In [None]:
import json
import os
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

class RiskIdentificationModel(nn.Module):
    def __init__(self, input_dim):
        super(RiskIdentificationModel, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        return self.layers(x)

def generate_simulated_dataset():
    # Simulated dataset with TC, workgroup, and risk features
    np.random.seed(42)
    n_samples = 5000
    
    data = {
        'TC': np.random.randint(1, 9, n_samples),
        #'Workgroup': np.random.randint(1, 20, n_samples),
        # removing workgroup, this seems a little silly. 
        
        'ApplicationAge': np.random.normal(20, 5, n_samples),
        'PreviousRejectionRate': np.random.uniform(0, 1, n_samples),
        'ComplexityScore': np.random.normal(50, 10, n_samples),
        'PriorArtCitations': np.random.randint(0, 50, n_samples),
        'AllowanceRate': np.random.uniform(0, 1, n_samples),
        'RiskLevel': np.random.randint(0, 2, n_samples)
    }
    
    df = pd.DataFrame(data)
    return df

def prepare_data(df):
    # Prepare features and labels
    features = ['TC', 'Workgroup', 'ApplicationAge', 
                'PreviousRejectionRate', 'ComplexityScore', 
                'PriorArtCitations', 'AllowanceRate']
    
    X = df[features]
    y = df['RiskLevel']
    
    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Convert to PyTorch tensors
    X_tensor = torch.FloatTensor(X_scaled)
    y_tensor = torch.FloatTensor(y.values).unsqueeze(1)
    
    return X_tensor, y_tensor, scaler

def train_risk_model(X_train, y_train, X_val, y_val):
    input_dim = X_train.shape[1]
    model = RiskIdentificationModel(input_dim)
    
    # Loss and optimizer
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    # Training loop
    epochs = 100
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        
        outputs = model(X_train)
        loss = criterion(outputs, y_train)
        
        loss.backward()
        optimizer.step()
        
        # Validation
        model.eval()
        with torch.no_grad():
            val_outputs = model(X_val)
            val_loss = criterion(val_outputs, y_val)
        
        if epoch % 10 == 0:
            print(f'Epoch {epoch}: Train Loss {loss.item()}, Val Loss {val_loss.item()}')
    
    return model

def identify_high_risk_areas(model, X, df, scaler):
    # Predict risk probabilities
    model.eval()
    with torch.no_grad():
        risk_probs = model(X)
    
    # Add probabilities back to dataframe
    df['RiskProbability'] = risk_probs.numpy()
    
    # Create a copy to work with for feature aggregation
    df_with_prob = df.copy()
    df_with_prob['RiskProbability'] = risk_probs.numpy()
    
    # Group by TC and Workgroup to get mean values of all features
    high_risk_areas = df_with_prob.groupby(['TC', 'Workgroup']).agg({
        'ApplicationAge': 'mean',
        'PreviousRejectionRate': 'mean',
        'ComplexityScore': 'mean',
        'PriorArtCitations': 'mean',
        'AllowanceRate': 'mean',
        'RiskLevel': 'mean',
        'RiskProbability': 'mean'
    }).reset_index()
    
    # Sort by RiskProbability (descending)
    high_risk_areas = high_risk_areas.sort_values('RiskProbability', ascending=False)
    
    return high_risk_areas


def main():
    # Generate simulated dataset
    df = generate_simulated_dataset()
    
    # Prepare data
    X_tensor, y_tensor, scaler = prepare_data(df)
    
    # Split data
    X_train, X_val, y_train, y_val = train_test_split(
        X_tensor, y_tensor, test_size=0.2, random_state=42
    )
    
    # Train model
    model = train_risk_model(X_train, y_train, X_val, y_val)
    
    # Identify high-risk areas
    high_risk_areas = identify_high_risk_areas(model, X_tensor, df, scaler)

       # Save high-risk areas to JSON file
    output_file = "high_risk_areas.json"
    
    dir_path = os.path.dirname(output_file)
    if dir_path == "":
        dir_path = "."  # If no directory (i.e., file in current dir), just use current dir
    else:
        dir_path = dir_path  # Already has path
    
    # Create directory if needed
    os.makedirs(dir_path, exist_ok=True)
    
    # Convert DataFrame to list of dicts (for JSON serialization)
    high_risk_areas_json = high_risk_areas.to_dict(orient='records')
    
    # Write to JSON file
    with open(output_file, 'w') as f:
        json.dump(high_risk_areas_json, f, indent=4)
    
    print("\nTop 10 High-Risk Areas:")
    print(high_risk_areas.head(10))



if __name__ == "__main__":
    main()

Epoch 0: Train Loss 0.7000737190246582, Val Loss 0.6979249715805054
Epoch 10: Train Loss 0.6954935193061829, Val Loss 0.6931849122047424
Epoch 20: Train Loss 0.6928402781486511, Val Loss 0.6931735277175903
Epoch 30: Train Loss 0.692310094833374, Val Loss 0.6934025883674622
Epoch 40: Train Loss 0.6928223967552185, Val Loss 0.6935884952545166
Epoch 50: Train Loss 0.6918793320655823, Val Loss 0.6937255263328552
Epoch 60: Train Loss 0.6907819509506226, Val Loss 0.693825900554657
Epoch 70: Train Loss 0.6910154819488525, Val Loss 0.6940131783485413
Epoch 80: Train Loss 0.6909488439559937, Val Loss 0.6944582462310791
Epoch 90: Train Loss 0.6897420287132263, Val Loss 0.6947230100631714

Top 10 High-Risk Areas:
     TC  Workgroup  ApplicationAge  PreviousRejectionRate  ComplexityScore  \
134   8          2       20.622919               0.423613        49.968232   
114   7          1       21.935356               0.383025        48.527392   
57    4          1       20.490379               0.445

In [2]:
def calculate_application_age(application_data):
    """
    Calculate application age from filing date to current date
    """
    current_date = datetime.now()
    application_age = (current_date - application_data['filing_date']).days / 365.25
    return application_age

In [3]:
def calculate_rejection_rate(examiner_history):
    """
    Calculate rejection rate based on historical office actions
    """
    total_actions = len(examiner_history)
    rejected_actions = sum(1 for action in examiner_history if action == 'rejected')
    
    rejection_rate = rejected_actions / total_actions if total_actions > 0 else 0
    return rejection_rate

In [None]:
def calculate_complexity_score(application_details):
    """
    Multi-dimensional complexity scoring
    """
    # Technical complexity factors
    claim_complexity = len(application_details['claims'])
    technical_domain_complexity = get_domain_complexity_factor(application_details['domain'])
    
    # Patent family size
    patent_family_size = len(application_details['related_patents'])
    
    # Interdisciplinary score
    interdisciplinary_score = calculate_interdisciplinary_score(application_details)
    
    # Weighted complexity calculation
    complexity_score = (
        0.3 * claim_complexity + 
        0.3 * technical_domain_complexity + 
        0.2 * patent_family_size + 
        0.2 * interdisciplinary_score
    )
    
    return complexity_score

def get_domain_complexity_factor(domain):
    """
    Predefined complexity mapping for technical domains
    """
    domain_complexity = {
        'computer_science': 0.8,
        'biotechnology': 0.9,
        'mechanical_engineering': 0.6,
        'electrical_engineering': 0.7
        # Add more domains
    }
    return domain_complexity.get(domain, 0.5)

In [None]:
def extract_prior_art_citations(patent_application):
    """
    Extract and analyze prior art citations
    """
    # Patent database citation analysis
    citations = patent_application['citations']
    
    # Citation metrics
    citation_count = len(citations)
    unique_citation_sources = len(set(citation['source'] for citation in citations))
    
    # Citation age analysis
    current_year = datetime.now().year
    citation_age_distribution = [
        current_year - citation['year'] 
        for citation in citations
    ]
    
    # Advanced citation analysis
    citation_metrics = {
        'total_citations': citation_count,
        'unique_sources': unique_citation_sources,
        'avg_citation_age': np.mean(citation_age_distribution),
        'citation_diversity_score': calculate_citation_diversity(citations)
    }
    
    return citation_metrics

def calculate_citation_diversity(citations):
    """
    Calculate diversity of citation sources
    """
    source_domains = [citation['domain'] for citation in citations]
    unique_domains = len(set(source_domains))
    return unique_domains / len(citations) if citations else 0

In [None]:
def calculate_allowance_rate(examiner_history):
    """
    Calculate patent allowance rate
    """
    total_applications = len(examiner_history)
    allowed_applications = sum(1 for app in examiner_history if app['status'] == 'allowed')
    
    allowance_rate = allowed_applications / total_applications if total_applications > 0 else 0
    
    return {
        'allowance_rate': allowance_rate,
        'total_applications': total_applications,
        'allowed_applications': allowed_applications
    }

In [None]:
class PatentDataWarehouse:
    def __init__(self):
        self.database = {
            'applications': [],
            'citations': [],
            'examiner_history': []
        }
    
    def collect_data(self, data_source):
        # Implement data collection from various sources
        pass
    
    def preprocess_data(self):
        # Clean and standardize data
        pass
    
    def feature_engineering(self):
        # Generate complex features
        pass

In [None]:
def calculate_risk_level(application_data):
    # Weighted risk factors
    risk_components = {
        'examination_complexity': 0.25,
        'prior_rejection_history': 0.20,
        'citation_complexity': 0.15,
        'prosecution_history': 0.15,
        'technical_domain_risk': 0.15,
        'examiner_performance': 0.10
    }
    
    # Detailed risk calculation
    risk_scores = {
        'examination_complexity': calculate_examination_complexity(application_data),
        'prior_rejection_history': calculate_prior_rejection_risk(application_data),
        'citation_complexity': calculate_citation_risk(application_data),
        'prosecution_history': calculate_prosecution_history_risk(application_data),
        'technical_domain_risk': calculate_technical_domain_risk(application_data),
        'examiner_performance': calculate_examiner_performance_risk(application_data)
    }
    
    # Weighted risk aggregation
    total_risk_score = sum(
        risk_scores[component] * weight 
        for component, weight in risk_components.items()
    )
    
    # Risk level categorization
    return categorize_risk_level(total_risk_score)

def categorize_risk_level(risk_score):
    """
    Convert continuous risk score to discrete risk levels
    """
    if risk_score < 0.2:
        return 0  # Low Risk
    elif risk_score < 0.4:
        return 1  # Medium-Low Risk
    elif risk_score < 0.6:
        return 2  # Medium Risk
    elif risk_score < 0.8:
        return 3  # High-Medium Risk
    else:
        return 4  # High Risk

In [None]:
def calculate_examination_complexity(application_data):
    """
    Assess complexity of patent examination
    """
    complexity_factors = [
        len(application_data['claims']),
        application_data['total_pages'],
        application_data['unique_independent_claims']
    ]
    
    # Normalize and aggregate complexity
    normalized_complexity = [
        (factor - min(complexity_factors)) / 
        (max(complexity_factors) - min(complexity_factors) + 1e-8)
        for factor in complexity_factors
    ]
    
    return np.mean(normalized_complexity)

def calculate_prior_rejection_risk(application_data):
    """
    Evaluate risk based on prior rejections
    """
    rejection_history = application_data['rejection_history']
    
    # Multiple rejection indicators
    risk_indicators = [
        len(rejection_history),  # Number of rejections
        any(rejection['type'] == 'final' for rejection in rejection_history),
        sum(1 for rejection in rejection_history if rejection['type'] == 'non-final') > 2
    ]
    
    # Weighted risk calculation
    weights = [0.5, 0.3, 0.2]
    return np.dot(risk_indicators, weights)

def calculate_citation_risk(application_data):
    """
    Assess risk through citation analysis
    """
    citations = application_data['citations']
    
    citation_risk_factors = [
        len(citations),  # Total citations
        len(set(citation['source_country'] for citation in citations)),  # Citation diversity
        sum(1 for citation in citations if citation['age'] < 5)  # Recent citations
    ]
    
    # Normalize and aggregate
    normalized_factors = [
        (factor - min(citation_risk_factors)) / 
        (max(citation_risk_factors) - min(citation_risk_factors) + 1e-8)
        for factor in citation_risk_factors
    ]
    
    return np.mean(normalized_factors)

def calculate_prosecution_history_risk(application_data):
    """
    Evaluate prosecution complexity and duration
    """
    prosecution_data = application_data['prosecution_history']
    
    risk_indicators = [
        prosecution_data['total_office_actions'],
        prosecution_data['prosecution_duration'],
        prosecution_data['amendments_count']
    ]
    
    # Normalize risk indicators
    normalized_indicators = [
        (indicator - min(risk_indicators)) / 
        (max(risk_indicators) - min(risk_indicators) + 1e-8)
        for indicator in risk_indicators
    ]
    
    return np.mean(normalized_indicators)

def calculate_technical_domain_risk(application_data):
    """
    Assess risk based on technical domain characteristics
    """
    domain_risk_mapping = {
        'biotechnology': 0.8,
        'artificial_intelligence': 0.7,
        'blockchain': 0.6,
        'quantum_computing': 0.9,
        'default': 0.5
    }
    
    domain = application_data['technical_domain']
    base_domain_risk = domain_risk_mapping.get(domain, domain_risk_mapping['default'])
    
    # Additional domain-specific risk factors
    domain_complexity_factors = [
        application_data['interdisciplinary_score'],
        application_data['emerging_technology_indicator']
    ]
    
    return base_domain_risk * np.mean(domain_complexity_factors)

def calculate_examiner_performance_risk(application_data):
    """
    Evaluate risk through examiner-specific factors
    """
    examiner_data = application_data['examiner_profile']
    
    performance_indicators = [
        examiner_data['allowance_rate'],
        examiner_data['average_prosecution_time'],
        examiner_data['technical_specialization_match']
    ]
    
    # Inverse relationship with risk
    risk_score = 1 - np.mean(performance_indicators)
    
    return risk_score

In [None]:
def calculate_technical_specialization_match(examiner_profile, patent_application):
    """
    Calculate the alignment between examiner's technical expertise 
    and the patent application's technical domain
    """
    # Examiner's technical background
    examiner_expertise = {
        'primary_art_unit': examiner_profile['art_unit'],
        'technical_domains': examiner_profile['technical_domains'],
        'education_background': examiner_profile['education'],
        'publication_areas': examiner_profile['research_publications']
    }
    
    # Patent application technical characteristics
    application_tech_profile = {
        'primary_domain': patent_application['technical_domain'],
        'secondary_domains': patent_application['related_domains'],
        'claim_keywords': extract_technical_keywords(patent_application['claims']),
        'abstract_keywords': extract_technical_keywords(patent_application['abstract'])
    }
    
    # Matching techniques
    matching_scores = [
        domain_similarity_score(examiner_expertise, application_tech_profile),
        art_unit_alignment_score(examiner_expertise, application_tech_profile),
        keyword_matching_score(examiner_expertise, application_tech_profile),
        educational_background_relevance(examiner_expertise, application_tech_profile)
    ]
    
    # Weighted average of matching scores
    weights = [0.3, 0.25, 0.25, 0.2]
    technical_specialization_match = np.dot(matching_scores, weights)
    
    return technical_specialization_match

def domain_similarity_score(examiner_expertise, application_tech_profile):
    """
    Calculate similarity between examiner's domains and application domains
    """
    examiner_domains = set(examiner_expertise['technical_domains'])
    application_domains = set([application_tech_profile['primary_domain']] + 
                               application_tech_profile['secondary_domains'])
    
    # Jaccard similarity
    intersection = len(examiner_domains.intersection(application_domains))
    union = len(examiner_domains.union(application_domains))
    
    return intersection / union if union > 0 else 0

def art_unit_alignment_score(examiner_expertise, application_tech_profile):
    """
    Assess alignment between examiner's art unit and application domain
    """
    art_unit_domain_mapping = {
        '2100': ['computer_technology', 'artificial_intelligence'],
        '1600': ['biotechnology', 'molecular_biology'],
        '3600': ['electrical_engineering', 'telecommunications']
        # Add more mappings
    }
    
    examiner_art_unit = examiner_expertise['primary_art_unit']
    primary_domain = application_tech_profile['primary_domain']
    
    # Check if domain matches art unit domains
    matching_domains = art_unit_domain_mapping.get(examiner_art_unit, [])
    
    return 1.0 if primary_domain in matching_domains else 0.0

def keyword_matching_score(examiner_expertise, application_tech_profile):
    """
    Match technical keywords from publications and application
    """
    examiner_keywords = set(
        keyword.lower() 
        for publication in examiner_expertise['publication_areas']
        for keyword in extract_technical_keywords(publication)
    )
    
    application_keywords = set(
        keyword.lower()
        for keyword_source in [
            application_tech_profile['claim_keywords'],
            application_tech_profile['abstract_keywords']
        ]
        for keyword in keyword_source
    )
    
    # Calculate keyword overlap
    intersection = len(examiner_keywords.intersection(application_keywords))
    union = len(examiner_keywords.union(application_keywords))
    
    return intersection / union if union > 0 else 0

def educational_background_relevance(examiner_expertise, application_tech_profile):
    """
    Assess relevance of examiner's educational background
    """
    education_domain_mapping = {
        'Computer Science': ['computer_technology', 'artificial_intelligence'],
        'Electrical Engineering': ['electronics', 'telecommunications'],
        'Biotechnology': ['molecular_biology', 'genetic_engineering'],
        # Add more mappings
    }
    
    examiner_education = examiner_expertise['education_background']
    primary_domain = application_tech_profile['primary_domain']
    
    # Check educational background domain alignment
    matching_domains = education_domain_mapping.get(examiner_education, [])
    
    return 1.0 if primary_domain in matching_domains else 0.5

def extract_technical_keywords(text):
    """
    Extract technical keywords from text
    """
    # Advanced NLP-based keyword extraction
    import spacy
    
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    
    # Extract nouns and technical terms
    keywords = [
        token.lemma_.lower() 
        for token in doc 
        if token.pos_ in ['NOUN', 'PROPN'] and 
           len(token.lemma_) > 2
    ]
    
    return list(set(keywords))

# Example usage
examiner_profile = {
    'art_unit': '2100',
    'technical_domains': ['computer_technology', 'machine_learning'],
    'education': 'Computer Science',
    'research_publications': ['Machine Learning Techniques', 'AI Applications']
}

patent_application = {
    'technical_domain': 'artificial_intelligence',
    'related_domains': ['machine_learning', 'data_science'],
    'claims': 'A method for machine learning...',
    'abstract': 'The invention relates to artificial intelligence techniques...'
}

specialization_match = calculate_technical_specialization_match(
    examiner_profile, 
    patent_application
)
print(f"Technical Specialization Match: {specialization_match}")

ModuleNotFoundError: No module named 'spacy'