In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
from scipy import stats
from imblearn.over_sampling import SMOTE, ADASYN
import seaborn as sns
import matplotlib.pyplot as plt
from collections import defaultdict
import glob
from libraries.claude_prompts import RED_FLAGS
import pickle
import json
import libraries.neo4j_lib as nl
import libraries.claude_prompts as cp
import libraries.llm_functions as lf

In [None]:
import os
import sys
import tiktoken
import concurrent.futures
import asyncio
import time
import logging
import re
from typing import Any, List, Dict, Optional
from openai import RateLimitError  # Correct import
import backoff

In [None]:
from llama_index.core.memory import ChatMemoryBuffer
from llama_index.core import VectorStoreIndex, Document

from llama_index.llms.ollama import Ollama
from llama_index.llms.openai import OpenAI
from tqdm import tqdm


In [None]:
def create_chat_engine(
    advert: str, memory: ChatMemoryBuffer, llm_instance: Any
) -> Optional[Any]:
    documents = [Document(text=advert)]
    index = VectorStoreIndex.from_documents(documents)
    return index.as_chat_engine(
        chat_mode="context",
        llm=llm_instance,
        memory=memory,
        system_prompt=(
            "As a career forensic analyst, you have deep insight into crime "
            "and criminal activity, especially human trafficking. Investigate "
            "the online recruitment advert and extract pertinent details."
        ),
    )

In [None]:
def get_llm() -> Any:
    # if prompt_name == "assure_prompt":
    #     return Ollama(
    #         model="llama3.1:latest",
    #         temperature=0,
    #         max_tokens=64768,
    #         request_timeout=120.0,
    #     )

    return OpenAI(temperature=0, model="gpt-4o-mini", request_timeout=120.0)

In [None]:
IDn = 573388
prompt_name = 'bypass_prompt'
advert, result = nl.get_neo4j_advert_analysis(IDn, prompt_name)

In [None]:
result

In [None]:
def create_audit_prompt(prompt_name: str, result: str) -> str:
    prompt_name = cp.CLAUDE_PROMPTS[prompt_name]
    audit_prompt = f"Is the correct answer to the following question\n\n ```{prompt_name}```\n\ {result}\n\n?"
    return audit_prompt

In [None]:
memory = ChatMemoryBuffer.from_defaults(token_limit=8192)
llm_instance = get_llm()
chat_engine = create_chat_engine(advert, memory, llm_instance)

In [None]:
audit_prompt = create_audit_prompt(prompt_name, result)

In [None]:
audit_prompt

In [None]:
response = lf.audit_analysis(chat_engine, audit_prompt)

In [None]:
nl.write_audit_to_neo4j(IDn, prompt_name, response)

In [None]:
query = """
MATCH (g:Group)-[:HAS_POSTING]-(n:Posting)-[:HAS_ANALYSIS {type: $prompt_name}]-(analysis:Analysis)
WHERE g.country_id = 1
  AND n.text IS NOT NULL
  AND n.text <> ""
  AND (NOT EXISTS {
    MATCH (analysis)-[:HAS_AUDIT {type: $prompt_name}]-(:Audit)
  })
RETURN ID(n) AS IDn, n.post_id AS post_id, n.text AS advert
"""
result = nl.execute_neo4j_query(query, {"prompt_name": prompt_name})

In [None]:
result

In [None]:
def load_data_splits(timestamp):
    """
    Load saved data splits for model training and evaluation.
    """
    splits = {}
    for split_type in [
        "X_train",
        "X_holdout",
        "y_train",
        "y_holdout",
        "X_full",
        "y_full",
    ]:
        with open(f"data/splits/{split_type}_{timestamp}.pkl", "rb") as f:
            splits[split_type] = pickle.load(f)

    with open(f"data/splits/split_info_{timestamp}.json", "r") as f:
        split_info = json.load(f)

    return splits, split_info

In [None]:
def analyze_feature_distributions(X, feature_names):
    """Analyze and visualize feature distributions and sparsity"""
    
    # Calculate basic statistics
    stats_df = pd.DataFrame({
        'feature': feature_names,
        'count': [X[col].sum() for col in feature_names],
        'sparsity': [1 - (X[col].sum() / len(X)) for col in feature_names],
        'unique_values': [len(X[col].unique()) for col in feature_names]
    })
    
    # Sort by sparsity
    stats_df = stats_df.sort_values('sparsity', ascending=False)
    
    # Create visualizations
    plt.figure(figsize=(15, 10))
    sns.barplot(data=stats_df, x='sparsity', y='feature')
    plt.title('Feature Sparsity Analysis')
    plt.tight_layout()
    plt.savefig('feature_sparsity.png')
    plt.close()
    
    # Create feature presence heatmap
    presence_matrix = X[feature_names].astype(bool).astype(int)
    plt.figure(figsize=(20, 10))
    sns.heatmap(presence_matrix.iloc[:50], # Show first 50 samples
                xticklabels=feature_names,
                yticklabels=False,
                cmap='YlOrRd')
    plt.title('Feature Presence Patterns (First 50 Samples)')
    plt.tight_layout()
    plt.savefig('feature_patterns.png')
    plt.close()
    
    return stats_df

def analyze_feature_importance(X, y, feature_names):
    """Analyze feature importance and correlation with target"""
    
    correlations = pd.DataFrame({
        'feature': feature_names,
        'correlation': [abs(X[col].corr(y)) for col in feature_names],
        'mutual_info': mutual_info_regression(X, y)
    }).sort_values('correlation', ascending=False)
    
    # Visualize correlations
    plt.figure(figsize=(12, 8))
    sns.scatterplot(data=correlations, x='correlation', y='mutual_info')
    for i, row in correlations.iterrows():
        plt.annotate(row['feature'], (row['correlation'], row['mutual_info']))
    plt.title('Feature Importance Analysis')
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.close()
    
    return correlations

def evaluate_balancing_strategies(X, y, feature_names, model):
    """Evaluate different balancing strategies"""
    
    results = defaultdict(dict)
    
    # Original baseline
    base_scores = cross_val_score(model, X, y, cv=5, scoring='r2')
    results['baseline'] = {
        'mean_score': base_scores.mean(),
        'std_score': base_scores.std()
    }
    
    # Strategy 1: Remove very sparse features
    sparse_threshold = 0.95  # Features present in less than 5% of samples
    sparse_features = [f for f in feature_names 
                      if (1 - X[f].sum()/len(X)) > sparse_threshold]
    X_no_sparse = X.drop(columns=sparse_features)
    scores = cross_val_score(model, X_no_sparse, y, cv=5, scoring='r2')
    results['remove_sparse'] = {
        'mean_score': scores.mean(),
        'std_score': scores.std(),
        'removed_features': sparse_features
    }
    
    # Strategy 2: SMOTE
    try:
        smote = SMOTE(random_state=42)
        X_smote, y_smote = smote.fit_resample(X, y)
        scores = cross_val_score(model, X_smote, y_smote, cv=5, scoring='r2')
        results['smote'] = {
            'mean_score': scores.mean(),
            'std_score': scores.std()
        }
    except Exception as e:
        results['smote'] = {'error': str(e)}
    
    # Strategy 3: ADASYN
    try:
        adasyn = ADASYN(random_state=42)
        X_adasyn, y_adasyn = adasyn.fit_resample(X, y)
        scores = cross_val_score(model, X_adasyn, y_adasyn, cv=5, scoring='r2')
        results['adasyn'] = {
            'mean_score': scores.mean(),
            'std_score': scores.std()
        }
    except Exception as e:
        results['adasyn'] = {'error': str(e)}
    
    # Strategy 4: Feature grouping (example with semantic grouping)
    feature_groups = {
        'location_related': ['drop_off_at_secure_location_prompt', 'no_location_prompt', 'multiple_provinces_prompt'],
        'communication': ['callback_request_prompt', 'suspicious_email_prompt', 'language_switch_prompt'],
        'hiring_process': ['immediate_hiring_prompt', 'requires_references', 'unrealistic_hiring_number_prompt'],
        'targeting': ['gender_specific_prompt', 'target_specific_group_prompt', 'recruit_students_prompt'],
        'job_details': ['vague_description_prompt', 'unusual_hours_prompt', 'overseas_prompt']
    }
    
    X_grouped = X.copy()
    for group_name, features in feature_groups.items():
        X_grouped[group_name] = X[features].sum(axis=1)
        X_grouped = X_grouped.drop(columns=features)
    
    scores = cross_val_score(model, X_grouped, y, cv=5, scoring='r2')
    results['feature_grouping'] = {
        'mean_score': scores.mean(),
        'std_score': scores.std(),
        'groups': feature_groups
    }
    
    return results

def visualize_results(results):
    """Visualize comparison of different strategies"""
    
    # Prepare data for plotting
    strategies = []
    means = []
    stds = []
    
    for strategy, metrics in results.items():
        if 'mean_score' in metrics:
            strategies.append(strategy)
            means.append(metrics['mean_score'])
            stds.append(metrics['std_score'])
    
    # Create comparison plot
    plt.figure(figsize=(12, 6))
    bars = plt.bar(strategies, means)
    plt.errorbar(strategies, means, yerr=stds, fmt='none', color='black', capsize=5)
    
    # Add value labels
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.3f}',
                ha='center', va='bottom')
    
    plt.title('Performance Comparison of Different Strategies')
    plt.ylabel('R² Score')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('strategy_comparison.png')
    plt.close()

def main_analysis(X, y, feature_names, model):
    """Run complete analysis pipeline"""
    
    print("1. Analyzing Feature Distributions...")
    distribution_stats = analyze_feature_distributions(X, feature_names)
    print("\nFeature Statistics:")
    print(distribution_stats)
    
    print("\n2. Analyzing Feature Importance...")
    importance_analysis = analyze_feature_importance(X, y, feature_names)
    print("\nFeature Importance:")
    print(importance_analysis)
    
    print("\n3. Evaluating Balancing Strategies...")
    strategy_results = evaluate_balancing_strategies(X, y, feature_names, model)
    print("\nStrategy Results:")
    for strategy, results in strategy_results.items():
        print(f"\n{strategy}:")
        print(results)
    
    print("\n4. Visualizing Results...")
    visualize_results(strategy_results)
    
    return {
        'distribution_stats': distribution_stats,
        'importance_analysis': importance_analysis,
        'strategy_results': strategy_results
    }

In [None]:
original = sorted(glob.glob("results/metrics_advanced_redflag_model_*.json"))[-1]
simplified = sorted(glob.glob("results/metrics_stacking_model_*.json"))[-1]
deep_learning = sorted(glob.glob("results/metrics_deep_learning_model_*.json"))[-1]

In [None]:
timestamp='20241114_115648'
splits, split_info = load_data_splits(timestamp)

In [None]:
splits.keys()
X=splits['X_full']
y=splits['y_full']

In [None]:
X.sum()

In [None]:
model_files = {
    "Original Model": original,
    "Simplified Model": simplified,
    "Deep Learning Model": deep_learning,
}

# Load metrics
metrics = {}
for model_name, file_path in model_files.items():
    with open(file_path, "r") as f:
        data = json.load(f)
        metrics[model_name] = data["holdout_metrics"]

In [None]:
# First, let's see the target distribution
print("Target value distribution:")
print(y.value_counts().sort_index())

# Look at feature co-occurrence
correlation_matrix = X[RED_FLAGS].corr()

# Check if certain red flags tend to occur together
# This helps understand if synthetic samples would be realistic

In [None]:
correlation_matrix