In [19]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz # For fuzzy string matching
from fuzzywuzzy import process # For more advanced fuzzy matching (though not directly used in this version, good to have)
from sklearn.model_selection import train_test_split, GridSearchCV # For splitting data and hyperparameter tuning
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, precision_recall_curve, auc, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import random
import string
import joblib # For saving/loading models

In [4]:
pip install python-Levenshtein


Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp311-cp311-win_amd64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Downloading rapidfuzz-3.13.0-cp311-cp311-win_amd64.whl.metadata (12 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp311-cp311-win_amd64.whl (100 kB)
   ---------------------------------------- 0.0/100.4 kB ? eta -:--:--
   ------------------------ --------------- 61.4/100.4 kB 1.6 MB/s eta 0:00:01
   ---------------------------------------- 100.4/100.4 kB 1.4 MB/s eta 0:00:00
Downloading rapidfuzz-3.13.0-cp311-cp311-win_amd64.whl (1.6 MB)
   ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
   -- ------------------------------------- 0.1/1.6 MB 2.3 MB/s eta 0:00:01
   --- -----------------------------

In [2]:
!pip install fuzzywuzzy


Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [20]:

# --- Configuration and Global Variables ---
NUM_CUSTOMERS = 500 # Number of dummy customer records
NUM_TRANSACTIONS = 5000000 # Number of dummy transaction records (not directly used in this script, but from previous step)

# Configuration for loading common names from Excel
COMMON_NAMES_FILE = 'Name_list.xlsx' # Your Excel file for common names
COMMON_NAMES_SHEET = 'Names' # Your sheet name within the Excel file

# List of common full names (will be loaded from Excel or use default if file not found)
COMMON_FULL_NAMES = [] # Will be populated by load_common_names_from_excel

# Country Risk Map (simulated external data)
COUNTRY_RISK_MAP = {
    'IRAN': 'HIGH', 'NORTH KOREA': 'HIGH', 'SYRIA': 'HIGH', 'CUBA': 'HIGH', 'VENEZUULA': 'HIGH',
    'RUSSIA': 'MEDIUM', 'CHINA': 'MEDIUM', 'INDIA': 'LOW', 'USA': 'LOW', 'UK': 'LOW',
    'GERMANY': 'LOW', 'FRANCE': 'LOW', 'BRAZIL': 'MEDIUM', 'SOUTH AFRICA': 'MEDIUM',
    'NIGERIA': 'MEDIUM', 'AFGHANISTAN': 'HIGH', 'YEMEN': 'HIGH', 'SOMALIA': 'HIGH',
    'LEBANON': 'MEDIUM', 'PAKISTAN': 'MEDIUM'
}
HIGH_RISK_COUNTRIES = [country for country, risk in COUNTRY_RISK_MAP.items() if risk == 'HIGH']


In [21]:

# --- Helper Functions for Data Loading and Generation ---

def load_common_names_from_excel(filepath, sheet_name):
    """
    Loads a list of full names from an Excel file.
    Assumes the names are in a column named 'Sanctioned_name' in the specified sheet.
    Provides a fallback to a hardcoded list if the file or column is not found.
    """
    try:
        names_df = pd.read_excel(filepath, sheet_name=sheet_name)
        if 'Sanctioned_name' in names_df.columns:
            print(f"Successfully loaded common names from '{filepath}' sheet '{sheet_name}'.")
            return names_df['Sanctioned_name'].astype(str).tolist()
        else:
            print(f"Error: 'Sanctioned_name' column not found in '{sheet_name}' of '{filepath}'.")
            return _get_default_common_names()
    except FileNotFoundError:
        print(f"Warning: Common names file '{filepath}' not found. Using default hardcoded names.")
        return _get_default_common_names()
    except Exception as e:
        print(f"An error occurred while loading common names from Excel: {e}")
        return _get_default_common_names()

def _get_default_common_names():
    """Provides a hardcoded list of common names as a fallback."""
    return [
        'John Smith', 'Jane Johnson', 'Michael Williams', 'Emily Brown', 'David Jones',
        'Sarah Garcia', 'Chris Miller', 'Anna Davis', 'Robert Rodriguez', 'Maria Martinez',
        'William Taylor', 'Olivia Wilson', 'James Moore', 'Sophia White', 'Benjamin Green',
        'Isabella Hall', 'Lucas King', 'Mia Wright', 'Henry Lopez', 'Charlotte Hill'
    ]

def load_or_generate_initial_data(sanctions_csv_path='UK Sanctions List_mean.csv',
                                   customer_data_path='customer_data.csv',
                                   num_customers=NUM_CUSTOMERS):
    """
    Loads cleaned sanctions and customer data. If files are not found,
    it generates minimal dummy data for demonstration.
    """
    global COMMON_FULL_NAMES # Declare global to modify the list
    COMMON_FULL_NAMES = load_common_names_from_excel(COMMON_NAMES_FILE, COMMON_NAMES_SHEET)
    if not COMMON_FULL_NAMES:
        print("FATAL: COMMON_FULL_NAMES list is empty. Cannot proceed with data generation.")
        return pd.DataFrame(), pd.DataFrame() # Return empty DFs

    sanctions_df_cleaned = pd.DataFrame()
    customer_df = pd.DataFrame()

    try:
        # Load sanctions data
        # Assuming 'header=1' for UK Sanctions List_mean.csv based on previous context
        raw_sanctions_df = pd.read_csv(sanctions_csv_path, encoding='utf-8', header=1)
        print(f"Successfully loaded {sanctions_csv_path}. Shape: {raw_sanctions_df.shape}")

        # Clean and standardize sanctions data
        name_col = 'Name 6'
        address_col = 'Address 6'
        dob_col = 'DOB 6'
        nationality_col = 'Nationality 6'
        type_col = 'Type'
        id_col = 'ID'

        # Robust column handling
        if name_col not in raw_sanctions_df.columns:
            print(f"Warning: '{name_col}' not found in sanctions CSV. Please check column names.")
            # Fallback to a generic 'Name' if exists, or create a placeholder
            name_col = 'Name' if 'Name' in raw_sanctions_df.columns else raw_sanctions_df.columns[0] # Take first col as fallback
            print(f"Using '{name_col}' as name column fallback.")
        # Similar checks for other columns if necessary

        sanctions_df_cleaned = raw_sanctions_df.copy()
        sanctions_df_cleaned['Sanctioned_Name'] = sanctions_df_cleaned[name_col].astype(str).str.upper().str.strip()
        sanctions_df_cleaned['Sanctioned_Address'] = sanctions_df_cleaned[address_col].astype(str).str.upper().str.strip()
        sanctions_df_cleaned['Sanctioned_DOB'] = pd.to_datetime(sanctions_df_cleaned[dob_col], errors='coerce').dt.strftime('%Y-%m-%d')
        sanctions_df_cleaned['Sanctioned_Nationality'] = sanctions_df_cleaned[nationality_col].astype(str).str.upper().str.strip()
        sanctions_df_cleaned['Sanction_Type'] = sanctions_df_cleaned[type_col].astype(str).str.upper().str.strip()
        sanctions_df_cleaned['Sanctioned_ID'] = sanctions_df_cleaned[id_col].astype(str) if id_col in sanctions_df_cleaned.columns else [f'S{i:04d}' for i in range(len(sanctions_df_cleaned))]

        sanctions_df_cleaned = sanctions_df_cleaned[[
            'Sanctioned_ID', 'Sanctioned_Name', 'Sanctioned_Address',
            'Sanctioned_DOB', 'Sanctioned_Nationality', 'Sanction_Type'
        ]].copy()
        sanctions_df_cleaned = sanctions_df_cleaned[
            (sanctions_df_cleaned['Sanctioned_Name'] != 'UNKNOWN SANCTIONED NAME') &
            (sanctions_df_cleaned['Sanctioned_Name'] != 'NAN') &
            (sanctions_df_cleaned['Sanctioned_Name'].str.strip() != '')
        ].reset_index(drop=True)
        print("Cleaned Sanctions Data (first 3 rows):")
        print(sanctions_df_cleaned.head(3))
        sanctions_df_cleaned.to_csv('sanctions_list_cleaned.csv', index=False) # Save cleaned version

    except FileNotFoundError:
        print(f"Error: Sanctions file '{sanctions_csv_path}' not found. Generating minimal dummy sanctions data.")
        sanctions_data_fallback = {
            'Sanctioned_ID': [f'S{i:04d}' for i in range(1, 11)],
            'Sanctioned_Name': ['JOHN DOE', 'JANE SMITH', 'ALI BABA', 'VLADIMIR PUTIN', 'KIM JONG-UN',
                                'MOHAMMAD AL-SHAMSI', 'FATIMA ZAHRA', 'GLOBAL OIL CORP', 'SEA DRAGON VESSEL', 'NORTH STAR BANK'],
            'Sanctioned_Address': ['123 MAIN ST, TEHRAN, IRAN', '456 OAK AVE, DAMASCUS, SYRIA', '789 DESERT RD, BAGHDAD, IRAQ',
                                   'KREMLIN, MOSCOW, RUSSIA', 'PYONGYANG, NORTH KOREA', 'RIYADH, SAUDI ARABIA',
                                   'BEIRUT, LEBANON', 'SHANGHAI, CHINA', 'PORT OF BANDAR ABBAS, IRAN', 'MOSCOW, RUSSIA'],
            'Sanctioned_DOB': ['1970-01-15', '1985-03-20', '1960-11-01', '1952-10-07', '1984-01-08',
                               '1975-07-22', '1990-09-10', np.nan, np.nan, np.nan],
            'Sanctioned_Nationality': ['IRANIAN', 'SYRIAN', 'IRAQI', 'RUSSIAN', 'NORTH KOREAN',
                                       'SAUDI', 'LEBANESE', np.nan, np.nan, np.nan],
            'Sanction_Type': ['INDIVIDUAL', 'INDIVIDUAL', 'ENTITY', 'INDIVIDUAL', 'INDIVIDUAL',
                              'INDIVIDUAL', 'INDIVIDUAL', 'ENTITY', 'VESSEL', 'ENTITY']
        }
        sanctions_df_cleaned = pd.DataFrame(sanctions_data_fallback)
        print("Generated fallback sanctions data.")
    except Exception as e:
        print(f"An unexpected error occurred during sanctions data loading/cleaning: {e}")
        sanctions_df_cleaned = pd.DataFrame()


    try:
        customer_df = pd.read_csv(customer_data_path)
        print(f"Successfully loaded customer data from {customer_data_path}. Shape: {customer_df.shape}")
    except FileNotFoundError:
        print(f"Customer data file '{customer_data_path}' not found. Generating dummy customer data.")
        customers = []
        for i in range(1, num_customers + 1):
            customer_id = f'CUST{i:05d}'
            customer_name = random.choice(COMMON_FULL_NAMES)
            customer_address = f"{random.randint(100, 999)} {random.choice(['Main St', 'Oak Ave', 'Pine Ln'])}"
            customer_dob = (datetime.date(1950, 1, 1) + datetime.timedelta(days=random.randint(0, 365 * 50))).strftime('%Y-%m-%d')
            customer_nationality = random.choice(list(COUNTRY_RISK_MAP.keys()))
            customer_country = random.choice(list(COUNTRY_RISK_MAP.keys()))
            customer_industry = random.choice(['Financial Services', 'Retail', 'Technology', 'Manufacturing', 'Healthcare'])
            onboarding_date = (datetime.date(2020, 1, 1) + datetime.timedelta(days=random.randint(0, 365 * 3))).strftime('%Y-%m-%d')

            # Introduce some 'risky' customers that might match sanctions list
            if i % 10 == 0 and not sanctions_df_cleaned.empty:
                sanctioned_entity = sanctions_df_cleaned.sample(1).iloc[0]
                customer_name = sanctioned_entity['Sanctioned_Name'].replace('A', 'a', 1).replace('E', 'e', 1) # Slight variation
                customer_address = sanctioned_entity['Sanctioned_Address'].replace('ST', 'Street', 1) # Slight variation
                customer_dob = sanctioned_entity['Sanctioned_DOB'] # Exact DOB match
                customer_nationality = sanctioned_entity['Sanctioned_Nationality']
                customer_country = sanctioned_entity['Sanctioned_Nationality']

            customers.append({
                'Customer_ID': customer_id,
                'Customer_Name': customer_name,
                'Customer_Address': customer_address,
                'Customer_DOB': customer_dob,
                'Customer_Nationality': customer_nationality,
                'Customer_Country': customer_country,
                'Customer_Industry': customer_industry,
                'Onboarding_Date': onboarding_date
            })
        customer_df = pd.DataFrame(customers)
        customer_df.to_csv(customer_data_path, index=False) # Save generated customer data
        print("Generated dummy customer data.")
    except Exception as e:
        print(f"An unexpected error occurred during customer data loading/generation: {e}")
        customer_df = pd.DataFrame()

    return sanctions_df_cleaned, customer_df



In [22]:

# --- Feature Engineering Function ---
def calculate_sanctions_features(df):
    """
    Calculates various matching and risk features for each customer-sanctioned entity pair.
    Uses more robust string matching.
    """
    # Ensure columns are string type and clean them
    df['Customer_Name_Clean'] = df['Customer_Name'].astype(str).str.upper().str.strip()
    df['Sanctioned_Name_Clean'] = df['Sanctioned_Name'].astype(str).str.upper().str.strip()
    df['Customer_Address_Clean'] = df['Customer_Address'].astype(str).str.upper().str.strip()
    df['Sanctioned_Address_Clean'] = df['Sanctioned_Address'].astype(str).str.upper().str.strip()
    df['Customer_Nationality_Clean'] = df['Customer_Nationality'].astype(str).str.upper().str.strip()
    df['Sanctioned_Nationality_Clean'] = df['Sanctioned_Nationality'].astype(str).str.upper().str.strip()
    df['Customer_Country_Clean'] = df['Customer_Country'].astype(str).str.upper().str.strip()

    # Name Fuzzy Match Scores
    # fuzz.ratio: Simple Levenshtein ratio
    df['name_fuzz_ratio'] = df.apply(
        lambda row: fuzz.ratio(row['Customer_Name_Clean'], row['Sanctioned_Name_Clean']), axis=1
    )
    # fuzz.token_sort_ratio: Sorts tokens before matching, good for reordered names
    df['name_token_sort_ratio'] = df.apply(
        lambda row: fuzz.token_sort_ratio(row['Customer_Name_Clean'], row['Sanctioned_Name_Clean']), axis=1
    )
    # fuzz.token_set_ratio: Considers common tokens, robust to extra words
    df['name_token_set_ratio'] = df.apply(
        lambda row: fuzz.token_set_ratio(row['Customer_Name_Clean'], row['Sanctioned_Name_Clean']), axis=1
    )
    # Take the max of these for a comprehensive name score
    df['name_match_score'] = df[['name_fuzz_ratio', 'name_token_sort_ratio', 'name_token_set_ratio']].max(axis=1)


    # Address Fuzzy Match Score (using token_set_ratio for robustness to word order)
    df['address_match_score'] = df.apply(
        lambda row: fuzz.token_set_ratio(row['Customer_Address_Clean'], row['Sanctioned_Address_Clean']), axis=1
    )

    # Date of Birth Match (Exact match)
    df['dob_match'] = df.apply(
        lambda row: 1 if (pd.notna(row['Customer_DOB']) and pd.notna(row['Sanctioned_DOB']) and
                         str(row['Customer_DOB']) == str(row['Sanctioned_DOB'])) else 0, axis=1
    )
    
    # Nationality Match (Exact match)
    df['nationality_match'] = df.apply(
        lambda row: 1 if (pd.notna(row['Customer_Nationality_Clean']) and pd.notna(row['Sanctioned_Nationality_Clean']) and
                         row['Customer_Nationality_Clean'] == row['Sanctioned_Nationality_Clean']) else 0, axis=1
    )

    # Country Risk Factor for Customer's Country
    df['customer_country_risk_score'] = df['Customer_Country_Clean'].map(
        {k: (10 if v == 'HIGH' else 5 if v == 'MEDIUM' else 1) for k, v in COUNTRY_RISK_MAP.items()}
    ).fillna(0) # Assign 0 if country not in map

    # Sanction Type Encoding (Numerical severity)
    sanction_type_severity = {
        'INDIVIDUAL': 10,
        'ENTITY': 8,
        'VESSEL': 5,
        'AIRCRAFT': 5,
        'NAN': 0,
        'UNKNOWN': 0 # Handle any other missing/unknown types
    }
    df['sanction_type_severity_score'] = df['Sanction_Type'].map(sanction_type_severity).fillna(0)

    # Interaction features (example)
    df['name_country_interaction'] = df['name_match_score'] * df['customer_country_risk_score']
    df['name_dob_interaction'] = df['name_match_score'] * df['dob_match']

    return df


In [23]:

# --- Training Data Generation and Labeling ---
num_samples=20000
num_true_positives = int(num_samples * 0.05)
def generate_and_label_training_data(customer_df, sanctions_df_cleaned, num_samples):
    """
    Generates a simulated labeled dataset for training the sanctions screening model.
    It creates a mix of true positives and true negatives based on predefined rules.
    """
    print(f"\n--- Generating Simulated Training Data with {num_samples} samples ---")
    
    training_samples = []
    
    # Strategy 1: Generate True Positives (simulated matches)
    num_true_positives = int(num_samples * 0.05) # Aim for 5% true positives
    for _ in range(num_true_positives):
        if sanctions_df_cleaned.empty or customer_df.empty:
            break
        sanctioned_entity = sanctions_df_cleaned.sample(1).iloc[0]
        customer_entity = customer_df.sample(1).iloc[0]

        # Create a "true match" by making customer data very similar to sanctioned
        # Introduce slight variations to simulate fuzzy matches
        cust_name_tp = sanctioned_entity['Sanctioned_Name']
        if len(cust_name_tp) > 3: # Introduce a typo for fuzzy match
            idx = random.randint(0, len(cust_name_tp) - 1)
            cust_name_tp = cust_name_tp[:idx] + random.choice(string.ascii_uppercase) + cust_name_tp[idx+1:]
        
        cust_address_tp = sanctioned_entity['Sanctioned_Address']
        if len(cust_address_tp) > 5: # Introduce a slight address variation
            idx = random.randint(0, len(cust_address_tp) - 1)
            cust_address_tp = cust_address_tp[:idx] + random.choice(string.ascii_uppercase) + cust_address_tp[idx+1:]

        cust_dob_tp = sanctioned_entity['Sanctioned_DOB']
        cust_nationality_tp = sanctioned_entity['Sanctioned_Nationality']
        cust_country_tp = sanctioned_entity['Sanctioned_Nationality'] # Assume country is same as nationality for TP

        training_samples.append({
            'Customer_ID': customer_entity['Customer_ID'],
            'Customer_Name': cust_name_tp,
            'Customer_Address': cust_address_tp,
            'Customer_DOB': cust_dob_tp,
            'Customer_Nationality': cust_nationality_tp,
            'Customer_Country': cust_country_tp,
            'Sanctioned_ID': sanctioned_entity['Sanctioned_ID'],
            'Sanctioned_Name': sanctioned_entity['Sanctioned_Name'],
            'Sanctioned_Address': sanctioned_entity['Sanctioned_Address'],
            'Sanctioned_DOB': sanctioned_entity['Sanctioned_DOB'],
            'Sanctioned_Nationality': sanctioned_entity['Sanctioned_Nationality'],
            'Sanction_Type': sanctioned_entity['Sanction_Type'],
            'is_sanction_match': 1 # Label as True Positive
        })


In [24]:

    # Strategy 2: Generate True Negatives (simulated non-matches)
    num_true_negatives = num_samples - num_true_positives
    for _ in range(num_true_negatives):
        if sanctions_df_cleaned.empty or customer_df.empty:
            break
        sanctioned_entity = sanctions_df_cleaned.sample(1).iloc[0]
        customer_entity = customer_df.sample(1).iloc[0]

        # Ensure it's a "true negative" by picking very different entities
        # Or by ensuring names/addresses are very different
        cust_name_tn = random.choice(COMMON_FULL_NAMES)
        cust_address_tn = f"{random.randint(1000, 9999)} {random.choice(['Road', 'Lane', 'Square'])}"
        cust_dob_tn = (datetime.date(1940, 1, 1) + datetime.timedelta(days=random.randint(0, 365 * 60))).strftime('%Y-%m-%d')
        cust_nationality_tn = random.choice(LOW_RISK_COUNTRIES)
        cust_country_tn = random.choice(LOW_RISK_COUNTRIES)

        training_samples.append({
            'Customer_ID': customer_entity['Customer_ID'],
            'Customer_Name': cust_name_tn,
            'Customer_Address': cust_address_tn,
            'Customer_DOB': cust_dob_tn,
            'Customer_Nationality': cust_nationality_tn,
            'Customer_Country': cust_country_tn,
            'Sanctioned_ID': sanctioned_entity['Sanctioned_ID'],
            'Sanctioned_Name': sanctioned_entity['Sanctioned_Name'],
            'Sanctioned_Address': sanctioned_entity['Sanctioned_Address'],
            'Sanctioned_DOB': sanctioned_entity['Sanctioned_DOB'],
            'Sanctioned_Nationality': sanctioned_entity['Sanctioned_Nationality'],
            'Sanction_Type': sanctioned_entity['Sanction_Type'],
            'is_sanction_match': 0 # Label as True Negative
        })
    
    training_df = pd.DataFrame(training_samples)
    training_df_features = calculate_sanctions_features(training_df.copy())
    
    # Fill any remaining NaNs in features with 0 (or a more sophisticated imputation)
    feature_cols = [
        'name_match_score', 'address_match_score', 'dob_match',
        'nationality_match', 'customer_country_risk_score', 'sanction_type_severity_score',
        'name_country_interaction', 'name_dob_interaction'
    ]
    training_df_features[feature_cols] = training_df_features[feature_cols].fillna(0)

    print(f"Generated {len(training_df_features)} training samples.")
    print(f"Class distribution:\n{training_df_features['is_sanction_match'].value_counts(normalize=True)}")
    print("Sample of training data features and labels:")
    print(training_df_features[['Customer_ID', 'Sanctioned_ID', 'name_match_score', 'dob_match', 'is_sanction_match']].head())
    
    return training_df_features


NameError: name 'sanctions_df_cleaned' is not defined

In [27]:

# --- Model Training Function ---
def train_sanctions_model(X_train, y_train, model_type='GradientBoosting'):
    """
    Trains a sanctions screening classification model.
    Supports Logistic Regression and Gradient Boosting.
    """
    print(f"\n--- Training {model_type} Model ---")
    if model_type == 'LogisticRegression':
        model = LogisticRegression(random_state=42, solver='liblinear', class_weight='balanced', max_iter=1000)
        param_grid = {
            'C': [0.1, 1, 10],
            'penalty': ['l1', 'l2']
        }
    elif model_type == 'GradientBoosting':
        model = GradientBoostingClassifier(random_state=42)
        param_grid = {
            'n_estimators': [100, 200],
            'learning_rate': [0.05, 0.1],
            'max_depth': [3, 5]
        }
    else:
        raise ValueError("model_type must be 'LogisticRegression' or 'GradientBoosting'")

    # Use GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)

    print(f"Best parameters for {model_type}: {grid_search.best_params_}")
    print(f"Best ROC AUC score for {model_type}: {grid_search.best_score_:.4f}")
    
    best_model = grid_search.best_estimator_
    return best_model

# --- Model Evaluation Function ---
def evaluate_model(model, X_test, y_test, model_name="Model"):
    """Evaluates the trained model and plots ROC and Precision-Recall curves."""
    print(f"\n--- Evaluation for {model_name} ---")
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    print(f"\nClassification Report for {model_name}:")
    print(classification_report(y_test, y_pred))
    
    roc_auc = roc_auc_score(y_test, y_proba)
    print(f"ROC AUC Score for {model_name}: {roc_auc:.4f}")

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                xticklabels=['Predicted 0', 'Predicted 1'],
                yticklabels=['Actual 0', 'Actual 1'])
    plt.title(f'Confusion Matrix for {model_name}')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()

    # ROC Curve
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve for {model_name}')
    plt.legend()
    plt.grid(True)
    plt.show()

    # Precision-Recall Curve
    precision, recall, _ = precision_recall_curve(y_test, y_proba)
    auprc = auc(recall, precision)
    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision, label=f'{model_name} (AUPRC = {auprc:.2f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall Curve for {model_name}')
    plt.legend()
    plt.grid(True)
    plt.show()

    # Feature Importance (if applicable)
    if hasattr(model, 'feature_importances_'):
        print("\nFeature Importance:")
        feature_importance = pd.Series(model.feature_importances_, index=X_test.columns).sort_values(ascending=False)
        print(feature_importance)
        plt.figure(figsize=(10, 6))
        sns.barplot(x=feature_importance.values, y=feature_importance.index)
        plt.title(f'Feature Importance for {model_name}')
        plt.xlabel('Importance')
        plt.ylabel('Feature')
        plt.tight_layout()
        plt.show()


In [30]:

# --- Sanctions Screening Function ---
def perform_sanctions_screening(customers_df, sanctions_df, model, features, alert_threshold=0.5):
    """
    Performs sanctions screening on customer data using the trained ML model.
    Implements a basic blocking strategy to reduce the number of pairs for feature calculation.
    """
    print(f"\n--- Performing Sanctions Screening on {len(customers_df)} Customers ---")
    
    all_screening_results = []

    # Apply a simple blocking strategy: only compare if first letter of last name matches
    # This is a very basic blocking; real systems use more sophisticated methods (e.g., phonetic keys, n-grams)
    customers_df['Customer_First_Letter'] = customers_df['Customer_Name'].astype(str).str.upper().str[0]
    sanctions_df['Sanctioned_First_Letter'] = sanctions_df['Sanctioned_Name'].astype(str).str.upper().str[0]

    unique_first_letters = customers_df['Customer_First_Letter'].unique()

    for letter in unique_first_letters:
        customer_block = customers_df[customers_df['Customer_First_Letter'] == letter].copy()
        sanction_block = sanctions_df[sanctions_df['Sanctioned_First_Letter'] == letter].copy()

        if sanction_block.empty:
            continue # No sanctioned entities starting with this letter

        block_pairs = []
        for cust_idx, customer in customer_block.iterrows():
            for sanc_idx, sanctioned in sanction_block.iterrows():
                block_pairs.append({
                    'Customer_ID': customer['Customer_ID'],
                    'Customer_Name': customer['Customer_Name'],
                    'Customer_Address': customer['Customer_Address'],
                    'Customer_DOB': customer['Customer_DOB'],
                    'Customer_Nationality': customer['Customer_Nationality'],
                    'Customer_Country': customer['Customer_Country'],
                    'Sanctioned_ID': sanctioned['Sanctioned_ID'],
                    'Sanctioned_Name': sanctioned['Sanctioned_Name'],
                    'Sanctioned_Address': sanctioned['Sanctioned_Address'],
                    'Sanctioned_DOB': sanctioned['Sanctioned_DOB'],
                    'Sanctioned_Nationality': sanctioned['Sanctioned_Nationality'],
                    'Sanction_Type': sanctioned['Sanction_Type']
                })
        
        if not block_pairs:
            continue

        screening_df_block = pd.DataFrame(block_pairs)
        
        # Calculate features for this block
        screening_df_block_features = calculate_sanctions_features(screening_df_block.copy())
        
        # Select only the features used for training and fill NaNs
        X_screen_block = screening_df_block_features[features].fillna(0)
        
        # Predict probabilities
        screening_df_block_features['Sanction_Match_Probability'] = model.predict_proba(X_screen_block)[:, 1]
        all_screening_results.append(screening_df_block_features)

    if not all_screening_results:
        print("No potential matches found after blocking strategy. All customers are OK.")
        # Create an empty DataFrame with expected columns if no results
        return pd.DataFrame(columns=['Customer_ID', 'Customer_Name', 'Max_Sanction_Match_Probability', 'Top_Matched_Sanctioned_Entity', 'Sanction_Alert_Flag'])

    full_screening_df = pd.concat(all_screening_results, ignore_index=True)

    # Aggregate results per customer: take the maximum probability found across all sanctioned entities
    # and identify the top matched sanctioned entity name
    final_screening_results = full_screening_df.groupby('Customer_ID').agg(
        Customer_Name=('Customer_Name', 'first'),
        Max_Sanction_Match_Probability=('Sanction_Match_Probability', 'max'),
        Top_Matched_Sanctioned_Entity=('Sanctioned_Name', lambda x: x.iloc[x.index[np.argmax(full_screening_df.loc[x.index, 'Sanction_Match_Probability'])]])
    ).reset_index()

    final_screening_results['Sanction_Alert_Flag'] = np.where(
        final_screening_results['Max_Sanction_Match_Probability'] >= alert_threshold,
        'ALERT', 'OK'
    )

    print(f"\n--- Final Sanctions Screening Results Summary ---")
    print(f"Total ALERTS: {final_screening_results[final_screening_results['Sanction_Alert_Flag'] == 'ALERT'].shape[0]}")
    print(f"Total OK: {final_screening_results[final_screening_results['Sanction_Alert_Flag'] == 'OK'].shape[0]}")
    
    print(f"\nTop 10 Sanction Alerts (by probability):")
    print(final_screening_results[final_screening_results['Sanction_Alert_Flag'] == 'ALERT']
          .sort_values(by='Max_Sanction_Match_Probability', ascending=False).head(10))
    
    print(f"\nSample of OK results (first 5):")
    print(final_screening_results[final_screening_results['Sanction_Alert_Flag'] == 'OK'].head(5))

    return final_screening_results


In [31]:

# --- Main Execution Flow ---
if __name__ == "__main__":
    print("--- Starting Sanctions Screening ML Model Implementation ---")

    # 1. Load or Generate Initial Data
    sanctions_df, customer_df = load_or_generate_initial_data()

    if sanctions_df.empty or customer_df.empty:
        print("Initial data loading/generation failed. Exiting.")
        exit()

    # 2. Generate Labeled Training Data
    # Adjust num_samples based on your computational resources and desired training data size
    training_data_features_labels = generate_and_label_training_data(customer_df, sanctions_df, num_samples=50000)

    # Define features for the ML model
    ml_features = [
        'name_match_score', 'address_match_score', 'dob_match',
        'nationality_match', 'customer_country_risk_score', 'sanction_type_severity_score',
        'name_country_interaction', 'name_dob_interaction'
    ]

    X = training_data_features_labels[ml_features]
    y = training_data_features_labels['is_sanction_match']

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    print(f"\nTraining data split: X_train={X_train.shape}, y_train={y_train.shape}")
    print(f"Testing data split: X_test={X_test.shape}, y_test={y_test.shape}")

    # 3. Train the Sanctions Screening Model
    # You can choose 'LogisticRegression' or 'GradientBoosting'
    sanctions_model = train_sanctions_model(X_train, y_train, model_type='GradientBoosting')

    # Save the trained model
    model_filename = 'sanctions_screening_gb_model.joblib'
    joblib.dump(sanctions_model, model_filename)
    print(f"Trained model saved to '{model_filename}'")

    # 4. Evaluate the Model
    evaluate_model(sanctions_model, X_test, y_test, model_name="Gradient Boosting Sanctions Model")

    # 5. Perform Sanctions Screening on Customer Data
    # Load the trained model if needed (e.g., in a production environment)
    # loaded_model = joblib.load(model_filename)
    
    final_screening_results_df = perform_sanctions_screening(
        customer_df, sanctions_df, sanctions_model, ml_features, alert_threshold=0.5
    )

    # You can now analyze final_screening_results_df
    # For example, save it to CSV:
    final_screening_results_df.to_csv('final_sanctions_screening_results.csv', index=False)
    print("\nFinal screening results saved to 'final_sanctions_screening_results.csv'")

    print("\n--- Sanctions Screening ML Model Implementation Complete ---")



--- Starting Sanctions Screening ML Model Implementation ---
Successfully loaded common names from 'Name_list.xlsx' sheet 'Names'.
An unexpected error occurred during sanctions data loading/cleaning: 'utf-8' codec can't decode byte 0x99 in position 12: invalid start byte
Successfully loaded customer data from customer_data.csv. Shape: (12820, 8)
Initial data loading/generation failed. Exiting.

--- Generating Simulated Training Data with 50000 samples ---


TypeError: 'NoneType' object is not subscriptable