In [1]:
import pandas as pd
import numpy as np
import datetime
import random
import joblib
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, auc, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier, IsolationForest
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ks_2samp
import string
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# --- Configuration and Global Variables (consistent with previous steps) ---
NUM_CUSTOMERS = 500
NUM_TRANSACTIONS = 100000

# Country Risk Map (simulated external data)
COUNTRY_RISK_MAP = {
    'IRAN': 'HIGH', 'NORTH KOREA': 'HIGH', 'SYRIA': 'HIGH', 'CUBA': 'HIGH', 'VENEZUULA': 'HIGH',
    'RUSSIA': 'MEDIUM', 'CHINA': 'MEDIUM', 'INDIA': 'LOW', 'USA': 'LOW', 'UK': 'LOW',
    'GERMANY': 'LOW', 'FRANCE': 'LOW', 'BRAZIL': 'MEDIUM', 'SOUTH AFRICA': 'MEDIUM',
    'NIGERIA': 'MEDIUM', 'AFGHANISTAN': 'HIGH', 'YEMEN': 'HIGH', 'SOMALIA': 'HIGH',
    'LEBANON': 'MEDIUM', 'PAKISTAN': 'MEDIUM'
}
HIGH_RISK_COUNTRIES = [country for country, risk in COUNTRY_RISK_MAP.items() if risk == 'HIGH']
LOW_RISK_COUNTRIES = [country for country, risk in COUNTRY_RISK_MAP.items() if risk == 'LOW']

# --- Helper Functions (copied for self-containment and consistency) ---

def load_common_names_from_excel(filepath, sheet_name):
    try:
        names_df = pd.read_excel(filepath, sheet_name=sheet_name)
        if 'Sanctioned_name' in names_df.columns:
            return names_df['Sanctioned_name'].astype(str).tolist()
        else:
            return _get_default_common_names()
    except FileNotFoundError:
        return _get_default_common_names()
    except Exception as e:
        return _get_default_common_names()

def _get_default_common_names():
    return [
        'John Smith', 'Jane Johnson', 'Michael Williams', 'Emily Brown', 'David Jones',
        'Sarah Garcia', 'Chris Miller', 'Anna Davis', 'Robert Rodriguez', 'Maria Martinez',
        'William Taylor', 'Olivia Wilson', 'James Moore', 'Sophia White', 'Benjamin Green',
        'Isabella Hall', 'Lucas King', 'Mia Wright', 'Henry Lopez', 'Charlotte Hill'
    ]

COMMON_FULL_NAMES = load_common_names_from_excel('Name_list.xlsx', 'Names')
if not COMMON_FULL_NAMES:
    print("Warning: COMMON_FULL_NAMES list is empty after attempting to load from Excel. Using default names.")
    COMMON_FULL_NAMES = _get_default_common_names()

def load_or_generate_initial_data(sanctions_csv_path='UK Sanctions List_mean.csv',
                                   customer_data_path='customer_data.csv',
                                   num_customers=None):
    if num_customers is None:
        num_customers = NUM_CUSTOMERS

    sanctions_df_cleaned = pd.DataFrame()
    customer_df = pd.DataFrame()

    try:
        raw_sanctions_df = pd.read_csv(sanctions_csv_path, encoding='latin1', header=1)
        name_col = 'Name 6'
        address_col = 'Address 6'
        dob_col = 'DOB 6'
        nationality_col = 'Nationality 6'
        type_col = 'Type'
        id_col = 'ID'

        actual_name_col = name_col if name_col in raw_sanctions_df.columns else ('Name' if 'Name' in raw_sanctions_df.columns else None)
        actual_address_col = address_col if address_col in raw_sanctions_df.columns else ('Address' if 'Address' in raw_sanctions_df.columns else None)
        actual_dob_col = dob_col if dob_col in raw_sanctions_df.columns else ('DOB' if 'DOB' in raw_sanctions_df.columns else None)
        actual_nationality_col = nationality_col if nationality_col in raw_sanctions_df.columns else ('Nationality' if 'Nationality' in raw_sanctions_df.columns else None)
        actual_type_col = type_col if type_col in raw_sanctions_df.columns else ('Type' if 'Type' in raw_sanctions_df.columns else None)
        actual_id_col = id_col if id_col in raw_sanctions_df.columns else ('ID' if 'ID' in raw_sanctions_df.columns else None)

        if not actual_name_col:
            raise ValueError("No name column found")

        sanctions_df_cleaned = raw_sanctions_df.copy()
        sanctions_df_cleaned['Sanctioned_Name'] = sanctions_df_cleaned[actual_name_col].astype(str).str.upper().str.strip()
        
        sanctions_df_cleaned['Sanctioned_Address'] = sanctions_df_cleaned[actual_address_col].astype(str).str.upper().str.strip() if actual_address_col and actual_address_col in sanctions_df_cleaned.columns else np.nan
        sanctions_df_cleaned['Sanctioned_DOB'] = pd.to_datetime(sanctions_df_cleaned[actual_dob_col], errors='coerce').dt.strftime('%Y-%m-%d') if actual_dob_col and actual_dob_col in sanctions_df_cleaned.columns else np.nan
        sanctions_df_cleaned['Sanctioned_Nationality'] = sanctions_df_cleaned[actual_nationality_col].astype(str).str.upper().str.strip() if actual_nationality_col and actual_nationality_col in sanctions_df_cleaned.columns else np.nan
        sanctions_df_cleaned['Sanction_Type'] = sanctions_df_cleaned[actual_type_col].astype(str).str.upper().str.strip() if actual_type_col and actual_type_col in sanctions_df_cleaned.columns else np.nan
        sanctions_df_cleaned['Sanctioned_ID'] = sanctions_df_cleaned[actual_id_col].astype(str) if actual_id_col and actual_id_col in sanctions_df_cleaned.columns else [f'S{i:04d}' for i in range(len(sanctions_df_cleaned))]

        sanctions_df_cleaned = sanctions_df_cleaned[[
            'Sanctioned_ID', 'Sanctioned_Name', 'Sanctioned_Address',
            'Sanctioned_DOB', 'Sanctioned_Nationality', 'Sanction_Type'
        ]].copy()
        sanctions_df_cleaned = sanctions_df_cleaned[
            (sanctions_df_cleaned['Sanctioned_Name'] != 'UNKNOWN SANCTIONED NAME') &
            (sanctions_df_cleaned['Sanctioned_Name'] != 'NAN') &
            (sanctions_df_cleaned['Sanctioned_Name'].str.strip() != '')
        ].reset_index(drop=True)

    except FileNotFoundError:
        sanctions_df_cleaned = pd.DataFrame({
            'Sanctioned_ID': [f'S{i:04d}' for i in range(1, 101)],
            'Sanctioned_Name': [f'SANCTIONED PERSON {i}' for i in range(1, 101)],
            'Sanctioned_Address': [f'{i*10} MAIN ST, HIGH RISK COUNTRY' for i in range(1, 101)],
            'Sanctioned_DOB': [f'{1950 + i}-01-01' for i in range(100)],
            'Sanctioned_Nationality': random.choices(HIGH_RISK_COUNTRIES, k=100),
            'Sanction_Type': random.choices(['INDIVIDUAL', 'ENTITY'], k=100)
        })
    except Exception as e:
        sanctions_df_cleaned = pd.DataFrame({
            'Sanctioned_ID': [f'S{i:04d}' for i in range(1, 101)],
            'Sanctioned_Name': [f'SANCTIONED PERSON {i}' for i in range(1, 101)],
            'Sanctioned_Address': [f'{i*10} MAIN ST, HIGH RISK COUNTRY' for i in range(1, 101)],
            'Sanctioned_DOB': [f'{1950 + i}-01-01' for i in range(100)],
            'Sanctioned_Nationality': random.choices(HIGH_RISK_COUNTRIES, k=100),
            'Sanction_Type': random.choices(['INDIVIDUAL', 'ENTITY'], k=100)
        })


    try:
        customer_df = pd.read_csv(customer_data_path)
        current_cols_lower = {col.lower(): col for col in customer_df.columns}
        
        expected_customer_cols_mapping = {
            'customer_id': 'Customer_ID',
            'customer_name': 'Customer_Name',
            'customer_address': 'Customer_Address',
            'customer_dob': 'Customer_DOB',
            'customer_nationality': 'Customer_Nationality',
            'customer_country': 'Customer_Country',
            'customer_industry': 'Customer_Industry',
            'onboarding_date': 'Onboarding_Date'
        }
        
        rename_dict = {}
        for old_col_lower, new_col_proper in expected_customer_cols_mapping.items():
            if old_col_lower in current_cols_lower:
                rename_dict[current_cols_lower[old_col_lower]] = new_col_proper
            elif new_col_proper not in customer_df.columns:
                customer_df[new_col_proper] = np.nan

        if rename_dict:
            customer_df.rename(columns=rename_dict, inplace=True)
            
        required_customer_cols = ['Customer_ID', 'Customer_Name', 'Customer_Address', 'Customer_DOB', 'Customer_Nationality', 'Customer_Country']
        if not all(col in customer_df.columns for col in required_customer_cols):
            missing_cols = [col for col in required_customer_cols if col not in customer_df.columns]
            customer_df = pd.DataFrame() 

    except FileNotFoundError:
        customers = []
        for i in range(1, num_customers + 1):
            customer_id = f'CUST{i:05d}'
            customer_name = random.choice(COMMON_FULL_NAMES)
            customer_address = f"{random.randint(100, 999)} {random.choice(['Main St', 'Oak Ave', 'Pine Ln'])}"
            customer_dob = (datetime.date(1950, 1, 1) + datetime.timedelta(days=random.randint(0, 365 * 50))).strftime('%Y-%m-%d')
            customer_nationality = random.choice(list(COUNTRY_RISK_MAP.keys()))
            customer_country = random.choice(list(COUNTRY_RISK_MAP.keys()))
            customer_industry = random.choice(['Financial Services', 'Retail', 'Technology', 'Manufacturing', 'Healthcare'])
            onboarding_date = (datetime.date(2020, 1, 1) + datetime.timedelta(days=random.randint(0, 365 * 3))).strftime('%Y-%m-%d')
            customers.append({
                'Customer_ID': customer_id, 'Customer_Name': customer_name, 'Customer_Address': customer_address,
                'Customer_DOB': customer_dob, 'Customer_Nationality': customer_nationality, 'Customer_Country': customer_country,
                'Customer_Industry': customer_industry, 'Onboarding_Date': onboarding_date
            })
        customer_df = pd.DataFrame(customers)
    except Exception as e:
        customers = []
        for i in range(1, num_customers + 1):
            customer_id = f'CUST{i:05d}'
            customer_name = random.choice(COMMON_FULL_NAMES)
            customer_address = f"{random.randint(100, 999)} {random.choice(['Main St', 'Oak Ave', 'Pine Ln'])}"
            customer_dob = (datetime.date(1950, 1, 1) + datetime.timedelta(days=random.randint(0, 365 * 50))).strftime('%Y-%m-%d')
            customer_nationality = random.choice(list(COUNTRY_RISK_MAP.keys()))
            customer_country = random.choice(list(COUNTRY_RISK_MAP.keys()))
            customer_industry = random.choice(['Financial Services', 'Retail', 'Technology', 'Manufacturing', 'Healthcare'])
            onboarding_date = (datetime.date(2020, 1, 1) + datetime.timedelta(days=random.randint(0, 365 * 3))).strftime('%Y-%m-%d')
            customers.append({
                'Customer_ID': customer_id, 'Customer_Name': customer_name, 'Customer_Address': customer_address,
                'Customer_DOB': customer_dob, 'Customer_Nationality': customer_nationality, 'Customer_Country': customer_country,
                'Customer_Industry': customer_industry, 'Onboarding_Date': onboarding_date
            })
        customer_df = pd.DataFrame(customers)

    return sanctions_df_cleaned, customer_df

def calculate_sanctions_features(df):
    expected_input_cols = [
        'Customer_ID', 'Customer_Name', 'Customer_Address', 'Customer_DOB', 'Customer_Nationality', 'Customer_Country',
        'Sanctioned_ID', 'Sanctioned_Name', 'Sanctioned_Address', 'Sanctioned_DOB', 'Sanctioned_Nationality', 'Sanction_Type'
    ]
    if df.empty:
        all_expected_output_cols = expected_input_cols + [
            'Customer_Name_Clean', 'Sanctioned_Name_Clean', 'Customer_Address_Clean', 'Sanctioned_Address_Clean',
            'Customer_Nationality_Clean', 'Sanctioned_Nationality_Clean', 'Customer_Country_Clean',
            'name_fuzz_ratio', 'name_token_sort_ratio', 'name_token_set_ratio', 'name_match_score',
            'address_match_score', 'dob_match', 'nationality_match', 'customer_country_risk_score',
            'sanction_type_severity_score', 'name_country_interaction', 'name_dob_interaction'
        ]
        if 'is_sanction_match' in df.columns:
            all_expected_output_cols.append('is_sanction_match')
        return pd.DataFrame(columns=all_expected_output_cols)

    for col in expected_input_cols:
        if col not in df.columns:
            df[col] = np.nan

    df['Customer_Name_Clean'] = df['Customer_Name'].astype(str).str.upper().str.strip()
    df['Sanctioned_Name_Clean'] = df['Sanctioned_Name'].astype(str).str.upper().str.strip()
    df['Customer_Address_Clean'] = df['Customer_Address'].astype(str).str.upper().str.strip()
    df['Sanctioned_Address_Clean'] = df['Sanctioned_Address'].astype(str).str.upper().str.strip()
    df['Customer_Nationality_Clean'] = df['Customer_Nationality'].astype(str).str.upper().str.strip()
    df['Sanctioned_Nationality_Clean'] = df['Sanctioned_Nationality'].astype(str).str.upper().str.strip()
    df['Customer_Country_Clean'] = df['Customer_Country'].astype(str).str.upper().str.strip()

    df['name_fuzz_ratio'] = df.apply(lambda row: fuzz.ratio(row['Customer_Name_Clean'], row['Sanctioned_Name_Clean']), axis=1)
    df['name_token_sort_ratio'] = df.apply(lambda row: fuzz.token_sort_ratio(row['Customer_Name_Clean'], row['Sanctioned_Name_Clean']), axis=1)
    df['name_token_set_ratio'] = df.apply(lambda row: fuzz.token_set_ratio(row['Customer_Name_Clean'], row['Sanctioned_Name_Clean']), axis=1)
    df['name_match_score'] = df[['name_fuzz_ratio', 'name_token_sort_ratio', 'name_token_set_ratio']].max(axis=1)
    df['address_match_score'] = df.apply(lambda row: fuzz.token_set_ratio(row['Customer_Address_Clean'], row['Sanctioned_Address_Clean']), axis=1)
    df['dob_match'] = df.apply(lambda row: 1 if (pd.notna(row['Customer_DOB']) and pd.notna(row['Sanctioned_DOB']) and str(row['Customer_DOB']) == str(row['Sanctioned_DOB'])) else 0, axis=1)
    df['nationality_match'] = df.apply(lambda row: 1 if (pd.notna(row['Customer_Nationality_Clean']) and pd.notna(row['Sanctioned_Nationality_Clean']) and row['Customer_Nationality_Clean'] == row['Sanctioned_Nationality_Clean']) else 0, axis=1)

    df['customer_country_risk_score'] = df['Customer_Country_Clean'].map(
        {k: (10 if v == 'HIGH' else 5 if v == 'MEDIUM' else 1) for k, v in COUNTRY_RISK_MAP.items()}
    ).fillna(0)

    sanction_type_severity = {
        'INDIVIDUAL': 10, 'ENTITY': 8, 'VESSEL': 5, 'AIRCRAFT': 5, 'NAN': 0, 'UNKNOWN': 0
    }
    df['sanction_type_severity_score'] = df['Sanction_Type'].map(sanction_type_severity).fillna(0)
    df['name_country_interaction'] = df['name_match_score'] * df['customer_country_risk_score']
    df['name_dob_interaction'] = df['name_match_score'] * df['dob_match']
    return df

def generate_transaction_data(customer_df, num_transactions=None):
    if num_transactions is None:
        num_transactions = NUM_TRANSACTIONS

    transactions = []
    transaction_types = ['DEPOSIT', 'WITHDRAWAL', 'TRANSFER_IN', 'TRANSFER_OUT', 'PAYMENT']
    currencies = ['USD', 'EUR', 'GBP', 'JPY']
    
    if customer_df.empty:
        return pd.DataFrame(columns=[
            'Transaction_ID', 'Customer_ID', 'Transaction_Date', 'Transaction_Type',
            'Amount', 'Currency', 'Sender_ID', 'Receiver_ID', 'Sender_Country',
            'Receiver_Country', 'Is_Suspicious_Label'
        ])

    customer_ids = customer_df['Customer_ID'].tolist()
    customer_countries = customer_df.set_index('Customer_ID')['Customer_Country'].to_dict()

    for i in range(num_transactions):
        trans_id = f'TRANS{i:07d}'
        customer_id = random.choice(customer_ids)
        trans_date = (datetime.date(2023, 1, 1) + datetime.timedelta(days=random.randint(0, 364))).strftime('%Y-%m-%d')
        trans_type = random.choice(transaction_types)
        amount = round(random.uniform(10, 10000), 2)
        currency = random.choice(currencies)
        
        sender_id = customer_id
        receiver_id = random.choice(customer_ids)
        while sender_id == receiver_id and trans_type in ['TRANSFER_IN', 'TRANSFER_OUT']:
            receiver_id = random.choice(customer_ids)

        sender_country = customer_countries.get(sender_id, random.choice(list(COUNTRY_RISK_MAP.keys())))
        receiver_country = customer_countries.get(receiver_id, random.choice(list(COUNTRY_RISK_MAP.keys())))

        is_suspicious = 0
        if random.random() < 0.01:
            is_suspicious = 1
            susp_type = random.choice(['large_amount', 'high_risk_country', 'structuring'])

            if susp_type == 'large_amount':
                amount = round(random.uniform(50000, 1000000), 2)
            elif susp_type == 'high_risk_country':
                if trans_type in ['TRANSFER_OUT', 'PAYMENT']:
                    receiver_country = random.choice(HIGH_RISK_COUNTRIES)
                else:
                    sender_country = random.choice(HIGH_RISK_COUNTRIES)
                amount = round(random.uniform(5000, 50000), 2)
            elif susp_type == 'structuring':
                amount = round(random.uniform(8000, 9900), 2)
                trans_type = random.choice(['DEPOSIT', 'WITHDRAWAL'])

        transactions.append({
            'Transaction_ID': trans_id,
            'Customer_ID': customer_id,
            'Transaction_Date': trans_date,
            'Transaction_Type': trans_type,
            'Amount': amount,
            'Currency': currency,
            'Sender_ID': sender_id,
            'Receiver_ID': receiver_id,
            'Sender_Country': sender_country,
            'Receiver_Country': receiver_country,
            'Is_Suspicious_Label': is_suspicious
        })

    transaction_df = pd.DataFrame(transactions)
    transaction_df['Transaction_Date'] = pd.to_datetime(transaction_df['Transaction_Date'])
    
    return transaction_df

def feature_engineer_transactions(transactions_df, customer_df):
    required_cols = ['Customer_ID', 'Transaction_Date', 'Amount', 'Transaction_Type', 'Sender_Country', 'Receiver_Country']
    for col in required_cols:
        if col not in transactions_df.columns:
            transactions_df[col] = np.nan

    transactions_df['Transaction_Date'] = pd.to_datetime(transactions_df['Transaction_Date'], errors='coerce')
    transactions_df.dropna(subset=['Transaction_Date'], inplace=True)

    transactions_df['Amount_USD'] = transactions_df['Amount']
    transactions_df['Transaction_Hour'] = transactions_df['Transaction_Date'].dt.hour
    transactions_df['Transaction_DayOfWeek'] = transactions_df['Transaction_Date'].dt.dayofweek

    transactions_df['Sender_Country_Risk_Score'] = transactions_df['Sender_Country'].map(
        {k: (10 if v == 'HIGH' else 5 if v == 'MEDIUM' else 1) for k, v in COUNTRY_RISK_MAP.items()}
    ).fillna(0)
    transactions_df['Receiver_Country_Risk_Score'] = transactions_df['Receiver_Country'].map(
        {k: (10 if v == 'HIGH' else 5 if v == 'MEDIUM' else 1) for k, v in COUNTRY_RISK_MAP.items()}
    ).fillna(0)
    transactions_df['Geographic_Risk_Score'] = transactions_df[['Sender_Country_Risk_Score', 'Receiver_Country_Risk_Score']].max(axis=1)

    transactions_df = pd.get_dummies(transactions_df, columns=['Transaction_Type'], prefix='TxType', dummy_na=False)

    customer_agg_features = transactions_df.groupby('Customer_ID').agg(
        Total_Amount=('Amount_USD', 'sum'),
        Avg_Amount=('Amount_USD', 'mean'),
        Num_Transactions=('Transaction_ID', 'count'),
        Max_Amount=('Amount_USD', 'max'),
        Min_Amount=('Amount_USD', 'min'),
        Unique_Counterparties=('Receiver_ID', lambda x: x.nunique()),
    ).reset_index()

    transactions_df_features = pd.merge(transactions_df, customer_agg_features, on='Customer_ID', how='left')
    transactions_df_features['Amount_Geo_Risk_Interaction'] = transactions_df_features['Amount_USD'] * transactions_df_features['Geographic_Risk_Score']

    model_features = [
        'Amount_USD', 'Transaction_Hour', 'Transaction_DayOfWeek',
        'Sender_Country_Risk_Score', 'Receiver_Country_Risk_Score', 'Geographic_Risk_Score',
        'Total_Amount', 'Avg_Amount', 'Num_Transactions', 'Max_Amount', 'Min_Amount', 'Unique_Counterparties',
        'Amount_Geo_Risk_Interaction'
    ]
    
    for col in transactions_df_features.columns:
        if col.startswith('TxType_'):
            model_features.append(col)
    
    for feature in model_features:
        if feature not in transactions_df_features.columns:
            transactions_df_features[feature] = 0

    return transactions_df_features, model_features

def train_aml_model(X_train_scaled, contamination_rate=0.01):
    model = IsolationForest(random_state=42, contamination=contamination_rate, n_estimators=200, max_features=1.0)
    model.fit(X_train_scaled)
    return model

def train_sanctions_model(X_train, y_train, model_type='GradientBoosting'):
    if model_type == 'LogisticRegression':
        model = LogisticRegression(random_state=42, solver='liblinear', class_weight='balanced', max_iter=1000)
        param_grid = {
            'C': [0.1, 1, 10],
            'penalty': ['l1', 'l2']
        }
    elif model_type == 'GradientBoosting':
        model = GradientBoostingClassifier(random_state=42)
        param_grid = {
            'n_estimators': [100, 200],
            'learning_rate': [0.05, 0.1],
            'max_depth': [3, 5]
        }
    else:
        raise ValueError("model_type must be 'LogisticRegression' or 'GradientBoosting'")

    grid_search = GridSearchCV(model, param_grid, cv=3, scoring='roc_auc', n_jobs=-1, verbose=0)
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

def generate_and_label_training_data(customer_df, sanctions_df_cleaned, num_samples=20000, start_date_offset_years=0):
    expected_training_df_cols = [
        'Customer_ID', 'Customer_Name', 'Customer_Address', 'Customer_DOB', 'Customer_Nationality', 'Customer_Country',
        'Sanctioned_ID', 'Sanctioned_Name', 'Sanctioned_Address', 'Sanctioned_DOB', 'Sanctioned_Nationality', 'Sanction_Type',
        'is_sanction_match'
    ]
    if customer_df.empty or sanctions_df_cleaned.empty:
        return pd.DataFrame(columns=expected_training_df_cols)
    
    if len(customer_df) < 1 or len(sanctions_df_cleaned) < 1:
        return pd.DataFrame(columns=expected_training_df_cols)

    training_samples = []
    num_true_positives = int(num_samples * 0.05)
    for _ in range(num_true_positives):
        sanctioned_entity = sanctions_df_cleaned.sample(1).iloc[0]
        customer_entity = customer_df.sample(1).iloc[0]
        cust_name_tp = sanctioned_entity['Sanctioned_Name']
        if len(cust_name_tp) > 3:
            idx = random.randint(0, len(cust_name_tp) - 1)
            cust_name_tp = cust_name_tp[:idx] + random.choice(string.ascii_uppercase) + cust_name_tp[idx+1:]
        cust_address_tp = sanctioned_entity['Sanctioned_Address']
        if len(cust_address_tp) > 5:
            idx = random.randint(0, len(cust_address_tp) - 1)
            cust_address_tp = cust_address_tp[:idx] + random.choice(string.ascii_uppercase) + cust_address_tp[idx+1:]
        
        # Adjust DOB for backtesting if needed, e.g., shift earlier
        original_dob = pd.to_datetime(sanctioned_entity['Sanctioned_DOB'], errors='coerce')
        if pd.notna(original_dob):
            cust_dob_tp = (original_dob - pd.DateOffset(years=start_date_offset_years)).strftime('%Y-%m-%d')
        else:
            cust_dob_tp = (datetime.date(1950, 1, 1) - datetime.timedelta(days=365*start_date_offset_years)).strftime('%Y-%m-%d')

        cust_nationality_tp = sanctioned_entity['Sanctioned_Nationality']
        cust_country_tp = sanctioned_entity['Sanctioned_Nationality']
        training_samples.append({
            'Customer_ID': customer_entity['Customer_ID'], 'Customer_Name': cust_name_tp, 'Customer_Address': cust_address_tp,
            'Customer_DOB': cust_dob_tp, 'Customer_Nationality': cust_nationality_tp, 'Customer_Country': cust_country_tp,
            'Sanctioned_ID': sanctioned_entity['Sanctioned_ID'], 'Sanctioned_Name': sanctioned_entity['Sanctioned_Name'],
            'Sanctioned_Address': sanctioned_entity['Sanctioned_Address'], 'Sanctioned_DOB': sanctioned_entity['Sanctioned_DOB'],
            'Sanctioned_Nationality': sanctioned_entity['Sanctioned_Nationality'], 'Sanction_Type': sanctioned_entity['Sanction_Type'],
            'is_sanction_match': 1
        })

    num_true_negatives = num_samples - num_true_positives
    for _ in range(num_true_negatives):
        sanctioned_entity = sanctions_df_cleaned.sample(1).iloc[0]
        customer_entity = customer_df.sample(1).iloc[0]
        cust_name_tn = random.choice(COMMON_FULL_NAMES)
        cust_address_tn = f"{random.randint(1000, 9999)} {random.choice(['Road', 'Lane', 'Square'])}"
        
        cust_dob_tn = (datetime.date(1940, 1, 1) + datetime.timedelta(days=random.randint(0, 365 * 60)) - datetime.timedelta(days=365*start_date_offset_years)).strftime('%Y-%m-%d')
        cust_nationality_tn = random.choice(list(COUNTRY_RISK_MAP.keys()))
        cust_country_tn = random.choice(list(COUNTRY_RISK_MAP.keys()))
        training_samples.append({
            'Customer_ID': customer_entity['Customer_ID'], 'Customer_Name': cust_name_tn, 'Customer_Address': cust_address_tn,
            'Customer_DOB': cust_dob_tn, 'Customer_Nationality': cust_nationality_tn, 'Customer_Country': cust_country_tn,
            'Sanctioned_ID': sanctioned_entity['Sanctioned_ID'], 'Sanctioned_Name': sanctioned_entity['Sanctioned_Name'],
            'Sanctioned_Address': sanctioned_entity['Sanctioned_Address'], 'Sanctioned_DOB': sanctioned_entity['Sanctioned_DOB'],
            'Sanctioned_Nationality': sanctioned_entity['Sanctioned_Nationality'], 'Sanction_Type': sanctioned_entity['Sanction_Type'],
            'is_sanction_match': 0
        })
    
    training_df = pd.DataFrame(training_samples, columns=expected_training_df_cols)
    required_cols_for_features_input = [
        'Customer_ID', 'Customer_Name', 'Customer_Address', 'Customer_DOB', 'Customer_Nationality', 'Customer_Country',
        'Sanctioned_ID', 'Sanctioned_Name', 'Sanctioned_Address', 'Sanctioned_DOB', 'Sanctioned_Nationality', 'Sanction_Type'
    ]
    for col in required_cols_for_features_input:
        if col not in training_df.columns:
            training_df[col] = np.nan

    training_df_features = calculate_sanctions_features(training_df.copy())
    
    if 'is_sanction_match' not in training_df_features.columns:
        if 'is_sanction_match' in training_df.columns:
            training_df_features['is_sanction_match'] = training_df['is_sanction_match']
        else:
            training_df_features['is_sanction_match'] = 0

    feature_cols = [
        'name_match_score', 'address_match_score', 'dob_match',
        'nationality_match', 'customer_country_risk_score', 'sanction_type_severity_score',
        'name_country_interaction', 'name_dob_interaction'
    ]
    for col in feature_cols:
        if col not in training_df_features.columns:
            training_df_features[col] = 0

    training_df_features[feature_cols] = training_df_features[feature_cols].fillna(0)
    return training_df_features

def evaluate_model_performance(model_type, model, X_test, y_test_true, model_name="Model"):
    """
    Evaluates and prints performance metrics.
    """
    if model_type == 'Sanctions':
        y_proba = model.predict_proba(X_test)[:, 1]
        y_pred = model.predict(X_test)
        
        report = classification_report(y_test_true, y_pred, output_dict=True)
        roc_auc = roc_auc_score(y_test_true, y_proba)

        print(f"  {model_name} Performance:")
        print(f"    Accuracy: {report['accuracy']:.4f}")
        print(f"    Precision (Sanctioned): {report['1']['precision']:.4f}")
        print(f"    Recall (Sanctioned): {report['1']['recall']:.4f}")
        print(f"    F1-Score (Sanctioned): {report['1']['f1-score']:.4f}")
        print(f"    ROC AUC: {roc_auc:.4f}")

    elif model_type == 'AML':
        y_pred_aml = np.where(model.predict(X_test) == -1, 1, 0) # -1 for outlier -> 1 for suspicious
        print(f"  {model_name} Performance:")
        print(classification_report(y_test_true, y_pred_aml, target_names=['Normal', 'Suspicious']))
        try:
            anomaly_scores = model.decision_function(X_test)
            roc_auc = roc_auc_score(y_test_true, -anomaly_scores)
            print(f"    ROC AUC: {roc_auc:.4f}")
        except ValueError as e:
            print(f"    Could not compute ROC AUC: {e}. (Likely only one class present in true labels)")
    else:
        print("Invalid model_type for evaluation.")

# --- Main Execution Flow for Backtesting ---
if __name__ == "__main__":
    print("--- Starting Backtesting Analysis ---")

    # 1. Load Initial Data and Trained Models
    # We load initial data to use for generating *new* historical data
    sanctions_df, customer_df = load_or_generate_initial_data()
    if customer_df.empty or sanctions_df.empty:
        print("FATAL: Customer or sanctions data is empty. Cannot proceed with backtesting. Exiting.")
        exit()

    sanctions_model_filename = 'sanctions_screening_gb_model.joblib'
    aml_model_filename = 'aml_isolation_forest_model.joblib'
    aml_scaler_filename = 'aml_scaler.joblib'

    sanctions_model = None
    aml_model = None
    aml_scaler = None

    try:
        sanctions_model = joblib.load(sanctions_model_filename)
        print(f"Loaded Sanctions Screening Model from '{sanctions_model_filename}'.")
    except FileNotFoundError:
        print(f"Warning: Sanctions model '{sanctions_model_filename}' not found. Skipping sanctions backtesting.")

    try:
        aml_model = joblib.load(aml_model_filename)
        aml_scaler = joblib.load(aml_scaler_filename)
        print(f"Loaded AML Isolation Forest Model and Scaler from '{aml_model_filename}' and '{aml_scaler_filename}'.")
    except FileNotFoundError:
        print(f"Warning: AML model or scaler not found. Skipping AML backtesting.")

    # Define feature lists (ensure consistency with training)
    sanctions_ml_features = [
        'name_match_score', 'address_match_score', 'dob_match',
        'nationality_match', 'customer_country_risk_score', 'sanction_type_severity_score',
        'name_country_interaction', 'name_dob_interaction'
    ]
    
    # For AML features, we need to generate some data to get the full list of TxType_ columns
    temp_aml_data, temp_aml_features_list = feature_engineer_transactions(generate_transaction_data(customer_df.head(10)), customer_df.head(10))
    aml_features_list = temp_aml_features_list # This will include all possible TxType_ columns

    # --- 2. Backtest Sanctions Screening Model ---
    if sanctions_model:
        print("\n--- Running Sanctions Model Backtest ---")
        # Simulate historical sanctions test data (e.g., from a period 1 year ago)
        # Using a distinct set of samples, potentially with slight historical variations in labels if needed
        historical_sanctions_test_data = generate_and_label_training_data(customer_df.copy(), sanctions_df.copy(), num_samples=5000, start_date_offset_years=1)
        
        if not historical_sanctions_test_data.empty:
            # Ensure all expected features are present, fill with 0 if not
            for feature in sanctions_ml_features:
                if feature not in historical_sanctions_test_data.columns:
                    historical_sanctions_test_data[feature] = 0

            X_historical_sanctions = historical_sanctions_test_data[sanctions_ml_features].fillna(0)
            y_historical_sanctions = historical_sanctions_test_data['is_sanction_match']

            if len(y_historical_sanctions.unique()) < 2:
                print("Warning: Historical sanctions data has only one class in true labels. Cannot compute full classification report.")
            else:
                evaluate_model_performance('Sanctions', sanctions_model, X_historical_sanctions, y_historical_sanctions, "Historical Sanctions Model")
        else:
            print("Skipping Sanctions Model Backtest due to empty historical data.")
    else:
        print("Sanctions Model not loaded, skipping backtest.")

    # --- 3. Backtest AML Transaction Monitoring Model ---
    if aml_model and aml_scaler:
        print("\n--- Running AML Model Backtest ---")
        # Simulate historical AML transaction data
        historical_aml_raw = generate_transaction_data(customer_df.copy(), num_transactions=20000)
        
        if not historical_aml_raw.empty:
            historical_aml_features, _ = feature_engineer_transactions(historical_aml_raw.copy(), customer_df.copy())
            
            # Ensure all expected features are present in historical_aml_features
            for feature in aml_features_list:
                if feature not in historical_aml_features.columns:
                    historical_aml_features[feature] = 0

            X_historical_aml = historical_aml_features[aml_features_list].fillna(0)
            y_historical_aml_labels = historical_aml_features['Is_Suspicious_Label']

            # Scale historical data using the *same scaler* that was fitted on the training data
            X_historical_aml_scaled = aml_scaler.transform(X_historical_aml)
            
            evaluate_model_performance('AML', aml_model, X_historical_aml_scaled, y_historical_aml_labels, "Historical AML Model")
        else:
            print("Skipping AML Model Backtest due to empty historical data.")
    else:
        print("AML Model or Scaler not loaded, skipping backtest.")

    print("\n--- Backtesting Analysis Complete ---")


--- Starting Backtesting Analysis ---
Loaded Sanctions Screening Model from 'sanctions_screening_gb_model.joblib'.
Loaded AML Isolation Forest Model and Scaler from 'aml_isolation_forest_model.joblib' and 'aml_scaler.joblib'.

--- Running Sanctions Model Backtest ---


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  Historical Sanctions Model Performance:
    Accuracy: 0.9500
    Precision (Sanctioned): 0.0000
    Recall (Sanctioned): 0.0000
    F1-Score (Sanctioned): 0.0000
    ROC AUC: 1.0000

--- Running AML Model Backtest ---
  Historical AML Model Performance:
              precision    recall  f1-score   support

      Normal       0.99      1.00      0.99     19795
  Suspicious       0.50      0.31      0.38       205

    accuracy                           0.99     20000
   macro avg       0.75      0.65      0.69     20000
weighted avg       0.99      0.99      0.99     20000

    ROC AUC: 0.8071

--- Backtesting Analysis Complete ---
