# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, auc
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from scipy.sparse import issparse
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score, make_scorer
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool
import joblib

# --- 1. Load data ---

In [2]:
print("--- 1. Loading Feature Engineered Data ---")
df = pd.read_pickle('../data/feature_engineered_transactions.pkl')
print(f"Data loaded: Rows={len(df)}, Columns={len(df.columns)}")

X = df.drop('is_fraud', axis=1)
y = df['is_fraud']

# Drop identifiers
identifiers = ['customer_id', 'device_id', 'transaction_id']
X = X.drop(columns=[c for c in identifiers if c in X.columns])

categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_features = X.select_dtypes(include=['int64', 'float64', 'int32']).columns.tolist()
print(f"Numerical: {len(numerical_features)}, Categorical: {len(categorical_features)}")

--- 1. Loading Feature Engineered Data ---
Data loaded: Rows=10200, Columns=36
Numerical: 23, Categorical: 9


# --- 2. Train/test split ---

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
print(f"Train={len(X_train)}, Test={len(X_test)}, Fraud rate={y_train.mean():.4f}")

Train=8160, Test=2040, Fraud rate=0.0194


# --- 3. Preprocessing for XGBoost & LightGBM ---

In [4]:
numerical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median'))
])
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))  # FIX
])
preprocessor = ColumnTransformer([
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# --- 4. Function to evaluate models ---

In [5]:
def evaluate_model(model, X_test, y_test, model_name):
    y_pred_proba = model.predict_proba(X_test)[:,1]
    auc_roc = roc_auc_score(y_test, y_pred_proba)
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
    auc_pr = auc(recall, precision)
    f1_scores = 2*(precision[:-1]*recall[:-1])/(precision[:-1]+recall[:-1])
    f1_scores = np.nan_to_num(f1_scores)
    best_idx = np.argmax(f1_scores)
    best_threshold = thresholds[best_idx]
    y_pred_opt = (y_pred_proba >= best_threshold).astype(int)
    print(f"\n--- {model_name} ---")
    print(f"AUC-ROC: {auc_roc:.4f}, AUC-PR: {auc_pr:.4f}, Best threshold={best_threshold:.4f}")
    print(classification_report(y_test, y_pred_opt, target_names=['Non-Fraud','Fraud']))

# --- 5. XGBoost ---

In [6]:
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    random_state=42,
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.7,
    colsample_bytree=0.7,
    use_label_encoder=False,
    scale_pos_weight=(len(y_train)-y_train.sum())/y_train.sum()
)
xgb_model.fit(X_train_processed, y_train)
evaluate_model(xgb_model, X_test_processed, y_test, "XGBoost")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



--- XGBoost ---
AUC-ROC: 0.7148, AUC-PR: 0.0848, Best threshold=0.3090
              precision    recall  f1-score   support

   Non-Fraud       0.98      0.97      0.98      2001
       Fraud       0.12      0.23      0.16        39

    accuracy                           0.95      2040
   macro avg       0.55      0.60      0.57      2040
weighted avg       0.97      0.95      0.96      2040



# --- 6. LightGBM ---

In [None]:
lgb_model = lgb.LGBMClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.7,
    colsample_bytree=0.7,
    is_unbalance=True,  
    random_state=42
)
lgb_model.fit(X_train_processed, y_train)
evaluate_model(lgb_model, X_test_processed, y_test, "LightGBM")

[LightGBM] [Info] Number of positive: 158, number of negative: 8002
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002636 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1726
[LightGBM] [Info] Number of data points in the train set: 8160, number of used features: 67
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.019363 -> initscore=-3.924852
[LightGBM] [Info] Start training from score -3.924852

--- LightGBM ---
AUC-ROC: 0.6814, AUC-PR: 0.0723, Best threshold=0.4379
              precision    recall  f1-score   support

   Non-Fraud       0.98      0.98      0.98      2001
       Fraud       0.12      0.13      0.12        39

    accuracy                           0.97      2040
   macro avg       0.55      0.55      0.55      2040
weighted avg       0.97      0.97      0.97      2040





# --- 7. CatBoost ---

In [9]:
# --- 7. Hyperparameter Tuning for CatBoost (Best Performer) ---
print("\n--- 7. CatBoost Hyperparameter Tuning (Randomized Search) ---")

# Initialize results list to collect model performance metrics
results = []

# Calculate class weight for imbalanced data
pos_weight = (len(y_train) - y_train.sum()) / y_train.sum()

# Define the parameter grid for Randomized Search
param_dist = {
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'depth': [5, 6, 8],
    'l2_leaf_reg': [1, 3, 5, 10],
    'random_seed': [42]
}

base_cat_model = CatBoostClassifier(
    iterations=500,
    eval_metric='F1',
    verbose=0,
    early_stopping_rounds=50,
    scale_pos_weight=pos_weight,
    allow_writing_files=False
)

f1_scorer = make_scorer(f1_score, pos_label=1)

random_search = RandomizedSearchCV(
    estimator=base_cat_model,
    param_distributions=param_dist,
    n_iter=10,
    scoring=f1_scorer,
    cv=3,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# Fit the search on the raw training data (CatBoost handles categories)
random_search.fit(X_train, y_train, cat_features=categorical_features)

best_cat_model = random_search.best_estimator_
# Evaluate the best model on the test set
y_pred_proba = best_cat_model.predict_proba(X_test)[:,1]
auc_roc = roc_auc_score(y_test, y_pred_proba)
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
auc_pr = auc(recall, precision)
f1_scores = 2*(precision[:-1]*recall[:-1])/(precision[:-1]+recall[:-1])
f1_scores = np.nan_to_num(f1_scores)
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
y_pred_opt = (y_pred_proba >= best_threshold).astype(int)
print(f"\n--- CatBoost (Tuned) ---")
print(f"AUC-ROC: {auc_roc:.4f}, AUC-PR: {auc_pr:.4f}, Best threshold={best_threshold:.4f}")
print(classification_report(y_test, y_pred_opt, target_names=['Non-Fraud','Fraud']))

# Extract metrics for results tracking
report_dict = classification_report(y_test, y_pred_opt, target_names=['Non-Fraud','Fraud'], output_dict=True)
results.append({
    'model_name': 'CatBoost (Tuned)',
    'model_object': best_cat_model,
    'auc_roc': auc_roc,
    'auc_pr': auc_pr,
    'f1_fraud': report_dict['Fraud']['f1-score'],
    'precision_fraud': report_dict['Fraud']['precision'],
    'recall_fraud': report_dict['Fraud']['recall']
})
# Evaluate the best model on the test set
results.append(evaluate_model(best_cat_model, X_test, y_test, "CatBoost (Tuned)"))


--- 7. CatBoost Hyperparameter Tuning (Randomized Search) ---
Fitting 3 folds for each of 10 candidates, totalling 30 fits

--- CatBoost (Tuned) ---
AUC-ROC: 0.7150, AUC-PR: 0.0700, Best threshold=0.6705
              precision    recall  f1-score   support

   Non-Fraud       0.98      0.99      0.98      2001
       Fraud       0.15      0.13      0.14        39

    accuracy                           0.97      2040
   macro avg       0.57      0.56      0.56      2040
weighted avg       0.97      0.97      0.97      2040


--- CatBoost (Tuned) ---
AUC-ROC: 0.7150, AUC-PR: 0.0700, Best threshold=0.6705
              precision    recall  f1-score   support

   Non-Fraud       0.98      0.99      0.98      2001
       Fraud       0.15      0.13      0.14        39

    accuracy                           0.97      2040
   macro avg       0.57      0.56      0.56      2040
weighted avg       0.97      0.97      0.97      2040



  f1_scores = 2*(precision[:-1]*recall[:-1])/(precision[:-1]+recall[:-1])
  f1_scores = 2*(precision[:-1]*recall[:-1])/(precision[:-1]+recall[:-1])


# --- 8. Save models ---

In [10]:
print("\n--- 8. Final Model Performance Summary ---")
results = [r for r in results if r is not None] 
results_df = pd.DataFrame(results)

if results_df.empty:
    print("ERROR: No models successfully trained or evaluated. Cannot proceed with final selection.")
else:
    # Selecting the model with the highest F1-Score for the fraud class
    best_row = results_df.sort_values(by='f1_fraud', ascending=False).iloc[0]
    final_model = best_row['model_object']
    final_model_name = best_row['model_name']

    print(results_df[['model_name', 'auc_roc', 'auc_pr', 'f1_fraud', 'precision_fraud', 'recall_fraud']].sort_values(by='f1_fraud', ascending=False).to_string(index=False))
    


--- 8. Final Model Performance Summary ---
      model_name  auc_roc   auc_pr  f1_fraud  precision_fraud  recall_fraud
CatBoost (Tuned)  0.71495 0.070049  0.138889         0.151515      0.128205


# --- 9. Feature Importance Analysis for the Best Model (XGBoost) ---

In [None]:
print(f"\n--- 9. Feature Importance Analysis for Selected Model: {final_model_name} ---")

if final_model_name == 'XGBoost':
        # This model uses the preprocessor, so we extract feature names from it
        feature_names = preprocessor.get_feature_names_out()
        importance = final_model.feature_importances_
        
        importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': importance
        }).sort_values(by='importance', ascending=False)
        
        # Save the preprocessor as the best model needs it
        joblib.dump(preprocessor, '../models/preprocessor.joblib')
        print("Preprocessor saved to '../models/preprocessor.joblib'.")

        # Save the best model
        joblib.dump(final_model, f'../models/{final_model_name}_fraud_model.joblib')
        print(f"Final model ({final_model_name}) saved to '../models/{final_model_name}_fraud_model.joblib'.")

        print("\nTop 15 Most Important Features:")
        print(importance_df.head(15).to_string(index=False))

elif final_model_name.startswith('CatBoost'):
        importance = final_model.get_feature_importance()
        feature_names = X_train.columns
        
        importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': importance
        }).sort_values(by='importance', ascending=False)
        
        joblib.dump(final_model, f'../models/{final_model_name}_fraud_model.joblib')
        print(f"Final model ({final_model_name}) saved to '../models/{final_model_name}_fraud_model.joblib'.")

        print("\nTop 15 Most Important Features:")
        print(importance_df.head(15).to_string(index=False))
        
else:
        print("Selected model is LightGBM or another type. Skipping detailed feature importance extraction.")


--- 9. Feature Importance Analysis for Selected Model: CatBoost (Tuned) ---
Final model (CatBoost (Tuned)) saved to '../models/CatBoost (Tuned)_fraud_model.joblib'.

Top 15 Most Important Features:
                  feature  importance
            ip_risk_score   10.877432
         txn_day_of_month    8.906903
      risk_score_internal    7.733498
      mean_amount_prev_3d    7.666782
       device_trust_score    6.879754
                      fee    6.639405
                 txn_hour    5.778870
exchange_rate_src_to_dest    4.941788
          txn_day_of_week    4.606931
        txn_count_prev_3d    4.501585
         account_age_days    4.286968
                  channel    4.055739
               amount_src    3.784763
               amount_usd    3.645717
                 kyc_tier    2.310178
