# Model Training 

In [1]:
# import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score

In [2]:
# Get working directory
import sys, os
sys.path.append(os.path.abspath('..'))

# Load the data
df = pd.read_csv('../data/processed_data.csv')

In [3]:
# Split features and target
X = df.drop(columns=['FraudResult'])
y = df['FraudResult']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y 
)

# Model Pipelines with preprocessing

# Logistic Regression Pipeline (with scaling)
lr_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(class_weight='balanced', random_state=42))
])

# Random Forest Pipeline (no scaling needed)
rf_pipe = Pipeline([
    ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42))
])

In [4]:
# Hyperparameter grids
lr_param_grid = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'classifier__solver': ['liblinear', 'saga'],
    'classifier__penalty': ['l1', 'l2']
}

rf_param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5],
    'classifier__max_features': ['sqrt', 'log2']
}

# Grid Search Setup
lr_grid = GridSearchCV(lr_pipe, lr_param_grid, 
                      cv=5, scoring='roc_auc', n_jobs=-1, verbose=1)
rf_grid = GridSearchCV(rf_pipe, rf_param_grid, 
                      cv=5, scoring='roc_auc', n_jobs=-1, verbose=1)


In [5]:
# Train models with hyperparameter tuning
print("Training Logistic Regression...")
lr_grid.fit(X_train, y_train)

print("\nTraining Random Forest...")
rf_grid.fit(X_train, y_train)


Training Logistic Regression...
Fitting 5 folds for each of 24 candidates, totalling 120 fits

Training Random Forest...
Fitting 5 folds for each of 24 candidates, totalling 120 fits


# Model Evaluation 

In [6]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    print(classification_report(y_test, y_pred))
    print(f"ROC-AUC Score: {roc_auc_score(y_test, y_proba):.4f}\n")

# Evaluate best models
print("Logistic Regression Results:")
print("Best Parameters:", lr_grid.best_params_)
evaluate_model(lr_grid.best_estimator_, X_test, y_test)

print("Random Forest Results:")
print("Best Parameters:", rf_grid.best_params_)
evaluate_model(rf_grid.best_estimator_, X_test, y_test)


Logistic Regression Results:
Best Parameters: {'classifier__C': 0.001, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     19094
           1       0.24      0.97      0.39        39

    accuracy                           0.99     19133
   macro avg       0.62      0.98      0.69     19133
weighted avg       1.00      0.99      1.00     19133

ROC-AUC Score: 0.9988

Random Forest Results:
Best Parameters: {'classifier__max_depth': 10, 'classifier__max_features': 'log2', 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     19094
           1       0.66      0.90      0.76        39

    accuracy                           1.00     19133
   macro avg       0.83      0.95      0.88     19133
weighted avg       1.00      1.00      1.00     19133

ROC-AUC Score

In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

def evaluate_model(model, X_test, y_test, model_name):
    # Generate predictions and probabilities
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)
    cm = confusion_matrix(y_test, y_pred)
    
    # Print metrics
    print(f"\n{model_name} Evaluation:")
    print("=============================")
    print(f"Accuracy:    {accuracy:.4f}")
    print(f"Precision:   {precision:.4f}")
    print(f"Recall:      {recall:.4f} ")
    print(f"F1 Score:    {f1:.4f}")
    print(f"ROC-AUC:     {roc_auc:.4f}")
    
    # Confusion matrix
    print("\nConfusion Matrix:")
    print("[[TN  FP]")
    print(" [FN  TP]]")
    print(cm)
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc
    }

# Evaluate both models
print("\nModel Evaluation Results:")
lr_metrics = evaluate_model(lr_grid.best_estimator_, X_test, y_test, "Logistic Regression")
rf_metrics = evaluate_model(rf_grid.best_estimator_, X_test, y_test, "Random Forest")

# Comparative analysis
print("\nModel Comparison:")
print(f"{'Metric':<12} {'Logistic Regression':<20} {'Random Forest':<15}")
print("-" * 50)
for metric in ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']:
    print(f"{metric:<12} {lr_metrics[metric]:<20.4f} {rf_metrics[metric]:<15.4f}")


Model Evaluation Results:

Logistic Regression Evaluation:
Accuracy:    0.9938
Precision:   0.2436
Recall:      0.9744 
F1 Score:    0.3897
ROC-AUC:     0.9988

Confusion Matrix:
[[TN  FP]
 [FN  TP]]
[[18976   118]
 [    1    38]]

Random Forest Evaluation:
Accuracy:    0.9989
Precision:   0.6604
Recall:      0.8974 
F1 Score:    0.7609
ROC-AUC:     0.9992

Confusion Matrix:
[[TN  FP]
 [FN  TP]]
[[19076    18]
 [    4    35]]

Model Comparison:
Metric       Logistic Regression  Random Forest  
--------------------------------------------------
accuracy     0.9938               0.9989         
precision    0.2436               0.6604         
recall       0.9744               0.8974         
f1           0.3897               0.7609         
roc_auc      0.9988               0.9992         


# Model Selection 
Based on the metrics provided, Random Forest is the better model for this fraud detection.

Accuracy:

* Random Forest (0.9989) > Logistic Regression (0.9938)
* But accuracy is misleading in imbalanced datasets (common in fraud detection)


Precision (Anti-False Positive):

* RF: 0.6604 vs LR: 0.2436
* RF makes 3× fewer false alarms

Recall (Fraud Detection Rate):

* LR: 0.9744 vs RF: 0.8974
* LR misses fewer frauds but at huge FP cost

F1 (Balance):

* RF: 0.7609 vs LR: 0.3897
* RF better balances precision/recall

# Feature Importance for Random Forest

In [12]:
# Feature Importance for Random Forest
importances = rf_grid.best_estimator_.named_steps['classifier'].feature_importances_
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
}).sort_values('Importance', ascending=False)

print("Top 10 Important Features:")
print(feature_importance.head(10))

Top 10 Important Features:
         Feature  Importance
4          Value    0.176038
12     MaxAmount    0.169245
9      AvgAmount    0.161366
3         Amount    0.119453
7    TotalDebits    0.092914
11     StdAmount    0.087797
6    TotalAmount    0.068713
1      ProductId    0.031363
0     ProviderId    0.020528
8   TotalCredits    0.012502
