In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load preprocessed data
data = pd.read_csv('../data/processed_creditcard.csv')

In [2]:
# Split into features and target
X = data.drop('Class', axis=1)
y = data['Class']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (226980, 30)
Testing set shape: (56746, 30)


In [3]:
# Handle Class Imbalance
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("Resampled training set shape:", X_train_resampled.shape)

Resampled training set shape: (453204, 30)


Use SMOTE (Synthetic Minority Oversampling Technique) to balance the dataset. This will generate synthetic samples of the minority class (frauds) to balance the class distribution.

In [4]:
# Use a smaller subset for tuning
X_train_sample, _, y_train_sample, _ = train_test_split(X_train_resampled, y_train_resampled, train_size=0.2, random_state=42)

In [5]:
# Define Models and Hyperparameter Grids
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

# Define models
models = {
    'Logistic Regression': LogisticRegression(class_weight='balanced', random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(class_weight='balanced', random_state=42)
}

# Define hyperparameter grids
# Use fewer iterations and narrower ranges
param_grids = {
    'Logistic Regression': {
        'C': [0.1, 1, 10],
        'penalty': ['l2']
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.1, 0.2],
        'max_depth': [3, 5]
    },
    'Random Forest': {
        'n_estimators': [100, 200],
        'max_depth': [10, 20],
        'min_samples_split': [2, 5]
    }
}

We just defined the models you want to optimize (i.e, Random Forest, Logistic Regression, Gradient Boosting) and their respective hyperparameter grids.

In [6]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Dictionary to store the best models
best_models = {}

# Perform Randomized Search for each model
# Perform Randomized Search with fewer iterations
for model_name, model in models.items():
    print(f"Optimizing {model_name}...")
    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_grids[model_name],
        n_iter=5,  # Fewer iterations
        cv=3,
        scoring='f1',
        n_jobs=-1,
        random_state=42
    )
    random_search.fit(X_train_sample, y_train_sample)
    best_models[model_name] = random_search.best_estimator_
    print(f"Best parameters for {model_name}:", random_search.best_params_)
    print("-----------------------------------------------------------------------")

Optimizing Logistic Regression...




Best parameters for Logistic Regression: {'penalty': 'l2', 'C': 10}
-----------------------------------------------------------------------
Optimizing Gradient Boosting...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Best parameters for Gradient Boosting: {'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.2}
-----------------------------------------------------------------------
Optimizing Random Forest...
Best parameters for Random Forest: {'n_estimators': 200, 'min_samples_split': 2, 'max_depth': 20}
-----------------------------------------------------------------------


We use GridSearchCV to find the best hyperparameters for each model. Optimize for F1-score since it balances precision and recall.

In [9]:
# Select the Best Model
from sklearn.metrics import classification_report

# Compare models and select the best one
best_model_name = None
best_f1_score = 0

for model_name, model in best_models.items():
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    f1_score = report['weighted avg']['f1-score']
    
    if f1_score > best_f1_score:
        best_f1_score = f1_score
        best_model_name = model_name

print(f"The best model is {best_model_name} with an F1-score of {best_f1_score:.2f}")

The best model is Random Forest with an F1-score of 1.00


Compare the performance of all optimized models and select the one with the highest F1-score and AUPRC.
Given the high class imbalance (frauds account for only 0.172% of transactions), it's crucial to focus on metrics like Precision-Recall Curve (AUPRC) and F1-score rather than accuracy. Accuracy can be misleading in such cases because a model that always predicts the majority class (non-fraud) would still achieve high accuracy but fail to detect fraud.

In [10]:
# Save the Final Model
import joblib

# Save the final model
best_model = best_models[best_model_name]
joblib.dump(best_model, '../models/final_model.pkl')

['../models/final_model.pkl']

In [14]:
from sklearn.metrics import precision_recall_curve, auc

# Generate Performance Report
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
report = classification_report(y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(report).transpose()

# Add AUPRC to the report
precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
auprc = auc(recall, precision)
report_df.loc['AUPRC'] = {'precision': auprc, 'recall': '', 'f1-score': '', 'support': ''}

# Save report
report_df.to_csv('../reports/final_model_performance.txt', sep='\t')

Document Findings
Update the project documentation to include the results of Phase 4. Highlight:

The best-performing model and its hyperparameters.

Key metrics (F1-score, AUPRC, Confusion Matrix).

Insights from the Precision-Recall Curve.