In [None]:
# Credit Card Fraud
# https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud

In [2]:
import zipfile
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE  # SMOTE for oversampling the minority class
from sklearn.utils import resample
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
import joblib

In [3]:
# Path to the zip file
zip_file = 'Credit Card Fraud1.zip'
extract_folder = 'Credit Card Fraud1'  # Folder to extract the contents

# Extract the contents of the zip file
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)

# Check if the file exists after extraction
extracted_files = os.listdir(extract_folder)
print(f"Files extracted: {extracted_files}")

# Now load the CSV file
data = pd.read_csv(os.path.join(extract_folder, 'creditcard.csv'))

# Inspect the first few rows of the dataset
print(data.head())

# Check for missing values and general information
print(data.info())

# Check for class distribution
print(data['Class'].value_counts())

Files extracted: ['creditcard.csv']
   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010 

In [4]:
# Feature columns (X) and target column (y)
X = data.drop('Class', axis=1)
y = data['Class']

# Standardize all features (including 'Time' and 'Amount')
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # Scale all features

# Convert the scaled data back to DataFrame
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Check the distribution of the classes before splitting
print("Class distribution before splitting:")
print(y.value_counts())

# Handle class imbalance by oversampling the minority class (frauds)
# Using SMOTE for oversampling the fraud cases (Class == 1)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Check the new class distribution
print("\nClass distribution after SMOTE oversampling:")
print(pd.Series(y_resampled).value_counts())

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# Model training using Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Model evaluation
y_pred = rf_model.predict(X_test)
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print(f"\nROC-AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")


Class distribution before splitting:
Class
0    284315
1       492
Name: count, dtype: int64

Class distribution after SMOTE oversampling:
Class
0    284315
1    284315
Name: count, dtype: int64

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56863
           1       1.00      1.00      1.00     56863

    accuracy                           1.00    113726
   macro avg       1.00      1.00      1.00    113726
weighted avg       1.00      1.00      1.00    113726


ROC-AUC Score: 1.0000


In [6]:
# Feature columns (X) and target column (y)
X = data.drop('Class', axis=1)
y = data['Class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to balance the classes in the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check the new class distribution
print(pd.Series(y_train_resampled).value_counts())

# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='roc_auc')
grid_search.fit(X_train_resampled, y_train_resampled)

# Best parameters from GridSearchCV
print(f"Best Parameters: {grid_search.best_params_}")

# Evaluate the model on the test set using the best parameters
best_rf_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_rf_model.predict(X_test)
y_pred_proba = best_rf_model.predict_proba(X_test)[:, 1]  # Get probability scores for ROC-AUC

# Evaluate the model's performance
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Calculate the ROC-AUC score
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC-AUC Score: {roc_auc:.4f}")


Class
0    227451
1    227451
Name: count, dtype: int64
Fitting 3 folds for each of 18 candidates, totalling 54 fits
Best Parameters: {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 200}

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.83      0.83      0.83        98

    accuracy                           1.00     56962
   macro avg       0.91      0.91      0.91     56962
weighted avg       1.00      1.00      1.00     56962

ROC-AUC Score: 0.9812


In [8]:
# Save the Model (Optional)

# Save the best model from GridSearchCV (not the initial rf_model)
joblib.dump(best_rf_model, 'credit_card_fraud_model.pkl')

print("Model saved as 'credit_card_fraud_model.pkl'")

Model saved as 'credit_card_fraud_model.pkl'
