In [1]:
import pandas as pd

df = pd.read_csv('../data/creditcard_cleaned.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [2]:
from sklearn.model_selection import train_test_split

X = df.drop('Class', axis=1)
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print(f"train shape: {X_train.shape}, test shape: {X_test.shape}")

train shape: (199364, 30), test shape: (85443, 30)


In [3]:
train_data = pd.concat([X_train, y_train], axis=1)
fraud = train_data[train_data['Class'] == 1]
non_fraud = train_data[train_data['Class'] == 0].sample(n=len(fraud), random_state=42)
balanced_train = pd.concat([fraud, non_fraud])
X_train_balanced = balanced_train.drop('Class', axis=1)
y_train_balanced = balanced_train['Class']

print("balanced class distribution:\n", y_train_balanced.value_counts())


balanced class distribution:
 Class
1    344
0    344
Name: count, dtype: int64


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_auc_score

In [5]:
lr = LogisticRegression(max_iter=7000, random_state=42)
lr.fit(X_train_balanced, y_train_balanced)

y_pred_lr = lr.predict(X_test)

print("Logistic Regression Results:")
print(classification_report(y_test, y_pred_lr))
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("F1 Score:", f1_score(y_test, y_pred_lr))
print("AUC Score:", roc_auc_score(y_test, lr.predict_proba(X_test)[:, 1]))

Logistic Regression Results:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     85295
           1       0.06      0.89      0.12       148

    accuracy                           0.98     85443
   macro avg       0.53      0.93      0.55     85443
weighted avg       1.00      0.98      0.99     85443

Accuracy: 0.9774703603571971
F1 Score: 0.11979881115683585
AUC Score: 0.9709607990075778


In [6]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_balanced, y_train_balanced)

y_pred_dt = dt.predict(X_test)

print("Decision Tree Results:")
print(classification_report(y_test, y_pred_dt))
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("F1 Score:", f1_score(y_test, y_pred_dt))
print("AUC Score:", roc_auc_score(y_test, dt.predict_proba(X_test)[:, 1]))


Decision Tree Results:
              precision    recall  f1-score   support

           0       1.00      0.89      0.94     85295
           1       0.01      0.89      0.03       148

    accuracy                           0.89     85443
   macro avg       0.51      0.89      0.48     85443
weighted avg       1.00      0.89      0.94     85443

Accuracy: 0.8866729866694756
F1 Score: 0.02654066552729466
AUC Score: 0.8892779114773369


In [7]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_balanced, y_train_balanced)

y_pred_rf = rf.predict(X_test)

print("Random Forest Results:")
print(classification_report(y_test, y_pred_rf))
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("F1 Score:", f1_score(y_test, y_pred_rf))
print("AUC Score:", roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1]))

Random Forest Results:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     85295
           1       0.08      0.86      0.14       148

    accuracy                           0.98     85443
   macro avg       0.54      0.92      0.57     85443
weighted avg       1.00      0.98      0.99     85443

Accuracy: 0.9821635476282434
F1 Score: 0.14285714285714285
AUC Score: 0.9747718965814984


In [8]:
from sklearn.model_selection import GridSearchCV 
param_grid = {
    'n_estimators': [50],
    'max_depth': [None, 10],
    'min_samples_split': [2],
    'min_samples_leaf': [1]
}
rf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=3,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)
print("Best F1 Score from GridSearchCV:", grid_search.best_score_)
best_rf = grid_search.best_estimator_

Fitting 3 folds for each of 2 candidates, totalling 6 fits
Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best F1 Score from GridSearchCV: 0.8444966812233158


In [9]:
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, f1_score

y_pred_best = best_rf.predict(X_test)

print("Evaluation on Test Set:")
print(classification_report(y_test, y_pred_best))
print("Accuracy:", accuracy_score(y_test, y_pred_best))
print("F1 Score:", f1_score(y_test, y_pred_best))
print("AUC Score:", roc_auc_score(y_test, best_rf.predict_proba(X_test)[:, 1]))


Evaluation on Test Set:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.97      0.76      0.85       148

    accuracy                           1.00     85443
   macro avg       0.98      0.88      0.92     85443
weighted avg       1.00      1.00      1.00     85443

Accuracy: 0.9995318516437859
F1 Score: 0.8484848484848485
AUC Score: 0.9280247962952107


In [10]:
import pickle

with open("best_model.pkl", "wb") as f:
    pickle.dump(best_rf, f)

print("Model saved as best_model.pkl")

Model saved as best_model.pkl
