In [1]:
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix 

In [None]:
df = pd.read_csv('./data/creditcard.csv')
display(df.head(), df.info())

### Data Preprocessing

In [None]:
X = df.drop(columns=['Time', 'Class'])
y = df['Class']  
print(f'X shape {X.shape}, y shape {y.shape} fraud cases {y.sum()}')
display('y_value counts',y.value_counts(), df.Class)

In [None]:
sn.countplot(y='Class', data=df, palette=['green', 'blue'])

The count plot above shows that we have an imbalance dataset where fraud cases are rare than the non fraud cases, this my lead our model to classify almost every case as the majority class which is the non fraud case

In [None]:
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
sc = StandardScaler()
X_train = sc.fit(X_train).transform(X_train)
print(X_train.shape , y_train.shape)

### Model Evaluation / Selection


Due to imbalance in our dataset we will use f1_score and confusion matrix for our model evaluation

In [None]:
def heatmap(y_test, preds):
    plt.figure(figsize=(9,9))
    cm = confusion_matrix(y_test, preds)
    sn.heatmap(cm, annot=True, linewidths=.5, cmap='Blues_r', square=True, fmt='.3f')
    plt.ylabel('Actual')
    plt.xlabel('predicted')
    plt.title(f'f1_score {f1_score(y_test, preds)}', size=15)
    print(f'total number of fraud cases {preds.sum()}')


In [None]:
gbc=HistGradientBoostingClassifier(learning_rate=0.01, 
        max_iter=2000, max_leaf_nodes=6, validation_fraction=0.2, 
        n_iter_no_change=15, random_state=42).fit(X_train,y_train)
preds = gbc.predict(X_test.values)
heatmap(y_test, preds)


In [None]:
rdc = RandomForestClassifier(n_estimators=500,)
rdc.fit(X_train, y_train)
preds = rdc.predict(X_test.values)
heatmap(y_test, preds)