In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_recall_fscore_support

## Data Importation

In [6]:
data_path = "./creditcard.csv"
data = pd.read_csv(data_path)
data.Time = data.Time.astype(int)  # Always positive natural numbers
data["Class"] = data["Class"].astype(int)

In [7]:
data.drop(["Time"], axis=1, inplace=True)

In [9]:
y = data["Class"]
X = data.drop(["Class"], axis=1)
target_labels = ["Fraud", "Normal"]

In [12]:
X.head(5)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99


## Data Overview

In [44]:
print("Unique data types: {}".format(np.unique(data.dtypes)))

Unique data types: [dtype('int32') dtype('int64') dtype('float64')]


In [42]:
print("Any NULL value in the data: {}".format(data.isna().any().any()))

Any NULL value in the data: False


In [59]:
X.iloc[:, :10].describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,3.91956e-15,5.688174e-16,-8.769071e-15,2.782312e-15,-1.552563e-15,2.010663e-15,-1.694249e-15,-1.927028e-16,-3.137024e-15
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499


In [60]:
X.iloc[:, 10:20].describe()

Unnamed: 0,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,1.768627e-15,9.170318e-16,-1.810658e-15,1.693438e-15,1.479045e-15,3.482336e-15,1.392007e-15,-7.528491e-16,4.328772e-16,9.049732e-16
std,1.08885,1.020713,0.9992014,0.9952742,0.9585956,0.915316,0.8762529,0.8493371,0.8381762,0.8140405
min,-24.58826,-4.797473,-18.68371,-5.791881,-19.21433,-4.498945,-14.12985,-25.1628,-9.498746,-7.213527
25%,-0.5354257,-0.7624942,-0.4055715,-0.6485393,-0.425574,-0.5828843,-0.4680368,-0.4837483,-0.4988498,-0.4562989
50%,-0.09291738,-0.03275735,0.1400326,-0.01356806,0.05060132,0.04807155,0.06641332,-0.06567575,-0.003636312,0.003734823
75%,0.4539234,0.7395934,0.618238,0.662505,0.4931498,0.6488208,0.5232963,0.399675,0.5008067,0.4589494
max,23.74514,12.01891,7.848392,7.126883,10.52677,8.877742,17.31511,9.253526,5.041069,5.591971


In [61]:
X.iloc[:, 20:].describe()

Unnamed: 0,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,5.085503e-16,1.537294e-16,7.959909e-16,5.36759e-16,4.458112e-15,1.453003e-15,1.699104e-15,-3.660161e-16,-1.206049e-16,88.349619
std,0.770925,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109
min,-54.49772,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0
25%,-0.2117214,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6
50%,-0.06248109,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0
75%,0.1330408,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165
max,39.4209,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16


In [58]:
y.describe()

count    284807.000000
mean          0.001727
std           0.041527
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: Class, dtype: float64

## Class Distribution 

In [16]:
print("Percentage of frauds in the data: {:.4f} %".format(y[y == 1].count() / y[y == 0].count() * 100))

Percentage of frauds in the data: 0.1730 %


## Split train/test 

In [26]:
from sklearn.model_selection import train_test_split

test_ratio = 0.5
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, random_state=13)

In [27]:
print("Frauds in the training set: {}".format(y_train[y_train == 1].count()))
print("Frauds in the testing set: {}".format(y_test[y_test == 1].count()))

Frauds in the training set: 251
Frauds in the testing set: 241


## Normalize data 

In [30]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train.iloc[:, 1:])
X_train["Amount"] = scaler.fit_transform(X_train["Amount"].values.reshape(-1, 1))
# X_test_scaled = scaler.transform(X_test.iloc[:, 1:])
X_test["Amount"] = scaler.transform(X_test["Amount"].values.reshape(-1, 1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [24]:
print("Train shape: {}".format(X_train.shape))
print("Test shape: {}".format(X_test.shape))

Train shape: (142403, 29)
Test shape: (142404, 29)


## XGBoost 

In [33]:
from sklearn.ensemble import GradientBoostingClassifier

xgb = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100, max_depth=7, random_state=13)
xgb.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=7,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=13,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [35]:
probs = xgb.predict_proba(X_test)
probs = probs[:, 1]
y_pred = xgb.predict(X_test)

In [36]:
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f_score, support = precision_recall_fscore_support(y_test, y_pred, average='binary')
print("Accuracy: {:0.4f}, Precision: {:0.4f}, Recall: {:0.4f}, F-score: {:0.4f}".format(
                accuracy, precision, recall, f_score))
print()
print("Confusion Matrix")
print(confusion_matrix(y_test, y_pred))  # True on the lines, predicted in the columns [[tn, fp], [fn, tp]]
print()
print("Classification Report")
print(classification_report(y_test, y_pred, target_names=target_labels, digits=4))

Accuracy: 0.9993, Precision: 0.8145, Recall: 0.7469, F-score: 0.7792

Confusion Matrix
[[142122     41]
 [    61    180]]

Classification Report
              precision    recall  f1-score   support

       Fraud     0.9996    0.9997    0.9996    142163
      Normal     0.8145    0.7469    0.7792       241

   micro avg     0.9993    0.9993    0.9993    142404
   macro avg     0.9070    0.8733    0.8894    142404
weighted avg     0.9993    0.9993    0.9993    142404



## Random Forest

In [37]:
from sklearn.ensemble import RandomForestClassifier

anomaly_ratio = data["Class"][data["Class"] == 1].count() / len(data)
class_weights = {0: 1, 1: anomaly_ratio}

rf = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=13, class_weight=class_weights)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True,
            class_weight={0: 1, 1: 0.001727485630620034}, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=13, verbose=0, warm_start=False)

In [38]:
probs = rf.predict_proba(X_test)
probs = probs[:, 1]
y_pred = rf.predict(X_test)

In [39]:
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f_score, support = precision_recall_fscore_support(y_test, y_pred, average='binary')
print("Accuracy: {:0.4f}, Precision: {:0.4f}, Recall: {:0.4f}, F-score: {:0.4f}".format(
                accuracy, precision, recall, f_score))
print()
print("Confusion Matrix")
print(confusion_matrix(y_test, y_pred))  # True on the lines, predicted in the columns [[tn, fp], [fn, tp]]
print()
print("Classification Report")
print(classification_report(y_test, y_pred, target_names=target_labels, digits=4))

Accuracy: 0.9995, Precision: 0.9534, Recall: 0.7635, F-score: 0.8479

Confusion Matrix
[[142154      9]
 [    57    184]]

Classification Report
              precision    recall  f1-score   support

       Fraud     0.9996    0.9999    0.9998    142163
      Normal     0.9534    0.7635    0.8479       241

   micro avg     0.9995    0.9995    0.9995    142404
   macro avg     0.9765    0.8817    0.9238    142404
weighted avg     0.9995    0.9995    0.9995    142404

