In [1]:
import pandas as pd
import numpy as np

In [2]:
# read the dataset and print out
c_data = pd.read_csv('creditcard.csv')
c_data.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,3.91956e-15,5.688174e-16,-8.769071e-15,2.782312e-15,-1.552563e-15,2.010663e-15,-1.694249e-15,-1.927028e-16,-3.137024e-15,...,1.537294e-16,7.959909e-16,5.36759e-16,4.458112e-15,1.453003e-15,1.699104e-15,-3.660161e-16,-1.206049e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


In [8]:
# Separate input and target
from sklearn.model_selection import train_test_split
X = c_data.drop('Class', axis=1)
Y = c_data.Class
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=27)

In [12]:
# Logistic Classifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# Train model
lrc = LogisticRegression(solver='liblinear').fit(X_train, y_train)
 
# Predict on training set
lrc_pred = lrc.predict(X_test)

print(f'Accuracy Score: {accuracy_score(y_test, lrc_pred)}')

predictions = pd.DataFrame(lrc_pred)
predictions[0].value_counts()

Accuracy Score: 0.9992135052386169


0    71110
1       92
Name: 0, dtype: int64

# Methods1 - check other performance metric

In [14]:
# general metrics - Confusion Matrix, Precision, Recall, F1
'''
Confusion Matrix: a table showing correct predictions and types of incorrect predictions.
Precision: the number of true positives divided by all positive predictions. Precision is also called Positive Predictive Value. It is a measure of a classifier’s exactness. Low precision indicates a high number of false positives.
Recall: the number of true positives divided by the number of positive values in the test data. Recall is also called Sensitivity or the True Positive Rate. It is a measure of a classifier’s completeness. Low recall indicates a high number of false negatives.
F1: Score: the weighted average of precision and recall.
'''
from sklearn.metrics import recall_score, f1_score
# f1 score
print(f'F1 Score: {f1_score(y_test, lrc_pred)}')
  
# recall score
print(f'Recall Score: {recall_score(y_test, lrc_pred)}')

F1 Score: 0.75
Recall Score: 0.6363636363636364


# Methods2 - Change the Algorithm

In [15]:
# Random forest is a good choice. As previous work in transcations_esimator file.
from sklearn.ensemble import RandomForestClassifier

# train model
rfc = RandomForestClassifier(n_estimators=10).fit(X_train, y_train)

# predict on test set
rfc_pred = rfc.predict(X_test)

accuracy_score(y_test, rfc_pred)

print(f'Accuracy Score: {accuracy_score(y_test, rfc_pred)}')
print(f'F1 Score: {f1_score(y_test, rfc_pred)}')
print(f'Recall Score: {recall_score(y_test, rfc_pred)}')

Accuracy Score: 0.9995646189713772
F1 Score: 0.8702928870292888
Recall Score: 0.7878787878787878


# Methods3 - Resampling Techniques — Oversample minority class

In [17]:
from sklearn.utils import resample
# concatenate our training data back together
X = pd.concat([X_train, y_train], axis=1)

# separate minority and majority classes
not_fraud = X[X.Class==0]
fraud = X[X.Class==1]

# upsample minority
fraud_upsampled = resample(fraud,
                          replace=True, # sample with replacement
                          n_samples=len(not_fraud), # match number in majority class
                          random_state=27) # reproducible results

# combine majority and upsampled minority
new = pd.concat([not_fraud, fraud_upsampled])

# check the strucutre now
new.Class.value_counts()

1    213245
0    213245
Name: Class, dtype: int64

In [19]:
# trying logistic regression again with the balanced dataset
y_train = new.Class
X_train = new.drop('Class', axis=1)

nlrc = LogisticRegression(solver='liblinear').fit(X_train, y_train)

new_pred = nlrc.predict(X_test)
print(f'Accuracy Score: {accuracy_score(y_test, new_pred)}')
print(f'F1 Score: {f1_score(y_test, new_pred)}')
print(f'Recall Score: {recall_score(y_test, new_pred)}')

# Accuracy, Reacll increase, but F1 reduce

Accuracy Score: 0.9807589674447347
F1 Score: 0.14375000000000002
Recall Score: 0.8712121212121212


# Methods4 - Resampling techniques — Undersample majority class

In [22]:
# downsample majority
not_fraud_downsampled = resample(not_fraud,
                                replace = False, # sample without replacement
                                n_samples = len(fraud), # match minority n
                                random_state = 27) # reproducible results

# combine minority and downsampled majority
down_new = pd.concat([not_fraud_downsampled, fraud])

# check the strucutre now
down_new.Class.value_counts()

1    360
0    360
Name: Class, dtype: int64

In [23]:
#trying logistic regression again with the undersampled dataset

y_train = down_new.Class
X_train = down_new.drop('Class', axis=1)

dlrc = LogisticRegression(solver='liblinear').fit(X_train, y_train)

dnew_pred = dlrc.predict(X_test)
print(f'Accuracy Score: {accuracy_score(y_test, dnew_pred)}')
print(f'F1 Score: {f1_score(y_test, dnew_pred)}')
print(f'Recall Score: {recall_score(y_test, dnew_pred)}')

# The performance of undersampling is worse than oversampeling

Accuracy Score: 0.9758574197354007
F1 Score: 0.11710323574730355
Recall Score: 0.8636363636363636


# Methods5 - Generate synthetic samples

In [27]:
from imblearn.over_sampling import SMOTE

# Separate input features and target
y = c_data.Class
X = c_data.drop('Class', axis=1)

# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

sm = SMOTE(random_state=27)
X_train, y_train = sm.fit_sample(X_train, y_train)

In [28]:
smote = LogisticRegression(solver='liblinear').fit(X_train, y_train)

smote_pred = smote.predict(X_test)
print(f'Accuracy Score: {accuracy_score(y_test, smote_pred)}')
print(f'F1 Score: {f1_score(y_test, smote_pred)}')
print(f'Recall Score: {recall_score(y_test, smote_pred)}')

Accuracy Score: 0.9858571388444145
F1 Score: 0.18461538461538463
Recall Score: 0.8636363636363636


## For the credit card fraud problem here, random forest and method 5 (SMOTE) are probably the best option here.