In [None]:
import csv

import numpy as np
import pandas as pd
import seaborn as sns
import os
import matplotlib
import matplotlib.pyplot as plt

from csv import reader

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import learning_curve

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_auc_score

In [None]:
# import math

# def sigmoid(x):
#     a = []
#     for item in x:
#         a.append(1/(1+math.exp(-item)))
#     return a

# x = np.arange(-8., 8., 0.2)
# sig = sigmoid(x)

# hline = np.full((80,1), 0.5)

# plt.plot(x,sig, color="black")
# plt.plot(x, hline, color="blue", linestyle = "dashed")

# plt.xlim(-8,8)
# #plt.hlines(0.5, xmin=-8, xmax=8, linestyles="dashed", colors="red")
# plt.show()

## Data Input & Sanity Check

In [None]:
Fraud_df = pd.read_csv("creditcard.csv")

In [None]:
Fraud_df

It appears there are no null values within our dataset, this however is to be expected as a PCA transformation has already been applied for GDPR reasons, and thus a level of data cleansing has likely already taken place.

In [None]:
for c in Fraud_df.columns:
    Col_Null = Fraud_df[c].isnull().sum()
    if Col_Null > 0:
        print(str(c) + ": " + str(Col_Null))

Below we can see that there were two transactions within 0 seconds of the first, one of which is necessarily the first. Over a 48 hour period with almost 284807 instances of card usage this is not unlikely. It is interesting to note however that 1825 transactions were for of $0.
Finally, we can see that of 284807 transactions, 284315 were legitimate (99.83\%) (Therefore 492 Fraudulent), this will be the crux of our machine learning problem, as such a high class imbalance has a potential to lead to false negatives.

In [None]:
(Fraud_df == 0).astype(int).sum(axis=0)

## Analysis

#### Class Imbalance

In [None]:
_, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10), sharex=True)

sns.countplot('Class', data=Fraud_df, ax=ax1)
ax1.set_title('Class Imbalance', fontsize=14)
ax1.set_ylabel("Count")

sns.countplot('Class', data=Fraud_df, ax=ax2).set_yscale('log')
ax2.set_title('Class Imbalance (Logarithmic)', fontsize=14)
ax2.set_ylabel("Count")
ax2.set_ylim(ymin=1)

#### Fraud vs Spend

Overall, it initially appears from this dataset that the average case of fraud is for lower value transactions, with a maximum stolen of little over €2,000, as opposed to the legitimate transaction's maximum of over €25,000

Once we account for outliers, we can see that the median transaction amount for fraudulent transactions is lower than the median for legitimate transactions, although the deviation (IQR) is notably higher. The overall higher range of (non-anomalous) fraudulent cases however leads to a higher mean for these.

In [None]:
plt.subplots_adjust(wspace = 0.75)
plt.subplots(figsize=(15,10))

plt.subplot(2,2,1)
s1 = sns.boxplot(data=Fraud_df.loc[Fraud_df['Class']==0], x="Class", y="Amount", color="Blue")

plt.subplot(2,2,2)
s2 = sns.boxplot(data=Fraud_df.loc[Fraud_df['Class']==1], x="Class", y="Amount", color="Orange")

plt.subplot(2,2,3)
sns.boxplot(data = Fraud_df, x = "Class", y = "Amount", palette=["Blue", "Orange"])

plt.subplot(2,2,4)
sns.boxplot(data = Fraud_df, x = "Class", y = "Amount", showmeans=True, meanline=True, showfliers=False, palette=["Blue", "Orange"])

plt.plot([], [], '-', linewidth=8, color='Blue', label='No Fraud')
plt.plot([], [], '-', linewidth=8, color='Orange', label='Fraud')
plt.plot([], [], '--', linewidth=1, color='Green', label='Mean')

plt.legend()
#plt.legend(labels = ["No Fraud", "Fraud", "Mean"], color = ["blue", "orange", "green"], ) 
#plt.legend(labels = ["Mean"])

In [None]:
Legit_det = Fraud_df[Fraud_df["Class"] == 0][["Time", "Amount"]].describe()
Fraud_det = Fraud_df[Fraud_df["Class"] == 1][["Time", "Amount"]].describe()

display(Legit_det)
display(Fraud_det)

#### Fraud vs Time

Unfortunately due to the confidentiality requirements of the original dataset we are unable to view the objective times of card transactions, however we are able to see timings relative to the first spend of the dataset. This means that while we may not know the hour of day transactions occurred, we can see trends in spending across the two day data gathering period.

In [None]:
g = sns.kdeplot(Fraud_df[Fraud_df["Class"] == 0]["Time"], label="No Fraud")
g = sns.kdeplot(Fraud_df[Fraud_df["Class"] == 1]["Time"], label="Fraud")
#g.xlabel("Time")
g.set(yticks=[0.00000114285, 0.00000228571, 0.00000342857, 0.00000457142, 0.00000571428, 0.00000685714, 0.000008])
g.set_yticklabels([2000, 4000, 6000, 8000, 10000, 12000, 14000])
g.set_xlim(-10000, 200000)
g.set_ylabel("Count")
g.legend()

In [None]:
t1 = sns.displot(Fraud_df, x=Fraud_df[Fraud_df["Class"] == 0]["Time"], binwidth=5000)
t1.fig.set_size_inches(12,4)
#plt.xlim(0, 172800)

t2 = sns.displot(Fraud_df, x=Fraud_df[Fraud_df["Class"] == 1]["Time"], binwidth=5000)
t2.fig.set_size_inches(12,4)

In [None]:
plt.subplots_adjust(hspace = 0.5)
plt.subplot(2,1,1)
sns.distplot(Fraud_df, x=Fraud_df[Fraud_df["Class"] == 0]["Time"], bins = 34)
plt.xlim(0, 180000)
#plt.xlim(0, 175000)

plt.subplot(2,1,2)
sns.distplot(Fraud_df, x=Fraud_df[Fraud_df["Class"] == 1]["Time"], bins = 34)
plt.xlim(0, 180000)
#plt.xlim(0, 175000)

As can be seen here, there is a drop in legitimate transactions around 100,000 seconds after data collection began (~ 27 hours, 45 minutes). Fraudulent transactions, though dropping around 20,000 seconds later, or 5 and a half hours, do not drop as significantly. Given the data collection period length, we might assume that the 100,000 second drop corresponds with the early hours of the morning (It would be reasonable to start and end data collection at midnight on both days for record gathering simplicity)

Below we filter for variables with a correlation coefficient to transaction amount with absolute value above 0.25, we can see that V2 has a significant negative correlation, along with weaker negative correlations from V5.

We have weak positive correlations from V7, and V20.

In [None]:
Fraud_df.corr()["Amount"][Fraud_df.corr()["Amount"].abs()>0.25]

Doing the same for correlation with Fraud, we get only weak correlations, all negative, of V12, V14, and V17. When we create an algorithm to predict fraud we would therefore expect these to play a more heavily weighted part.

In [None]:
Fraud_df.corr()["Class"][Fraud_df.corr()["Class"].abs()>0.25]

## Preventing Over/Underfitting

Due to the class imbalance of our dataset if we use it without further refinement we will be subject to our algorithms overfitting the data, that is, tending towards the assumption that most transactions are not fraudulent, simply because this is more likely to be correct by chance.

#### For Undersampling:

We randomly select an equal number of legitimate transactions as there are fraudulent, giving us a dataset of 984 transactions, with a 50/50 split by class. The random sampling function also shuffles these.

In [None]:
#Create a sample dataset with a 50/50 split of fraudulent to genuine transactions
No_Fraud_Under_Samp = Fraud_df[Fraud_df["Class"] == 0].sample(n=492)
Fraud_Under_Samp = Fraud_df[Fraud_df["Class"] == 1].sample(n=492)

Under_Samp_Fraud = pd.concat([Fraud_Under_Samp, No_Fraud_Under_Samp]).sample(frac=1)

Under_Samp_Fraud

In [None]:
Under_Genuine_No = Under_Samp_Fraud[Under_Samp_Fraud["Class"] == 0]["Class"].count()
Under_Fraud_No = Under_Samp_Fraud[Under_Samp_Fraud["Class"] == 1]["Class"].count()

print(Under_Genuine_No)
print(Under_Fraud_No)

#### For Oversampling:

We randomly select a number of fraudulent transactions (allowing for repeat selections) as there are genuine, giving us a dataset of 100,000 transactions, with a 1:20 fraud to genuin class split. The random sampling function also shuffles these.

In [None]:
#Create a sample dataset with 5% fraudulent transactions
No_Fraud_Over_Samp = Fraud_df[Fraud_df["Class"] == 0].sample(n=95000)
Fraud_Over_Samp = Fraud_df[Fraud_df["Class"] == 1].sample(n=5000, replace=True)

Over_Samp_Fraud = pd.concat([Fraud_Over_Samp, No_Fraud_Over_Samp]).sample(frac=1)

Over_Samp_Fraud

In [None]:
Over_Genuine_No = Over_Samp_Fraud[Over_Samp_Fraud["Class"] == 0]["Class"].count()
Over_Fraud_No = Over_Samp_Fraud[Over_Samp_Fraud["Class"] == 1]["Class"].count()

print(Over_Genuine_No)
print(Over_Fraud_No)

## Machine Learning

### Undersample

First we split our training and testing data:

In [None]:
Under_Samp_Minus_Class = Under_Samp_Fraud.drop("Class", axis=1)
Under_Samp_Class = Under_Samp_Fraud["Class"]

In [None]:
#Split data into 30% training, 70% test group
Input_train, Input_test, Res_train, Res_test = train_test_split(Under_Samp_Minus_Class, Under_Samp_Class, test_size=0.3, random_state=42)


#Logistic Regression requires Array inputs
Input_train = Input_train.values
Input_test = Input_test.values
Res_train = Res_train.values
Res_test = Res_test.values

print("Transactions in X_train dataset: ", Input_train.shape)
print("Transaction classes in y_train dataset: ", Res_train.shape)

print("Transactions in X_test dataset: ", Input_test.shape)
print("Transaction classes in y_test dataset: ", Res_test.shape)

#### Logistic Regression

In [None]:
LogReg = LogisticRegression(max_iter = 1000)
LogReg.fit(Input_train, Res_train)

LogReg_Train_Acc = accuracy_score(Res_train, LogReg.predict(Input_train))
print("Classifiers: Logistic Regression has a Training Accuracy of", 100*LogReg_Train_Acc, "%")

LogReg_Cross_Val = cross_val_score(LogReg, Input_train, Res_train, cv=5)
print("Classifiers: Logistic Regression has a CrossVal score of", 100*LogReg_Cross_Val.mean(), "%")

Optimising parameters for best fit using GridSearchCV. (This turns out to have only a minor effect on the accuracy, around 0.2% !)

In [None]:
LogReg_Para = {"penalty": ['l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], "max_iter": [500, 1000, 1500]}

GridLogReg = GridSearchCV(LogisticRegression(), LogReg_Para)
GridLogReg.fit(Input_train, Res_train)

Log_Reg = GridLogReg.best_estimator_

In [None]:
Log_Reg_Train_Acc = accuracy_score(Res_train, Log_Reg.predict(Input_train))
print("Classifiers: Logistic Regression has a Training Accuracy of", 100*Log_Reg_Train_Acc, "%")

Log_Reg_Cross_Val = cross_val_score(Log_Reg, Input_train, Res_train, cv=5)
print("Classifiers: Logistic Regression has a CrossVal score of", 100*Log_Reg_Cross_Val.mean(), "%")

In [None]:
def plot_learning_curve(title, data, Samp_Minus_Class, Samp_Class, axes=None, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 10), verbose=0):
    
    _, axes = plt.subplots(1, 1, figsize=(10, 5))
    
#    if ylim is not None:
#        axes[0].set_ylim(*ylim)
#    axes[0].set_xlabel("Training dataset size")
#    axes[0].set_ylabel("Score")
    
    
    train_sizes, train_scores, test_scores, fit_times, _ = \
        learning_curve(data, Samp_Minus_Class, Samp_Class, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, verbose = verbose, return_times=True)
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)
    
    
    axes.grid()
    axes.fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
    axes.fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
    axes.plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training Cross-Val score")
    axes.plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Testing Cross-Val score")
    axes.legend(loc="best")
    axes.set_title(title, fontsize=14)
    
    
#    axes[0].grid()
#    axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
#                         train_scores_mean + train_scores_std, alpha=0.1,
#                         color="r")
#    axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
#                         test_scores_mean + test_scores_std, alpha=0.1,
#                         color="g")
#    axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
#                 label="Training Cross-Val score")
#    axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
#                 label="Testing Cross-Val score")
#    axes[0].legend(loc="best")
#    axes[0].set_title("Logistic Regression Learning Curve", fontsize=14)
    
#    axes[1].grid()
#    axes[1].plot(train_sizes, fit_times_mean, 'o-')
#    axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
#                         fit_times_mean + fit_times_std, alpha=0.1)
#    axes[1].set_xlabel("Training examples")
#    axes[1].set_ylabel("fit_times")
#    axes[1].set_title("Scalability of the model")

    
    
#    axes[1].grid()
#    a = sorted(list(zip(fit_times_mean, test_scores_mean)))
#    axes[1].plot([x[0] for x in a], [x[1] for x in a], "o-")
#    axes[1].plot(fit_times_mean, test_scores_mean, "o-")
#    axes[1].fill_between(fit_times_mean, test_scores_mean - test_scores_std,
#                         test_scores_mean + test_scores_std, alpha=0.1)
#    axes[1].set_xlabel("fit_times")
#    axes[1].set_ylabel("Score")
#    axes[1].set_title("Performance of the model")
    
    return plt

In [None]:
cv = ShuffleSplit(n_splits=100, test_size=0.3, random_state=42)
plot_learning_curve("Logistic Regression Learning Curve", Log_Reg, Input_train, Res_train, (0.85, 1.00), cv=cv, n_jobs=-1)

Unfortunately, when used to predict fraud in the entire dataset, our model is over 20x higher than the actual number of cases

In [None]:
Log_Reg.predict(Fraud_df.drop("Class", axis=1)).sum() / 492

In [None]:
def plot_confusion_matrix(y_true, y_predicted, title):

    cm  = confusion_matrix(y_true, y_predicted)
    # Get the per-class normalized value for each cell
    cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    
    # We color each cell according to its normalized value, annotate with exact counts.
    ax = sns.heatmap(cm_norm, annot=cm, fmt="d")
    ax.set(xticklabels=["non-fraud", "fraud"], yticklabels=["non-fraud", "fraud"])
    ax.set_ylim([0,2])
    plt.title(title)
    plt.ylabel('Real Classes')
    plt.xlabel('Predicted Classes')
    plt.show()

In [None]:
plot_confusion_matrix(Res_train, Log_Reg.predict(Input_train), "Training Dataset Matrix")
tn, fp, fn, tp = confusion_matrix(Res_train, Log_Reg.predict(Input_train)).ravel()

print("Percentage of fraud missed: ", round(100*(fn / (fn+tp)), 2), "%")
print("Percentage of genuine mis-classified: ", round(100*(fp / (fp+tn)), 2), "%")
print("Total accuracy: ", round(100*((tp + tn)/(tp+tn+fp+fn)), 2), "%")

In [None]:
plot_confusion_matrix(Res_test, Log_Reg.predict(Input_test), "Testing Dataset Matrix")
tn, fp, fn, tp = confusion_matrix(Res_test, Log_Reg.predict(Input_test)).ravel()

print("Percentage of fraud missed: ", round(100*(fn / (fn+tp)), 2), "%")
print("Percentage of genuine mis-classified: ", round(100*(fp / (fp+tn)), 2), "%")
print("Total accuracy: ", round(100*((tp + tn)/(tp+tn+fp+fn)), 2), "%")

In [None]:
plot_confusion_matrix(Fraud_df["Class"], Log_Reg.predict(Fraud_df.drop("Class", axis=1)), "Entire Dataset Matrix")
tn, fp, fn, tp = confusion_matrix(Fraud_df["Class"], Log_Reg.predict(Fraud_df.drop("Class", axis=1))).ravel()

print("Percentage of fraud missed: ", round(100*(fn / (fn+tp)), 2), "%")
print("Percentage of genuine mis-classified: ", round(100*(fp / (fp+tn)), 2), "%")
print("Total accuracy: ", round(100*((tp + tn)/(tp+tn+fp+fn)), 2), "%")

#### Random Forest

In [None]:
def print_score(label, prediction, train=True):
    if train:
#        clf_report = pd.DataFrame(classification_report(label, prediction, output_dict=True))
        print(f"Train Accuracy Score: {accuracy_score(label, prediction) * 100:.2f}%")
       
    elif train==False:
#        clf_report = pd.DataFrame(classification_report(label, prediction, output_dict=True))
        print(f"Test Accuracy Score: {accuracy_score(label, prediction) * 100:.2f}%")

In [None]:
RanFor = RandomForestClassifier(n_estimators=100, oob_score=False)
RanFor.fit(Input_train, Res_train)

RanFor_Training_Acc = accuracy_score(Res_train, RanFor.predict(Input_train))
RanFor_Cross_Val = cross_val_score(RanFor, Input_train, Res_train, cv=5)

#RanFor_train_pred = RanFor.predict(Input_train)
#RanFor_test_pred = RanFor.predict(Input_test)

#print_score(Res_train, RanFor_train_pred, train=True)
#print_score(Res_test, RanFor_test_pred, train=False)

print("Classifiers: Random Forest has a Training Accuracy of", 100*RanFor_Training_Acc, "%")
print("Classifiers: Random Forest has a CrossVal score of", 100*RanFor_Cross_Val.mean(), "%")

Optimising using GridSearch:

In [None]:
RanFor_Para = {"bootstrap": [True], "max_features": [5, 10, 15, 20, 25, 30], "min_samples_leaf": [1, 2, 3, 4], \
"min_samples_split": [2, 4, 6, 8, 10], "n_estimators": [10, 50, 100, 200, 300, 500]}

GridRanFor = GridSearchCV(RandomForestClassifier(), RanFor_Para, cv = 5, n_jobs = -1, verbose = 5)

GridRanFor.fit(Input_train, Res_train)

Ran_For = GridRanFor.best_estimator_

#RanFor_train_pred = RanFor.predict(Input_train)
#RanFor_test_pred = RanFor.predict(Input_test)

#print_score(Res_train, RanFor_train_pred, train=True)
#print_score(Res_test, RanFor_test_pred, train=False)

Ran_For_Training = accuracy_score(Res_train, Ran_For.predict(Input_train))
Ran_For_Cross_Val = cross_val_score(Ran_For, Input_train, Res_train, cv=5)

print("Classifiers: Random Forest has a Training Accuracy of", 100*Ran_For_Training, "%")
print("Classifiers: Random Forest has a CrossVal score of", 100*Ran_For_Cross_Val.mean(), "%")

Unfortunately, when used to predict fraud in the entire dataset, our model is over 10x higher than the actual number of cases

In [None]:
Ran_For.predict(Fraud_df.drop("Class", axis=1)).sum() /492

In [None]:
print(RanFor.get_params())
print(Ran_For.get_params())

In [None]:
if RanFor_Cross_Val.mean() > Ran_For_Cross_Val.mean():
    Ran_For = RanFor

In [None]:
cv = ShuffleSplit(n_splits=100, test_size=0.3, random_state=42)
plot_learning_curve("Random Forest Learning Curve", Ran_For, Input_train, Res_train, (0.85, 1.00), cv=cv, n_jobs=-1)

In [None]:
plot_confusion_matrix(Res_train, Ran_For.predict(Input_train), "Training Dataset Matrix")
tn, fp, fn, tp = confusion_matrix(Res_train, Ran_For.predict(Input_train)).ravel()


print("Percentage of fraud missed: ", round(100*(fn / (fn+tp)), 2), "%")
print("Percentage of genuine mis-classified: ", round(100*(fp / (fp+tn)), 2), "%")
print("Total accuracy: ", round(100*((tp + tn)/(tp+tn+fp+fn)), 2), "%")

In [None]:
plot_confusion_matrix(Res_test, Ran_For.predict(Input_test), "Testing Dataset Matrix")
tn, fp, fn, tp = confusion_matrix(Res_test, Ran_For.predict(Input_test)).ravel()

print("Percentage of fraud missed: ", round(100*(fn / (fn+tp)), 2), "%")
print("Percentage of genuine mis-classified: ", round(100*(fp / (fp+tn)), 2), "%")
print("Total accuracy: ", round(100*((tp + tn)/(tp+tn+fp+fn)), 2), "%")

In [None]:
plot_confusion_matrix(Fraud_df["Class"], Ran_For.predict(Fraud_df.drop("Class", axis=1)), "Entire Dataset Matrix")
tn, fp, fn, tp = confusion_matrix(Fraud_df["Class"], Ran_For.predict(Fraud_df.drop("Class", axis=1))).ravel()

print("Percentage of fraud missed: ", round(100*(fn / (fn+tp)), 2), "%")
print("Percentage of genuine mis-classified: ", round(100*(fp / (fp+tn)), 2), "%")
print("Total accuracy: ", round(100*((tp + tn)/(tp+tn+fp+fn)), 2), "%")

#### Linear Discriminant Analysis

In [None]:
LDA = LinearDiscriminantAnalysis()
LDA.fit(Input_train, Res_train)

LDA_Training_Acc = accuracy_score(Res_train, LDA.predict(Input_train))
LDA_Cross_Val = cross_val_score(LDA, Input_train, Res_train, cv=5)

print("Classifiers: LDA has a Training Accuracy of", 100*LDA_Training_Acc, "%")
print("Classifiers: LDA has a CrossVal score of", 100*LDA_Cross_Val.mean(), "%")

In [None]:
LDA.predict(Fraud_df.drop(["Class"], axis=1)).sum() /492

In [None]:
cv = ShuffleSplit(n_splits=100, test_size=0.3, random_state=42)
plot_learning_curve("Linear Discriminant Analysis Learning Curve", LDA, Input_train, Res_train, (0.85, 1.00), cv=cv, n_jobs=4)

In [None]:
plot_confusion_matrix(Res_train, LDA.predict(Input_train), "Training Dataset Matrix")
tn, fp, fn, tp = confusion_matrix(Res_train, LDA.predict(Input_train)).ravel()


print("Percentage of fraud missed: ", round(100*(fn / (fn+tp)), 2), "%")
print("Percentage of genuine mis-classified: ", round(100*(fp / (fp+tn)), 2), "%")
print("Total accuracy: ", round(100*((tp + tn)/(tp+tn+fp+fn)), 2), "%")

In [None]:
plot_confusion_matrix(Res_test, LDA.predict(Input_test), "Testing Dataset Matrix")
tn, fp, fn, tp = confusion_matrix(Res_test, LDA.predict(Input_test)).ravel()

print("Percentage of fraud missed: ", round(100*(fn / (fn+tp)), 2), "%")
print("Percentage of genuine mis-classified: ", round(100*(fp / (fp+tn)), 2), "%")
print("Total accuracy: ", round(100*((tp + tn)/(tp+tn+fp+fn)), 2), "%")

In [None]:
plot_confusion_matrix(Fraud_df["Class"], LDA.predict(Fraud_df.drop("Class", axis=1)), "Entire Dataset Matrix")
tn, fp, fn, tp = confusion_matrix(Fraud_df["Class"], LDA.predict(Fraud_df.drop("Class", axis=1))).ravel()

print("Percentage of fraud missed: ", round(100*(fn / (fn+tp)), 2), "%")
print("Percentage of genuine mis-classified: ", round(100*(fp / (fp+tn)), 2), "%")
print("Total accuracy: ", round(100*((tp + tn)/(tp+tn+fp+fn)), 2), "%")

#### Oversample

In [None]:
Over_Samp_Minus_Class = Over_Samp_Fraud.drop("Class", axis=1)
Over_Samp_Class = Over_Samp_Fraud["Class"]

In [None]:
#Split data into 30% training, 70% test group
Input_train, Input_test, Res_train, Res_test = train_test_split(Over_Samp_Minus_Class, Over_Samp_Class, test_size=0.3, random_state=42)


#Logistic Regression requires Array inputs
Input_train = Input_train.values
Input_test = Input_test.values
Res_train = Res_train.values
Res_test = Res_test.values

In [None]:
LogReg = LogisticRegression(max_iter = 1000)
LogReg.fit(Input_train, Res_train)

LogReg_Train_Acc = accuracy_score(Res_train, LogReg.predict(Input_train))
print("Classifiers: Logistic Regression has a Training Accuracy of", 100*LogReg_Train_Acc, "%")

LogReg_Cross_Val = cross_val_score(LogReg, Input_train, Res_train, cv=5)
print("Classifiers: Logistic Regression has a CrossVal score of", 100*LogReg_Cross_Val.mean(), "%")

Optimising parameters for best fit using GridSearchCV. (This turns out to have only a minor effect on the accuracy, around 0.2% !)

In [None]:
LogReg_Para = {"penalty": ['l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], "max_iter": [500, 1000, 1500]}

GridLogReg = GridSearchCV(LogisticRegression(), LogReg_Para, n_jobs = -1, verbose=10)
GridLogReg.fit(Input_train, Res_train)

Log_Reg = GridLogReg.best_estimator_

In [None]:
Log_Reg_Train_Acc = accuracy_score(Res_train, Log_Reg.predict(Input_train))
print("Classifiers: Logistic Regression has a Training Accuracy of", 100*Log_Reg_Train_Acc, "%")

Log_Reg_Cross_Val = cross_val_score(Log_Reg, Input_train, Res_train, cv=5)
print("Classifiers: Logistic Regression has a CrossVal score of", 100*Log_Reg_Cross_Val.mean(), "%")

In [None]:
cv = ShuffleSplit(n_splits=100, test_size=0.3, random_state=42)
plot_learning_curve(title = "Logistic Regression Learning Curve", Log_Reg, Input_train, Res_train, (0.85, 1.00), cv=cv, n_jobs=-1, verbose=10)

Unfortunately, when used to predict fraud in the entire dataset, our model is over twice as high as actual number of cases

In [None]:
Log_Reg.predict(Fraud_df.drop("Class", axis=1)).sum() / 492

In [None]:
def plot_confusion_matrix(y_true, y_predicted, title):

    cm  = confusion_matrix(y_true, y_predicted)
    # Get the per-class normalized value for each cell
    cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    
    # We color each cell according to its normalized value, annotate with exact counts.
    ax = sns.heatmap(cm_norm, annot=cm, fmt="d")
    ax.set(xticklabels=["non-fraud", "fraud"], yticklabels=["non-fraud", "fraud"])
    ax.set_ylim([0,2])
    plt.title(title)
    plt.ylabel('Real Classes')
    plt.xlabel('Predicted Classes')
    plt.show()

In [None]:
plot_confusion_matrix(Res_train, Log_Reg.predict(Input_train), "Training Dataset Matrix")
tn, fp, fn, tp = confusion_matrix(Res_train, Log_Reg.predict(Input_train)).ravel()

print("Percentage of fraud missed: ", round(100*(fn / (fn+tp)), 2), "%")
print("Percentage of genuine mis-classified: ", round(100*(fp / (fp+tn)), 2), "%")
print("Total accuracy: ", round(100*((tp + tn)/(tp+tn+fp+fn)), 2), "%")

In [None]:
plot_confusion_matrix(Res_test, Log_Reg.predict(Input_test), "Testing Dataset Matrix")
tn, fp, fn, tp = confusion_matrix(Res_test, Log_Reg.predict(Input_test)).ravel()

print("Percentage of fraud missed: ", round(100*(fn / (fn+tp)), 2), "%")
print("Percentage of genuine mis-classified: ", round(100*(fp / (fp+tn)), 2), "%")
print("Total accuracy: ", round(100*((tp + tn)/(tp+tn+fp+fn)), 2), "%")

In [None]:
plot_confusion_matrix(Fraud_df["Class"], Log_Reg.predict(Fraud_df.drop("Class", axis=1)), "Entire Dataset Matrix")
tn, fp, fn, tp = confusion_matrix(Fraud_df["Class"], Log_Reg.predict(Fraud_df.drop("Class", axis=1))).ravel()

print("Percentage of fraud missed: ", round(100*(fn / (fn+tp)), 2), "%")
print("Percentage of genuine mis-classified: ", round(100*(fp / (fp+tn)), 2), "%")
print("Total accuracy: ", round(100*((tp + tn)/(tp+tn+fp+fn)), 2), "%")

#### Random Forest

In [None]:
def print_score(label, prediction, train=True):
    if train:
#        clf_report = pd.DataFrame(classification_report(label, prediction, output_dict=True))
        print(f"Train Accuracy Score: {accuracy_score(label, prediction) * 100:.2f}%")
       
    elif train==False:
#        clf_report = pd.DataFrame(classification_report(label, prediction, output_dict=True))
        print(f"Test Accuracy Score: {accuracy_score(label, prediction) * 100:.2f}%")

In [None]:
RanFor = RandomForestClassifier(n_estimators=100, oob_score=False)
RanFor.fit(Input_train, Res_train)

RanFor_Training_Acc = accuracy_score(Res_train, RanFor.predict(Input_train))
RanFor_Cross_Val = cross_val_score(RanFor, Input_train, Res_train, cv=5)

#RanFor_train_pred = RanFor.predict(Input_train)
#RanFor_test_pred = RanFor.predict(Input_test)

#print_score(Res_train, RanFor_train_pred, train=True)
#print_score(Res_test, RanFor_test_pred, train=False)

print("Classifiers: Random Forest has a Training Accuracy of", 100*RanFor_Training_Acc, "%")
print("Classifiers: Random Forest has a CrossVal score of", 100*RanFor_Cross_Val.mean(), "%")

Previously we optimised via gridsearch, however due to the size of the oversampled dataset this is not practical here, also the accuracy of the default classifier seems as close to perfect as we can expect.

In [None]:
#RanFor_Para = {"bootstrap": [True], "max_features": [5, 10, 15, 20, 25, 30], "min_samples_leaf": [1, 2, 3, 4], \
#"min_samples_split": [2, 4, 6, 8, 10], "n_estimators": [5, 10, 50, 100]}

#GridRanFor = GridSearchCV(estimator = RanFor, param_grid = RanFor_Para, cv = 5, n_jobs = -1, verbose = 10)

#GridRanFor.fit(Input_train, Res_train)

#Ran_For = GridRanFor.best_estimator_

#Ran_For_Cross_Val = cross_val_score(Ran_For, Input_train, Res_train, cv=5)
#Ran_For_Training = accuracy_score(Res_train, Ran_For.predict(Input_train))

#print("Classifiers: Random Forest has a Training Accuracy of", 100*Ran_For_Training, "%")
#print("Classifiers: Random Forest has a CrossVal score of", 100*Ran_For_Cross_Val.mean(), "%")

In [None]:
Ran_For = RanFor

When used to predict fraud in the entire dataset, our model predicts relatively accurately the actual number of cases

In [None]:
Ran_For.predict(Fraud_df.drop("Class", axis=1)).sum() /492

In [None]:
cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=42)
plot_learning_curve("Random Forest Learning Curve", Ran_For, Input_train, Res_train, (0.85, 1.00), cv=cv, n_jobs=-1, verbose=10)

In [None]:
plot_confusion_matrix(Res_train, Ran_For.predict(Input_train), "Training Dataset Matrix")
tn, fp, fn, tp = confusion_matrix(Res_train, Ran_For.predict(Input_train)).ravel()

print("Percentage of fraud missed: ", round(100*(fn / (fn+tp)), 2), "%")
print("Percentage of genuine mis-classified: ", round(100*(fp / (fp+tn)), 2), "%")
print("Total accuracy: ", round(100*((tp + tn)/(tp+tn+fp+fn)), 2), "%")

In [None]:
plot_confusion_matrix(Res_test, Ran_For.predict(Input_test), "Testing Dataset Matrix")
tn, fp, fn, tp = confusion_matrix(Res_test, Ran_For.predict(Input_test)).ravel()

print("Percentage of fraud missed: ", round(100*(fn / (fn+tp)), 2), "%")
print("Percentage of genuine mis-classified: ", round(100*(fp / (fp+tn)), 2), "%")
print("Total accuracy: ", round(100*((tp + tn)/(tp+tn+fp+fn)), 2), "%")

In [None]:
plot_confusion_matrix(Fraud_df["Class"], Ran_For.predict(Fraud_df.drop("Class", axis=1)), "Entire Dataset Matrix")
tn, fp, fn, tp = confusion_matrix(Fraud_df["Class"], Ran_For.predict(Fraud_df.drop("Class", axis=1))).ravel()

print("Percentage of fraud missed: ", round(100*(fn / (fn+tp)), 2), "%")
print("Percentage of genuine mis-classified: ", round(100*(fp / (fp+tn)), 2), "%")
print("Total accuracy: ", round(100*((tp + tn)/(tp+tn+fp+fn)), 2), "%")

#### Linear Discriminant Analysis

In [None]:
LDA = LinearDiscriminantAnalysis()
LDA.fit(Input_train, Res_train)

LDA_Training_Acc = accuracy_score(Res_train, LDA.predict(Input_train))
LDA_Cross_Val = cross_val_score(LDA, Input_train, Res_train, cv=5)

print("Classifiers: LDA has a Training Accuracy of", 100*LDA_Training_Acc, "%")
print("Classifiers: LDA has a CrossVal score of", 100*LDA_Cross_Val.mean(), "%")

When used to predict fraud in the entire dataset, our model predicts almost perfectly the actual number of cases

In [None]:
LDA.predict(Fraud_df.drop(["Class"], axis=1)).sum() /492

In [None]:
cv = ShuffleSplit(n_splits=100, test_size=0.3, random_state=42)
plot_learning_curve("Linear Discriminant Analysis Learning Curve", LDA, Input_train, Res_train, (0.85, 1.00), cv=cv, n_jobs=-1)

In [None]:
plot_confusion_matrix(Res_train, LDA.predict(Input_train), "Training Dataset Matrix")
tn, fp, fn, tp = confusion_matrix(Res_train, LDA.predict(Input_train)).ravel()

print("Percentage of fraud missed: ", round(100*(fn / (fn+tp)), 2), "%")
print("Percentage of genuine mis-classified: ", round(100*(fp / (fp+tn)), 2), "%")
print("Total accuracy: ", round(100*((tp + tn)/(tp+tn+fp+fn)), 2), "%")

In [None]:
plot_confusion_matrix(Res_test, LDA.predict(Input_test), "Testing Dataset Matrix")
tn, fp, fn, tp = confusion_matrix(Res_test, LDA.predict(Input_test)).ravel()

print("Percentage of fraud missed: ", round(100*(fn / (fn+tp)), 2), "%")
print("Percentage of genuine mis-classified: ", round(100*(fp / (fp+tn)), 2), "%")
print("Total accuracy: ", round(100*((tp + tn)/(tp+tn+fp+fn)), 2), "%")

In [None]:
plot_confusion_matrix(Fraud_df["Class"], LDA.predict(Fraud_df.drop("Class", axis=1)), "Entire Dataset Matrix")
tn, fp, fn, tp = confusion_matrix(Fraud_df["Class"], LDA.predict(Fraud_df.drop("Class", axis=1))).ravel()

print("Percentage of fraud missed: ", round(100*(fn / (fn+tp)), 2), "%")
print("Percentage of genuine mis-classified: ", round(100*(fp / (fp+tn)), 2), "%")
print("Total accuracy: ", round(100*((tp + tn)/(tp+tn+fp+fn)), 2), "%")

#### Comparing accuracy scores (ROC)

In [None]:
Log_Reg_pred = cross_val_predict(Log_Reg, Input_train, Res_train, cv=5, method="decision_function")

Forest_pred = cross_val_predict(Ran_For, Input_train, Res_train, cv=5)

LDA_pred = cross_val_predict(LDA, Input_train, Res_train, cv=5)

In [None]:
print('Logistic Regression: ', roc_auc_score(Res_train, Log_Reg_pred))

print('Decision Tree Classifier: ', roc_auc_score(Res_train, Forest_pred))

print('LDA Classifier: ', roc_auc_score(Res_train, LDA_pred))

Source for below ROC graph: https://github.com/sxu11/Data_Science/blob/master/Anomaly_Detection/DealingWithImbalancedDataSets.py

In [None]:
def graph_roc_curve_multiple(log_fpr, log_tpr, for_fpr, for_tpr, LDA_fpr, LDA_tpr):
    plt.figure(figsize=(16,8))
    plt.title('ROC Curve', fontsize=18)
    plt.plot(log_fpr, log_tpr, label='Logistic Regression Classifier Score: {:.4f}'.format(roc_auc_score(Res_train, Log_Reg_pred)))
    plt.plot(for_fpr, for_tpr, label='Random Forest Classifier Score: {:.4f}'.format(roc_auc_score(Res_train, Forest_pred)))
    plt.plot(LDA_fpr, LDA_tpr, label='Linear Discriminant Classifier Score: {:.4f}'.format(roc_auc_score(Res_train, LDA_pred)))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([-0.01, 1, 0, 1.005])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)
    plt.annotate('Minimum ROC Score of 50% \n (This is the minimum score to get)', xy=(0.5, 0.5), xytext=(0.6, 0.3),
                arrowprops=dict(facecolor='#6E726D', shrink=0.05),
                )
    plt.legend()

In [None]:
Log_fpr, Log_tpr, Log_thresold = roc_curve(Res_train, Log_Reg_pred)
Forest_fpr, Forest_tpr, Forest_threshold = roc_curve(Res_train, Forest_pred)
LDA_fpr, LDA_tpr, LDA_thresold = roc_curve(Res_train, LDA_pred)
    
graph_roc_curve_multiple(Log_fpr, Log_tpr, Forest_fpr, Forest_tpr, LDA_fpr, LDA_tpr)
plt.show()

In [None]:
Log_Reg_pred = cross_val_predict(Log_Reg, Fraud_df.drop("Class", axis=1), Fraud_df["Class"], n_jobs=-1, verbose=20, cv=5, method="decision_function")

In [None]:
Forest_pred = cross_val_predict(Ran_For, Fraud_df.drop("Class", axis=1), Fraud_df["Class"], n_jobs=-1, cv=5)

In [None]:
LDA_pred = cross_val_predict(LDA, Fraud_df.drop("Class", axis=1), Fraud_df["Class"], n_jobs=-1, cv=5)

In [None]:
print('Logistic Regression: ', roc_auc_score(Fraud_df["Class"], Log_Reg_pred))

print('Decision Tree Classifier: ', roc_auc_score(Fraud_df["Class"], Forest_pred))

print('LDA Classifier: ', roc_auc_score(Fraud_df["Class"], LDA_pred))

In [None]:
def graph_roc_curve_full_dataset(log_fpr, log_tpr, for_fpr, for_tpr, LDA_fpr, LDA_tpr):
    plt.figure(figsize=(16,8))
    plt.title('ROC Curve', fontsize=18)
    plt.plot(log_fpr, log_tpr, label='Logistic Regression Classifier Score: {:.4f}'.format(roc_auc_score(Fraud_df["Class"], Log_Reg_pred)))
    plt.plot(for_fpr, for_tpr, label='Random Forest Classifier Score: {:.4f}'.format(roc_auc_score(Fraud_df["Class"], Forest_pred)))
    plt.plot(LDA_fpr, LDA_tpr, label='Linear Discriminant Classifier Score: {:.4f}'.format(roc_auc_score(Fraud_df["Class"], LDA_pred)))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([-0.01, 1, 0, 1.005])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)
    plt.annotate('Minimum ROC Score of 50% \n (This is the minimum score to get)', xy=(0.5, 0.5), xytext=(0.6, 0.3),
                arrowprops=dict(facecolor='#6E726D', shrink=0.05),
                )
    plt.legend()

In [None]:
Log_fpr, Log_tpr, Log_thresold = roc_curve(Fraud_df["Class"], Log_Reg_pred)
Forest_fpr, Forest_tpr, Forest_threshold = roc_curve(Fraud_df["Class"], Forest_pred)
LDA_fpr, LDA_tpr, LDA_thresold = roc_curve(Fraud_df["Class"], LDA_pred)
    
graph_roc_curve_full_dataset(Log_fpr, Log_tpr, Forest_fpr, Forest_tpr, LDA_fpr, LDA_tpr)
plt.show()