In [34]:
# Importing required packages
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix,precision_recall_curve,accuracy_score
from sklearn.metrics import auc,f1_score,roc_auc_score,roc_curve,recall_score
from sklearn.metrics import precision_score,recall_score,classification_report
from sklearn.neighbors import LocalOutlierFactor

# Importing data and preprocessing
data = pd.read_csv("creditcard.csv")
data['normAmount'] = StandardScaler().fit_transform(data['Amount']
                                                    .values.reshape(-1, 1))
data = data.drop(['Time','Amount'],axis=1)
X = data.iloc[:,data.columns != 'Class']
Y = data.iloc[:,data.columns == 'Class']
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,
                                                 test_size=0.20,random_state=21,
                                                 stratify=Y)
train= pd.concat([X_train, Y_train],axis=1) 
fraud = train[train["Class"]==1]
valid = train[train["Class"]==0]

# 10/90 Under-Sampling
valid_90 = valid.sample(n=(369*9))
data1_train = pd.concat([fraud,valid_90])
data1_train = data1_train.sample(frac=1).reset_index(drop=True)
X_train = data1_train.iloc[:,:-1]
Y_train = data1_train.iloc[:,-1]

# Hyperparameter Tuning
f1 = []
for n in range(1,100):
    clf = LocalOutlierFactor(n_neighbors=n,contamination=0.10,n_jobs=-1)
    clf.fit(X_train)
    y_pred = clf.fit_predict(X_train)
    y_pred[y_pred==1]=0
    y_pred[y_pred==-1]=1
    f = f1_score(Y_train,y_pred)
    f1.append(f)
print("Best f-1 Score is : ",max(f1) )
print("Best f-1 Score parameters grid no. : ", f1.index(max(f1))+1) 

# Performance
lof = LocalOutlierFactor(n_neighbors=16,contamination=0.10,n_jobs=-1)
y_pred = lof.fit_predict(X_test)
y_pred[y_pred==1]=0
y_pred[y_pred==-1]=1
cnf_matrix = confusion_matrix(Y_test,y_pred)
accuracy= accuracy_score(Y_test,y_pred)
precision = precision_score(Y_test,y_pred)
recall = recall_score(Y_test,y_pred)
print("Accuracy:" ,accuracy)
print("Classification report is : ")
print(classification_report(Y_test,y_pred))
print("Cofusion Matrix is : ")
print(cnf_matrix) 

# 20/80 Under-Sampling
valid_80 = valid.sample(n=(369*4))
data1_train = pd.concat([fraud,valid_80])
data1_train = data1_train.sample(frac=1).reset_index(drop=True)
X_train = data1_train.iloc[:,:-1]
Y_train = data1_train.iloc[:,-1]
f1 = []
for n in range(1,100):
    clf = LocalOutlierFactor(n_neighbors=n,contamination=0.20,n_jobs=-1)
    clf.fit(X_train)
    y_pred = clf.fit_predict(X_train)
    y_pred[y_pred==1]=0
    y_pred[y_pred==-1]=1
    f = f1_score(Y_train,y_pred)
    f1.append(f)
print("Best f-1 Score is : ",max(f1) )
print("Best f-1 Score parameters grid no. : ", f1.index(max(f1))+1) 
# Test Set performance
lof = LocalOutlierFactor(n_neighbors=6,contamination=0.20,n_jobs=-1)
y_pred = lof.fit_predict(X_test)
y_pred[y_pred==1]=0
y_pred[y_pred==-1]=1
cnf_matrix = confusion_matrix(Y_test,y_pred)
accuracy= accuracy_score(Y_test,y_pred)
precision = precision_score(Y_test,y_pred)
recall = recall_score(Y_test,y_pred)
print("Accuracy:" ,accuracy)
print("Classification report is : ")
print(classification_report(Y_test,y_pred))
print("Cofusion Matrix is : ")
print(cnf_matrix) 

# 50/50 Under-Sampling
valid_50 = valid.sample(n=(369*2))
data1_train = pd.concat([fraud,valid_50])
data1_train = data1_train.sample(frac=1).reset_index(drop=True)
X_train = data1_train.iloc[:,:-1]
Y_train = data1_train.iloc[:,-1]
f1 = []
for n in range(1,100):
    clf = LocalOutlierFactor(n_neighbors=n,contamination=0.50,n_jobs=-1)
    clf.fit(X_train)
    y_pred = clf.fit_predict(X_train)
    y_pred[y_pred==1]=0
    y_pred[y_pred==-1]=1
    f = f1_score(Y_train,y_pred)
    f1.append(f)
print("Best f-1 Score is : ",max(f1) )
print("Best f-1 Score parameters grid no. : ", f1.index(max(f1))+1) 
# Test Set Performance
lof = LocalOutlierFactor(n_neighbors=12,contamination=0.50,n_jobs=-1)
y_pred = lof.fit_predict(X_test)
y_pred[y_pred==1]=0
y_pred[y_pred==-1]=1
cnf_matrix = confusion_matrix(Y_test,y_pred)
accuracy= accuracy_score(Y_test,y_pred)
precision = precision_score(Y_test,y_pred)
recall = recall_score(Y_test,y_pred)
print("Accuracy:" ,accuracy)
print("Classification report is : ")
print(classification_report(Y_test,y_pred))
print("Cofusion Matrix is : ")
print(cnf_matrix) 

# 90/10 Under-sampling
Accuracy = []
Precision = []
Recall = []
for n in range(150,301):
    lof = LocalOutlierFactor(n_neighbors=n,contamination=0.10)
    y_pred = lof.fit_predict(X_train)
    y_pred[y_pred==1]=0
    y_pred[y_pred==-1]=1
    cnf_matrix = confusion_matrix(Y_train,y_pred)
    accuracy= accuracy_score(Y_train,y_pred)
    precision = precision_score(Y_train,y_pred)
    recall = recall_score(Y_train,y_pred)
    print("Accuracy:" ,accuracy)
    print("Classification report is : ")
    print(classification_report(Y_train,y_pred))
    print("Cofusion Matrix is : ")
    print(cnf_matrix) 
    Accuracy.append(accuracy)
    Precision.append(precision)
    Recall.append(recall)
print("Best Accuracy is : ", max(Accuracy))
print("Best Accuracy parameters  : ", Accuracy.index(max(Accuracy))+1)
print("Best Precision is : ", max(Precision))
print("Best precision parameters is : ", Precision.index(max(Precision))+1)
print("Best Recall is : ",max(Recall) )
print("Best recall parameters is : ", Recall.index(max(Recall))+1)
valid_10 = valid.sample(n=41)
data1_train = pd.concat([fraud,valid_10])
data1_train = data1_train.sample(frac=1).reset_index(drop=True)
X_train = data1_train.iloc[:,:-1]
Y_train = data1_train.iloc[:,-1]
f1 = []
for n in range(1,100):
    clf = LocalOutlierFactor(n_neighbors=n,contamination=0.10,n_jobs=-1)
    clf.fit(X_train)
    y_pred = clf.fit_predict(X_train)
    y_pred[y_pred==1]=0
    y_pred[y_pred==-1]=1
    f = f1_score(Y_train,y_pred)
    f1.append(f)
print("Best f-1 Score is : ",max(f1) )
print("Best f-1 Score parameters grid no. : ", f1.index(max(f1))+1) 
# Test Set Performance
lof = LocalOutlierFactor(n_neighbors=1,contamination=0.10,n_jobs=-1)
y_pred = lof.fit_predict(X_test)
y_pred[y_pred==1]=0
y_pred[y_pred==-1]=1
cnf_matrix = confusion_matrix(Y_test,y_pred)
accuracy= accuracy_score(Y_test,y_pred)
precision = precision_score(Y_test,y_pred)
recall = recall_score(Y_test,y_pred)
print("Accuracy:" ,accuracy)
print("Classification report is : ")
print(classification_report(Y_test,y_pred))
print("Cofusion Matrix is : ")
print(cnf_matrix) 

# Performance without Undersampling/Oversampling
f1 = []
for n in range(1,101):
    clf = LocalOutlierFactor(n_neighbors=n,contamination=0.01,n_jobs=-1)
    clf.fit(X_test)
    y_pred = clf.fit_predict(X_test)
    y_pred[y_pred==1]=0
    y_pred[y_pred==-1]=1
    f = f1_score(Y_test,y_pred)
    f1.append(f)
print("Best f-1 Score is : ",max(f1) )
print("Best f-1 Score parameters grid no. : ", f1.index(max(f1))+1) 
lof = LocalOutlierFactor(n_neighbors=11,contamination=0.01,n_jobs=-1)
y_pred = lof.fit_predict(X_test)
y_pred[y_pred==1]=0
y_pred[y_pred==-1]=1
cnf_matrix = confusion_matrix(Y_test,y_pred)
accuracy= accuracy_score(Y_test,y_pred)
precision = precision_score(Y_test,y_pred)
recall = recall_score(Y_test,y_pred)
print("Accuracy:" ,accuracy)
print("Classification report is : ")
print(classification_report(Y_test,y_pred))
print("Cofusion Matrix is : ")
print(cnf_matrix) 