In [2]:
# Import required packages
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix,precision_recall_curve,accuracy_score
from sklearn.metrics import auc,f1_score,roc_auc_score,roc_curve,recall_score
from sklearn.metrics import precision_score,recall_score,classification_report
from sklearn.ensemble import IsolationForest

# Import data and preprocessing
data = pd.read_csv("creditcard.csv")
data['normAmount'] = StandardScaler().fit_transform(data['Amount']
                                                    .values.
                                                    reshape(-1, 1))
data = data.drop(['Time','Amount'],axis=1)
X = data.iloc[:,data.columns != 'Class']
Y = data.iloc[:,data.columns == 'Class']
X_train,X_test,Y_train,Y_test 
            =train_test_split(X,Y,test_size=0.20,
                              random_state=21,stratify=Y)
train= pd.concat([X_train, Y_train],axis=1) 
fraud = train[train["Class"]==1]
valid = train[train["Class"]==0]

# Hyper-paramter tuning 
f1 = []
for m in range(1,29):
    for n in range(1,150):
        clf = IsolationForest(n_estimators=n,max_features=m,
                              contamination=0.010,random_state=21)
        clf.fit(X_train)
        scores_pred = clf.decision_function(X_train)
        y_pred=clf.predict(X_test)
        y_pred[y_pred==1]=0
        y_pred[y_pred==-1]=1
        f = f1_score(Y_test,y_pred)
        f1.append(f)
print("Best f-1 Score 10/90 for Isolation Forest is : ",max(f1) )
print("Best f-1 Score 10/90 for Isolation Forest parameters grid no. : ",
      f1.index(max(f1))+1) 
# Results
clf = IsolationForest(n_estimators=69,max_features=1,
                      contamination=0.010,random_state=21)
clf.fit(X_train)
scores_pred = clf.decision_function(X_train)
yt = clf.predict(X_train)
y_pred=clf.predict(X_test)
y_pred[y_pred==1]=0
y_pred[y_pred==-1]=1
yt[yt==1]=0
yt[yt==-1]=1
precision = precision_score(Y_test,y_pred)
recall = recall_score(Y_test,y_pred)
accuracy = accuracy_score(Y_test,y_pred)
print("Train Accuracy: ",accuracy_score(Y_train,yt))
print("training Classification report : ")
print(classification_report(Y_train,yt))
print("training Confusion Matrix : ")
print(confusion_matrix(Y_train,yt))
print("Accuracy: ",accuracy)
print("Classification report : ")
print(classification_report(Y_test,y_pred))
print("Confusion Matrix : ")
print(confusion_matrix(Y_test,y_pred))

# For 10/90 Undersampling
valid90 = valid.sample(n=(394*10))
train90 = pd.concat([fraud, valid90])
train90 = train90.sample(frac=1).reset_index(drop=True)
X_train90 = train90.iloc[:,train90.columns!="Class"]
Y_train90 = train90.iloc[:,train90.columns=="Class"]
X_train,X_cross,Y_train,Y_cross=train_test_split(X_train90,
                                                   Y_train90,
                                                   test_size=0.25,
                                                   random_state=21)
# Hyper-parameter tuning
f1 = []
for m in range(1,29):
    for n in range(1,150):
        clf = IsolationForest(n_estimators=n,
                              max_features=m,contamination=0.10,
                              random_state=21)
        clf.fit(X_train)
        scores_pred = clf.decision_function(X_train)
        y_pred=clf.predict(X_cross)
        y_pred[y_pred==1]=0
        y_pred[y_pred==-1]=1
        f = f1_score(Y_cross,y_pred)
        f1.append(f)
print("Best f-1 Score 10/90 for Isolation Forest is : ",max(f1) )
print("Best f-1 Score 10/90 for Isolation Forest parameters grid no. : ",
      f1.index(max(f1))+1) 
# Results
clf = IsolationForest(n_estimators=69,max_features=1,random_state=21)
clf.fit(X_train)
scores_pred = clf.decision_function(X_train)
yt = clf.predict(X_train)
y_pred=clf.predict(X_test)
y_pred[y_pred==1]=0
y_pred[y_pred==-1]=1
yt[yt==1]=0
yt[yt==-1]=1
precision = precision_score(Y_test,y_pred)
recall = recall_score(Y_test,y_pred)
accuracy = accuracy_score(Y_test,y_pred)
print("Train Accuracy: ",accuracy_score(Y_train,yt))
print("training Classification report : ")
print(classification_report(Y_train,yt))
print("training Confusion Matrix : ")
print(confusion_matrix(Y_train,yt))
print("Accuracy: ",accuracy)
print("Classification report : ")
print(classification_report(Y_test,y_pred))
print("Confusion Matrix : ")
print(confusion_matrix(Y_test,y_pred))

# Similarly for 20/80 Undersampling
valid80 = valid.sample(n=(394*4))
train80 = pd.concat([fraud, valid80])
train80 = train80.sample(frac=1).reset_index(drop=True)
X_train80 = train80.iloc[:,train80.columns!="Class"]
Y_train80 = train80.iloc[:,train80.columns=="Class"]
X_train,X_cross,Y_train,Y_cross = train_test_split(X_train80,Y_train80,
                                                   test_size=0.25,
                                                   random_state=21)
f1 = []
for m in range(1,29):
    for n in range(1,150):
        clf = IsolationForest(n_estimators=n,max_features=m,
                              contamination=0.20,random_state=21)
        clf.fit(X_train)
        scores_pred = clf.decision_function(X_train)
        y_pred=clf.predict(X_cross)
        y_pred[y_pred==1]=0
        y_pred[y_pred==-1]=1
        f = f1_score(Y_cross,y_pred)
        f1.append(f)
print("Best f-1 Score 10/90 for Isolation Forest is : ",max(f1) )
print("Best f-1 Score 10/90 for Isolation Forest parameters grid no. : ",
      f1.index(max(f1))+1) 
clf = IsolationForest(n_estimators=74,max_features=1,random_state=21)
clf.fit(X_train)
scores_pred = clf.decision_function(X_train)
yt = clf.predict(X_train)
y_pred=clf.predict(X_test)
y_pred[y_pred==1]=0
y_pred[y_pred==-1]=1
yt[yt==1]=0
yt[yt==-1]=1
precision = precision_score(Y_test,y_pred)
recall = recall_score(Y_test,y_pred)
accuracy = accuracy_score(Y_test,y_pred)
print("Train Accuracy: ",accuracy_score(Y_train,yt))
print("training Classification report : ")
print(classification_report(Y_train,yt))
print("training Confusion Matrix : ")
print(confusion_matrix(Y_train,yt))
print("Accuracy: ",accuracy)
print("Classification report : ")
print(classification_report(Y_test,y_pred))
print("Confusion Matrix : ")
print(confusion_matrix(Y_test,y_pred))

# Similarly for 50/50 undersampling
valid50 = valid.sample(n=(394))
train50 = pd.concat([fraud, valid50])
train50 = train50.sample(frac=1).reset_index(drop=True)
X_train50 = train50.iloc[:,train50.columns!="Class"]
Y_train50 = train50.iloc[:,train50.columns=="Class"]
X_train,X_cross,Y_train,Y_cross = train_test_split(X_train50,Y_train50,
                                                   test_size=0.25,
                                                   random_state=21)
f1 = []
for m in range(1,29):
    for n in range(1,150):
        clf = IsolationForest(n_estimators=n,max_features=m,
                              contamination=0.50,random_state=21)
        clf.fit(X_train)
        scores_pred = clf.decision_function(X_train)
        y_pred=clf.predict(X_cross)
        y_pred[y_pred==1]=0
        y_pred[y_pred==-1]=1
        f = f1_score(Y_cross,y_pred)
        f1.append(f)
print("Best f-1 Score is : ",max(f1) )
print("Best f-1 Score parameters grid no. : ", f1.index(max(f1))+1) 
clf = IsolationForest(n_estimators=74,max_features=1,random_state=21)
clf.fit(X_train)
scores_pred = clf.decision_function(X_train)
yt = clf.predict(X_train)
y_pred=clf.predict(X_test)
y_pred[y_pred==1]=0
y_pred[y_pred==-1]=1
yt[yt==1]=0
yt[yt==-1]=1
precision = precision_score(Y_test,y_pred)
recall = recall_score(Y_test,y_pred)
accuracy = accuracy_score(Y_test,y_pred)
print("Train Accuracy: ",accuracy_score(Y_train,yt))
print("training Classification report : ")
print(classification_report(Y_train,yt))
print("training Confusion Matrix : ")
print(confusion_matrix(Y_train,yt))
print("Accuracy: ",accuracy)
print("Classification report : ")
print(classification_report(Y_test,y_pred))
print("Confusion Matrix : ")
print(confusion_matrix(Y_test,y_pred))

# Similarly for 90/10 undersampling
valid10 = valid.sample(n=41)
train10 = pd.concat([fraud, valid10])
train10 = train10.sample(frac=1).reset_index(drop=True)
X_train10 = train10.iloc[:,train10.columns!="Class"]
Y_train10 = train10.iloc[:,train10.columns=="Class"]
X_train,X_cross,Y_train,Y_cross = train_test_split(X_train10,Y_train10,
                                                   test_size=0.25,
                                                   random_state=21)
f1 = []
for m in range(1,29):
    for n in range(1,150):
        clf = IsolationForest(n_estimators=n,max_features=m,
                              contamination=0.90,random_state=21)
        clf.fit(X_train)
        scores_pred = clf.decision_function(X_train)
        y_pred=clf.predict(X_cross)
        y_pred[y_pred==1]=0
        y_pred[y_pred==-1]=1
        f = f1_score(Y_cross,y_pred)
        f1.append(f)
print("Best f-1 Score is : ",max(f1) )
print("Best f-1 Score parameters grid no. : ", f1.index(max(f1))+1) 
clf = IsolationForest(n_estimators=64,max_features=1,random_state=21)
clf.fit(X_train)
scores_pred = clf.decision_function(X_train)
yt = clf.predict(X_train)
y_pred=clf.predict(X_test)
y_pred[y_pred==1]=0
y_pred[y_pred==-1]=1
yt[yt==1]=0
yt[yt==-1]=1
precision = precision_score(Y_test,y_pred)
recall = recall_score(Y_test,y_pred)
accuracy = accuracy_score(Y_test,y_pred)
print("Train Accuracy: ",accuracy_score(Y_train,yt))
print("training Classification report : ")
print(classification_report(Y_train,yt))
print("training Confusion Matrix : ")
print(confusion_matrix(Y_train,yt))
print("Accuracy: ",accuracy)
print("Classification report : ")
print(classification_report(Y_test,y_pred))
print("Confusion Matrix : ")
print(confusion_matrix(Y_test,y_pred))
