In [None]:
# Import required packages
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.metrics import confusion_matrix,precision_recall_curve,accuracy_score
from sklearn.metrics import auc,f1_score,roc_auc_score,roc_curve,recall_score
from sklearn.metrics import precision_score,recall_score,classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle
from imblearn.over_sampling import SMOTE

# Import data and preprocessing
data = pd.read_csv("creditcard.csv")
data['normAmount'] = StandardScaler().fit_transform(data['Amount']
                                                    .values.reshape(-1, 1))
data = data.drop(['Time','Amount'],axis=1)
X = data.iloc[:,data.columns != 'Class']
Y = data.iloc[:,data.columns == 'Class']
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,
                                                 test_size=0.20,
                                                 random_state=21, stratify=Y)
train= pd.concat([X_train, Y_train],axis=1) 
fraud = train[train["Class"]==1]
valid = train[train["Class"]==0]

# 10/90 Undersampling
valid_90 = valid.sample(n=(369*9))
data1_train = pd.concat([fraud,valid_90])
data1_train = data1_train.sample(frac=1).reset_index(drop=True)
X_train = data1_train.iloc[:,:-1]
Y_train = data1_train.iloc[:,-1]
#X_train,X_cross,Y_train,Y_cross = train_test_split(X_train,Y_train,test_size=0.25,random_state=0)
# Hyper-parameter Tuning
rf = RandomForestClassifier(random_state=21,class_weight='balanced')
param_grid = {'n_estimators': range(1,100),'min_samples_split':range(2,100)}
CV_lr = GridSearchCV(estimator=rf,param_grid=param_grid,cv=5,scoring='f1')
CV_lr.fit(X=X_train,y=Y_train)
best_param = CV_lr.best_params_
print("Best Paramters for Random Forest: ",best_param)
# Results
rf = RandomForestClassifier(n_estimators=17,min_samples_split=15,
                            random_state=21,class_weight='balanced')
rf.fit(X_train,Y_train)
y_pred1 = rf.predict(X_train)
y_pred10 = rf.predict(X_test)
print ("Score on train set is: ", accuracy_score(Y_train,y_pred1))
print ("Score for test data is", accuracy_score(Y_test,y_pred10))
print("Classification report for train set")
print(classification_report(Y_train,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(Y_train,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(Y_test,y_pred10))
print("Classification report for test set")
print(classification_report(Y_test,y_pred10))

# Similarly for 20/80 undersampling
valid_80 = valid.sample(n=(369*4))
data1_train = pd.concat([fraud,valid_80])
data1_train = data1_train.sample(frac=1).reset_index(drop=True)
X_train = data1_train.iloc[:,:-1]
Y_train = data1_train.iloc[:,-1]
rf = RandomForestClassifier(random_state=21,class_weight='balanced')
param_grid = {'n_estimators': range(1,100),'min_samples_split':range(2,100)}
CV_lr = GridSearchCV(estimator=rf,param_grid=param_grid,cv=5,scoring='f1')
CV_lr.fit(X=X_train,y=Y_train)
best_param = CV_lr.best_params_
print("Best Paramters for Random Forest: ",best_param)
rf = RandomForestClassifier(n_estimators=9,min_samples_split=10,
                            random_state=21,class_weight='balanced')
rf.fit(X_train,Y_train)
y_pred1 = rf.predict(X_train)
y_pred20 = rf.predict(X_test)
print ("Score on train set is: ", accuracy_score(Y_train,y_pred1))
print ("Score for test data is", accuracy_score(Y_test,y_pred20))
print("Classification report for train set")
print(classification_report(Y_train,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(Y_train,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(Y_test,y_pred20))
print("Classification report for test set")
print(classification_report(Y_test,y_pred20))

# Similarly for 50/50 Undersampling
valid_50 = valid.sample(n=(369*2))
data1_train = pd.concat([fraud,valid_50])
data1_train = data1_train.sample(frac=1).reset_index(drop=True)
X_train = data1_train.iloc[:,:-1]
Y_train = data1_train.iloc[:,-1]
rf = RandomForestClassifier(random_state=21,class_weight='balanced')
param_grid = {'n_estimators': range(1,100),'min_samples_split':range(2,100)}
CV_lr = GridSearchCV(estimator=rf,param_grid=param_grid,cv=5,scoring='f1')
CV_lr.fit(X=X_train,y=Y_train)
best_param = CV_lr.best_params_
print("Best Paramters for Logistic regression: ",best_param)
rf = RandomForestClassifier(n_estimators=8,min_samples_split=2,
                            random_state=21,class_weight='balanced')
rf.fit(X_train,Y_train)
y_pred1 = rf.predict(X_train)
y_pred50 = rf.predict(X_test)
print ("Score on train set is: ", accuracy_score(Y_train,y_pred1))
print ("Score for test data is", accuracy_score(Y_test,y_pred50))
print("Classification report for train set")
print(classification_report(Y_train,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(Y_train,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(Y_test,y_pred50))
print("Classification report for test set")
print(classification_report(Y_test,y_pred50))

# For 90/10 Undersampling 
valid_10 = valid.sample(n=41)
data1_train = pd.concat([fraud,valid_10])
data1_train = data1_train.sample(frac=1).reset_index(drop=True)
X_train = data1_train.iloc[:,:-1]
Y_train = data1_train.iloc[:,-1]
rf = RandomForestClassifier(random_state=21,class_weight='balanced')
param_grid = {'n_estimators': range(1,100),'min_samples_split':range(2,100)}
CV_lr = GridSearchCV(estimator=rf,param_grid=param_grid,cv=5,scoring='f1')
CV_lr.fit(X=X_train,y=Y_train)
best_param = CV_lr.best_params_
print("Best Paramters for Random Forest: ",best_param)
rf = RandomForestClassifier(n_estimators=7,min_samples_split=5,
                            random_state=21,class_weight='balanced')
rf.fit(X_train,Y_train)
y_pred1 = rf.predict(X_train)
y_pred90 = rf.predict(X_test)
print ("Score on train set is: ", accuracy_score(Y_train,y_pred1))
print ("Score for test data is", accuracy_score(Y_test,y_pred90))
print("Classification report for train set")
print(classification_report(Y_train,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(Y_train,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(Y_test,y_pred90))
print("Classification report for test set")
print(classification_report(Y_test,y_pred90))

# Without undersampling or oversampling
rf = RandomForestClassifier(random_state=21,class_weight='balanced')
param_grid = {'n_estimators': range(1,50),'min_samples_split':range(2,50)}
CV_lr = GridSearchCV(estimator=rf,param_grid=param_grid,cv=5,scoring='f1')
CV_lr.fit(X=X_train,y=Y_train)
best_param = CV_lr.best_params_
print("Best Paramters for Logistic regression: ",best_param)
rf = RandomForestClassifier(n_estimators=5,min_samples_split=3,
                            random_state=21,class_weight='balanced')
rf.fit(X,Y)
y_pred1 = rf.predict(X_train)
y_pred2 = rf.predict(X_test)
print ("Score on train set is: ", accuracy_score(Y_train,y_pred1))
print ("Score for test data is", accuracy_score(Y_test,y_pred2))
print("Classification report for train set")
print(classification_report(Y_train,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(Y_train,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(Y_test,y_pred2))
print("Classification report for test set")
print(classification_report(Y_test,y_pred2))

# For oversampling 
sm= SMOTE(kind='borderline2')
X_resampled, Y_resampled = sm.fit_sample(X_train, Y_train)
rf = RandomForestClassifier(random_state=21,class_weight='balanced')
param_grid = {'n_estimators': range(80,121),'min_samples_split':range(2,11)}
CV_lr = GridSearchCV(estimator=rf,param_grid=param_grid,cv=5,scoring='f1',n_jobs=-1)
CV_lr.fit(X=X_resampled,y=Y_resampled)
best_param = CV_lr.best_params_
print("Best Paramters: ",best_param)
rf = RandomForestClassifier(n_estimators=90,min_samples_split=12,
                            min_child_weight=10,random_state=21,
                            class_weight='balanced',n_jobs=-1)
rf.fit(X_resampled,Y_resampled)
y_pred1 = rf.predict(X_resampled)
y_pred2 = rf.predict(X_test)
print ("Score on train set is: ", accuracy_score(Y_resampled,y_pred1))
print ("Score for test data is", accuracy_score(Y_test,y_pred2))
print("Classification report for train set")
print(classification_report(Y_resampled,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(Y_resampled,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(Y_test,y_pred2))
print("Classification report for test set")
print(classification_report(Y_test,y_pred2))