In [None]:
# Import required packages
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.metrics import confusion_matrix,precision_recall_curve,accuracy_score
from sklearn.metrics import auc,f1_score,roc_auc_score,roc_curve,recall_score
from sklearn.metrics import precision_score,recall_score,classification_report
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")
from imblearn.over_sampling import SMOTE

# Import data
data = pd.read_csv("creditcard.csv")
data['normAmount'] = StandardScaler().fit_transform(data['Amount']
                                                    .values.reshape(-1, 1))
data = data.drop(['Time','Amount'],axis=1)
X = data.iloc[:,data.columns != 'Class']
Y = data.iloc[:,data.columns == 'Class']
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.20,
                                                 random_state=21, stratify=Y)
train= pd.concat([X_train, Y_train],axis=1) 
fraud = train[train["Class"]==1]
valid = train[train["Class"]==0]

# Without Undersampling or oversamplin using default parameters
xg = XGBClassifier()
xg.fit(X_train,Y_train)
y_pred1 = xg.predict(X_train)
y_pred2 = xg.predict(X_test)
print ("Score on train set is: ", accuracy_score(Y_train,y_pred1))
print ("Score for test data is", accuracy_score(Y_test,y_pred2))
print("Classification report for train set")
print(classification_report(Y_train,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(Y_train,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(Y_test,y_pred2))
print("Classification report for test set")
print(classification_report(Y_test,y_pred2))

# 10/90 undersampling
valid90 = valid.sample(n=(394*10))
train90 = pd.concat([fraud, valid90])
train90 = train90.sample(frac=1).reset_index(drop=True)
X_train90 = train90.iloc[:,train90.columns!="Class"]
Y_train90 = train90.iloc[:,train90.columns=="Class"]

# Hyperparameter tuning
xg = XGBClassifier()
param_grid = {'learning_rate':np.arange(0.01,0.5,0.1),
              'n_estimators': range(1,100),'min_child_weight':range(1,50),
              'gamma': range(0,100)}
CV_lr = GridSearchCV(estimator=xg,param_grid=param_grid,cv=5,
                     scoring='f1',n_jobs=-1)
CV_lr.fit(X=X_train90,y=Y_train90)
best_param = CV_lr.best_params_
print("Best Paramters for 10/90 Split: ",best_param)

# Results
xg = XGBClassifier(learning_rate=0.41,n_estimators=9,
                   min_child_weight=1,gamma=6)
xg.fit(X_train90,Y_train90)
y_pred1 = xg.predict(X_train90)
y_pred2 = xg.predict(X_test)
print ("Score on train set is: ", accuracy_score(Y_train90,y_pred1))
print ("Score for test data is", accuracy_score(Y_test,y_pred2))
print("Classification report for train set")
print(classification_report(Y_train90,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(Y_train90,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(Y_test,y_pred2))
print("Classification report for test set")
print(classification_report(Y_test,y_pred2))

# Similarly for 20/80 Undersampling
valid80 = valid.sample(n=(394*4))
train80 = pd.concat([fraud, valid80])
train80 = train80.sample(frac=1).reset_index(drop=True)
X_train80 = train80.iloc[:,train80.columns!="Class"]
Y_train80 = train80.iloc[:,train80.columns=="Class"]
xg = XGBClassifier()
param_grid = {'learning_rate':np.arange(0.01,0.5,0.1),'n_estimators': range(1,100),
              'min_child_weight':range(1,50),
              'gamma': range(0,100)}
CV_lr = GridSearchCV(estimator=xg,param_grid=param_grid,cv=5,
                     scoring='f1',n_jobs=-1)
CV_lr.fit(X=X_train80,y=Y_train80)
best_param = CV_lr.best_params_
print("Best Paramters for 20/80 splits: ",best_param)
# Results
xg = XGBClassifier(learning_rate= 0.41, n_estimators= 9 , 
                   min_child_weight=5 , gamma=3 )
xg.fit(X_train80,Y_train80)
y_pred1 = xg.predict(X_train80)
y_pred2 = xg.predict(X_test)
print ("Score on train set is: ", accuracy_score(Y_train80,y_pred1))
print ("Score for test data is", accuracy_score(Y_test,y_pred2))
print("Classification report for train set")
print(classification_report(Y_train80,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(Y_train80,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(Y_test,y_pred2))
print("Classification report for test set")
print(classification_report(Y_test,y_pred2))

# For 50/50 Undersampling
valid50 = valid.sample(n=(394))
train50 = pd.concat([fraud, valid50])
train50 = train50.sample(frac=1).reset_index(drop=True)
X_train50 = train50.iloc[:,train50.columns!="Class"]
Y_train50 = train50.iloc[:,train50.columns=="Class"]
xg = XGBClassifier()
param_grid = {'learning_rate':np.arange(0.01,0.5,0.1),
              'n_estimators': range(1,100),'min_child_weight':range(1,50),
              'gamma': range(0,100)}
CV_lr = GridSearchCV(estimator=xg,param_grid=param_grid,
                     cv=5,scoring='f1',n_jobs=-1)
CV_lr.fit(X=X_train50,y=Y_train50)
best_param = CV_lr.best_params_
print("Best Paramters for 50/50 Splits: ",best_param)
# Results
xg = XGBClassifier(learning_rate=.41 , n_estimators = 9,
                   min_child_weight = 1, gamma = 0)
xg.fit(X_train50,Y_train50)
y_pred1 = xg.predict(X_train50)
y_pred2 = xg.predict(X_test)
print ("Score on train set is: ", accuracy_score(Y_train50,y_pred1))
print ("Score for test data is", accuracy_score(Y_test,y_pred2))
print("Classification report for train set")
print(classification_report(Y_train50,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(Y_train50,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(Y_test,y_pred2))
print("Classification report for test set")
print(classification_report(Y_test,y_pred2))

# For 90/10 Undersampling
valid10 = valid.sample(n=(44))
train10 = pd.concat([fraud, valid10])
train10 = train10.sample(frac=1).reset_index(drop=True)
X_train10 = train10.iloc[:,train10.columns!="Class"]
Y_train10 = train10.iloc[:,train10.columns=="Class"]
param_grid = {'learning_rate':np.arange(0.01,0.5,0.1),
              'n_estimators': range(1,100),'min_child_weight':range(1,50),
              'gamma': range(0,100)}
CV_lr = GridSearchCV(estimator=xg,param_grid=param_grid,
                     cv=5,scoring='f1',n_jobs=-1)
CV_lr.fit(X=X_train10,y=Y_train10)
best_param = CV_lr.best_params_
print("Best Paramters for 90/10 Split is: ",best_param)
# Results
xg = XGBClassifier(learning_rate = 0.11, min_child_weight=6 ,
                   n_estimators=3 , gammma = 3)
xg.fit(X_train10,Y_train10)
y_pred1 = xg.predict(X_train10)
y_pred2 = xg.predict(X_test)
print ("Score on train set is: ", accuracy_score(Y_train10,y_pred1))
print ("Score for test data is", accuracy_score(Y_test,y_pred2))
print("Classification report for train set")
print(classification_report(Y_train10,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(Y_train10,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(Y_test,y_pred2))
print("Classification report for test set")
print(classification_report(Y_test,y_pred2))

# For Oversampling
# Import data
train = pd.read_csv('train.csv')
weight = pd.read_csv('weight.csv')
test = pd.read_csv('test.csv')
X = train.iloc[:,:-1]
Y = train.iloc[:,-1]
weightX = weight.iloc[:,:-1]
weightY = weight.iloc[:,-1]
testX = test.iloc[:,:-1]
testY = test.iloc[:,-1]
# Oversampling
sm= SMOTE(kind='borderline2')
X_resampled, Y_resampled = sm.fit_sample(X, Y)
Y = pd.DataFrame(Y)
X_ = pd.DataFrame(X_resampled, columns= X.columns)
Y_ = pd.DataFrame(Y_resampled, columns=Y.columns)
# Hyper-parameter tuning
xg = XGBClassifier()
param_grid = {'learning_rate':np.arange(0.01,0.5,0.1),
              'n_estimators': range(1,100),'min_child_weight':range(1,50),
              'gamma': range(0,100)}
CV_lr = GridSearchCV(estimator=xg,param_grid=param_grid,
                     cv=5,scoring='f1',n_jobs=-1)
CV_lr.fit(X=X_,y=Y_)
best_param = CV_lr.best_params_
print("Best Paramters for 50/50 Splits: ",best_param)

# Results
xg = XGBClassifier(n_estimators=80,learning_rate=0.31,
                   gamma=0, min_child_weight=2)
xg.fit(X_,Y_)
y_pred1 = xg.predict(X_)
y_pred2 = xg.predict(testX)
print ("Score on train set is: ", accuracy_score(Y_,y_pred1))
print ("Score for test data is", accuracy_score(testY,y_pred2))
print("Classification report for train set")
print(classification_report(Y_,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(Y_,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(testY,y_pred2))
print("Classification report for test set")
print(classification_report(testY,y_pred2))
pd.DataFrame(y_pred2, columns=['Class']).to_csv('smoteXG.csv',index = False)
y_pred3 = xg.predict(weightX)
print('Classification report:')
print(classification_report(weightY,y_pred3))
print('Confusion matrix')
print(confusion_matrix(weightY,y_pred3))
pd.DataFrame(y_pred3, columns=['Class']).to_csv('WsmoteXG.csv',index = False)
