In [None]:
# Importing required Packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.metrics import confusion_matrix,precision_recall_curve,accuracy_score
from sklearn.metrics import auc,f1_score,roc_auc_score,roc_curve,recall_score
from sklearn.metrics import precision_score,recall_score,classification_report
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE

"""For Meta Ensemble Model-1 and Credit Card Fraud data"""

# Import Data
train = pd.read_csv('train.csv')
weight = pd.read_csv('weight.csv')
test = pd.read_csv('test.csv')
X = train.iloc[:,:-1]
Y = train.iloc[:,-1]
weightX = weight.iloc[:,:-1]
weightY = weight.iloc[:,-1]
testX = test.iloc[:,:-1]
testY = test.iloc[:,-1]
# Oversampling
sm= SMOTE(kind='borderline2')
X_resampled, Y_resampled = sm.fit_sample(X, Y)
Y = pd.DataFrame(Y)
X_ = pd.DataFrame(X_resampled, columns= X.columns)
Y_ = pd.DataFrame(Y_resampled, columns=Y.columns)

# Test Set Performance
xg = XGBClassifier(n_estimators=80,learning_rate=0.31, gamma=0, 
                   min_child_weight=2)
xg.fit(X_,Y_)
y_pred1 = xg.predict(X_)
y_pred2 = xg.predict(testX)
print ("Score on train set is: ", accuracy_score(Y_,y_pred1))
print ("Score for test data is", accuracy_score(testY,y_pred2))
print("Classification report for train set")
print(classification_report(Y_,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(Y_,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(testY,y_pred2))
print("Classification report for test set")
print(classification_report(testY,y_pred2))
pd.DataFrame(y_pred2, columns=['Class']).to_csv('smoteXG.csv',
                                                index = False)

# Weight set results
y_pred3 = xg.predict(weightX)
print('Classification report:')
print(classification_report(weightY,y_pred3))
print('Confusion matrix')
print(confusion_matrix(weightY,y_pred3))
pd.DataFrame(y_pred3, columns=['Class']).to_csv('WsmoteXG.csv',
                                                index = False)

"""For Meta Ensemble-2 and the Credit Card fraud data"""

# Import data
train = pd.read_csv('train2.csv')
test = pd.read_csv('test2.csv')
X = train.iloc[:,:-1]
Y = train.iloc[:,-1]
testX = test.iloc[:,:-1]
testY = test.iloc[:,-1]

# Oversampling
sm= SMOTE(kind='borderline2')
X_resampled, Y_resampled = sm.fit_sample(X, Y)
Y = pd.DataFrame(Y)
X_ = pd.DataFrame(X_resampled, columns= X.columns)
Y_ = pd.DataFrame(Y_resampled, columns=Y.columns)

# training set results
y_pred3 = xg.predict(X)
print('Classification report:')
print(classification_report(Y,y_pred3))
print('Confusion matrix')
print(confusion_matrix(Y,y_pred3))
pd.DataFrame(y_pred3, columns=['Class']).to_csv('tsmoteXG2.csv',
                                                index = False)

# Test Set Performance
xg = XGBClassifier(n_estimators=80,learning_rate=0.31, gamma=0, 
                   min_child_weight=2)
xg.fit(X_,Y_)
y_pred1 = xg.predict(X_)
y_pred2 = xg.predict(testX)
print ("Score on train set is: ", accuracy_score(Y_,y_pred1))
print ("Score for test data is", accuracy_score(testY,y_pred2))
print("Classification report for train set")
print(classification_report(Y_,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(Y_,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(testY,y_pred2))
print("Classification report for test set")
print(classification_report(testY,y_pred2))
pd.DataFrame(y_pred2, columns=['Class']).to_csv('smoteXG2.csv',
                                                index = False)

"""For Meta Ensemble model-1 and the Abalone Data"""

# Import Data
train = pd.read_csv("train.csv")
test =  pd.read_csv("test.csv")
weight1 =  pd.read_csv("weight1.csv")
X = train.iloc[:,:-1]
Y = train.iloc[:,-1]
weight1X = weight1.iloc[:,:-1]
weight1Y = weight1.iloc[:,-1]
X_test = test.iloc[:,:-1]
Y_test = test.iloc[:,-1]


# Without undersamplin/oversampling
# Parameter Tuning
xg = XGBClassifier()
param_grid = {'learning_rate':np.arange(0.01,0.5,0.1),
              'n_estimators': range(1,11),'min_child_weight':range(1,11),
              'gamma': range(0,11)}
CV_lr = GridSearchCV(estimator=xg,param_grid=param_grid,cv=5,
                     scoring='f1',n_jobs=-1)
CV_lr.fit(X=X,y=Y)
best_param = CV_lr.best_params_
print("Best Paramters: ",best_param)

xg = XGBClassifier(n_estimators=1,learning_rate=0.01, 
                   gamma=0, min_child_weight=1)
xg.fit(X,Y)
y_pred1 = xg.predict(X)
y_pred2 = xg.predict(X_test)
print ("Score on train set is: ", accuracy_score(Y,y_pred1))
print ("Score for test data is", accuracy_score(Y_test,y_pred2))
print("Classification report for train set")
print(classification_report(Y,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(Y,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(Y_test,y_pred2))
print("Classification report for test set")
print(classification_report(Y_test,y_pred2))

# Using Oversampling
sm= SMOTE(kind='borderline2')
X_resampled, Y_resampled = sm.fit_sample(X, Y)
Y = pd.DataFrame(Y)
X_ = pd.DataFrame(X_resampled, columns= X.columns)
Y_ = pd.DataFrame(Y_resampled, columns=Y.columns)

# Tuning
xg = XGBClassifier()
param_grid = {'learning_rate':np.arange(0.01,0.5,0.1),
              'n_estimators': range(1,11),'min_child_weight':range(1,11),
              'gamma': range(0,11)}
CV_lr = GridSearchCV(estimator=xg,param_grid=param_grid,cv=5,
                     scoring='f1',n_jobs=-1)
CV_lr.fit(X=X_,y=Y_)
best_param = CV_lr.best_params_
print("Best Paramters: ",best_param)

# Results
xg = XGBClassifier(n_estimators=10,learning_rate=0.41, 
                   gamma=2, min_child_weight=5)
xg.fit(X_,Y_)
y_pred1 = xg.predict(X_)
y_pred2 = xg.predict(X_test)
print ("Score on train set is: ", accuracy_score(Y_,y_pred1))
print ("Score for test data is", accuracy_score(Y_test,y_pred2))
print("Classification report for train set")
print(classification_report(Y_,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(Y_,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(Y_test,y_pred2))
print("Classification report for test set")
print(classification_report(Y_test,y_pred2))
pd.DataFrame(y_pred2, columns=['Class']).to_csv('smoteXG.csv',
                                                index = False)

# Weight Set Results
y_pred2 = xg.predict(weight1X)
print ("Score for test data is", accuracy_score(weight1Y,y_pred2))
print("Confusion matrix for train set")
print(confusion_matrix(weight1Y,y_pred2))
print("Classification report for test set")
print(classification_report(weight1Y,y_pred2))
pd.DataFrame(y_pred2, columns=['Class']).to_csv('WsmoteXG.csv',
                                                index = False)

"""For Meta Ensemble model-2 and the Abalone Data"""

# Import Data
train = pd.read_csv("train2.csv")
test =  pd.read_csv("test2.csv")
X = train.iloc[:,:-1]
Y = train.iloc[:,-1]
X_test = test.iloc[:,:-1]
Y_test = test.iloc[:,-1]

# Without undersamplin/oversampling
# Hyper Parameter Tuning
xg = XGBClassifier()
param_grid = {'learning_rate':np.arange(0.01,0.5,0.1),
              'n_estimators': range(1,11),'min_child_weight':range(1,11),
              'gamma': range(0,11)}
CV_lr = GridSearchCV(estimator=xg,param_grid=param_grid,
                     cv=5,scoring='f1',n_jobs=-1)
CV_lr.fit(X=X,y=Y)
best_param = CV_lr.best_params_
print("Best Paramters: ",best_param)

# Results
xg = XGBClassifier(n_estimators=1,learning_rate=0.01,
                   gamma=0, min_child_weight=1)
xg.fit(X,Y)
y_pred1 = xg.predict(X)
y_pred2 = xg.predict(X_test)
print ("Score on train set is: ", accuracy_score(Y,y_pred1))
print ("Score for test data is", accuracy_score(Y_test,y_pred2))
print("Classification report for train set")
print(classification_report(Y,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(Y,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(Y_test,y_pred2))
print("Classification report for test set")
print(classification_report(Y_test,y_pred2))

# Using Oversampling
sm= SMOTE(kind='borderline2')
X_resampled, Y_resampled = sm.fit_sample(X, Y)
Y = pd.DataFrame(Y)
X_ = pd.DataFrame(X_resampled, columns= X.columns)
Y_ = pd.DataFrame(Y_resampled, columns=Y.columns)

# Hyper-parameter tuning
xg = XGBClassifier()
param_grid = {'learning_rate':np.arange(0.01,0.5,0.1),
              'n_estimators': range(1,11),'min_child_weight':range(1,11),
              'gamma': range(0,11)}
CV_lr = GridSearchCV(estimator=xg,param_grid=param_grid,
                     cv=5,scoring='f1',n_jobs=-1)
CV_lr.fit(X=X_,y=Y_)
best_param = CV_lr.best_params_
print("Best Paramters: ",best_param)

# Results
xg = XGBClassifier(n_estimators=10,learning_rate=0.41,
                   gamma=2, min_child_weight=5)
xg.fit(X_,Y_)
y_pred1 = xg.predict(X_)
y_pred2 = xg.predict(X_test)
y_pred3 = xg.predict(X)
print ("Score on train set is: ", accuracy_score(Y_,y_pred1))
print ("Score for test data is", accuracy_score(Y_test,y_pred2))
print("Classification report for train set")
print(classification_report(Y_,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(Y_,y_pred1))
print("Confusion matrix for train set")
print(confusion_matrix(Y_test,y_pred2))
print("Classification report for test set")
print(classification_report(Y_test,y_pred2))
pd.DataFrame(y_pred2, columns=['Class']).to_csv('smoteXG2.csv',
                                                index = False)
pd.DataFrame(y_pred3, columns=['Class']).to_csv('tsmoteXG2.csv',
                                                index = False)