In [None]:
from joblib import dump,load
import math
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

df_raw_train = pd.read_csv('../data/raw/train.csv')
df_raw_test = pd.read_csv('../data/raw/test.csv')

train_df_size = 0.8
rand_state_ind = 42
validation_df_size = 0.2
scoring = 'roc_auc'
cv = 10

df_cleaned_train = df_raw_train.copy()

target = df_cleaned_train.pop('TARGET_5Yrs')
IDlist_train = df_cleaned_train.pop('Id')
df_col_names = df_cleaned_train.columns
train_dataset_size = IDlist_train.size

df_cleaned_train[df_cleaned_train<0] = 0

scaler = StandardScaler()
array_cleaned_train = scaler.fit_transform(df_cleaned_train)
df_cleaned2_train = pd.DataFrame(array_cleaned_train,columns=df_col_names)
#df_cleaned2_train.insert(loc=0,column='Id',value=IDlist_train)
#df_cleaned2_train.set_index(keys='Id',drop=False,verify_integrity=True)

df_cleaned_test = df_raw_test.copy()
IDlist_test = df_cleaned_test.pop('Id')
df_cleaned_test[df_cleaned_test<0] = 0

array_cleaned_test = scaler.transform(df_cleaned_test) 
df_cleaned2_test = pd.DataFrame(array_cleaned_test,columns=df_col_names)

y_train_pos_count = sum(target)
y_train_neg_count = target.size - y_train_pos_count

def BuildCSVforSubmission(IDlist,PredictionList):
    iErrCode = 0
    fFileName = 'submission.csv'
    if (IDlist.size != 3799) or (PredictionList.size != 3799):
        iErrCode = 1
    if iErrCode == 0:
        if not(isinstance(IDlist, pd.Series)):
            list1 = pd.Series(IDlist)
        else:
            list1 = IDlist.copy()
        if not(isinstance(PredictionList, pd.Series)):
            list2 = pd.Series(PredictionList)
        else:
            list2 = IDlist.copy()
        #list2.round(2)
        #df_out = pd.DataFrame([list1,list2])
        df_out = pd.DataFrame({'Id':list1,'TARGET_5Yrs':list2})
        df_out.to_csv(path_or_buf = '../data/processed/'+fFileName,index=False)
    return iErrCode

from sklearn.metrics import roc_auc_score

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV
logRegModel = LogisticRegression(penalty = 'l2',max_iter=10000,random_state=42)
models = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
cRegStr = np.logspace(start=-3,stop=0,num=21)
class_weight = [None,'balanced']
param_grid = {'C':cRegStr,'class_weight':class_weight,'solver':models}
#param_grid = {'C':cRegStr,'solver':models}


clf11 = GridSearchCV(estimator=logRegModel,param_grid=param_grid,cv=cv,scoring=scoring,return_train_score=True,verbose=3)
clf11.fit(df_cleaned2_train, target)
clf11.best_estimator_
clf11.score(df_cleaned2_train, target)
clf11.cv_results_.keys()
test_grid_1 = clf11.predict_proba(df_cleaned2_test)[:,1]

from sklearn.model_selection import RandomizedSearchCV
clf12 = RandomizedSearchCV(estimator=logRegModel,param_distributions=param_grid,n_iter = 30,cv=cv,scoring=scoring,return_train_score=True,verbose=3)
clf12.fit(df_cleaned2_train, target)
clf12.score(df_cleaned2_train, target)

import xgboost as xgb
data_dmatrix_train = xgb.DMatrix(data=df_cleaned2_train,label=target)
data_dmatrix_test = xgb.DMatrix(data=df_cleaned2_test)

XGBModel = xgb.XGBClassifier(use_label_encoder=False, objective='binary:logistic', eval_metric='auc')

#X_data, X_val, y_data, y_val = train_test_split(df_cleaned3_train, target, train_size=train_df_size, random_state=rand_state_ind, stratify=target)

clf6 = XGBModel.fit(X_data, y_data)
y_pred = XGBModel.predict_proba(df_cleaned2_test)[:,1]
roc_train_XGB = roc_auc_score(y_data, clf6.predict_proba(X_data)[:,1])
roc_val_XGB = roc_auc_score(y_val, clf6.predict_proba(X_val)[:,1])

n_estimators = np.logspace(start=1, stop=round(math.log(train_dataset_size/2,2)),base=2,num=5).astype(int)
max_depth = np.arange(start=2,stop=5)
min_child_weight = np.logspace(start=1, stop=round(math.log(train_dataset_size/2),2),base=2,num=5).astype(int)
learning_rate = np.logspace(start=-3,stop=-0.1,num=7)
param_grid2 = {'max_depth':max_depth,'n_estimators':n_estimators,'min_child_weight':min_child_weight,'learning_rate':learning_rate}

clf13 = GridSearchCV(estimator=XGBModel,param_grid=param_grid2,cv=cv,scoring=scoring,return_train_score=True,verbose=3)
clf13.fit(df_cleaned2_train, target)
clf13.score(df_cleaned2_train, target)

dump(clf13,  '../models/XGB_10cv_gridsearch_basic.joblib')

clf_X11 = load('../models/XGB_10cv_gridsearch_basic.joblib')

roc_test = clf_X11.predict_proba(df_cleaned2_test)[:,1]
code = BuildCSVforSubmission(IDlist_test,roc_test)

from sklearn.utils import resample
df_train_posclass = df_raw_train[df_raw_train['TARGET_5Yrs']==1]
df_train_negclass = df_raw_train[df_raw_train['TARGET_5Yrs']==0]
df_train_negclass_upsampled = resample(df_train_negclass, replace=True, n_samples=y_train_pos_count, random_state=42)

df_cleaned3_train = pd.concat([df_train_posclass,df_train_negclass_upsampled])

target_upsampled = df_cleaned3_train.pop('TARGET_5Yrs')
IDlist_train_upsampled = df_cleaned3_train.pop('Id')

n_estimators = np.logspace(start=1, stop=round(math.log(train_dataset_size/4,2)),base=2,num=5).astype(int)
max_depth = np.arange(start=2,stop=5)
min_child_weight = np.logspace(start=1, stop=round(math.log(train_dataset_size/4),2),base=2,num=5).astype(int)
learning_rate = np.logspace(start=-3,stop=-0.1,num=4)
param_grid3 = {'max_depth':max_depth,'n_estimators':n_estimators,'min_child_weight':min_child_weight,'learning_rate':learning_rate}

clf14 = GridSearchCV(estimator=XGBModel,param_grid=param_grid3,cv=cv,scoring=scoring,return_train_score=True,verbose=3)
clf14.fit(df_cleaned3_train, target_upsampled)
clf14.score(df_cleaned3_train, target_upsampled)

dump(clf14,  '../models/XGB_10cv_gridsearch_upsampled.joblib')

# StandarsScale no negative reset
clf16 = GridSearchCV(estimator=XGBModel,param_grid=param_grid3,cv=cv,scoring=scoring,return_train_score=True,verbose=3)
clf16.fit(df_cleaned3_train, target_upsampled)
clf16.score(df_cleaned3_train, target_upsampled)
clf16.best_estimator_

dump(clf16,  '../models/XGB_10cv_gridsearch_upsampled_includenegs.joblib')


df_cleaned4_train = df_raw_train.copy()
target4 = df_cleaned4_train.pop('TARGET_5Yrs')
IDlist_train4 = df_cleaned4_train.pop('Id')

df_cleaned_test4 = df_raw_test.copy()
IDlist_test4 = df_cleaned_test4.pop('Id')

clf15 = GridSearchCV(estimator=XGBModel,param_grid=param_grid3,cv=cv,scoring=scoring,return_train_score=True,verbose=3)
clf15.fit(df_cleaned4_train, target4)
clf15.score(df_cleaned4_train, target4)

dump(clf15,  '../models/XGB_10cv_gridsearch_unscaled.joblib')


# MOre hyperparameters
XGBModel2 = xgb.XGBClassifier(use_label_encoder=False, objective='binary:logistic', eval_metric='auc',
                             sampling_method='gradient_based')

n_estimators = np.logspace(start=round(math.log(train_dataset_size/20,2)), stop=round(math.log(train_dataset_size/4,2)),base=2,num=5).astype(int)
max_depth = np.arange(start=2,stop=5)
min_child_weight = np.logspace(start=round(math.log(train_dataset_size/20,2)), stop=round(math.log(train_dataset_size/4),2),base=2,num=5).astype(int)
learning_rate = np.logspace(start=-2,stop=0,num=5)
lambda_reg = np.logspace(start=0,stop=2,num=3)
alpha_reg = np.logspace(start=0,stop=2,num=3)
param_grid4 = {'max_depth':max_depth,'n_estimators':n_estimators,'min_child_weight':min_child_weight,
               'learning_rate':learning_rate,'lambda':lambda_reg,'alpha':alpha_reg}

clf17 = GridSearchCV(estimator=XGBModel2,param_grid=param_grid4,cv=cv,scoring=scoring,return_train_score=True,verbose=3)
clf17.fit(df_cleaned3_train, target_upsampled)
clf17.score(df_cleaned3_train, target_upsampled)

dump(clf17,  '../models/XGB_10cv_gridsearch_biggergrid_upsampled.joblib')

clf18 = RandomizedSearchCV(estimator=XGBModel2,param_distributions=param_grid4,n_iter = 100,cv=cv,scoring=scoring,return_train_score=True,verbose=3)
clf18.fit(df_cleaned3_train, target_upsampled)
clf18.score(df_cleaned3_train, target_upsampled)
clf18.best_estimator_

dump(clf18,  '../models/XGB_10cv_randomsearch_biggergrid_upsampled.joblib')

XGBModel3 = xgb.XGBClassifier(use_label_encoder=False, objective='binary:logistic', eval_metric='auc',
                             sampling_method='gradient_based',max_depth=4)

n_estimators = np.logspace(start=round(math.log(train_dataset_size/20,2)), stop=round(math.log(train_dataset_size/4,2)),base=2,num=5).astype(int)
min_child_weight = np.logspace(start=round(math.log(train_dataset_size/20,2)), stop=round(math.log(train_dataset_size/4),2),base=2,num=5).astype(int)
learning_rate = np.logspace(start=-2,stop=0,num=201)
param_grid5 = {'n_estimators':n_estimators,'min_child_weight':min_child_weight,
               'learning_rate':learning_rate}

clf19 = RandomizedSearchCV(estimator=XGBModel3,param_distributions=param_grid5,n_iter = 100,cv=cv,scoring=scoring,return_train_score=True,verbose=3)
clf19.fit(df_cleaned3_train, target_upsampled)
clf19.score(df_cleaned3_train, target_upsampled)
clf19.best_estimator_

dump(clf19,  '../models/XGB_10cv_randomsearch_biggergrid_upsampled2.joblib')


array_cleaned4_train = scaler.fit_transform(df_cleaned3_train)
df_cleaned4_train = pd.DataFrame(array_cleaned4_train,columns=df_col_names)
array_cleaned4_test = scaler.transform(df_cleaned3_train) 
df_cleaned4_test = pd.DataFrame(array_cleaned_test,columns=df_col_names)

clf20 = RandomizedSearchCV(estimator=XGBModel3,param_distributions=param_grid5,n_iter = 100,cv=cv,scoring=scoring,return_train_score=True,verbose=3)
clf20.fit(df_cleaned4_train, target_upsampled)
clf20.score(df_cleaned4_train, target_upsampled)
clf20.best_estimator_

dump(clf20,  '../models/XGB_10cv_randomsearch_biggergrid_upsampled3.joblib')

n_estimators = np.logspace(start=round(math.log(train_dataset_size/20,2)), stop=round(math.log(train_dataset_size/4,2)),base=2,num=5).astype(int)
min_child_weight = np.logspace(start=round(math.log(train_dataset_size/20,2)), stop=round(math.log(train_dataset_size/4),2),base=2,num=5).astype(int)
learning_rate = np.linspace(start=0.5,stop=1.0,num=201)
param_grid6 = {'n_estimators':n_estimators,'min_child_weight':min_child_weight,
               'learning_rate':learning_rate}

clf21 = RandomizedSearchCV(estimator=XGBModel3,param_distributions=param_grid6,n_iter = 100,cv=cv,scoring=scoring,return_train_score=True,verbose=3)
clf21.fit(df_cleaned4_train, target_upsampled)
clf21.score(df_cleaned4_train, target_upsampled)
clf21.best_estimator_

dump(clf21,  '../models/XGB_10cv_randomsearch_biggergrid_upsampled4.joblib')

roc_test = clf21.best_estimator_.predict_proba(df_cleaned4_test)[:,1]
code = BuildCSVforSubmission(IDlist_test,roc_test)
