# Initialise packages

In [None]:
from joblib import dump
import math
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import StandardScaler

In [None]:
df_raw_train = pd.read_csv('../data/raw/train.csv')
df_raw_test = pd.read_csv('../data/raw/test.csv')

## Constant value definition

In [None]:
train_df_size = 0.8
rand_state_ind = 42
validation_df_size = 0.2
scoring = 'roc_auc'
cv = 10

## Data processing

In [None]:
df_cleaned_train = df_raw_train.copy()

target = df_cleaned_train.pop('TARGET_5Yrs')
IDlist_train = df_cleaned_train.pop('Id')
df_col_names = df_cleaned_train.columns
train_dataset_size = IDlist_train.size

df_cleaned_train[df_cleaned_train<0] = 0

scaler = StandardScaler()
array_cleaned_train = scaler.fit_transform(df_cleaned_train)
df_cleaned2_train = pd.DataFrame(array_cleaned_train,columns=df_col_names)

In [None]:
df_cleaned_test = df_raw_test.copy()
IDlist_test = df_cleaned_test.pop('Id')
df_cleaned_test[df_cleaned_test<0] = 0

array_cleaned_test = scaler.transform(df_cleaned_test) 
df_cleaned2_test = pd.DataFrame(array_cleaned_test,columns=df_col_names)

## Calculating size of majority and minority classes

In [None]:
y_train_pos_count = sum(target)
y_train_neg_count = target.size - y_train_pos_count

## Upsample the minority class for training dataset

In [None]:
from sklearn.utils import resample

In [None]:
df_train_posclass = df_raw_train[df_raw_train['TARGET_5Yrs']==1]
df_train_negclass = df_raw_train[df_raw_train['TARGET_5Yrs']==0]
df_train_negclass_upsampled = resample(df_train_negclass, replace=True, n_samples=y_train_pos_count, random_state=42)

In [None]:
df_cleaned3_train = pd.concat([df_train_posclass,df_train_negclass_upsampled])
target_upsampled = df_cleaned3_train.pop('TARGET_5Yrs')
IDlist_train_upsampled = df_cleaned3_train.pop('Id')

## Set base model

In [None]:
import xgboost as xgb

In [None]:
XGBModel = xgb.XGBClassifier(use_label_encoder=False, objective='binary:logistic', eval_metric='auc',
                             sampling_method='gradient_based')

## Define grid for search

In [None]:
n_estimators = np.logspace(start=round(math.log(train_dataset_size/20,2)), stop=round(math.log(train_dataset_size/4,2)),base=2,num=5).astype(int)
max_depth = np.arange(start=2,stop=5)
min_child_weight = np.logspace(start=round(math.log(train_dataset_size/20,2)), stop=round(math.log(train_dataset_size/4),2),base=2,num=5).astype(int)
learning_rate = np.logspace(start=-2,stop=0,num=5)
lambda_reg = np.logspace(start=0,stop=2,num=3)
alpha_reg = np.logspace(start=0,stop=2,num=3)
param_grid = {'max_depth':max_depth,'n_estimators':n_estimators,'min_child_weight':min_child_weight,
               'learning_rate':learning_rate,'lambda':lambda_reg,'alpha':alpha_reg}

## Perform randomised search

In [None]:
clf = RandomizedSearchCV(estimator=XGBModel,param_distributions=param_grid,n_iter = 100,cv=cv,scoring=scoring,return_train_score=True,verbose=3)
clf.fit(df_cleaned3_train, target_upsampled)
clf.score(df_cleaned3_train, target_upsampled)
clf.best_estimator_

## Save model

In [None]:
dump(clf,  '../models/XGB_10cv_randomsearch_biggergrid_upsampled.joblib')