# XGBoost Classifier

In [None]:
# For browser notification
!pip install jupyternotify
%load_ext jupyternotify
!pip install xgboost
!pip install imblearn

# General
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# ML
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import Logisticclassifyion
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score
import xgboost as xgb

# Custom
import sys,os
sys.path.append( '.' )
sys.path.append( '..' )
import Components.Outlier_Detection as Outlier_Detection
import Components.Feature_Selection as Feature_Selection
import Components.Normalisation as Normalisation
import Components.data_fetching as data_fetching
import Components.Data_Augmentation as Data_Augmentation
import Components.wrapper as wrapper

# CAREFUL:
# If you make changes to a custom module, you have to reload it, i.e rerun this cell
import importlib
importlib.reload(Data_Augmentation)
importlib.reload(Outlier_Detection)
importlib.reload(Feature_Selection)
importlib.reload(Normalisation)
importlib.reload(data_fetching)
importlib.reload(wrapper)


# Data Preprocessing

In [2]:
x_train, y_train = data_fetching.get_train_data()
y_train = np.ravel(y_train)
x_test = data_fetching.get_test_data()

# Model

In [3]:
%%notify

# X,x_test_final = Feature_Selection.rfe(X,y,x_test_final)
boost = xgb.XGBClassifier()
pipe = Pipeline(steps=[('scaling',StandardScaler()),
                        ('classify', boost)])

# Model
max_depth = [3]
learning_rate = [0.05]
n_estimators_model = [100]

parameters = dict(classify__max_depth=max_depth,
                    classify__learning_rate=learning_rate,
                    classify__n_estimators=n_estimators_model)

clf = GridSearchCV(pipe, parameters, cv=10, n_jobs=-1, verbose=10)
clf.fit(x_train, y_train)

# View The Best Parameters
print(clf.best_params_)
print(clf.best_score_)

# Initial
#--------
# max_depth = [3,5,7]
# learning_rate = [0.05]
# n_estimators_model = [100,200,300]
# {'classify__learning_rate': 0.1, 'classify__max_depth': 5, 'classify__n_estimators': 200}

# CV:              0.8096
# Private score:   0.5570

# Test
#-----
# max_depth = [3]
# learning_rate = [0.05]
# n_estimators_model = [100]

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:  1.8min remaining:  4.1min
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:  1.8min remaining:  1.8min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:  1.8min remaining:   46.9s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  2.1min finished


{'regress__learning_rate': 0.05, 'regress__max_depth': 3, 'regress__n_estimators': 100}
0.7987500000000001


<IPython.core.display.Javascript object>

In [14]:
y_pred = clf.predict(x_test)
# save them:
y_pred_pd = pd.DataFrame(data=y_pred, columns=["y"])
y_pred_pd.to_csv('../../Predictions/Naive_XGBoost.csv', index_label='id')