XGBoosting Method using decision Trees - 

In [None]:
import numpy as np 
import pandas as pd 
# import seaborn as sb 
# import matplotlib.pyplot as plt 

# from sklearn import tree 
# from sklearn.tree import DecisionTreeClassifier 
# from sklearn.ensemble import RandomForestClassifier 
 
import xgboost as xgb 
from xgboost import XGBClassifier 
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.metrics import accuracy_score, confusion_matrix 

import nbformat 
from IPython import get_ipython 

In [None]:
# %run "../Data_Preprocessing/data_preprocess_dtcls.ipynb" 

with open("../Data_Preprocessing/data_preprocess_dtcls.ipynb", "r", encoding="utf-8") as f:
    ntb = nbformat.read(f, as_version = 4) 

ipython = get_ipython() 

for cell in ntb.cells:
    if cell.cell_type == "code":
        print(cell.source) 

        if ("mov_cls_cleaned" in cell.source or "mov_cls" in cell.source):
            ipython.run_cell(cell.source, silent=True) 
            # ipython.run_cell_async(cell.source, silent=True) 

try:
    print("Movies Clean Data : ")
    print(mov_cls_cleaned.head())   # type: ignore 

except NameError as e:
    print(f"Variable not found: {e}")

In [None]:
mov_cls_cleaned     # type: ignore 

In [None]:
mov_cls_cleaned.corr()      # type: ignore 

Variable split (X, y) : 

In [None]:
X = mov_cls_cleaned.loc[:, mov_cls_cleaned.columns != 'Start_Tech_Oscar']      # type: ignore 
X 

In [None]:
X.shape 

In [None]:
y = mov_cls_cleaned['Start_Tech_Oscar']      # type: ignore 
y 

In [None]:
y.shape 

Test - Train Split : 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) 

print(f" X_train size : {X_train.shape} \n X_test size : {X_test.shape} \n y_train size : {y_train.shape} \n y_test size : {y_test.shape}") 

print(f"X_train :\n{X_train}") 
print(f"X_test :\n{X_test}") 
print(f"y_train :\n{y_train}") 
print(f"y_test :\n{y_test}") 

In [None]:
X_train = pd.DataFrame(X_train, columns=list(X.columns))  
X_train 

In [None]:
y_train = pd.Series(y_train) 
y_train 

Training multiple classification trees using XGBoosting - 

Parameter Tuning -                                                                                                                              
-> General parameters : Guide the overall functioning.  
-> Booster parameters : Guide the individual booster (tree/regression) at each individual step.  
-> Learning Task parameter : Guide the optimization performed.  

In [None]:
# clstree = DecisionTreeClassifier() 
     
xgb_cls = xgb.XGBClassifier(max_depth=5, n_estimators=10000, learning_rate=0.3, n_jobs=-1)       
xgb_cls.fit(X_train, y_train) 

y_train_prd = xgb_cls.predict(X_train) 
y_test_prd = xgb_cls.predict(X_test) 

In [None]:
y_train_prd 

In [None]:
y_test_prd 

Model  Performance - 

In [None]:
conf_mtx_trn = confusion_matrix(y_train, y_train_prd) 
print(f"Train Confusion Matrix : \n{conf_mtx_trn}") 

conf_mtx_tst = confusion_matrix(y_test, y_test_prd) 
print(f"Test Confusion Matrix : \n{conf_mtx_tst}") 

In [None]:
acc_sc_trn = accuracy_score(y_train, y_train_prd) 
print(f"Train Accuracy Score : {acc_sc_trn}") 

acc_sc_tst = accuracy_score(y_test, y_test_prd) 
print(f"Test Accuracy Score : {acc_sc_tst}") 

In [None]:
xgb.plot_importance(xgb_cls) 

More hyperparameters usage - 

In [None]:
# rnd_cls = RandomForestClassifier(max_depth=5, random_state=45)  
# xgb_cls2 = XGBClassifier(estimator=rnd_cls, learning_rate=0.2, n_estimators=5000) 

xgb_cls2 = XGBClassifier(gamma=0.3, max_depth=7, learning_rate=0.2, n_estimators=5000, random_state=45)   
xgb_cls2.fit(X_train, y_train) 

In [None]:
conf_mtx_trn = confusion_matrix(y_train, xgb_cls2.predict(X_train)) 
print(f"Train Confusion Matrix : \n{conf_mtx_trn}") 

conf_mtx_tst = confusion_matrix(y_test, xgb_cls2.predict(X_test)) 
print(f"Test Confusion Matrix : \n{conf_mtx_tst}") 

In [None]:
acc_sc_trn = accuracy_score(y_train, xgb_cls2.predict(X_train)) 
print(f"Train Accuracy Score : {acc_sc_trn}") 

acc_sc_tst = accuracy_score(y_test, xgb_cls2.predict(X_test)) 
print(f"Test Accuracy Score : {acc_sc_tst}") 

In [None]:
xgb.plot_importance(xgb_cls2) 

---------------

In [None]:
xgb_cls3 = XGBClassifier(random_state=50, early_stopping_rounds=10, eval_metric='logloss')    
# xgb_cls3 = XGBClassifier()  

params_grid = { "max_depth": [3, 5, 7, 10], 
                "gamma": [0.1, 0.2, 0.3, 0.4], 
                "learning_rate": [0.1, 0.2, 0.3, 0.5], 
                "subsample": [0.8, 0.96, 0.9], 
                "n_estimators": [100, 500, 1000, 2500], 
                "colsample_bytree": [0.85, 0.95, 0.9],  
                "reg_alpha": [1e-2, 0.1, 1]
              } 

grid_search = GridSearchCV(xgb_cls3, param_grid=params_grid, scoring='accuracy', n_jobs=-1, cv=5)    
grid_search.fit(X_train, y_train)  

In [None]:
grid_search.best_params_ 

In [None]:
cv_rf_cls = grid_search.best_estimator_ 
cv_rf_cls 

In [None]:
conf_mtx_trn = confusion_matrix(y_train, cv_rf_cls.predict(X_train)) 
print(f"Train Confusion Matrix : \n{conf_mtx_trn}") 

conf_mtx_tst = confusion_matrix(y_test, cv_rf_cls.predict(X_test)) 
print(f"Test Confusion Matrix : \n{conf_mtx_tst}") 

In [None]:
acc_sc_trn = accuracy_score(y_train, cv_rf_cls.predict(X_train)) 
print(f"Train Accuracy Score : {acc_sc_trn}") 

acc_sc_tst = accuracy_score(y_test, cv_rf_cls.predict(X_test)) 
print(f"Test Accuracy Score : {acc_sc_tst}") 

In [None]:
xgb.plot_importance(cv_rf_cls) 