<h1 align="center">Case 3 - Retail Score Analysis</h1>

<div align="right">Machine Learning</div>

- Jose Abal Caamaño
- Jesús Platero Acevedo

# Index: 
## 1. Libraries
## 2. Creation of the functions
## 3. Read and load the data
## 4. XGBoost Optimizated
## 5. Interpretability

# 1. Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, accuracy_score, auc,ConfusionMatrixDisplay,\
                            silhouette_score, recall_score, precision_score, make_scorer, \
                            roc_auc_score, f1_score, precision_recall_curve, fbeta_score, r2_score
from datetime import datetime

from lightgbm import LGBMClassifier
import lightgbm as lgb

from sklearn.ensemble import RandomForestClassifier
from shap import TreeExplainer

import xgboost as xgb
import pickle
import warnings
warnings.filterwarnings('ignore')

# 2. Creation of the functions

In [2]:
# Functions to be used
def evaluate_model(yval, ypred, ypred_proba = None):
    if ypred_proba is not None:
        print('ROC-AUC score of the model: {}'.format(roc_auc_score(yval, ypred_proba[:, 1])))
        print(f'\nF2 Score: {fbeta_score(yval, ypred, beta=2, average="macro")}\n')
        print(f'R2 Score: {r2_score(yval, ypred)}\n')
        print('Accuracy of the model: {}\n'.format(accuracy_score(yval, ypred)))
        print('Classification report: \n{}\n'.format(classification_report(yval, ypred)))
        
def cargar_modelo(ruta):
    return pickle.load(open(ruta, 'rb'))

def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

# 3. Read and load the data

In [3]:
#Import the different datasets
xtrain = pd.read_parquet('../data/processed/xtrain.parquet')
ytrain = pd.read_parquet('../data/processed/ytrain.parquet')
xtest = pd.read_parquet('../data/processed/xtest.parquet')
ytest = pd.read_parquet('../data/processed/ytest.parquet')

In [4]:
#Upload the preprocessor of the models
preprocessor = cargar_modelo('../models/preprocessor.pickle')

# 4. XGBoost Optimizated

In [12]:
# Loading lightgbm model
xgboost = cargar_modelo('../models/XGBoost.pickle')

In [1]:
#Parameters
seed = 12345
test_size = 0.25

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

params = {"subsample":[0.5, 0.75, 1], 
          "colsample_bytree":[0.5, 0.75, 1], 
          "max_depth":[2, 6, 12], 
          "min_child_weight":[1,5,15], 
          "learning_rate":[0.3, 0.1, 0.03],
          "n_estimators":[100]
         }

xgb = xgboost.XGBClassifier()
best_xgb = RandomizedSearchCV(xgb,params,cv=3,scoring="f1_macro",random_state = seed)
best_xgb = best_xgb.fit(x_train, y_train)
print(best_xgb.best_params_)

In [None]:
y_pred_xgb = best_xgb.predict(x_test)

y_pred_xgb_prob = best_xgb.predict_proba(x_test)

In [None]:
plot_metrics(y_test, y_pred_xgb, y_pred_xgb_prob, Y)

In [None]:
best_xgb.best_estimator_

# 5. Interpretability

In [None]:
import shap

explainer = shap.TreeExplainer(best_xgb.best_estimator_)
shap_values = explainer.shap_values(x_test)
shap.summary_plot(shap_values, x_test)

In [None]:
shap.summary_plot(shap_values, x_sampled, plot_type="bar")

In [None]:
shap.initjs()
# Using a random sample of the dataframe for better time computation
x_sampled = x_train.sample(100, random_state=seed)
# explain the model's predictions using SHAP values
# (same syntax works for LightGBM, CatBoost, and scikit-learn models)
explainer = shap.TreeExplainer(best_xgb.best_estimator_)
shap_values = explainer.shap_values(x_sampled)
# visualize the first prediction's explanation
shap.force_plot(explainer.expected_value, shap_values[0,:], x_sampled.iloc[0,:])

In [None]:
#Visualize the training set predictions
shap.force_plot(explainer.expected_value, shap_values, x_test, link='logit')