In [1]:
# Data management
# ------------------------------------------------------------------------------
import numpy as np
from numpy import mean
from numpy import std
import pandas as pd

# Graphics
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt

# Preprocessing and modelling
# evaluate bagging ensemble for regression
# ------------------------------------------------------------------------------
from sklearn.datasets import load_boston
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.inspection import permutation_importance
import multiprocessing


# Configuration of warnings
# ------------------------------------------------------------------------------
import warnings
#warnings.filterwarnings('once')
warnings.filterwarnings('ignore')

In [33]:
# Data Loading
# The Titanic dataset is available in Seaborn as the ‘titanic’ dataset. It consists of the following columns:

# Survived: Survival status (0 = No, 1 = Yes)
# Pclass: Passenger class (1 = 1st class, 2 = 2nd class, 3 = 3rd class)
# Sex: Passenger’s gender
# Age: Passenger’s age
# SibSp: Number of siblings/spouses aboard
# Parch: Number of parents/children aboard
# Fare: Fare paid for the ticket
# Embarked: Port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)
# Class: Equivalent to Pclass (1 = 1st class, 2 = 2nd class, 3 = 3rd class)
# Who: Passenger’s category (man, woman, child)
# Adult_male: Whether the passenger is an adult male or not (True or False)
# Deck: Cabin deck
# Embark_town: Port of embarkation (Cherbourg, Queenstown, Southampton)
# Alive: Survival status (yes or no)
# Alone: Whether the passenger is alone or not (True or False)
# Adult_male: Whether the passenger is an adult male or not (True or False)
# Alone: Whether the passenger is alone or not (True or False)
# Alive: Survival status (yes or no)
# Embarked: Port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)
# Class: Equivalent to Pclass (1 = 1st class, 2 = 2nd class, 3 = 3rd class)

import seaborn as sns    
data_train = sns.load_dataset('titanic')
data_train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [34]:
data_train= data_train.drop(columns = ["survived", "pclass", "embarked", "who", "adult_male", "deck"])

In [35]:
data_train=data_train.dropna()

In [36]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   sex          712 non-null    object  
 1   age          712 non-null    float64 
 2   sibsp        712 non-null    int64   
 3   parch        712 non-null    int64   
 4   fare         712 non-null    float64 
 5   class        712 non-null    category
 6   embark_town  712 non-null    object  
 7   alive        712 non-null    object  
 8   alone        712 non-null    bool    
dtypes: bool(1), category(1), float64(2), int64(2), object(3)
memory usage: 46.0+ KB


In [37]:
# Configuring train and test datasets
# ------------------------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
                                        data_train.drop(columns = "alive"),
                                        data_train['alive'],
                                        train_size= 0.7, # by default 0.75 is the amunt of data for training
                                        random_state = 123
                                                            ) 

In [38]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 498 entries, 605 to 646
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   sex          498 non-null    object  
 1   age          498 non-null    float64 
 2   sibsp        498 non-null    int64   
 3   parch        498 non-null    int64   
 4   fare         498 non-null    float64 
 5   class        498 non-null    category
 6   embark_town  498 non-null    object  
 7   alone        498 non-null    bool    
dtypes: bool(1), category(1), float64(2), int64(2), object(2)
memory usage: 28.3+ KB


In [39]:
#Conversion of categorical values

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# One-hot-encoding of the categoric variables
# ------------------------------------------------------------------------------
# Identification of categoric and numerical variables/column
cat_cols = X_train.select_dtypes(include=['object', 'category', 'bool']).columns.to_list()
numeric_cols = X_train.select_dtypes(include=['float64', 'int64']).columns.to_list()


In [40]:
# One-hot-encoding of the categoric variables
# ------------------------------------------------------------------------------
# Application of one-hot-encoding only to the categorical variables
preprocessor = ColumnTransformer(
                    [('onehot', OneHotEncoder(handle_unknown='ignore'), cat_cols)],
                    remainder='passthrough'
               )

# Once the object ColumnTransformer was created, using the method fit()
# the transforms are applied to the taining and test datasets
# using transform(). Both operations at the same time using fit_transform().
X_train_prep = preprocessor.fit_transform(X_train)
X_test_prep  = preprocessor.fit_transform(X_test)

In [41]:
# Cheking NaN /null values
X_train.isnull().sum()*100/X_train.shape[0]

sex            0.0
age            0.0
sibsp          0.0
parch          0.0
fare           0.0
class          0.0
embark_town    0.0
alone          0.0
dtype: float64

In [42]:
# Obtaining names of columns for creation of a dataframe
encoded_cat = preprocessor.named_transformers_['onehot'].get_feature_names(cat_cols)
labels = np.concatenate([ encoded_cat, numeric_cols])

In [43]:
# Conversion to dataframe
X_train_prep = pd.DataFrame(X_train_prep, columns=labels)
X_test_prep  = pd.DataFrame(X_test_prep, columns=labels)
X_train_prep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 498 entries, 0 to 497
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   sex_female               498 non-null    float64
 1   sex_male                 498 non-null    float64
 2   class_First              498 non-null    float64
 3   class_Second             498 non-null    float64
 4   class_Third              498 non-null    float64
 5   embark_town_Cherbourg    498 non-null    float64
 6   embark_town_Queenstown   498 non-null    float64
 7   embark_town_Southampton  498 non-null    float64
 8   alone_False              498 non-null    float64
 9   alone_True               498 non-null    float64
 10  age                      498 non-null    float64
 11  sibsp                    498 non-null    float64
 12  parch                    498 non-null    float64
 13  fare                     498 non-null    float64
dtypes: float64(14)
memory usag

In [44]:
X_train_prep

Unnamed: 0,sex_female,sex_male,class_First,class_Second,class_Third,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,alone_False,alone_True,age,sibsp,parch,fare
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,36.0,1.0,0.0,15.5500
1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,42.0,0.0,1.0,8.4042
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,21.0,0.0,0.0,10.5000
3,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,48.0,1.0,0.0,76.7292
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,22.0,0.0,1.0,55.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
493,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,12.0,1.0,0.0,11.2417
494,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,26.0,0.0,0.0,8.0500
495,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,9.0,5.0,2.0,46.9000
496,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,65.0,0.0,0.0,26.5500


In [45]:
# Removing repeated column
X_train_prep= X_train_prep.drop(columns = ["sex_male"])
X_test_prep= X_test_prep.drop(columns = ["sex_male"])

# updatig coulmns names
labels= X_train_prep.columns.values

In [46]:
X_train_prep

Unnamed: 0,sex_female,class_First,class_Second,class_Third,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,alone_False,alone_True,age,sibsp,parch,fare
0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,36.0,1.0,0.0,15.5500
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,42.0,0.0,1.0,8.4042
2,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,21.0,0.0,0.0,10.5000
3,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,48.0,1.0,0.0,76.7292
4,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,22.0,0.0,1.0,55.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
493,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,12.0,1.0,0.0,11.2417
494,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,26.0,0.0,0.0,8.0500
495,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,9.0,5.0,2.0,46.9000
496,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,65.0,0.0,0.0,26.5500


In [47]:
# Grid to find hiperparameters
# ==============================================================================


# Grid de hiperparámetros evaluados
# ==============================================================================
param_grid = {'n_estimators'  : [50, 100, 500, 1000],
              'max_features'  : [None, 'sqrt', 'log2'],
              'max_depth'     : [None, 1, 3, 5, 10, 20],
              'subsample'     : [0.5, 1],  # fraction of samples to be used for fitting the individual base learners
              'learning_rate' : [0.001, 0.01, 0.1]
             }

# Grid search using CV
# ==============================================================================
grid = GridSearchCV(
        estimator  = GradientBoostingClassifier(random_state=123),
        param_grid = param_grid,
        scoring    = 'accuracy',
        n_jobs     = multiprocessing.cpu_count() - 1,
        cv         = RepeatedKFold(n_splits=3, n_repeats=1, random_state=123), 
        refit      = True,
        verbose    = 0,
        return_train_score = True
       )

grid.fit(X = X_train_prep, y = y_train)

# Results
# ==============================================================================
results = pd.DataFrame(grid.cv_results_)
results.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False) \
    .head(10)




Unnamed: 0,param_learning_rate,param_max_depth,param_max_features,param_n_estimators,param_subsample,mean_test_score,std_test_score,mean_train_score,std_train_score
346,0.1,3,sqrt,100,0.5,0.831325,0.013014,0.898594,0.008637
354,0.1,3,log2,100,0.5,0.831325,0.013014,0.898594,0.008637
237,0.01,5,log2,500,1.0,0.817269,0.007513,0.936747,0.002459
229,0.01,5,sqrt,500,1.0,0.817269,0.007513,0.936747,0.002459
118,0.001,10,log2,1000,0.5,0.815261,0.015027,0.962851,0.006189
110,0.001,10,sqrt,1000,0.5,0.815261,0.015027,0.962851,0.006189
214,0.01,3,log2,1000,0.5,0.813253,0.014756,0.905622,0.00142
196,0.01,3,,500,0.5,0.813253,0.017039,0.90261,0.005119
198,0.01,3,,1000,0.5,0.813253,0.008519,0.939759,0.002459
206,0.01,3,sqrt,1000,0.5,0.813253,0.014756,0.905622,0.00142


In [48]:
# Better hyperparameters using out-of-bag error
# ==============================================================================
print("--------------------------------------------------")
print("Better hyperparameters found (oob-accuracy)")
print("--------------------------------------------------")
print(grid.best_params_, ":", grid.best_score_, grid.scoring)

--------------------------------------------------
Better hyperparameters found (oob-accuracy)
--------------------------------------------------
{'learning_rate': 0.1, 'max_depth': 3, 'max_features': 'sqrt', 'n_estimators': 100, 'subsample': 0.5} : 0.8313253012048193 accuracy


In [49]:
model_final = grid.best_estimator_

In [50]:
# Model training error
#-------------------------------------------------------------------------------
predictions_train = model_final.predict(X = X_train_prep,)

print("Confussion Matrix -- TRAIN")
print("-------------------")
confusion_matrix(
    y_true    = y_train,
    y_pred    = predictions_train
)

Confussion Matrix -- TRAIN
-------------------


array([[282,  17],
       [ 40, 159]], dtype=int64)

In [51]:
accuracy_train = accuracy_score(
            y_true    = y_train,
            y_pred    = predictions_train,
            normalize = True
           )
#print(f"Accuracy in train : {100 * accuracy_train} %")
print("\n Accuracy in train 1s: {:.2f} %".format(100 * accuracy_train))


 Accuracy in train 1s: 88.55 %


In [52]:
# Model test error
#-------------------------------------------------------------------------------
predictions_test = model_final.predict(X = X_test_prep,)

print("Confussion Matrix -- TEST")
print("-------------------")
confusion_matrix(
    y_true    = y_test,
    y_pred    = predictions_test
)

Confussion Matrix -- TEST
-------------------


array([[112,  13],
       [ 29,  60]], dtype=int64)

In [53]:
accuracy_test = accuracy_score(
            y_true    = y_test,
            y_pred    = predictions_test,
            normalize = True
           )
#print(f"Accuracy in test 1s: {100 * accuracy_test} %")

print("\n Accuracy in test 1s: {:.2f} %".format(100 * accuracy_test))


 Accuracy in test 1s: 80.37 %


In [54]:
mat_conf = confusion_matrix(
                    y_true    = y_test,
                    y_pred    = predictions_test
                )

accuracy = accuracy_score(
            y_true    = y_test,
            y_pred    = predictions_test,
            normalize = True
           )

print("Confussion Matrix")
print("-------------------")
print(mat_conf)
print("")
print(f"Accuracy in test is: {100 * accuracy} %")

Confussion Matrix
-------------------
[[112  13]
 [ 29  60]]

Accuracy in test is: 80.37383177570094 %


In [55]:
from sklearn.metrics import classification_report
print(
    classification_report(
        y_true = y_test,
        y_pred = predictions_test
    )
)

              precision    recall  f1-score   support

          no       0.79      0.90      0.84       125
         yes       0.82      0.67      0.74        89

    accuracy                           0.80       214
   macro avg       0.81      0.79      0.79       214
weighted avg       0.81      0.80      0.80       214



In [56]:
# Prediction of probabilities
# ==============================================================================
predictions = model_final.predict_proba(X = X_test_prep)
predictions[:10, :]

array([[0.84775661, 0.15224339],
       [0.88862166, 0.11137834],
       [0.03374   , 0.96626   ],
       [0.27715322, 0.72284678],
       [0.41028145, 0.58971855],
       [0.09479478, 0.90520522],
       [0.85818783, 0.14181217],
       [0.62584621, 0.37415379],
       [0.41581655, 0.58418345],
       [0.58031329, 0.41968671]])

In [57]:
# Classification using the clas with higher probability
# ==============================================================================
df_predictions = pd.DataFrame(data=predictions, columns=['0', '1'])
df_predictions['classification_default_0.5'] = np.where(df_predictions['0'] > df_predictions['1'], 0, 1)
df_predictions.head(10)

Unnamed: 0,0,1,classification_default_0.5
0,0.847757,0.152243,0
1,0.888622,0.111378,0
2,0.03374,0.96626,1
3,0.277153,0.722847,1
4,0.410281,0.589719,1
5,0.094795,0.905205,1
6,0.858188,0.141812,0
7,0.625846,0.374154,0
8,0.415817,0.584183,1
9,0.580313,0.419687,0


In [58]:
# Classification using a threshold of 0.8 for class 1.
# ==============================================================================
df_predictions['classification_custom_0.8'] = np.where(df_predictions['1'] > 0.9, 1, 0)
df_predictions.iloc[4:20, :]

Unnamed: 0,0,1,classification_default_0.5,classification_custom_0.8
4,0.410281,0.589719,1,0
5,0.094795,0.905205,1,1
6,0.858188,0.141812,0,0
7,0.625846,0.374154,0,0
8,0.415817,0.584183,1,0
9,0.580313,0.419687,0,0
10,0.924867,0.075133,0,0
11,0.81536,0.18464,0,0
12,0.841956,0.158044,0,0
13,0.75603,0.24397,0,0


In [59]:
importance_predictors = pd.DataFrame(
                            {'predictor': X_train_prep.columns,
                             'importance': model_final.feature_importances_}
                            )
print("Importance  of the model predictors")
print("-------------------------------------------")
importance_predictors.sort_values('importance', ascending=False)

Importance  of the model predictors
-------------------------------------------


Unnamed: 0,predictor,importance
0,sex_female,0.271557
12,fare,0.208613
9,age,0.189711
10,sibsp,0.056409
3,class_Third,0.054163
1,class_First,0.051577
11,parch,0.048739
2,class_Second,0.035437
7,alone_False,0.03103
4,embark_town_Cherbourg,0.02074


In [60]:
# Conversion yes/no  of variable y_train and y_test 
y_train_prep = y_train.apply(lambda x: 0 if x=='no' else 1)
y_test_prep = y_test.apply(lambda x: 0 if x=='no' else 1)

In [62]:
# Algoritmo XGBoost
import xgboost as xgb

In [63]:
from sklearn.model_selection import GridSearchCV

clf = xgb.XGBClassifier(objective='binary:logistic')
parameters = {
     "eta"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ],  # Step size shrinkage used in update to prevents overfitting
     "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
     "min_child_weight" : [ 1, 3, 5, 7 ], # def= 1 Minimum loss reduction required to make a further partition on a leaf node of the tree
     "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ], # Minimum loss reduction required to make a further partition on a leaf node of the tree
     "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ],  # It is a family of parameters for subsampling of columns.
     "n_estimators"     : [10, 20, 30]
     }

grid = GridSearchCV(clf, parameters, n_jobs=4, cv=3)

In [65]:
#grid.fit(X_train_prep, y_train_prep)

grid.fit(X = X_train_prep, y = y_train_prep)

In [66]:
# Results
# ==============================================================================
results = pd.DataFrame(grid.cv_results_)
results.filter(regex = '(param.*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False) \
    .head(10)

Unnamed: 0,param_colsample_bytree,param_eta,param_gamma,param_max_depth,param_min_child_weight,param_n_estimators,mean_test_score,std_test_score
9998,0.7,0.15,0.4,4,1,30,0.827309,0.022718
9806,0.7,0.15,0.2,4,1,30,0.821285,0.015027
10093,0.7,0.2,0.0,4,1,20,0.821285,0.019879
7577,0.5,0.2,0.3,15,3,30,0.821285,0.020478
7565,0.5,0.2,0.3,12,3,30,0.821285,0.020478
7553,0.5,0.2,0.3,10,3,30,0.821285,0.020478
7457,0.5,0.2,0.2,10,3,30,0.819277,0.02144
10285,0.7,0.2,0.2,4,1,20,0.819277,0.013014
7445,0.5,0.2,0.2,8,3,30,0.819277,0.02144
9613,0.7,0.15,0.0,4,1,20,0.819277,0.009837


In [67]:
# Best hiperparameters using CV
# ==============================================================================
print("----------------------------------------")
print("Best hiperparameters found (cv)")
print("----------------------------------------")
print(grid.best_params_, ":", grid.best_score_, grid.scoring)

----------------------------------------
Best hiperparameters found (cv)
----------------------------------------
{'colsample_bytree': 0.7, 'eta': 0.15, 'gamma': 0.4, 'max_depth': 4, 'min_child_weight': 1, 'n_estimators': 30} : 0.8273092369477912 None


In [68]:
# Number of trees of the final model (early stopping activated)
# ==============================================================================
n_trees_included = len(grid.best_estimator_.get_booster().get_dump())
print(f"Number of trees included in the model: {n_trees_included}")

Number of trees included in the model: 30


In [71]:
# Selecting Best model
model_xgb = grid.best_estimator_

In [76]:
# Model training error
#-------------------------------------------------------------------------------
predictions_train = model_xgb.predict(X = X_train_prep,)

print("Confussion Matrix -- TRAIN")
print("-------------------")
confusion_matrix(
    y_true    = y_train_prep,
    y_pred    = predictions_train
)

Confussion Matrix -- TRAIN
-------------------


array([[286,  13],
       [ 38, 161]], dtype=int64)

In [77]:
accuracy_train = accuracy_score(
            y_true    = y_train_prep,
            y_pred    = predictions_train,
            normalize = True
           )
#print(f"Accuracy in train : {100 * accuracy_train} %")
print("\n Accuracy in train 1s: {:.2f} %".format(100 * accuracy_train))


 Accuracy in train 1s: 89.76 %


In [78]:
# Model test error
#-------------------------------------------------------------------------------
predictions_test =  model_xgb.predict(X = X_test_prep,)

print("Confussion Matrix -- TEST")
print("-------------------")
confusion_matrix(
    y_true    = y_test_prep,
    y_pred    = predictions_test
)

Confussion Matrix -- TEST
-------------------


array([[114,  11],
       [ 30,  59]], dtype=int64)

In [79]:
accuracy_test = accuracy_score(
            y_true    = y_test_prep,
            y_pred    = predictions_test,
            normalize = True
           )
#print(f"Accuracy in test 1s: {100 * accuracy_test} %")

print("\n Accuracy in test 1s: {:.2f} %".format(100 * accuracy_test))


 Accuracy in test 1s: 80.84 %


In [80]:
importance_predictors = pd.DataFrame(
                            {'predictor': X_train_prep.columns,
                             'importance': model_xgb.feature_importances_}
                            )
print("Importance  of the model predictors")
print("-------------------------------------------")
importance_predictors.sort_values('importance', ascending=False)

Importance  of the model predictors
-------------------------------------------


Unnamed: 0,predictor,importance
0,sex_female,0.476843
3,class_Third,0.13026
2,class_Second,0.063242
10,sibsp,0.062221
1,class_First,0.055012
6,embark_town_Southampton,0.05051
9,age,0.041766
12,fare,0.036749
11,parch,0.025775
4,embark_town_Cherbourg,0.023889


In [81]:
from sklearn.inspection import permutation_importance

importance_per = permutation_importance(
                estimator    = model_xgb,
                X            = X_train_prep,
                y            = y_train_prep,
                n_repeats    = 5,
                scoring      = 'neg_root_mean_squared_error',
                n_jobs       = - 1,
                random_state = 123
             )

# Se almacenan los resultados (media y desviación) en un dataframe
df_importance = pd.DataFrame(
                    {k: importance_per[k] for k in ['importances_mean', 'importances_std']}
                 )
df_importance['feature'] = X_train_prep.columns
df_importance.sort_values('importances_mean', ascending=False)

Unnamed: 0,importances_mean,importances_std,feature
0,0.233841,0.008383,sex_female
9,0.15058,0.015064,age
3,0.123853,0.013033,class_Third
12,0.071079,0.007576,fare
10,0.053649,0.011594,sibsp
1,0.030542,0.001143,class_First
2,0.0117,0.002262,class_Second
7,0.008648,0.003541,alone_False
11,0.007428,0.003123,parch
4,0.005584,0.003034,embark_town_Cherbourg


In [84]:
# Algoritmo lightgbm
import lightgbm as lgb

In [90]:
from sklearn.model_selection import GridSearchCV

clf = lgb.LGBMClassifier(boosting_type='gbdt')
parameters = {
     "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
     "min_child_weight" : [ 1, 3, 5, 7 ], # def= 1 Minimum loss reduction required to make a further partition on a leaf node of the tree
     "n_estimators"     : [10, 20, 30]
     }

grid = GridSearchCV(clf, parameters, cv=3)

In [110]:
#grid.fit(X_train_prep, y_train_prep)

grid.fit(X = X_train_prep, y = y_train_prep)

In [111]:
# Results
# ==============================================================================
results = pd.DataFrame(grid.cv_results_)
results.filter(regex = '(param.*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False) \
    .head(10)

Unnamed: 0,param_max_depth,param_min_child_weight,param_n_estimators,mean_test_score,std_test_score
50,8,1,30,0.811245,0.019879
77,12,3,30,0.809237,0.018622
89,15,3,30,0.809237,0.018622
26,5,1,30,0.809237,0.015027
65,10,3,30,0.809237,0.018622
29,5,3,30,0.809237,0.012378
38,6,1,30,0.807229,0.013014
7,3,5,20,0.807229,0.029919
73,12,1,20,0.805221,0.015811
49,8,1,20,0.805221,0.015811


In [112]:
# Best hiperparameters using CV
# ==============================================================================
print("----------------------------------------")
print("Best hiperparameters found (cv)")
print("----------------------------------------")
print(grid.best_params_, ":", grid.best_score_, grid.scoring)

----------------------------------------
Best hiperparameters found (cv)
----------------------------------------
{'max_depth': 8, 'min_child_weight': 1, 'n_estimators': 30} : 0.8112449799196787 None


In [113]:
# Number of trees of the final model (early stopping activated)
# ==============================================================================
#n_trees_included = len(grid.best_estimator_.get_booster().get_dump())
n_trees_included = grid.best_estimator_.n_estimators
print(f"Number of trees included in the model: {n_trees_included}")

Number of trees included in the model: 30


In [114]:
# Selecting Best model
model_lgbm = grid.best_estimator_

In [115]:
# Model training error
#-------------------------------------------------------------------------------
predictions_train = model_lgbm.predict(X = X_train_prep,)

print("Confussion Matrix -- TRAIN")
print("-------------------")
confusion_matrix(
    y_true    = y_train_prep,
    y_pred    = predictions_train
)

Confussion Matrix -- TRAIN
-------------------


array([[283,  16],
       [ 39, 160]], dtype=int64)

In [116]:
accuracy_train = accuracy_score(
            y_true    = y_train_prep,
            y_pred    = predictions_train,
            normalize = True
           )
#print(f"Accuracy in train : {100 * accuracy_train} %")
print("\n Accuracy in train 1s: {:.2f} %".format(100 * accuracy_train))


 Accuracy in train 1s: 88.96 %


In [117]:
# Model test error
#-------------------------------------------------------------------------------
predictions_test =  model_lgbm.predict(X = X_test_prep,)

print("Confussion Matrix -- TEST")
print("-------------------")
confusion_matrix(
    y_true    = y_test_prep,
    y_pred    = predictions_test
)

Confussion Matrix -- TEST
-------------------


array([[114,  11],
       [ 27,  62]], dtype=int64)

In [119]:
accuracy_test = accuracy_score(
            y_true    = y_test_prep,
            y_pred    = predictions_test,
            normalize = True
           )
#print(f"Accuracy in test 1s: {100 * accuracy_test} %")

print("\n Accuracy in test 1s: {:.2f} %".format(100 * accuracy_test))


 Accuracy in test 1s: 82.24 %


In [120]:
importance_predictors = pd.DataFrame(
                            {'predictor': X_train_prep.columns,
                             'importance': model_lgbm.feature_importances_}
                            )
print("Importance  of the model predictors")
print("-------------------------------------------")
importance_predictors.sort_values('importance', ascending=False)

Importance  of the model predictors
-------------------------------------------


Unnamed: 0,predictor,importance
9,age,189
12,fare,172
10,sibsp,39
0,sex_female,31
3,class_Third,23
1,class_First,17
2,class_Second,10
7,alone_False,8
4,embark_town_Cherbourg,5
8,alone_True,4


In [121]:
from sklearn.inspection import permutation_importance

importance_per = permutation_importance(
                estimator    = model_lgbm,
                X            = X_train_prep,
                y            = y_train_prep,
                n_repeats    = 5,
                scoring      = 'neg_root_mean_squared_error',
                n_jobs       = - 1,
                random_state = 123
             )

# Se almacenan los resultados (media y desviación) en un dataframe
df_importance = pd.DataFrame(
                    {k: importance_per[k] for k in ['importances_mean', 'importances_std']}
                 )
df_importance['feature'] = X_train_prep.columns
df_importance.sort_values('importances_mean', ascending=False)

Unnamed: 0,importances_mean,importances_std,feature
0,0.238694,0.006217,sex_female
9,0.147254,0.011023,age
3,0.091224,0.011039,class_Third
10,0.055031,0.012396,sibsp
12,0.046817,0.004917,fare
1,0.038265,0.003273,class_First
7,0.010095,0.003972,alone_False
4,0.003598,0.002239,embark_town_Cherbourg
2,0.002401,0.002242,class_Second
8,0.002401,0.002242,alone_True
