
### Gradient Boosting with XGboost

In [1]:
from xgboost import XGBClassifier, XGBRegressor


import pandas as pd
import numpy as np


import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

sns.set_style('darkgrid')
sns.set_palette('husl')

In [2]:
data = pd.read_csv('insurance.csv')

In [3]:
data.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [4]:
data.head(4)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061


### Assignment 

Perform an extensive exploratory data analysis on this dataset

In [5]:
data_enc = pd.get_dummies(data, ['sex','region','smoker'])

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

num_cols = scaler.fit_transform(data_enc[['age','bmi','children']])
cat_cols = data_enc.drop(columns=['age','bmi','children','charges']).values

X = np.concatenate((num_cols,cat_cols), axis= 1)
y = data_enc['charges']

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=23)

reg = XGBRegressor(random_state= 23, n_estimators = 50,learning_rate = 0.1,
                  eval_metric = 'rmse')

model = reg.fit(X_train, y_train)

test_pred = model.predict(X_test)
train_pred = model.predict(X_train)

from sklearn.metrics import mean_squared_error


print(f'test_rmse: {mean_squared_error(y_test, test_pred, squared= False)}')
print(f'train_rmse: {mean_squared_error(y_train, train_pred, squared=False)}')

test_rmse: 4573.38967835942
train_rmse: 2902.7569778724433


# Classification with Xgboost

In [8]:
from xgboost import XGBClassifier


from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score,recall_score,f1_score,confusion_matrix

In [9]:
train_weather = pd.read_csv('weather_train.csv')
test_weather = pd.read_csv('weather_test.csv')

In [10]:
train_weather.columns

Index(['Unnamed: 0', 'Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall',
       'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am',
       'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am',
       'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm',
       'RainToday', 'RainTomorrow', 'WindGustDir_E', 'WindGustDir_ENE',
       'WindGustDir_ESE', 'WindGustDir_N', 'WindGustDir_NE', 'WindGustDir_NNE',
       'WindGustDir_NNW', 'WindGustDir_NW', 'WindGustDir_S', 'WindGustDir_SE',
       'WindGustDir_SSE', 'WindGustDir_SSW', 'WindGustDir_SW', 'WindGustDir_W',
       'WindGustDir_WNW', 'WindGustDir_WSW', 'WindDir9am_E', 'WindDir9am_ENE',
       'WindDir9am_ESE', 'WindDir9am_N', 'WindDir9am_NE', 'WindDir9am_NNE',
       'WindDir9am_NNW', 'WindDir9am_NW', 'WindDir9am_S', 'WindDir9am_SE',
       'WindDir9am_SSE', 'WindDir9am_SSW', 'WindDir9am_SW', 'WindDir9am_W',
       'WindDir9am_WNW', 'WindDir9am_WSW', 'WindDir3pm_E', 'WindDir3pm_ENE',
       'WindDir3pm_ESE', 'Wi

In [11]:
# split the data
xtrain = train_weather.drop(columns=['Unnamed: 0', 'Date', 'Location','RainTomorrow'])
xtest = test_weather.drop(columns=['Unnamed: 0', 'Date', 'Location','RainTomorrow'])
ytrain = np.array([0 if x.lower() == 'yes'else 1 for x in train_weather['RainTomorrow']])
ytest = np.array([0 if x.lower() == 'yes'else 1 for x in test_weather['RainTomorrow']])

In [12]:
classifier = XGBClassifier(random_state = 0, learning_rate = 0.1, n_estimators = 600)

model_classifier = classifier.fit(xtrain, ytrain)

test_prediction = model_classifier.predict(xtest)
train_prediction = model_classifier.predict(xtrain)

In [13]:
print(f'train accuracy: {accuracy_score(ytrain, train_prediction)}')
print(f'test accuracy: {accuracy_score(ytest, test_prediction)}')

train accuracy: 0.9044262970656753
test accuracy: 0.8490416839251421


In [14]:
f1_score(ytest,  test_prediction)

0.906600191039769

In [15]:
confusion_matrix(ytest, test_prediction)

array([[ 3091,  2855],
       [ 1154, 19457]], dtype=int64)

In [16]:
print(precision_score(ytest, test_prediction))
print(recall_score(ytest, test_prediction))

0.8720419505198996
0.9440104798408617


### cross validation

In [17]:
from sklearn.model_selection import KFold


In [18]:
frames = [train_weather, test_weather]

data = pd.concat(frames)

X = data.drop(columns=['RainTomorrow','Unnamed: 0', 'Date', 'Location'])
y = np.array([0 if x.lower() == 'yes'else 1 for x in data['RainTomorrow']])
y = pd.DataFrame(data = y, columns=['RainTomorrow'])

In [19]:
kfold = KFold(n_splits=10)


In [20]:
def train_and_evaluate(X_train, y_train, X_test, y_test, **params):
    model = XGBClassifier(random_state=42, n_jobs=-1, **params)
    model.fit(X_train, y_train)
    train_accuracy = accuracy_score(model.predict(X_train), y_train)
    test_accuracy = accuracy_score(model.predict(X_test), y_test)
    return model, train_accuracy, test_accuracy

In [21]:
models = []
accuracy_test = []
accuracy_train = []

for train_idxs, val_idxs in kfold.split(X):
    X_train, y_train = X.iloc[train_idxs], y.iloc[train_idxs]
    X_test, y_test = X.iloc[val_idxs], y.iloc[val_idxs]
    model, train_accuracy, test_accuracy = train_and_evaluate(X_train, 
                                                     y_train, 
                                                     X_test, 
                                                     y_test, 
                                                     max_depth=4, 
                                                     n_estimators=20)
    models.append(model)
    accuracy_test.append(test_accuracy)
    accuracy_train.append(train_accuracy)
    print('Train Accuracy: {}, Test Accuracy: {}'.format(train_accuracy, test_accuracy))

print(f'Overall test accuracy: {sum(accuracy_test)/len(accuracy_test)}')
print(f'Overall train accuracy: {sum(accuracy_train)/len(accuracy_train)}')

Train Accuracy: 0.8486181768183693, Test Accuracy: 0.8587927952701774
Train Accuracy: 0.8521930427608965, Test Accuracy: 0.8264127595215179
Train Accuracy: 0.8489466367233451, Test Accuracy: 0.8439433521242954
Train Accuracy: 0.8510319751898193, Test Accuracy: 0.8284064347586966
Train Accuracy: 0.8507111538872848, Test Accuracy: 0.8371373573490994
Train Accuracy: 0.8491834333990254, Test Accuracy: 0.8523305376048398
Train Accuracy: 0.8478772323815634, Test Accuracy: 0.8529492644025849
Train Accuracy: 0.8484348503597782, Test Accuracy: 0.8523305376048398
Train Accuracy: 0.8500618726797745, Test Accuracy: 0.8477244603327375
Train Accuracy: 0.8500924270895397, Test Accuracy: 0.8424996562628901
Overall test accuracy: 0.8442527155231678
Overall train accuracy: 0.8497150801289395


In [22]:
models

[XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric=None, feature_types=None,
               gamma=None, grow_policy=None, importance_type=None,
               interaction_constraints=None, learning_rate=None, max_bin=None,
               max_cat_threshold=None, max_cat_to_onehot=None,
               max_delta_step=None, max_depth=4, max_leaves=None,
               min_child_weight=None, missing=nan, monotone_constraints=None,
               multi_strategy=None, n_estimators=20, n_jobs=-1,
               num_parallel_tree=None, random_state=42, ...),
 XGBClassifier(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, device=None, early_stopping_rounds=None,
               enable_categorical=Fa

## Hyperparameter Optimization

In [23]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import optuna

In [31]:
def train_model(xtrain, xtest, ytrain, ytest, **params):
    xgb = XGBClassifier(random_state =0,n_estimators = 5,max_depth = 7, **params)
    model = xgb.fit(xtrain, ytrain)
    test_pred = model.predict(xtest)
    train_pred = model.predict(xtrain)
    test_f1 = f1_score(ytest, test_pred)
    train_f1 = f1_score(ytrain, train_pred)
    
    return f'test f1_score: {test_f1}, train f1_score: {train_f1}'

In [33]:
params = {
    0.1:10,
    0.15:9,
    0.2:8,
    0.25:7,
    0.3:6,
    0.35:5,
    0.4:4,
    0.45:3
}

for i, v in params.items():
    print(f'learning_rate: {i}, max_leaves: {v}, 
          {train_model(X_train, X_test, y_train, y_test, learning_rate=i, max_leaves = v)}')

learning_rate: 0.1, max_leaves: 10, test f1_score: 0.8806011035886198, train f1_score: 0.8846875338459871
learning_rate: 0.15, max_leaves: 9, test f1_score: 0.8898085876875323, train f1_score: 0.8962480417061275
learning_rate: 0.2, max_leaves: 8, test f1_score: 0.8956006290576233, train f1_score: 0.9005387093440685
learning_rate: 0.25, max_leaves: 7, test f1_score: 0.8972853998532649, train f1_score: 0.9016731648353645
learning_rate: 0.3, max_leaves: 6, test f1_score: 0.8991425071457737, train f1_score: 0.9030622480803955
learning_rate: 0.35, max_leaves: 5, test f1_score: 0.8973638104551197, train f1_score: 0.9023969092126785
learning_rate: 0.4, max_leaves: 4, test f1_score: 0.8982369823698237, train f1_score: 0.9034485284691318
learning_rate: 0.45, max_leaves: 3, test f1_score: 0.8972543884774337, train f1_score: 0.9016637558599137


In [35]:
model.get_params


<bound method XGBModel.get_params of XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=4, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=20, n_jobs=-1,
              num_parallel_tree=None, random_state=42, ...)>

In [39]:
# randomized search
from sklearn.model_selection import StratifiedKFold
parameters = {
    
    'learning_rate': [0.01, 0.05, 0.1, 0.15],
    'max_depth': [2,4,6,8],
    'gamma': [1,1.5,2,2.5],
    'subsample': [0.2,0.4,0.6,0.8],
    'colsample_bytree': [0.2,0.4,0.6,0.8]
}

xgb = XGBClassifier(random_state = 0, nthread = 1)

folds = 3

param_comb = 5

skfold = StratifiedKFold(n_splits= folds, shuffle=True, random_state=10)

random_search = RandomizedSearchCV(xgb,param_distributions=parameters,
                                   n_iter=param_comb,scoring='roc_auc', n_jobs=4,
                                   verbose = 3 , random_state = 10, cv=skfold.split(X,y))

In [40]:
random_search.fit(X,y)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


Parameters: { "silent" } are not used.



In [41]:
random_search.best_estimator_

In [42]:
random_search.best_params_

{'subsample': 0.4,
 'max_depth': 8,
 'learning_rate': 0.15,
 'gamma': 1.5,
 'colsample_bytree': 0.2}

In [43]:
random_search.best_score_

0.874509294914574

In [44]:
random_search.cv_results_

{'mean_fit_time': array([12.82262047, 11.01181372,  8.04400786, 16.25616201,  7.14250573]),
 'std_fit_time': array([0.11868309, 0.96461649, 0.16844972, 0.59169213, 1.90554699]),
 'mean_score_time': array([0.45864805, 0.65372181, 0.3986059 , 0.87028384, 0.23847405]),
 'std_score_time': array([0.01805493, 0.01399112, 0.00822027, 0.19517147, 0.0708033 ]),
 'param_subsample': masked_array(data=[0.4, 0.4, 0.2, 0.8, 0.2],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_max_depth': masked_array(data=[6, 8, 4, 8, 2],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_learning_rate': masked_array(data=[0.01, 0.15, 0.1, 0.01, 0.01],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_gamma': masked_array(data=[1, 1.5, 2.5, 1, 1.5],
              mask=[False, False, False, False, False],
        fill_va