# Model Tunning

We have already chosen best model. It's time to tune it and get best out of it

# 1)- Importing key modules

In [1]:
#support both Python 2 and Python 3 with minimal overhead.
from __future__ import absolute_import, division, print_function
import warnings
warnings.filterwarnings('ignore')

In [2]:
import re    # for regular expressions 
import nltk  # for text manipulation 
import string 
import numpy as np 
import pickle
import pandas as pd 

#For Visuals
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from matplotlib import rcParams
rcParams['figure.figsize'] = 11, 8
%config InlineBackend.figure_format = 'svg'
%matplotlib inline

# For model scores and tunning

from sklearn.model_selection import train_test_split 
from sklearn.metrics import f1_score, accuracy_score, roc_curve,roc_auc_score,confusion_matrix, classification_report

# 2)- Loading Dataset & trained model

In [3]:
data=pd.read_pickle('file_clean.pkl')

### Load pre-processed word2vec model

from clean and vectorization process

In [6]:
wordvec_df=pd.read_pickle('word2vec_model.pkl')

In [7]:
X=wordvec_df
y=data['class']

In [8]:
print(X.shape)
print(y.shape)

(8932, 200)
(8932,)


In [9]:
# splitting data into training and validation set
xtrain_word2vec, xvalid_word2vec, ytrain, yvalid = train_test_split(X, y,random_state=42,test_size=0.2)

In [10]:
print(xtrain_word2vec.shape)
print(xvalid_word2vec.shape)
print(ytrain.shape)
print(yvalid.shape)

(7145, 200)
(1787, 200)
(7145,)
(1787,)


### Loading saved model

In [11]:
filename = 'finalized_model.sav'

In [12]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

### Evaluation of results

In [13]:
prediction = loaded_model.predict_proba(xvalid_word2vec)
prediction 

array([[7.9391772e-01, 2.0608230e-01],
       [4.2826951e-02, 9.5717305e-01],
       [3.2508373e-04, 9.9967492e-01],
       ...,
       [9.8542094e-01, 1.4579064e-02],
       [6.4736211e-01, 3.5263789e-01],
       [9.9662358e-01, 3.3764120e-03]], dtype=float32)

In [14]:
# for standard threshold 0.5
prediction_class = prediction[:,1] >= 0.5
prediction_class

array([False,  True,  True, ..., False, False, False])

In [15]:
prediction_int = prediction_class.astype(np.int)
prediction_int

array([0, 1, 1, ..., 0, 0, 0])

In [16]:
accuracy_score(yvalid, prediction_int)

0.817011751538892

In [17]:
f1_score(yvalid, prediction_int)

0.819436775262286

In [18]:
print(classification_report(yvalid, prediction_int))

              precision    recall  f1-score   support

         0.0       0.82      0.81      0.81       889
         1.0       0.81      0.83      0.82       898

    accuracy                           0.82      1787
   macro avg       0.82      0.82      0.82      1787
weighted avg       0.82      0.82      0.82      1787



In [19]:
# for threshold 0.5

prediction_class = prediction[:,1] >= 0.3
prediction_class

array([False,  True,  True, ..., False,  True, False])

In [20]:
prediction_int = prediction_class.astype(np.int)
prediction_int

array([0, 1, 1, ..., 0, 1, 0])

In [21]:
accuracy_score(yvalid, prediction_int)

0.8047006155567991

In [22]:
f1_score(yvalid, prediction_int)

0.815636555731643

In [23]:
print(classification_report(yvalid, prediction_int))

              precision    recall  f1-score   support

         0.0       0.84      0.75      0.79       889
         1.0       0.78      0.86      0.82       898

    accuracy                           0.80      1787
   macro avg       0.81      0.80      0.80      1787
weighted avg       0.81      0.80      0.80      1787



# 3) FineTuning XGBoost + Word2Vec

In [24]:
import xgboost as xgb

### 3.1)- Creating DMatrices

Here we will use DMatrices. A DMatrix can contain both the features and the target.

In [25]:
dtrain = xgb.DMatrix(xtrain_word2vec, label=ytrain)

In [26]:
dvalid = xgb.DMatrix(xvalid_word2vec, label=yvalid)

In [27]:
# Parameters that we are going to tune 
params = {
    'objective':'binary:logistic',
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1
 }

### 3.2)- Scoring matric

So far, we have chosen vectorization method(word2vec), model(XGBoost) and it's time to choose evaluation matrix i.e f1=score as it is a binary classification problem

In [28]:
def custom_eval(preds, dtrain):
    labels = dtrain.get_label().astype(np.int)
    preds = (preds >= 0.3).astype(np.int)
    return [('f1_score', f1_score(labels, preds))]

# 4)- GridSearchCV

General Approach for Parameter Tuning

- Choose a relatively high learning rate. Usually a learning rate of 0.3 is used at this stage.
- Tune tree-specific parameters such as max_depth, min_child_weight, subsample, colsample_bytree keeping the learning rate fixed.
- Tune the learning rate.
- Finally tune gamma to avoid overfitting.

### 4.1)- Tuning max_depth and min_child_weight

In [29]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(6,10)
     for min_child_weight in range(5,8)
 ]
max_f1 = 0. # initializing with 0 
best_params = None 
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))

# Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight

    
# Cross-validation
    cv_results = xgb.cv(        params,
        dtrain,        feval= custom_eval,
        num_boost_round=200,
        maximize=True,
        seed=16,
        nfold=5,
        early_stopping_rounds=10
    )    
    
# Finding best F1 Score
    
mean_f1 = cv_results['test-f1_score-mean'].max()

boost_rounds = cv_results['test-f1_score-mean'].idxmax() 

print("\tF1 Score {} for {} rounds".format(mean_f1, boost_rounds))

if mean_f1 > max_f1:
        max_f1 = mean_f1
        best_params = (max_depth,min_child_weight) 
        
print("Best params: {}, {}, F1 Score: {}".format(best_params[0], best_params[1], max_f1))

CV with max_depth=6, min_child_weight=5
CV with max_depth=6, min_child_weight=6
CV with max_depth=6, min_child_weight=7
CV with max_depth=7, min_child_weight=5
CV with max_depth=7, min_child_weight=6
CV with max_depth=7, min_child_weight=7
CV with max_depth=8, min_child_weight=5
CV with max_depth=8, min_child_weight=6
CV with max_depth=8, min_child_weight=7
CV with max_depth=9, min_child_weight=5
CV with max_depth=9, min_child_weight=6
CV with max_depth=9, min_child_weight=7
	F1 Score 0.8032527999999999 for 123 rounds
Best params: 9, 7, F1 Score: 0.8032527999999999


####  Updating max_depth and min_child_weight parameters

In [30]:
params['max_depth'] = 9 
params['min_child_weight'] = 7

### 4.2)-Tuning subsample and colsample

In [31]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(5,10)]
    for colsample in [i/10. for i in range(5,10)] ]
max_f1 = 0. 
best_params = None 
for subsample, colsample in gridsearch_params:
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # Update our parameters
    params['colsample'] = colsample
    params['subsample'] = subsample
    cv_results = xgb.cv(
        params,
        dtrain,
        feval= custom_eval,
        num_boost_round=200,
        maximize=True,
        seed=16,
        nfold=5,
        early_stopping_rounds=10
    )
     # Finding best F1 Score
    mean_f1 = cv_results['test-f1_score-mean'].max()
    boost_rounds = cv_results['test-f1_score-mean'].idxmax()
    print("\tF1 Score {} for {} rounds".format(mean_f1, boost_rounds))
    if mean_f1 > max_f1:
        max_f1 = mean_f1
        best_params = (subsample, colsample) 

print("Best params: {}, {}, F1 Score: {}".format(best_params[0], best_params[1], max_f1))

CV with subsample=0.5, colsample=0.5
	F1 Score 0.7889358000000001 for 88 rounds
CV with subsample=0.5, colsample=0.6
	F1 Score 0.7889358000000001 for 88 rounds
CV with subsample=0.5, colsample=0.7
	F1 Score 0.7889358000000001 for 88 rounds
CV with subsample=0.5, colsample=0.8
	F1 Score 0.7889358000000001 for 88 rounds
CV with subsample=0.5, colsample=0.9
	F1 Score 0.7889358000000001 for 88 rounds
CV with subsample=0.6, colsample=0.5
	F1 Score 0.7918453999999999 for 78 rounds
CV with subsample=0.6, colsample=0.6
	F1 Score 0.7918453999999999 for 78 rounds
CV with subsample=0.6, colsample=0.7
	F1 Score 0.7918453999999999 for 78 rounds
CV with subsample=0.6, colsample=0.8
	F1 Score 0.7918453999999999 for 78 rounds
CV with subsample=0.6, colsample=0.9
	F1 Score 0.7918453999999999 for 78 rounds
CV with subsample=0.7, colsample=0.5
	F1 Score 0.78279 for 43 rounds
CV with subsample=0.7, colsample=0.6
	F1 Score 0.78279 for 43 rounds
CV with subsample=0.7, colsample=0.7
	F1 Score 0.78279 for 43 

#### Updating subsample and colsample_bytree

In [33]:
params['subsample'] = .9 
params['colsample_bytree'] = .5

### 4.3)- tune the learning rate

In [34]:
max_f1 = 0. 
best_params = None 
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
     # Update ETA
    params['eta'] = eta

     # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        feval= custom_eval,
        num_boost_round=1000,
        maximize=True,
        seed=16,
        nfold=5,
        early_stopping_rounds=20
    )

     # Finding best F1 Score
    mean_f1 = cv_results['test-f1_score-mean'].max()
    boost_rounds = cv_results['test-f1_score-mean'].idxmax()
    print("\tF1 Score {} for {} rounds".format(mean_f1, boost_rounds))
    if mean_f1 > max_f1:
        max_f1 = mean_f1
        best_params = eta 
print("Best params: {}, F1 Score: {}".format(best_params, max_f1))

CV with eta=0.3
	F1 Score 0.8021998 for 107 rounds
CV with eta=0.2
	F1 Score 0.8037496000000001 for 229 rounds
CV with eta=0.1
	F1 Score 0.7991820000000001 for 136 rounds
CV with eta=0.05
	F1 Score 0.7924262000000001 for 168 rounds
CV with eta=0.01
	F1 Score 0.6751199999999999 for 0 rounds
CV with eta=0.005
	F1 Score 0.6751199999999999 for 0 rounds
Best params: 0.2, F1 Score: 0.8037496000000001


#### Update learning rate

In [35]:
params['eta'] = .2

### 4.4)- List of tunned parameters

In [36]:
params
{'colsample': 0.9,
 'colsample_bytree': 0.5, 'eta': 0.2,
 'max_depth': 9, 'min_child_weight': 7,
 'objective': 'binary:logistic',
 'subsample': 0.9}

{'colsample': 0.9,
 'colsample_bytree': 0.5,
 'eta': 0.2,
 'max_depth': 9,
 'min_child_weight': 7,
 'objective': 'binary:logistic',
 'subsample': 0.9}

# 5)- Train the Tunned model

In [37]:
xgb_model = xgb.train(
    params,
    dtrain,
    feval= custom_eval,
    num_boost_round= 1000,
    maximize=True,
    evals=[(dvalid, "Validation")],
    early_stopping_rounds=10
 )

[0]	Validation-error:0.341354	Validation-f1_score:0.668901
Multiple eval metrics have been passed: 'Validation-f1_score' will be used for early stopping.

Will train until Validation-f1_score hasn't improved in 10 rounds.
[1]	Validation-error:0.31953	Validation-f1_score:0.668901
[2]	Validation-error:0.298825	Validation-f1_score:0.684231
[3]	Validation-error:0.283156	Validation-f1_score:0.699686
[4]	Validation-error:0.278679	Validation-f1_score:0.709884
[5]	Validation-error:0.270285	Validation-f1_score:0.716564
[6]	Validation-error:0.26413	Validation-f1_score:0.728792
[7]	Validation-error:0.254617	Validation-f1_score:0.733192
[8]	Validation-error:0.253497	Validation-f1_score:0.733819
[9]	Validation-error:0.255176	Validation-f1_score:0.732189
[10]	Validation-error:0.247902	Validation-f1_score:0.740773
[11]	Validation-error:0.243984	Validation-f1_score:0.745837
[12]	Validation-error:0.245104	Validation-f1_score:0.751103
[13]	Validation-error:0.247342	Validation-f1_score:0.757402
[14]	Vali

This xgb_model can be tested on test set or out of sample data