# Model Tunning

# 1)- Importing key modules

In [1]:
#support both Python 2 and Python 3 with minimal overhead.
from __future__ import absolute_import, division, print_function
import warnings
warnings.filterwarnings('ignore')

In [2]:
import re    # for regular expressions 
import nltk  # for text manipulation 
import string 
import numpy as np 
import pickle
import pandas as pd 

#For Visuals
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from matplotlib import rcParams
rcParams['figure.figsize'] = 11, 8
%config InlineBackend.figure_format = 'svg'
%matplotlib inline

# For model scores and tunning

from sklearn.model_selection import train_test_split 
from sklearn.metrics import f1_score, accuracy_score, roc_curve,roc_auc_score,confusion_matrix, classification_report

# 2)- Loading Dataset

In [3]:
data=pd.read_pickle('file_clean.pkl')

In [4]:
data.shape

(8932, 4)

In [5]:
data.head()

Unnamed: 0,text,class,clean,clean2
0,Supplier shall update the Documentation on a r...,1.0,Supplier shall update the Documentation on a r...,supplier shall update documentation regular ba...
1,"major release upgrades of Software, change of ...",1.0,major release upgrade of Software change of Eq...,major release upgrade software change equipmen...
2,Accept incident severity as set by E.ON Servic...,1.0,Accept incident severity a set by E ON Service...,accept incident severity set e service desk ce...
3,"Supplier shall provide all tools, documentatio...",1.0,Supplier shall provide all tool documentation ...,supplier shall provide tool documentation mate...
4,For smaller Projects a deviation can be agreed...,1.0,For smaller Projects a deviation can be agreed...,smaller project deviation agreed within projec...


# 2)-Load Model

### Load pre-processed files 

from clean and vectorization

In [6]:
wordvec_df=pd.read_pickle('word2vec_model.pkl')

In [7]:
X=wordvec_df
y=data['class']

In [8]:
print(X.shape)
print(y.shape)

(8932, 200)
(8932,)


In [9]:
# splitting data into training and validation set
xtrain_word2vec, xvalid_word2vec, ytrain, yvalid = train_test_split(X, y,random_state=42,test_size=0.2)

In [10]:
print(xtrain_word2vec.shape)
print(xvalid_word2vec.shape)
print(ytrain.shape)
print(yvalid.shape)

(7145, 200)
(1787, 200)
(7145,)
(1787,)


### Loading saved model

In [11]:
filename = 'finalized_model.sav'

In [12]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

### Evaluation of results

In [13]:
prediction = loaded_model.predict_proba(xvalid_word2vec)
prediction 

array([[7.9391772e-01, 2.0608230e-01],
       [4.2826951e-02, 9.5717305e-01],
       [3.2508373e-04, 9.9967492e-01],
       ...,
       [9.8542094e-01, 1.4579064e-02],
       [6.4736211e-01, 3.5263789e-01],
       [9.9662358e-01, 3.3764120e-03]], dtype=float32)

In [14]:
# for standard threshold 0.5
prediction_class = prediction[:,1] >= 0.5
prediction_class

array([False,  True,  True, ..., False, False, False])

In [15]:
prediction_int = prediction_class.astype(np.int)
prediction_int

array([0, 1, 1, ..., 0, 0, 0])

In [16]:
accuracy_score(yvalid, prediction_int)

0.817011751538892

In [17]:
f1_score(yvalid, prediction_int)

0.819436775262286

In [18]:
print(classification_report(yvalid, prediction_int))

              precision    recall  f1-score   support

         0.0       0.82      0.81      0.81       889
         1.0       0.81      0.83      0.82       898

    accuracy                           0.82      1787
   macro avg       0.82      0.82      0.82      1787
weighted avg       0.82      0.82      0.82      1787



In [19]:
# for threshold 0.5

prediction_class = prediction[:,1] >= 0.3
prediction_class

array([False,  True,  True, ..., False,  True, False])

In [20]:
prediction_int = prediction_class.astype(np.int)
prediction_int

array([0, 1, 1, ..., 0, 1, 0])

In [21]:
accuracy_score(yvalid, prediction_int)

0.8047006155567991

In [22]:
f1_score(yvalid, prediction_int)

0.815636555731643

In [23]:
print(classification_report(yvalid, prediction_int))

              precision    recall  f1-score   support

         0.0       0.84      0.75      0.79       889
         1.0       0.78      0.86      0.82       898

    accuracy                           0.80      1787
   macro avg       0.81      0.80      0.80      1787
weighted avg       0.81      0.80      0.80      1787



# 3) FineTuning XGBoost + Word2Vec