# Modeling

Before modeling, there will be EDA, Cleaning and vectorization procedures.

# 1)- Importing key modules

In [1]:
#support both Python 2 and Python 3 with minimal overhead.
from __future__ import absolute_import, division, print_function
import warnings
warnings.filterwarnings('ignore')

In [2]:
import re    # for regular expressions 
import nltk  # for text manipulation 
import string 
import numpy as np 
import pickle
import pandas as pd 

#For Visuals
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from matplotlib import rcParams
rcParams['figure.figsize'] = 11, 8
%config InlineBackend.figure_format = 'svg'
%matplotlib inline

# 2)- Loading Dataset

In [3]:
data=pd.read_pickle('file_clean.pkl')

In [4]:
data.shape

(8932, 4)

In [5]:
data.head()

Unnamed: 0,text,class,clean,clean2
0,Supplier shall update the Documentation on a r...,1.0,Supplier shall update the Documentation on a r...,supplier shall update documentation regular ba...
1,"major release upgrades of Software, change of ...",1.0,major release upgrade of Software change of Eq...,major release upgrade software change equipmen...
2,Accept incident severity as set by E.ON Servic...,1.0,Accept incident severity a set by E ON Service...,accept incident severity set e service desk ce...
3,"Supplier shall provide all tools, documentatio...",1.0,Supplier shall provide all tool documentation ...,supplier shall provide tool documentation mate...
4,For smaller Projects a deviation can be agreed...,1.0,For smaller Projects a deviation can be agreed...,smaller project deviation agreed within projec...


In [6]:
data.tail()

Unnamed: 0,text,class,clean,clean2
8927,EnsurethatSupplier’sperformancerequirementsast...,0.0,EnsurethatSupplier sperformancerequirementsast...,ensurethatsupplier sperformancerequirementsast...
8928,Establishandexecutetheaccountmanagementdiscipl...,0.0,Establishandexecutetheaccountmanagementdiscipl...,establishandexecutetheaccountmanagementdiscipl...
8929,Reviewofconsolidatedforecast/demandreportscove...,0.0,Reviewofconsolidatedforecast demandreportscove...,reviewofconsolidatedforecast demandreportscove...
8930,OnceE.ON'sContractManagerdecidestoproceedwitha...,0.0,OnceE ON sContractManagerdecidestoproceedwitha...,oncee scontractmanagerdecidestoproceedwithaccn...
8931,"maymaketemporaryOperationalChanges,incaseitisa...",0.0,maymaketemporaryOperationalChanges incaseitisa...,maymaketemporaryoperationalchanges incaseitisa...


In [7]:
data['class'].value_counts()

1.0    4539
0.0    4393
Name: class, dtype: int64

**0 means other text and 1 means Deliverable and Obligations**

# 3)- Model

- Logistic Regression
- Support Vector Machine
- Naives Bayes
- RandomForest
- XGBoost


In [8]:
# loading pickle files before we start training

bow=pd.read_pickle('bow_model.pkl')
tfidf=pd.read_pickle('tfidf_model.pkl')
wordvec_df=pd.read_pickle('word2vec_model.pkl')
docvec_df=pd.read_pickle('doc2vec_model.pkl')

### 3.1)- Logistic Regression Model

In [9]:
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import f1_score, accuracy_score, roc_curve,roc_auc_score,confusion_matrix, classification_report

#### 3.1.1 Logistic Regression using Bag-of-Words Features

In [10]:
X=bow
y=data['class']

In [11]:
print(X.shape)
print(y.shape)

(8932, 1000)
(8932,)


In [12]:
# splitting data into training and validation set
xtrain_bow, xvalid_bow, ytrain, yvalid = train_test_split(bow, y,random_state=42,test_size=0.2)

In [13]:
print(xtrain_bow.shape)
print(xvalid_bow.shape)
print(ytrain.shape)
print(yvalid.shape)

(7145, 1000)
(1787, 1000)
(7145,)
(1787,)


In [14]:
lreg = LogisticRegression(solver='liblinear')

# training the model
lreg.fit(xtrain_bow, ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
# predicting on the validation set
prediction = lreg.predict_proba(xvalid_bow)
prediction

array([[0.57070774, 0.42929226],
       [0.06826315, 0.93173685],
       [0.05078176, 0.94921824],
       ...,
       [0.75704973, 0.24295027],
       [0.894826  , 0.105174  ],
       [0.97484251, 0.02515749]])

In [16]:
# prediction over classes

prediction_class=lreg.predict(xvalid_bow)
prediction_class

array([0., 1., 1., ..., 0., 0., 0.])

In [17]:
accuracy_score(yvalid, prediction_class)

0.8030218242865137

In [18]:
roc_auc_score(yvalid, prediction_class)

0.803208479786352

In [19]:
confusion_matrix(yvalid, prediction_class)

array([[747, 142],
       [210, 688]])

In [20]:
print(classification_report(yvalid, prediction_class))

              precision    recall  f1-score   support

         0.0       0.78      0.84      0.81       889
         1.0       0.83      0.77      0.80       898

    accuracy                           0.80      1787
   macro avg       0.80      0.80      0.80      1787
weighted avg       0.80      0.80      0.80      1787



**Changing threshold**

In [21]:
# if prediction is greater than or equal to 0.3 than 1 else 0
prediction_int = prediction[:,1] >= 0.3
prediction_int

array([ True,  True,  True, ..., False, False, False])

In [22]:
prediction_int = prediction_int.astype(np.int)
prediction_int

array([1, 1, 1, ..., 0, 0, 0])

In [23]:
# calculating f1 score for the validation set
f1_score(yvalid, prediction_int)

0.803680981595092

In [24]:
# calculating accuracy score for the validation set
accuracy_score(yvalid, prediction_int)

0.7851147174034695

In [25]:
# roc-auc score
roc_auc_score(yvalid, prediction_int)

0.7846583208279366

In [26]:
confusion_matrix(yvalid, prediction_int)

array([[617, 272],
       [112, 786]])

In [27]:
print(classification_report(yvalid, prediction_int))

              precision    recall  f1-score   support

         0.0       0.85      0.69      0.76       889
         1.0       0.74      0.88      0.80       898

    accuracy                           0.79      1787
   macro avg       0.79      0.78      0.78      1787
weighted avg       0.79      0.79      0.78      1787



By changing threshold, we have seen a decrease in accuracy but, we have got more precise results for class 1 i.e O&B. So, it is trade-off between accuracy and precision.

For this excercise, we are more interested in Class 1 precised results so, 0.3 threshold is used for rest of notebook.

#### 3.1.2 Logistic Regression using TF-IDF Features

In [28]:
X=tfidf
y=data['class']

In [29]:
print(X.shape)
print(y.shape)

(8932, 1000)
(8932,)


In [30]:
# splitting data into training and validation set
xtrain_tfidf, xvalid_tfidf, ytrain, yvalid = train_test_split(tfidf, y,random_state=42,test_size=0.2)

In [31]:
print(xtrain_tfidf.shape)
print(xvalid_tfidf.shape)
print(ytrain.shape)
print(yvalid.shape)

(7145, 1000)
(1787, 1000)
(7145,)
(1787,)


In [32]:
lreg = LogisticRegression(solver='liblinear')

# training the model
lreg.fit(xtrain_tfidf, ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [33]:
# predicting on the validation set
prediction = lreg.predict_proba(xvalid_tfidf)
prediction

array([[0.38226668, 0.61773332],
       [0.1795822 , 0.8204178 ],
       [0.12885721, 0.87114279],
       ...,
       [0.68508414, 0.31491586],
       [0.6982247 , 0.3017753 ],
       [0.90387319, 0.09612681]])

In [34]:
# Same rule : if prediction is greater than or equal to 0.3 than 1 else 0
prediction_int = prediction[:,1] >= 0.3
prediction_int

array([ True,  True,  True, ...,  True,  True, False])

In [35]:
prediction_int = prediction_int.astype(np.int)
prediction_int

array([1, 1, 1, ..., 1, 1, 0])

In [36]:
accuracy_score(yvalid, prediction_int)

0.7336317851147174

In [37]:
# calculating f1 score for the validation set
f1_score(yvalid, prediction_int)

0.7806451612903227

In [38]:
confusion_matrix(yvalid, prediction_int)

array([[464, 425],
       [ 51, 847]])

In [39]:
print(classification_report(yvalid, prediction_int))

              precision    recall  f1-score   support

         0.0       0.90      0.52      0.66       889
         1.0       0.67      0.94      0.78       898

    accuracy                           0.73      1787
   macro avg       0.78      0.73      0.72      1787
weighted avg       0.78      0.73      0.72      1787



#### 3.1.3 Logistic Regression using Word2Vec Features

In [40]:
X=wordvec_df
y=data['class']

In [41]:
print(X.shape)
print(y.shape)

(8932, 200)
(8932,)


In [42]:
# splitting data into training and validation set
xtrain_word2vec, xvalid_word2vec, ytrain, yvalid = train_test_split(X, y,random_state=42,test_size=0.2)

In [43]:
print(xtrain_word2vec.shape)
print(xvalid_word2vec.shape)
print(ytrain.shape)
print(yvalid.shape)

(7145, 200)
(1787, 200)
(7145,)
(1787,)


In [44]:
# training the model
lreg.fit(xtrain_word2vec, ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [45]:
# predicting on the validation set
prediction = lreg.predict_proba(xvalid_word2vec)
prediction

array([[0.36783016, 0.63216984],
       [0.1183427 , 0.8816573 ],
       [0.18988563, 0.81011437],
       ...,
       [0.56433758, 0.43566242],
       [0.49043159, 0.50956841],
       [0.76519438, 0.23480562]])

In [46]:
prediction_int = prediction[:,1] >= 0.3
prediction_int

array([ True,  True,  True, ...,  True,  True, False])

In [47]:
prediction_int = prediction_int.astype(np.int)
prediction_int

array([1, 1, 1, ..., 1, 1, 0])

In [48]:
f1_score(yvalid, prediction_int)

0.7405146096816398

In [49]:
accuracy_score(yvalid, prediction_int)

0.6670397313933968

In [50]:
confusion_matrix(yvalid, prediction_int)

array([[343, 546],
       [ 49, 849]])

In [51]:
print(classification_report(yvalid, prediction_int))

              precision    recall  f1-score   support

         0.0       0.88      0.39      0.54       889
         1.0       0.61      0.95      0.74       898

    accuracy                           0.67      1787
   macro avg       0.74      0.67      0.64      1787
weighted avg       0.74      0.67      0.64      1787



#### 3.1.4 Logistic Regression using Doc2Vec Features

In [52]:
X=docvec_df
y=data['class']

In [53]:
print(X.shape)
print(y.shape)

(8932, 200)
(8932,)


In [54]:
# splitting data into training and validation set
xtrain_doc2vec, xvalid_doc2vec, ytrain, yvalid = train_test_split(X, y,random_state=42,test_size=0.2)

In [55]:
print(xtrain_doc2vec.shape)
print(xvalid_doc2vec.shape)
print(ytrain.shape)
print(yvalid.shape)

(7145, 200)
(1787, 200)
(7145,)
(1787,)


In [56]:
# training the model
lreg.fit(xtrain_doc2vec, ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [57]:
# predicting on the validation set
prediction = lreg.predict_proba(xvalid_doc2vec)
prediction

array([[0.40029179, 0.59970821],
       [0.15948637, 0.84051363],
       [0.41839632, 0.58160368],
       ...,
       [0.66038584, 0.33961416],
       [0.42146772, 0.57853228],
       [0.12542656, 0.87457344]])

In [58]:
prediction_int = prediction[:,1] >= 0.3
prediction_int

array([ True,  True,  True, ...,  True,  True,  True])

In [59]:
prediction_int = prediction_int.astype(np.int)
prediction_int

array([1, 1, 1, ..., 1, 1, 1])

In [60]:
f1_score(yvalid, prediction_int)

0.7512551346417161

In [61]:
accuracy_score(yvalid, prediction_int)

0.6950195858981534

In [62]:
confusion_matrix(yvalid, prediction_int)

array([[419, 470],
       [ 75, 823]])

In [63]:
print(classification_report(yvalid, prediction_int))

              precision    recall  f1-score   support

         0.0       0.85      0.47      0.61       889
         1.0       0.64      0.92      0.75       898

    accuracy                           0.70      1787
   macro avg       0.74      0.69      0.68      1787
weighted avg       0.74      0.70      0.68      1787



For a simpler dataset, bag of words is giving best score. Howver, we can see that tfidf and word2vec performs very well i.e 0.94 score for class 1 and very bad for class 0. So, we may use them if we are interested in more precise results for class 1.

### 3.2)-Support Vector Machine (SVM)

In [64]:
from sklearn import svm

#### 3.2.1 SVM using Bag-of-Words Features

In [65]:
svc = svm.SVC(kernel='linear', C=1, probability=True).fit(xtrain_bow, ytrain)

In [66]:
prediction = svc.predict_proba(xvalid_bow)
prediction

array([[0.62764233, 0.37235767],
       [0.15052447, 0.84947553],
       [0.15829892, 0.84170108],
       ...,
       [0.75634492, 0.24365508],
       [0.83553348, 0.16446652],
       [0.95204019, 0.04795981]])

In [67]:
prediction_int = prediction[:,1] >= 0.3
prediction_int

array([ True,  True,  True, ..., False, False, False])

In [68]:
prediction_int = prediction_int.astype(np.int)
prediction_int

array([1, 1, 1, ..., 0, 0, 0])

In [69]:
accuracy_score(yvalid, prediction_int)

0.7565752658086178

In [70]:
f1_score(yvalid, prediction_int)

0.783689706613625

In [71]:
confusion_matrix(yvalid, prediction_int)

array([[564, 325],
       [110, 788]])

In [72]:
print(classification_report(yvalid, prediction_int))

              precision    recall  f1-score   support

         0.0       0.84      0.63      0.72       889
         1.0       0.71      0.88      0.78       898

    accuracy                           0.76      1787
   macro avg       0.77      0.76      0.75      1787
weighted avg       0.77      0.76      0.75      1787



#### 3.2.2 SVM using TF-IDF Features

In [73]:
svc = svm.SVC(kernel='linear',C=1, probability=True).fit(xtrain_tfidf, ytrain)

In [74]:
prediction = svc.predict_proba(xvalid_tfidf)
prediction

array([[0.42282726, 0.57717274],
       [0.12759646, 0.87240354],
       [0.05459321, 0.94540679],
       ...,
       [0.80396909, 0.19603091],
       [0.85134327, 0.14865673],
       [0.96904258, 0.03095742]])

In [75]:
prediction_int = prediction[:,1] >= 0.3
prediction_int

array([ True,  True,  True, ..., False, False, False])

In [76]:
prediction_int = prediction_int.astype(np.int)
prediction_int

array([1, 1, 1, ..., 0, 0, 0])

In [77]:
accuracy_score(yvalid, prediction_int)

0.7588136541689984

In [78]:
f1_score(yvalid, prediction_int)

0.7884143348060874

In [79]:
confusion_matrix(yvalid, prediction_int)

array([[553, 336],
       [ 95, 803]])

In [80]:
print(classification_report(yvalid, prediction_int))

              precision    recall  f1-score   support

         0.0       0.85      0.62      0.72       889
         1.0       0.71      0.89      0.79       898

    accuracy                           0.76      1787
   macro avg       0.78      0.76      0.75      1787
weighted avg       0.78      0.76      0.75      1787



#### 3.2.3 SVM using word2vec Features

In [81]:
svc = svm.SVC(kernel='linear', C=1, probability=True).fit(xtrain_word2vec, ytrain)

In [82]:
prediction = svc.predict_proba(xvalid_word2vec)
prediction

array([[0.48041304, 0.51958696],
       [0.11418709, 0.88581291],
       [0.20920935, 0.79079065],
       ...,
       [0.64110967, 0.35889033],
       [0.5       , 0.5       ],
       [0.77104888, 0.22895112]])

In [83]:
prediction_int = prediction[:,1] >= 0.3
prediction_int

array([ True,  True,  True, ...,  True,  True, False])

In [84]:
prediction_int = prediction_int.astype(np.int)
prediction_int

array([1, 1, 1, ..., 1, 1, 0])

In [85]:
accuracy_score(yvalid, prediction_int)

0.668158925573587

In [86]:
f1_score(yvalid, prediction_int)

0.7395696091348264

In [87]:
print(confusion_matrix(yvalid, prediction_int))

[[352 537]
 [ 56 842]]


In [88]:
print(classification_report(yvalid, prediction_int))

              precision    recall  f1-score   support

         0.0       0.86      0.40      0.54       889
         1.0       0.61      0.94      0.74       898

    accuracy                           0.67      1787
   macro avg       0.74      0.67      0.64      1787
weighted avg       0.74      0.67      0.64      1787



#### 3.2.4 SVM using doc2vec Features

In [89]:
svc = svm.SVC(kernel='linear', C=1, probability=True).fit(xtrain_doc2vec, ytrain)

In [90]:
prediction = svc.predict_proba(xvalid_doc2vec)
prediction

array([[0.37773937, 0.62226063],
       [0.1607062 , 0.8392938 ],
       [0.43435575, 0.56564425],
       ...,
       [0.63707993, 0.36292007],
       [0.40103994, 0.59896006],
       [0.14039158, 0.85960842]])

In [91]:
prediction_int = prediction[:,1] >= 0.3
prediction_int

array([ True,  True,  True, ...,  True,  True,  True])

In [92]:
prediction_int = prediction_int.astype(np.int)
prediction_int

array([1, 1, 1, ..., 1, 1, 1])

In [93]:
accuracy_score(yvalid, prediction_int)

0.6804700615556799

In [94]:
f1_score(yvalid, prediction_int)

0.7445190156599553

In [95]:
print(confusion_matrix(yvalid, prediction_int))

[[384 505]
 [ 66 832]]


In [96]:
print(classification_report(yvalid, prediction_int))

              precision    recall  f1-score   support

         0.0       0.85      0.43      0.57       889
         1.0       0.62      0.93      0.74       898

    accuracy                           0.68      1787
   macro avg       0.74      0.68      0.66      1787
weighted avg       0.74      0.68      0.66      1787



### 3.3)- Naive Bayes

In [97]:
from sklearn.naive_bayes import MultinomialNB

#### 3.3.1 Naive Bayes using Bag-of-Words 

In [98]:
mnb = MultinomialNB()
mnb.fit(xtrain_bow, ytrain)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [99]:
# naive bayes in theory does not work well with predict_probab so, we shall use prediction directly
prediction = mnb.predict_proba(xvalid_bow)
prediction

array([[0.44420094, 0.55579906],
       [0.03296167, 0.96703833],
       [0.00234653, 0.99765347],
       ...,
       [0.72864983, 0.27135017],
       [0.24624521, 0.75375479],
       [0.98842928, 0.01157072]])

In [100]:
prediction_int = prediction[:,1] >= 0.3
prediction_int

array([ True,  True,  True, ..., False,  True, False])

In [None]:
prediction_int = prediction_int.astype(np.int)
prediction_int

In [101]:
accuracy_score(yvalid, prediction_int)

0.6793508673754897

In [102]:
f1_score(yvalid, prediction_int)

0.727790973871734

In [103]:
print(confusion_matrix(yvalid, prediction_int))

[[448 441]
 [132 766]]


In [104]:
print(classification_report(yvalid, prediction_int))

              precision    recall  f1-score   support

         0.0       0.77      0.50      0.61       889
         1.0       0.63      0.85      0.73       898

    accuracy                           0.68      1787
   macro avg       0.70      0.68      0.67      1787
weighted avg       0.70      0.68      0.67      1787



Not too good

#### 3.3.2 Naive Bayes using TF-IDF

In [105]:
mnb = MultinomialNB()
mnb.fit(xtrain_tfidf, ytrain)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [106]:
prediction = mnb.predict_proba(xvalid_tfidf)
prediction

array([[0.45332712, 0.54667288],
       [0.31474078, 0.68525922],
       [0.1394017 , 0.8605983 ],
       ...,
       [0.61807525, 0.38192475],
       [0.42044278, 0.57955722],
       [0.82730073, 0.17269927]])

In [107]:
prediction_int = prediction[:,1] >= 0.3
prediction_int

array([ True,  True,  True, ...,  True,  True, False])

In [None]:
prediction_int = prediction_int.astype(np.int)
prediction_int

In [108]:
accuracy_score(yvalid, prediction_int)

0.6183547845551203

In [109]:
f1_score(yvalid, prediction_int)

0.7172470978441128

In [110]:
print(confusion_matrix(yvalid, prediction_int))

[[240 649]
 [ 33 865]]


In [111]:
print(classification_report(yvalid, prediction_int))

              precision    recall  f1-score   support

         0.0       0.88      0.27      0.41       889
         1.0       0.57      0.96      0.72       898

    accuracy                           0.62      1787
   macro avg       0.73      0.62      0.57      1787
weighted avg       0.72      0.62      0.57      1787



#### 3.3.3 Naive Bayes using word2vec

As for Naive Bayes, Input X must be non-negative. In case of Word2vec and Doc2vec, Naive Bayes is not ideal. So we will not use it. It is not giving any exceptional results anyway.

For results, we have already have better models' outcome.

#### 3.3.4 Naive Bayes using doc2vec

### 3.4 Random Forest

In [113]:
from sklearn.ensemble import RandomForestClassifier

#### 3.4.1-RF with Bag-of-Words Features

In [114]:
rf = RandomForestClassifier(n_estimators=400, random_state=11).fit(xtrain_bow, ytrain)

In [115]:
prediction = rf.predict_proba(xvalid_bow)
prediction

array([[0.74625   , 0.25375   ],
       [0.3675    , 0.6325    ],
       [0.0225    , 0.9775    ],
       ...,
       [0.73058333, 0.26941667],
       [0.93833333, 0.06166667],
       [1.        , 0.        ]])

In [116]:
prediction_int = prediction[:,1] >= 0.3
prediction_int

array([False,  True,  True, ..., False, False, False])

In [None]:
prediction_int = prediction_int.astype(np.int)
prediction_int

In [117]:
f1_score(yvalid, prediction_int)

0.792360430950049

In [118]:
accuracy_score(yvalid, prediction_int)

0.7627308337996642

In [119]:
print(confusion_matrix(yvalid, prediction_int))

[[554 335]
 [ 89 809]]


In [120]:
print(classification_report(yvalid, prediction_int))

              precision    recall  f1-score   support

         0.0       0.86      0.62      0.72       889
         1.0       0.71      0.90      0.79       898

    accuracy                           0.76      1787
   macro avg       0.78      0.76      0.76      1787
weighted avg       0.78      0.76      0.76      1787



#### 3.4.2-RF with TF-IDF Features

In [121]:
rf = RandomForestClassifier(n_estimators=400, random_state=11).fit(xtrain_tfidf, ytrain)

In [122]:
prediction = rf.predict_proba(xvalid_tfidf)
prediction

array([[0.60333333, 0.39666667],
       [0.38725   , 0.61275   ],
       [0.0225    , 0.9775    ],
       ...,
       [0.91291667, 0.08708333],
       [0.793125  , 0.206875  ],
       [0.95791667, 0.04208333]])

In [123]:
prediction_int = prediction[:,1] >= 0.3
prediction_int

array([ True,  True,  True, ..., False, False, False])

In [None]:
prediction_int = prediction_int.astype(np.int)
prediction_int

In [124]:
accuracy_score(yvalid, prediction_int)

0.7632904308897593

In [125]:
f1_score(yvalid, prediction_int)

0.7945604662457503

In [126]:
print(confusion_matrix(yvalid, prediction_int))

[[546 343]
 [ 80 818]]


In [127]:
print(classification_report(yvalid, prediction_int))

              precision    recall  f1-score   support

         0.0       0.87      0.61      0.72       889
         1.0       0.70      0.91      0.79       898

    accuracy                           0.76      1787
   macro avg       0.79      0.76      0.76      1787
weighted avg       0.79      0.76      0.76      1787



#### 3.4.3-RF with word2vec Features

In [141]:
print(xtrain_word2vec.shape)
print(xvalid_word2vec.shape)

(7145, 200)
(1787, 200)


In [142]:
rf = RandomForestClassifier(n_estimators=400, random_state=11).fit(xtrain_word2vec, ytrain)

In [143]:
prediction= rf.predict_proba(xvalid_word2vec)
prediction

array([[0.52354167, 0.47645833],
       [0.23291667, 0.76708333],
       [0.12945833, 0.87054167],
       ...,
       [0.66191667, 0.33808333],
       [0.485     , 0.515     ],
       [0.79608333, 0.20391667]])

In [144]:
prediction_int = prediction[:,1] >= 0.3
prediction_int

array([ True,  True,  True, ...,  True,  True, False])

In [145]:
prediction_int = prediction_int.astype(np.int)
prediction_int

array([1, 1, 1, ..., 1, 1, 0])

In [146]:
accuracy_score(yvalid, prediction_int)

0.6385002797985451

In [147]:
f1_score(yvalid, prediction_int)

0.7229845626072041

In [148]:
print(confusion_matrix(yvalid, prediction_int))

[[298 591]
 [ 55 843]]


In [149]:
print(classification_report(yvalid, prediction_int))

              precision    recall  f1-score   support

         0.0       0.84      0.34      0.48       889
         1.0       0.59      0.94      0.72       898

    accuracy                           0.64      1787
   macro avg       0.72      0.64      0.60      1787
weighted avg       0.72      0.64      0.60      1787



#### 3.4.4-RF with doc2vec Features

In [140]:
print(xtrain_doc2vec.shape)
print(xvalid_doc2vec.shape)

(7145, 200)
(1787, 200)


In [132]:
rf = RandomForestClassifier(n_estimators=400, random_state=11).fit(xtrain_doc2vec, ytrain)

In [133]:
prediction= rf.predict_proba(xvalid_doc2vec)
prediction

array([[0.4   , 0.6   ],
       [0.3225, 0.6775],
       [0.455 , 0.545 ],
       ...,
       [0.5575, 0.4425],
       [0.4   , 0.6   ],
       [0.3525, 0.6475]])

In [134]:
prediction_int = prediction[:,1] >= 0.3
prediction_int

array([ True,  True,  True, ...,  True,  True,  True])

In [135]:
prediction_int = prediction_int.astype(np.int)
prediction_int

array([1, 1, 1, ..., 1, 1, 1])

In [136]:
accuracy_score(yvalid, prediction_int)

0.6424174594292109

In [137]:
f1_score(yvalid, prediction_int)

0.7318506084767099

In [138]:
print(confusion_matrix(yvalid, prediction_int))

[[276 613]
 [ 26 872]]


In [139]:
print(classification_report(yvalid, prediction_int))

              precision    recall  f1-score   support

         0.0       0.91      0.31      0.46       889
         1.0       0.59      0.97      0.73       898

    accuracy                           0.64      1787
   macro avg       0.75      0.64      0.60      1787
weighted avg       0.75      0.64      0.60      1787



### 3.5)-XGBoost 

Extreme Gradient Boosting (xgboost) is an advanced implementation of gradient boosting algorithm. It has both linear model solver and tree learning algorithms. Its ability to do parallel computation on a single machine makes it extremely fast. It also has additional features for doing cross validation and finding important variables. There are many parameters which need to be controlled to optimize the model.

Some key benefits of XGBoost are:

- Regularization - helps in reducing overfitting
- Parallel Processing - XGBoost implements parallel processing and is blazingly faster as compared to GBM.
- Handling Missing Values - It has an in-built routine to handle missing values.
- Built-in Cross-Validation - allows user to run a cross-validation at each iteration of the boosting process

In [151]:
from xgboost import XGBClassifier

#### 3.5.1) XGBoost using bag of words features

In [152]:
xgb_model = XGBClassifier(max_depth=6, n_estimators=1000).fit(xtrain_bow, ytrain)

In [153]:
prediction = xgb_model.predict_proba(xvalid_bow)
prediction 

array([[0.6994201 , 0.3005799 ],
       [0.16367435, 0.83632565],
       [0.02237719, 0.9776228 ],
       ...,
       [0.89265764, 0.10734234],
       [0.85022104, 0.14977898],
       [0.9595621 , 0.04043785]], dtype=float32)

In [154]:
prediction_int = prediction[:,1] >= 0.3
prediction_int

array([ True,  True,  True, ..., False, False, False])

In [155]:
prediction_int = prediction_int.astype(np.int)
prediction_int

array([1, 1, 1, ..., 0, 0, 0])

In [156]:
accuracy_score(yvalid, prediction_int)

0.7912702853945159

In [157]:
f1_score(yvalid, prediction_int)

0.8097909229984702

In [158]:
print(confusion_matrix(yvalid, prediction_int))

[[620 269]
 [104 794]]


In [159]:
print(classification_report(yvalid, prediction_int))

              precision    recall  f1-score   support

         0.0       0.86      0.70      0.77       889
         1.0       0.75      0.88      0.81       898

    accuracy                           0.79      1787
   macro avg       0.80      0.79      0.79      1787
weighted avg       0.80      0.79      0.79      1787



#### 3.5.2) XGBoost using tfidffeatures

In [160]:
xgb_model = XGBClassifier(max_depth=6, n_estimators=1000).fit(xtrain_tfidf, ytrain)

In [161]:
prediction = xgb_model.predict_proba(xvalid_tfidf)
prediction 

array([[0.3995415 , 0.6004585 ],
       [0.40370852, 0.5962915 ],
       [0.01535803, 0.98464197],
       ...,
       [0.9596097 , 0.0403903 ],
       [0.8704996 , 0.12950037],
       [0.9770459 , 0.02295412]], dtype=float32)

In [162]:
prediction_int = prediction[:,1] >= 0.3
prediction_int

array([ True,  True,  True, ..., False, False, False])

In [163]:
prediction_int = prediction_int.astype(np.int)
prediction_int

array([1, 1, 1, ..., 0, 0, 0])

In [164]:
accuracy_score(yvalid, prediction_int)

0.7806379406827084

In [165]:
f1_score(yvalid, prediction_int)

0.8012170385395536

In [166]:
print(confusion_matrix(yvalid, prediction_int))

[[605 284]
 [108 790]]


In [167]:
print(classification_report(yvalid, prediction_int))

              precision    recall  f1-score   support

         0.0       0.85      0.68      0.76       889
         1.0       0.74      0.88      0.80       898

    accuracy                           0.78      1787
   macro avg       0.79      0.78      0.78      1787
weighted avg       0.79      0.78      0.78      1787



#### 3.5.3) XGBoost using word2vecfeatures

In [168]:
xgb_model = XGBClassifier(max_depth=6, n_estimators=1000).fit(xtrain_word2vec, ytrain)

In [169]:
prediction = xgb_model.predict_proba(xvalid_word2vec)
prediction 

array([[7.9391772e-01, 2.0608230e-01],
       [4.2826951e-02, 9.5717305e-01],
       [3.2508373e-04, 9.9967492e-01],
       ...,
       [9.8542094e-01, 1.4579064e-02],
       [6.4736211e-01, 3.5263789e-01],
       [9.9662358e-01, 3.3764120e-03]], dtype=float32)

In [170]:
prediction_int = prediction[:,1] >= 0.3
prediction_int

array([False,  True,  True, ..., False,  True, False])

In [171]:
prediction_int = prediction_int.astype(np.int)
prediction_int

array([0, 1, 1, ..., 0, 1, 0])

In [172]:
accuracy_score(yvalid, prediction_int)

0.8047006155567991

In [173]:
f1_score(yvalid, prediction_int)

0.815636555731643

In [174]:
print(confusion_matrix(yvalid, prediction_int))

[[666 223]
 [126 772]]


In [175]:
print(classification_report(yvalid, prediction_int))

              precision    recall  f1-score   support

         0.0       0.84      0.75      0.79       889
         1.0       0.78      0.86      0.82       898

    accuracy                           0.80      1787
   macro avg       0.81      0.80      0.80      1787
weighted avg       0.81      0.80      0.80      1787



#### 3.5.4) XGBoost using doc2vec features

In [176]:
xgb_model = XGBClassifier(max_depth=6, n_estimators=1000).fit(xtrain_doc2vec, ytrain)

In [177]:
prediction = xgb_model.predict_proba(xvalid_doc2vec)
prediction 

array([[0.53582823, 0.46417174],
       [0.04373556, 0.95626444],
       [0.3479312 , 0.6520688 ],
       ...,
       [0.9962422 , 0.00375777],
       [0.03484458, 0.9651554 ],
       [0.09240466, 0.90759534]], dtype=float32)

In [178]:
prediction_int = prediction[:,1] >= 0.3
prediction_int

array([ True,  True,  True, ..., False,  True,  True])

In [179]:
prediction_int = prediction_int.astype(np.int)
prediction_int

array([1, 1, 1, ..., 0, 1, 1])

In [180]:
accuracy_score(yvalid, prediction_int)

0.7571348628987129

In [181]:
f1_score(yvalid, prediction_int)

0.7792472024415055

In [182]:
print(confusion_matrix(yvalid, prediction_int))

[[587 302]
 [132 766]]


In [183]:
print(classification_report(yvalid, prediction_int))

              precision    recall  f1-score   support

         0.0       0.82      0.66      0.73       889
         1.0       0.72      0.85      0.78       898

    accuracy                           0.76      1787
   macro avg       0.77      0.76      0.75      1787
weighted avg       0.77      0.76      0.75      1787



**XGBoost using word2vec gives us the best results with our given matrics shown in classification report in sect 6.5.3**

In [184]:
xgb_model_best = XGBClassifier(max_depth=6, n_estimators=1000).fit(xtrain_word2vec, ytrain)

In [186]:
# save model
filename = 'finalized_model.sav'
pickle.dump(xgb_model_best, open(filename, 'wb'))

In [187]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

In [188]:
prediction = loaded_model.predict_proba(xvalid_word2vec)
prediction 

array([[7.9391772e-01, 2.0608230e-01],
       [4.2826951e-02, 9.5717305e-01],
       [3.2508373e-04, 9.9967492e-01],
       ...,
       [9.8542094e-01, 1.4579064e-02],
       [6.4736211e-01, 3.5263789e-01],
       [9.9662358e-01, 3.3764120e-03]], dtype=float32)

In [189]:
# for standard threshold 0.5
prediction_class = prediction[:,1] >= 0.5
prediction_class

array([False,  True,  True, ..., False, False, False])

In [190]:
prediction_int = prediction_class.astype(np.int)
prediction_int

array([0, 1, 1, ..., 0, 0, 0])

In [191]:
accuracy_score(yvalid, prediction_int)

0.817011751538892

In [192]:
f1_score(yvalid, prediction_int)

0.819436775262286

In [194]:
print(classification_report(yvalid, prediction_int))

              precision    recall  f1-score   support

         0.0       0.82      0.81      0.81       889
         1.0       0.81      0.83      0.82       898

    accuracy                           0.82      1787
   macro avg       0.82      0.82      0.82      1787
weighted avg       0.82      0.82      0.82      1787



In [195]:
# for threshold 0.5

prediction_class = prediction[:,1] >= 0.3
prediction_class

array([False,  True,  True, ..., False,  True, False])

In [196]:
prediction_int = prediction_class.astype(np.int)
prediction_int

array([0, 1, 1, ..., 0, 1, 0])

In [197]:
accuracy_score(yvalid, prediction_int)

0.8047006155567991

In [198]:
f1_score(yvalid, prediction_int)

0.815636555731643

In [199]:
print(classification_report(yvalid, prediction_int))

              precision    recall  f1-score   support

         0.0       0.84      0.75      0.79       889
         1.0       0.78      0.86      0.82       898

    accuracy                           0.80      1787
   macro avg       0.81      0.80      0.80      1787
weighted avg       0.81      0.80      0.80      1787



- Why XBBoost is most optimal model ?

Reason is that it maintains a good score of accuracy and f1 for both levels of threshold i.e 0.3 and 0.5. Other models like Random Forest does not handle imbalance class very well. That's Boosting algorithms sometimes outperform ensemble learning methods and this is one of those examples.