# Advanced ML Models

# 1)- Import key modules

In [1]:
# support both Python 2 and Python 3 with minimal overhead.
from __future__ import absolute_import, division, print_function

# I am an engineer. I care only about error not warning. So, let's be maverick and ignore warnings.
import warnings
warnings.filterwarnings('ignore')

In [2]:
import re    # for regular expressions 
import nltk  # for text manipulation 
import string 
import numpy as np 
import pandas as pd 
import string 

#For Visuals
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from matplotlib import rcParams
rcParams['figure.figsize'] = 11, 8
%config InlineBackend.figure_format = 'svg'
%matplotlib inline

In [3]:
#models and evaluation

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from nltk.classify.scikitlearn import SklearnClassifier # notice its from ntlk not sklearn
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
# Evaluation packages
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import confusion_matrix

In [4]:
#pip install version_information
%reload_ext version_information
%version_information pandas,numpy, nltk, seaborn, matplotlib

Software,Version
Python,3.7.7 64bit [MSC v.1916 64 bit (AMD64)]
IPython,7.13.0
OS,Windows 10 10.0.17763 SP0
pandas,1.0.3
numpy,1.18.1
nltk,3.5
seaborn,0.10.1
matplotlib,3.1.3
Fri Jun 26 17:05:33 2020 W. Europe Daylight Time,Fri Jun 26 17:05:33 2020 W. Europe Daylight Time


# 2)- Loading Data

In [5]:
data=pd.read_excel('clean_3655_eng.xlsx')
data=data.rename(columns={'Unnamed: 0':'random_columns'}) # a trick to tackle random index values
data.shape

(3655, 5)

In [6]:
data.head(2)

Unnamed: 0,random_columns,clean,firstmessage,dep,firstusedtextblock
0,0,helloi tri appli voucher order receiv mail did...,Hello:<br><br>I tried to apply a voucher to th...,Shipping issues,nichtkombiwb
1,1,wow wow wow im love acryl cover pro photo book...,WOW WOW WOW! I'm so in love with my acrylic co...,Customer feedback,feedback


### Keeping response of Bot as target variable

In [7]:
# select all samples that are above 100 atleast
#counts=data['firstusedtextblock'].value_counts()
#df = data.loc[data['firstusedtextblock'].isin(counts.index[counts > 30])]
#f.shape

In [8]:
#df.firstusedtextblock.value_counts()

# 3)- Vectorization

- bag of words
- tf-idf
- doc2vec
- word2vec

In [9]:
#filling any clean values in data with other

df=data.fillna('Other')

In [10]:
df.isnull().sum()

random_columns        0
clean                 0
firstmessage          0
dep                   0
firstusedtextblock    0
dtype: int64

In [11]:
features=df['clean']
labels=df['dep']
print(features.shape)
print(labels.shape)

(3655,)
(3655,)


### 3.1).Bag of Words

Bag-of-Words is a method to represent text into numerical features.

Let us understand this using a simple example. Suppose we have only 2 document

- D1: He is a lazy boy. She is also lazy.

- D2: Smith is a lazy person.

The list created would consist of all the unique tokens in the corpus C.

= [‘He’,’She’,’lazy’,’boy’,’Smith’,’person’]

Here, D=2, N=6



In [12]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import gensim

bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
# bag-of-words feature matrix
bow = bow_vectorizer.fit_transform(df['clean'])
bow.shape

(3655, 1000)

### 3.2)-TF-IDF

This is another method which is based on the frequency method but it is different to the bag-of-words approach in the sense that it takes into account not just the occurrence of a word in a single document (or tweet) but in the entire corpus.

TF-IDF works by penalising the common words by assigning them lower weights while giving importance to words which are rare in the entire corpus but appear in good numbers in few documents.

Let’s have a look at the important terms related to TF-IDF:

- TF = (Number of times term t appears in a document)/(Number of terms in the document)

- IDF = log(N/n), where, N is the number of documents and n is the number of documents a term t has appeared in.

- TF-IDF = TF*IDF

In [13]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
# TF-IDF feature matrix
tfidf = tfidf_vectorizer.fit_transform(df['clean'])
tfidf.shape

(3655, 1000)

### 3.3)- Doc2Vec Embedding

In [14]:
from tqdm import tqdm 
tqdm.pandas(desc="progress-bar") 
from gensim.models.doc2vec import TaggedDocument

In [15]:
tokenized_text = df['clean'].apply(lambda x: x.split()) # tokenizing

In [16]:
def add_label(twt):
    output = []
    for i, s in zip(twt.index, twt):
        output.append(TaggedDocument(s, ["clean_" + str(i)]))
    return output
labeled_text = add_label(tokenized_text) # label all the tweets

##### 3.3.a.Train doc2vec model

In [17]:
model_d2v = gensim.models.Doc2Vec(dm=1,dm_mean=1,vector_size=200,window=5,negative=7,min_count=5,workers=3,alpha=0.1,seed=23)

In [18]:
model_d2v.build_vocab([i for i in tqdm(labeled_text)])

100%|█████████████████████████████████████████████████████████████████████████| 3655/3655 [00:00<00:00, 3579309.16it/s]


##### 3.3.b.Preparing doc2vec Feature Set

In [19]:
docvec_arrays = np.zeros((len(tokenized_text), 200))
for i in range(len(data)):
    docvec_arrays[i,:] = model_d2v.docvecs[i].reshape((1,200))

    
docvec_df = pd.DataFrame(docvec_arrays)
docvec_df.shape

(3655, 200)

In [20]:
docvec_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.002166,0.000939,-0.00098,5e-05,0.001281,6.1e-05,0.00144,0.0005,0.000581,-0.001539,...,-0.002178,-0.001576,0.001816,0.000959,-5.8e-05,0.00076,-0.000628,-0.000629,-0.000777,0.001153
1,5.8e-05,0.00183,0.001868,0.002268,0.000487,0.000579,-0.001489,-0.001988,-0.001501,0.001159,...,0.002264,0.002389,-0.000381,-0.000881,0.000729,-0.000314,-0.000399,1e-05,0.000101,-0.001789
2,-0.000939,0.000681,-0.001643,0.002111,-0.001158,-0.000852,0.001565,0.000319,-0.001806,-0.000717,...,-0.002187,-0.00219,-0.000274,0.000139,0.001936,0.001624,0.002444,0.001509,0.001125,-0.002286
3,-0.000846,-0.000962,0.001039,-0.001436,-0.000521,0.002297,-0.00223,-0.002242,-0.00017,-0.001664,...,0.001961,4.1e-05,-0.001898,-0.001061,0.001041,0.000436,-0.002331,-0.002375,-0.001548,-0.001167
4,-0.00232,0.000137,0.001443,-0.00221,-0.001233,-0.000493,0.00215,-0.001851,0.001271,-0.002494,...,0.000728,0.00038,-0.001385,-0.000499,-0.000883,-0.0004,0.0019,-0.000789,0.000151,0.001394


### 3.4.Word2Vec Embedding

In [21]:
tokenized_text = df['clean'].apply(lambda x: x.split()) # tokenizing

model_w2v = gensim.models.Word2Vec(
            tokenized_text,
            size=200, # desired no. of features/independent variables
            window=5, # context window size
            min_count=2,
            sg = 1, # 1 for skip-gram model
            hs = 0,
            negative = 10, # for negative sampling i.e class with other types
            workers= 2, # no.of cores
            seed = 34) 

model_w2v.train(tokenized_text, total_examples= len(data['clean']), epochs=20)

(2167305, 2636780)

In [22]:
model_w2v.wv['saal']

array([-0.08594143, -0.1393568 , -0.01676735,  0.16414452, -0.05891037,
       -0.19802162,  0.10549298, -0.35362422,  0.03304493, -0.13664612,
        0.29476675,  0.27462107, -0.19522385,  0.6507489 , -0.35911494,
        0.41927707, -0.02259575, -0.02091164, -0.1625492 ,  0.17799832,
       -0.06042916, -0.2579778 , -0.18438585,  0.4082261 , -0.18838435,
       -0.42107424, -0.30793926,  0.28581512,  0.02010676,  0.00502999,
        0.31703722, -0.19147067, -0.08546429, -0.03063924,  0.28291166,
        0.08268694,  0.5034758 , -0.00929422, -0.06546594, -0.05180954,
        0.08905365, -0.30859813, -0.37141716, -0.05231638,  0.14325944,
       -0.12485892, -0.06324858,  0.00233839, -0.5763319 , -0.2960994 ,
        0.23777905, -0.08020502, -0.3752152 ,  0.41769078, -0.2881874 ,
        0.11470824,  0.44921073,  0.12638573,  0.30085662,  0.17457093,
       -0.18340655, -0.34110963,  0.01167548,  0.13790037, -0.35857105,
        0.6253741 , -0.55346495, -0.50293547,  0.19824928,  0.25

In [23]:
len(model_w2v.wv['saal'])

200

In [24]:
type(model_w2v)

gensim.models.word2vec.Word2Vec

##### 3.4.1.Preparing Vectors for text data

In [25]:
def word_vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model_w2v[word].reshape((1, size))
            count += 1.
        except KeyError: # handling the case where the token is not in vocabulary           
            continue
    if count != 0:
        vec /= count
    return vec

##### 3.4.2.Preparing word2vec feature set

In [26]:
wordvec_arrays = np.zeros((len(tokenized_text), 200)) 
for i in range(len(tokenized_text)):
    wordvec_arrays[i,:] = word_vector(tokenized_text[i], 200)
    wordvec_df = pd.DataFrame(wordvec_arrays)
wordvec_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.331766,-0.038644,-0.033001,0.274877,-0.208521,-0.017309,0.086353,-0.185794,0.080092,-0.157228,...,0.076795,0.133693,-0.112057,-0.009724,-0.074638,-0.152288,0.178477,0.070667,-0.032763,0.050453
1,0.286518,0.006342,0.102688,0.255977,-0.106073,-0.17722,-0.014828,-0.179581,0.04188,-0.187265,...,-0.001617,-0.132372,-0.011269,0.196126,0.003819,-0.105417,0.204926,-0.036961,-0.013739,0.071489
2,0.15794,0.017721,-0.114595,0.205522,-0.150483,0.072872,-0.102696,-0.037234,0.187812,-0.259148,...,0.122001,0.149522,0.092402,-0.087501,0.02501,0.150465,0.163956,-0.083834,-0.013065,0.090841
3,0.244558,-0.087245,0.035899,0.181079,-0.178954,0.080701,-0.02471,-0.198605,0.132769,-0.0093,...,0.01074,-0.035341,0.028652,0.120433,0.127101,-0.251632,0.099583,-0.002252,0.014713,0.101971
4,0.347166,0.011271,-0.032613,0.11429,-0.08629,-0.08034,0.003564,-0.15635,-0.106164,-0.152771,...,0.109793,0.087171,-0.220014,-0.001117,-0.008629,-0.187659,0.152867,0.060867,0.088029,0.201105


In [27]:
wordvec_df.shape

(3655, 200)

# 4)-Model Building

- Logistic Regression 
- Support Vector
- Random Forest
- XGBoost
- MLP

### 4.1.Logistic Regression Model

In [28]:
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import f1_score, accuracy_score, roc_curve,roc_auc_score,confusion_matrix, classification_report

##### 4.1.a. Logistic Regression using Bag-of-Words Features

In [29]:
X=bow
y=df['dep']
print(X.shape)
print(y.shape)

(3655, 1000)
(3655,)


In [30]:
# splitting data into training and validation set
xtrain_bow, xvalid_bow, ytrain, yvalid = train_test_split(bow, y,random_state=42,test_size=0.2)

In [31]:
print(xtrain_bow.shape)
print(xvalid_bow.shape)
print(ytrain.shape)
print(yvalid.shape)

(2924, 1000)
(731, 1000)
(2924,)
(731,)


In [32]:
lreg = LogisticRegression(solver='liblinear')

# training the model
lreg.fit(xtrain_bow, ytrain)

LogisticRegression(solver='liblinear')

In [33]:
# predicting on the validation set
prediction = lreg.predict_proba(xvalid_bow)
prediction[:5]

array([[1.25114720e-03, 1.50902304e-05, 7.87329957e-06, 5.25325359e-05,
        7.38993811e-04, 3.97275231e-05, 7.44688145e-01, 5.81113732e-06,
        1.15586454e-01, 2.68178547e-04, 5.19969867e-02, 1.65308256e-04,
        2.17490933e-05, 3.33104954e-05, 5.06001688e-05, 3.35933472e-06,
        2.14868750e-05, 1.93266483e-02, 6.50283749e-02, 2.54745197e-04,
        2.20207170e-04, 2.23270811e-04],
       [1.15029640e-01, 4.54453468e-03, 3.22528561e-03, 1.66677191e-03,
        1.01256907e-02, 2.39948304e-03, 7.18337242e-01, 3.35708164e-03,
        1.11185083e-02, 1.07515490e-02, 4.98802807e-03, 1.38316589e-03,
        2.18079872e-03, 1.01478559e-03, 1.53424966e-03, 3.58630523e-03,
        4.50954637e-02, 8.82524788e-03, 6.72903832e-03, 1.50400977e-02,
        2.55420351e-02, 3.52499689e-03],
       [3.19660306e-03, 4.76074044e-04, 1.67482968e-03, 1.21733747e-03,
        1.28329399e-02, 1.53215581e-03, 8.13601706e-01, 1.40715347e-02,
        3.54192616e-03, 1.64081165e-03, 7.06608328e-03

In [34]:
# prediction over classes

prediction_class=lreg.predict(xvalid_bow)
prediction_class[:5]

array(['Order management', 'Order management', 'Order management',
       'Order management', 'Software/Webshop/App'], dtype=object)

In [35]:
accuracy_score(yvalid, prediction_class)

0.5485636114911081

##### 4.1.b.Logistic Regression using TF-IDF Features

In [36]:
X=tfidf
y=df['dep']
print(X.shape)
print(y.shape)

(3655, 1000)
(3655,)


In [37]:
# splitting data into training and validation set
xtrain_tfidf, xvalid_tfidf, ytrain, yvalid = train_test_split(tfidf, y,random_state=42,test_size=0.2)

In [38]:
print(xtrain_tfidf.shape)
print(xvalid_tfidf.shape)
print(ytrain.shape)
print(yvalid.shape)

(2924, 1000)
(731, 1000)
(2924,)
(731,)


In [39]:
lreg = LogisticRegression(solver='liblinear')

# training the model
lreg.fit(xtrain_tfidf, ytrain)

LogisticRegression(solver='liblinear')

In [40]:
# predicting on the validation set
prediction = lreg.predict_proba(xvalid_tfidf)

In [41]:
prediction_class=lreg.predict(xvalid_tfidf)

In [42]:
accuracy_score(yvalid, prediction_class)

0.5554035567715458

##### 4.1.c. Logistic Regression using Word2Vec Features

In [43]:
X=wordvec_df
y=df['dep']
print(X.shape)
print(y.shape)

(3655, 200)
(3655,)


In [44]:
# splitting data into training and validation set
xtrain_word2vec, xvalid_word2vec, ytrain, yvalid = train_test_split(X, y,random_state=42,test_size=0.2)

In [45]:
print(xtrain_word2vec.shape)
print(xvalid_word2vec.shape)
print(ytrain.shape)
print(yvalid.shape)

(2924, 200)
(731, 200)
(2924,)
(731,)


In [46]:
# training the model
lreg.fit(xtrain_word2vec, ytrain)

LogisticRegression(solver='liblinear')

In [47]:
# predicting on the validation set
prediction = lreg.predict_proba(xvalid_word2vec)

In [48]:
prediction_class=lreg.predict(xvalid_word2vec)

In [49]:
accuracy_score(yvalid, prediction_class)

0.5772913816689467

##### 4.1.d. Logistic Regression using Doc2Vec Features

In [50]:
X=docvec_df
y=df['dep']
print(X.shape)
print(y.shape)

(3655, 200)
(3655,)


In [51]:
# splitting data into training and validation set
xtrain_doc2vec, xvalid_doc2vec, ytrain, yvalid = train_test_split(X, y,random_state=42,test_size=0.2)
print(xtrain_doc2vec.shape)
print(xvalid_doc2vec.shape)
print(ytrain.shape)
print(yvalid.shape)

(2924, 200)
(731, 200)
(2924,)
(731,)


In [52]:
# training the model
lreg.fit(xtrain_doc2vec, ytrain)

LogisticRegression(solver='liblinear')

In [53]:
# predicting on the validation set
prediction = lreg.predict_proba(xvalid_doc2vec)

In [54]:
prediction_class=lreg.predict(xvalid_doc2vec)

In [55]:
accuracy_score(yvalid, prediction_class)

0.4117647058823529

**Summary:**

- bow=54%
- tfidf=55%
- word2vec=57%
- doc2vec=41%

### 4.2.Support Vector Machine (SVM)

In [56]:
from sklearn import svm

##### SVM using Bag-of-Words Features

In [57]:
svc = svm.SVC(kernel='linear', C=1, probability=True).fit(xtrain_bow, ytrain)
prediction = svc.predict_proba(xvalid_bow)
prediction_class = svc.predict(xvalid_bow)

In [58]:
accuracy_score(yvalid, prediction_class)

0.5471956224350205

##### SVM using TF-IDF Features

In [59]:
svc = svm.SVC(kernel='linear',C=1, probability=True).fit(xtrain_tfidf, ytrain)
prediction = svc.predict_proba(xvalid_tfidf)
prediction_class = svc.predict(xvalid_tfidf)
accuracy_score(yvalid, prediction_class)

0.560875512995896

##### SVM using word2vec Features

In [60]:
svc = svm.SVC(kernel='linear', C=1, probability=True).fit(xtrain_word2vec, ytrain)
prediction = svc.predict_proba(xvalid_word2vec)
prediction_class = svc.predict(xvalid_word2vec)
accuracy_score(yvalid, prediction_class)

0.585499316005472

##### SVM using doc2vec Features

In [61]:
svc = svm.SVC(kernel='linear', C=1, probability=True).fit(xtrain_doc2vec, ytrain)
prediction = svc.predict_proba(xvalid_doc2vec)
prediction_class = svc.predict(xvalid_doc2vec)
accuracy_score(yvalid, prediction_class)

0.4117647058823529

**Summary**


- bow = 54%
- tfidf= 56%
- word2vec= 58% 
- doc2vec= 41%

### 4.3.Random Forest

In [62]:
from sklearn.ensemble import RandomForestClassifier

##### RF with Bag-of-Words Features

In [63]:
rf = RandomForestClassifier(n_estimators=400, random_state=11).fit(xtrain_bow, ytrain)
prediction = rf.predict_proba(xvalid_bow)
prediction_class = rf.predict(xvalid_bow)
accuracy_score(yvalid, prediction_class)

0.5704514363885089

##### RF with TF-IDF Features

In [64]:
rf = RandomForestClassifier(n_estimators=400, random_state=11).fit(xtrain_tfidf, ytrain)
prediction = rf.predict_proba(xvalid_tfidf)
prediction_class = rf.predict(xvalid_tfidf)
accuracy_score(yvalid, prediction_class)

0.5677154582763337

##### RF with word2vec Features

In [65]:
rf = RandomForestClassifier(n_estimators=400, random_state=11).fit(xtrain_word2vec, ytrain)
prediction= rf.predict_proba(xvalid_word2vec)
prediction_class = rf.predict(xvalid_word2vec)
accuracy_score(yvalid, prediction_class)

0.5745554035567716

##### RF with doc2vec Feature

In [66]:
rf = RandomForestClassifier(n_estimators=400, random_state=11).fit(xtrain_doc2vec, ytrain)
prediction= rf.predict_proba(xvalid_doc2vec)
prediction_class = rf.predict(xvalid_doc2vec)
accuracy_score(yvalid, prediction_class)

0.4117647058823529

**Summary**

- bow = 57%
- tfidf = 56%
- word2vec = 57%
- doc2vec = 41%

# 4.4.XGBoost
Extreme Gradient Boosting (xgboost) is an advanced implementation of gradient boosting algorithm. It has both linear model solver and tree learning algorithms. Its ability to do parallel computation on a single machine makes it extremely fast. It also has additional features for doing cross validation and finding important variables. There are many parameters which need to be controlled to optimize the model.

Some key benefits of XGBoost are:

Regularization - helps in reducing overfitting
Parallel Processing - XGBoost implements parallel processing and is blazingly faster as compared to GBM.
Handling Missing Values - It has an in-built routine to handle missing values.
Built-in Cross-Validation - allows user to run a cross-validation at each iteration of the boosting process

**Notice there is no sklearn ready made model therefore; I needed to use XGBoost from its main librrary**

In [67]:
from xgboost import XGBClassifier

##### XGBoost using bag of words features

In [68]:
xgb_model = XGBClassifier(max_depth=6, n_estimators=1000).fit(xtrain_bow, ytrain)
prediction = xgb_model.predict_proba(xvalid_bow)
prediction_class = xgb_model.predict(xvalid_bow)
accuracy_score(yvalid, prediction_class)

0.5444596443228454

##### XGBoost using tfidf features

In [69]:
xgb_model = XGBClassifier(max_depth=6, n_estimators=1000).fit(xtrain_tfidf, ytrain)
prediction = xgb_model.predict_proba(xvalid_tfidf)
prediction_class = xgb_model.predict(xvalid_tfidf)
accuracy_score(yvalid, prediction_class)

0.5554035567715458

##### XGBoost using word2vecfeatures

In [70]:
xgb_model = XGBClassifier(max_depth=6, n_estimators=1000).fit(xtrain_word2vec, ytrain)
prediction = xgb_model.predict_proba(xvalid_word2vec)
prediction_class = xgb_model.predict(xvalid_word2vec)
accuracy_score(yvalid, prediction_class)

0.5991792065663475

##### XGBoost using doc2vec features

In [71]:
xgb_model = XGBClassifier(max_depth=6, n_estimators=1000).fit(xtrain_doc2vec, ytrain)
prediction = xgb_model.predict_proba(xvalid_doc2vec)
prediction_class = xgb_model.predict(xvalid_doc2vec)
accuracy_score(yvalid, prediction_class)


0.3679890560875513

**Summary**

- bow = 54%
- tfidf = 55%
- word2vec = 58%
- doc2vec = 37%

### 4.5.MLPClassifier

A multilayer perceptron (MLP) is a class of feedforward artificial neural network

In [72]:
from sklearn.neural_network import MLPClassifier

##### MLP using bag of words features

In [73]:
mlp_model = MLPClassifier(random_state=1, max_iter=300,learning_rate_init=0.001).fit(xtrain_bow, ytrain)
prediction = mlp_model.predict_proba(xvalid_bow)
prediction_class = mlp_model.predict(xvalid_bow)
accuracy_score(yvalid, prediction_class)

0.518467852257182

##### MLP using tfidf features

In [74]:
mlp_model = MLPClassifier(random_state=1, max_iter=300,learning_rate_init=0.001).fit(xtrain_tfidf, ytrain)
prediction = mlp_model.predict_proba(xvalid_tfidf)
prediction_class = mlp_model.predict(xvalid_tfidf)
accuracy_score(yvalid, prediction_class)

0.5280437756497948

##### MLP using word2vecfeatures

In [75]:
mlp_model = MLPClassifier(random_state=1, max_iter=300,learning_rate_init=0.001).fit(xtrain_word2vec, ytrain)
prediction = mlp_model.predict_proba(xvalid_word2vec)
prediction_class = mlp_model.predict(xvalid_word2vec)
accuracy_score(yvalid, prediction_class)

0.5595075239398085

##### MLP using doc2vec features

In [76]:
mlp_model = MLPClassifier(random_state=1, max_iter=300,learning_rate_init=0.001).fit(xtrain_doc2vec, ytrain)
prediction = mlp_model.predict_proba(xvalid_doc2vec)
prediction_class = mlp_model.predict(xvalid_doc2vec)
accuracy_score(yvalid, prediction_class)

0.4117647058823529

**Summary**

- bow = 51%
- tfidf = 52%
- word2vec = 54%
- doc2vec = 41%

**XGBoost using word2vec gives us the best results with our given matrics i.e 58%.**

## Saving model

In [77]:
import pickle
xgb_model_best = XGBClassifier(max_depth=6, n_estimators=1000).fit(xtrain_word2vec, ytrain)
# save model
filename = 'xgb_model.sav'
pickle.dump(xgb_model_best, open(filename, 'wb'))

In [78]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

In [79]:
prediction = loaded_model.predict_proba(xvalid_word2vec)
prediction_class = loaded_model.predict(xvalid_word2vec)

In [80]:
accuracy_score(yvalid, prediction_class)

0.5991792065663475

In [81]:
print(classification_report(yvalid, prediction_class))

                                                      precision    recall  f1-score   support

                                   Customer feedback       0.61      0.53      0.57        78
                       Data protection (Datenschutz)       1.00      0.50      0.67         4
                                   Discovery voucher       0.00      0.00      0.00         4
                                           Marketing       0.71      0.52      0.60        23
                                    Order management       0.65      0.83      0.73       301
                                 Payment (Bezahlung)       0.00      0.00      0.00        12
                                   Product (Produkt)       0.71      0.26      0.38        19
                                   Production delays       0.00      0.00      0.00         9
                    Professional area (Profibereich)       0.60      0.18      0.27        17
                                   Reseller workflow       

In [82]:
print(labels.nunique())
print(yvalid.nunique())
print(ytrain.nunique())

22
18
22


Other values are also very consistent.

- accuracy = 57.8%
- precision = 58%
- recall = 58%
- f-score = 55%
- (test samples=731)
- No. of classes in test data = 18
- No. of classes in train data = 22
- Total Classes = 22

# END OF NOTEBOOK CODE