# Advanced ML Models

# 1)- Import key modules

In [3]:
# support both Python 2 and Python 3 with minimal overhead.
from __future__ import absolute_import, division, print_function

# I am an engineer. I care only about error not warning. So, let's be maverick and ignore warnings.
import warnings
warnings.filterwarnings('ignore')

In [4]:
import re    # for regular expressions 
import nltk  # for text manipulation 
import string 
import numpy as np 
import pandas as pd 
import string 

#For Visuals
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from matplotlib import rcParams
rcParams['figure.figsize'] = 11, 8
%config InlineBackend.figure_format = 'svg'
%matplotlib inline

In [5]:
#models and evaluation

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from nltk.classify.scikitlearn import SklearnClassifier # notice its from ntlk not sklearn
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
# Evaluation packages
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import confusion_matrix

In [6]:
!pip install version_information

Collecting version_information
  Downloading https://files.pythonhosted.org/packages/ff/b0/6088e15b9ac43a08ccd300d68e0b900a20cf62077596c11ad11dd8cc9e4b/version_information-1.0.3.tar.gz
Building wheels for collected packages: version-information
  Building wheel for version-information (setup.py) ... [?25l[?25hdone
  Created wheel for version-information: filename=version_information-1.0.3-cp36-none-any.whl size=3881 sha256=44d29efe59f9c47608b85a9ec3c7386a47cc545f025f1a97653e126a86cc0a20
  Stored in directory: /root/.cache/pip/wheels/1f/4c/b3/1976ac11dbd802723b564de1acaa453a72c36c95827e576321
Successfully built version-information
Installing collected packages: version-information
Successfully installed version-information-1.0.3


In [7]:
#pip install version_information
%reload_ext version_information
%version_information pandas,numpy, nltk, seaborn, matplotlib

Software,Version
Python,3.6.9 64bit [GCC 8.4.0]
IPython,5.5.0
OS,Linux 4.19.112+ x86_64 with Ubuntu 18.04 bionic
pandas,1.0.5
numpy,1.18.5
nltk,3.2.5
seaborn,0.10.1
matplotlib,3.2.2
Fri Sep 25 11:48:12 2020 UTC,Fri Sep 25 11:48:12 2020 UTC


In [8]:
# testing GPU on colab
import tensorflow as tf
tf.test.gpu_device_name()

''

# 2)- Loading Data

In [9]:
data=pd.read_csv('train_data_clean.csv')
#data=data.rename(columns={'Unnamed: 0':'random_columns'}) # a trick to tackle random index values
data.shape

(10000, 3)

In [10]:
data.head(2)

Unnamed: 0,news,category,clean
0,Top 5 Reasons Why 'Divergent' Star Kate Winsle...,e,top 5 reason diverg star kate winslet deserv s...
1,Vessyl Bottle Tracks Your Drink And Its Health...,t,vessyl bottl track drink health benefitsgadget...


In [11]:
data.isnull().sum()

news        0
category    0
clean       0
dtype: int64

In [13]:
#loading test feature and label data saved from previous notebooks
feature_test=pd.read_csv('test_data.csv')
label_test=pd.read_csv('test_label.csv')

In [14]:
print(feature_test.shape)
print(label_test.shape)

(84484, 2)
(84484, 1)


# 3)- Vectorization

- bag of words
- tf-idf
- doc2vec
- word2vec

In [12]:
features=data['clean']
labels=data['category']
print(features.shape)
print(labels.shape)

(10000,)
(10000,)


### 3.1).Bag of Words

Bag-of-Words is a method to represent text into numerical features.

Let us understand this using a simple example. Suppose we have only 2 document

- D1: He is a lazy boy. She is also lazy.

- D2: Smith is a lazy person.

The list created would consist of all the unique tokens in the corpus C.

= [‘He’,’She’,’lazy’,’boy’,’Smith’,’person’]

Here, D=2, N=6



In [15]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import gensim

bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
# bag-of-words feature matrix
bow = bow_vectorizer.fit_transform(features)
bow.shape

(10000, 1000)

#### 3.1.a. Transform test data

In [58]:
feature_test.head(2)

Unnamed: 0.1,Unnamed: 0,title
0,153245,iPhone 6 Release Date Pushed Back Due to Issue...
1,308611,Samsung Galaxy S4 vs Galaxy S3: Budget-Friendl...


In [59]:
bow_test = bow_vectorizer.transform(feature_test["title"])

In [60]:
print(bow.shape)
print(bow_test.shape)

(10000, 1000)
(84484, 1000)


### 3.2)-TF-IDF

This is another method which is based on the frequency method but it is different to the bag-of-words approach in the sense that it takes into account not just the occurrence of a word in a single document (or tweet) but in the entire corpus.

TF-IDF works by penalising the common words by assigning them lower weights while giving importance to words which are rare in the entire corpus but appear in good numbers in few documents.

Let’s have a look at the important terms related to TF-IDF:

- TF = (Number of times term t appears in a document)/(Number of terms in the document)

- IDF = log(N/n), where, N is the number of documents and n is the number of documents a term t has appeared in.

- TF-IDF = TF*IDF

In [16]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
# TF-IDF feature matrix
tfidf = tfidf_vectorizer.fit_transform(features)
tfidf.shape

(10000, 1000)

#### 3.2.a. Transform Test data

In [61]:
tfidf_test = tfidf_vectorizer.transform(feature_test["title"])

In [62]:
print(tfidf.shape)
print(tfidf_test.shape)

(10000, 1000)
(84484, 1000)


### 3.3)- Doc2Vec Embedding

In [17]:
from tqdm import tqdm 
tqdm.pandas(desc="progress-bar") 
from gensim.models.doc2vec import TaggedDocument

In [26]:
tokenized_text = data['clean'].apply(lambda x: x.split()) # tokenizing

In [27]:
def add_label(twt):
    output = []
    for i, s in zip(twt.index, twt):
        output.append(TaggedDocument(s, ["clean_" + str(i)]))
    return output
labeled_text = add_label(tokenized_text) # label all the news

##### 3.3.a.Train doc2vec model

In [28]:
model_d2v = gensim.models.Doc2Vec(dm=1,dm_mean=1,vector_size=200,window=5,negative=7,min_count=5,workers=3,alpha=0.1,seed=23)

In [29]:
model_d2v.build_vocab([i for i in tqdm(labeled_text)])

100%|██████████| 10000/10000 [00:00<00:00, 1460920.93it/s]


##### 3.3.b.Preparing doc2vec Feature Set

In [31]:
docvec_arrays = np.zeros((len(tokenized_text), 200))
for i in range(len(data)):
    docvec_arrays[i,:] = model_d2v.docvecs[i].reshape((1,200))

    
docvec_df = pd.DataFrame(docvec_arrays)
docvec_df.shape

(10000, 200)

In [32]:
docvec_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199
0,0.00066,-0.001548,-0.001803,0.001017,0.000443,0.002277,-0.001327,0.000648,-0.001661,-0.002412,-0.001085,-0.00139,0.001621,0.000651,-0.001689,0.000301,0.002134,-0.001802,-0.000452,-0.002162,0.00209,0.001161,-0.002306,0.002419,0.000466,-0.00242,0.001162,-0.000503,0.002183,-0.001155,-0.002186,-0.000369,-0.001165,-0.000774,0.000329,-0.000395,0.000179,0.00097,-0.000976,-0.001376,...,0.000464,0.000956,-0.000348,0.00098,-0.001252,0.001497,0.000941,0.000353,-0.001991,0.002098,-0.00109,0.000263,0.001332,-0.000861,-0.001793,-0.001753,-0.000463,0.000376,-0.000585,-0.002241,-0.001413,-0.000529,0.002321,-0.002379,-0.00232,0.000869,-0.001165,-0.001717,0.0017,-0.000151,0.001759,0.001252,-0.000431,-0.002377,-0.000306,0.001925,0.001344,-0.001755,0.002436,0.000104
1,-0.000643,-0.001514,0.00206,0.001986,0.001788,-0.002209,0.002283,0.001975,0.002193,-0.001416,-0.002301,0.000521,-0.00212,0.000802,0.00053,-0.001273,0.000675,0.001101,0.002246,-0.001894,0.001171,-0.002045,0.00222,0.001133,-0.00077,-0.000855,0.000913,-0.000234,0.002405,-0.001193,-0.001934,0.001232,-0.0012,0.001148,-0.00068,-0.000302,0.000863,0.001703,-0.002086,-0.000216,...,0.001424,-0.000645,0.002287,0.001414,0.001195,0.000979,-9.9e-05,-0.00173,-0.001844,-0.000229,0.002394,-0.00156,-0.002325,-0.000574,-0.001903,0.001161,-0.001391,0.000984,-0.000118,-0.001652,0.001745,0.001118,0.002353,-0.001233,0.001241,-0.002389,-0.001753,0.001194,-0.001765,-0.000146,0.001008,0.000385,-0.000687,0.001901,-0.000519,0.001515,-0.000411,0.001046,-0.001441,-0.001738
2,0.001979,0.001849,-0.001995,-0.001026,-0.000852,-0.001799,0.000509,0.00086,-0.001535,0.001565,0.0011,0.001211,0.001209,-0.001482,0.001661,-0.00142,-0.001368,0.000574,-0.001618,-0.00135,0.002393,0.000925,-0.000395,0.000259,0.001515,0.001906,0.001472,0.000751,-0.001331,0.000112,0.000217,-0.000464,0.001808,0.00103,-0.001323,-0.002057,0.000434,-0.000212,-0.00117,0.002322,...,2e-06,0.001776,-0.0019,0.001152,-0.002029,0.001571,-0.000752,-0.002377,-0.000784,0.002026,0.001351,0.000444,0.000966,8.5e-05,-0.0004,0.001452,-0.001162,0.000999,0.000947,0.001177,0.002333,0.0018,0.000455,0.001037,0.000276,-0.002273,0.002066,-0.001888,-0.001703,0.002245,-0.001162,-0.000733,-0.000553,-0.000359,0.000163,0.000528,-0.001528,-0.000377,0.001272,-0.001139
3,0.000833,0.001087,-0.000411,-0.000663,-0.001789,-0.001146,-0.001027,-0.002483,7e-05,0.00117,-0.001924,0.000602,-0.000384,0.001695,0.001532,-0.000288,-0.001623,0.0008,-0.000955,0.0011,0.001058,-9e-06,0.001272,-0.001759,-0.001806,0.001676,0.000563,-0.002048,-0.001953,0.001511,0.000782,-0.000603,-0.00056,-0.002205,2e-06,0.000474,0.002141,0.002306,-0.000497,0.00225,...,0.000151,-0.000505,0.002163,0.000279,0.002288,0.001558,-0.001581,-0.002436,-0.000155,-0.001032,-0.002245,0.002293,0.000564,0.002472,0.000732,-0.002029,-0.000661,0.002025,0.000142,-0.001471,-0.000774,0.000375,0.000222,0.000909,0.000706,0.001184,-0.000113,0.000377,0.000201,-0.000262,-0.000498,-0.002466,0.000778,0.000853,-0.000561,0.002186,0.001363,0.002281,0.000959,0.001521
4,-0.000257,-0.00142,0.000425,0.001791,-0.001414,-3.4e-05,-0.001541,0.001569,-0.002313,0.002408,-6e-06,0.001897,0.002358,-0.00035,0.001947,-0.001627,-0.001613,-0.002186,-0.000157,0.000643,0.001653,-0.000728,0.002204,-0.001091,0.001301,-0.00214,0.000378,-0.000617,0.001685,0.000145,-0.002145,-0.001471,0.001601,2.6e-05,-0.001206,0.001618,0.00173,0.000245,-0.002497,0.001613,...,0.00027,0.000291,-0.002008,-0.000504,0.002242,-0.002224,0.00104,-0.00205,0.000228,0.002462,0.002483,-0.000502,0.001079,-0.001243,-0.000257,-0.001698,-0.002161,0.002076,-0.000724,-0.000359,-0.000807,0.001816,-0.000223,0.000381,-0.002461,-0.002221,-0.000591,-0.002397,-0.001409,0.000707,0.001782,-0.00219,-0.002089,0.001718,-0.002062,-4.8e-05,0.000422,-0.001455,-0.000906,-0.001383


#### 3.3.3.transform test data

In [49]:
feature_test.head(2)

Unnamed: 0.1,Unnamed: 0,title
0,153245,iPhone 6 Release Date Pushed Back Due to Issue...
1,308611,Samsung Galaxy S4 vs Galaxy S3: Budget-Friendl...


In [50]:
tokenized_text_test = feature_test['title'].apply(lambda x: x.split()) # tokenizing

In [51]:
labeled_text_test = add_label(tokenized_text_test) # label all the news

In [56]:
docvec_arrays_test = np.zeros((len(tokenized_text_test), 200))
for i in range(len(data)):
    docvec_arrays_test[i,:] = model_d2v.docvecs[i].reshape((1,200))

    
docvec_df_test = pd.DataFrame(docvec_arrays_test)


(84484, 200)

In [57]:
print(docvec_df.shape) # training
print(docvec_df_test.shape) # for testing

(10000, 200)
(84484, 200)


Notice I have only transformed test set and didn't train. So only my train set learns about vocab of corpus. My test model has only been transformed and learnt nothing.

### 3.4.Word2Vec Embedding

In [33]:
tokenized_text = data['clean'].apply(lambda x: x.split()) # tokenizing

model_w2v = gensim.models.Word2Vec(
            tokenized_text,
            size=200, # desired no. of features/independent variables
            window=5, # context window size
            min_count=2,
            sg = 1, # 1 for skip-gram model
            hs = 0,
            negative = 10, # for negative sampling i.e class with other types
            workers= 2, # no.of cores
            seed = 34) 

model_w2v.train(tokenized_text, total_examples= len(data['clean']), epochs=20)

(1329685, 1633380)

In [36]:
model_w2v.wv['nasdaq']

array([-4.19158429e-01, -2.29767099e-01, -2.00829958e-03,  4.46986109e-01,
        2.00698435e-01,  9.71860439e-02,  4.15311158e-02,  2.58581072e-01,
        3.71628642e-01,  3.26173812e-01, -8.91381223e-03,  1.84910446e-01,
        2.54351079e-01,  1.06504513e-02, -6.44920617e-02, -3.30742985e-01,
       -6.48779571e-02,  1.23355500e-01,  4.43420932e-02,  1.63406491e-01,
        3.49484593e-01, -1.41000241e-01, -4.32326406e-01, -1.62403971e-01,
        9.17873859e-01,  1.92707494e-01, -2.93042153e-01,  1.03319466e-01,
        2.02471361e-01,  9.48425606e-02,  4.04774070e-01, -4.50274954e-03,
        4.01588649e-01, -3.88885550e-02, -7.54421204e-02, -1.67875737e-01,
       -4.73084062e-01, -9.19232517e-03,  2.94244200e-01,  4.66713347e-02,
        3.12463224e-01, -7.01050907e-02,  2.46901587e-02, -2.88366258e-01,
       -2.06862837e-01, -2.26560801e-01,  5.84619462e-01,  4.43947613e-01,
        6.38083816e-02, -4.46176529e-01,  8.23984519e-02,  8.29339400e-03,
       -1.95765402e-02,  

In [37]:
len(model_w2v.wv['nasdaq'])

200

In [38]:
type(model_w2v)

gensim.models.word2vec.Word2Vec

##### 3.4.1.Preparing Vectors for text data

In [39]:
def word_vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model_w2v[word].reshape((1, size))
            count += 1.
        except KeyError: # handling the case where the token is not in vocabulary           
            continue
    if count != 0:
        vec /= count
    return vec

##### 3.4.2.Preparing word2vec feature set

In [40]:
wordvec_arrays = np.zeros((len(tokenized_text), 200)) 
for i in range(len(tokenized_text)):
    wordvec_arrays[i,:] = word_vector(tokenized_text[i], 200)
    wordvec_df = pd.DataFrame(wordvec_arrays)
wordvec_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199
0,0.112036,0.117529,0.214865,-0.020733,0.261518,0.107048,0.11626,-0.103589,0.39861,0.037055,0.262092,0.081014,0.009041,0.068648,-0.076937,-0.189877,0.096075,0.175835,0.05669,0.218068,0.127369,-0.100208,-0.288216,-0.004495,0.055956,-0.296565,-0.01357,-0.068814,0.00623,0.17226,0.215402,0.00842,0.393125,-0.000484,-0.026228,0.203838,0.034258,-0.043373,-0.089856,-0.004687,...,0.075618,0.123927,-0.085901,-0.130596,-0.066989,0.216751,-0.012993,-0.12095,-0.020062,-0.012617,0.139313,0.034025,-0.010441,-0.164775,0.214455,0.229792,-0.141525,-0.074406,0.065473,-0.097259,0.088345,-0.070987,0.107051,-0.219546,0.131598,-0.04046,-0.189088,-0.070738,-0.327408,0.138552,-0.161466,0.357012,-0.042573,-0.184069,0.227112,-0.004313,-0.108996,-0.163471,-0.103295,0.17976
1,0.289337,-0.201637,-0.040488,0.01267,0.131444,0.172174,-0.124476,0.232219,0.263581,-0.041276,0.126665,0.166228,0.139223,-0.113769,0.155543,-0.156425,0.042678,0.173135,0.17874,0.095331,0.141841,0.007733,-0.170668,-0.16926,0.118065,0.129832,-0.003639,-0.086812,0.069391,0.176566,0.20702,0.069319,0.293605,0.016283,0.049575,0.106987,-0.0183,-0.189745,-0.030911,0.001348,...,0.11268,0.00873,0.079051,0.071825,-0.126326,0.294807,0.033628,-0.181058,-0.104755,-0.021393,-0.018796,-0.045801,0.034394,-0.044872,0.115539,0.12614,-0.09029,-0.262139,-0.014969,-0.100925,0.407028,-0.041355,-0.290562,0.073295,0.363646,0.107311,-0.276164,0.222163,-0.071557,0.131636,-0.058818,0.144118,0.134564,-0.228925,0.245931,-0.189699,-0.115568,-0.015011,0.015346,0.047497
2,0.274414,0.008228,0.077097,0.177192,0.153487,0.133675,-0.036129,0.174917,0.179359,0.315468,0.087836,0.28191,0.078622,-0.041186,-0.280356,-0.019932,-0.172325,0.054286,-0.145681,-0.012326,0.064129,0.025044,-0.234353,-0.144449,0.155626,-0.202906,-0.010311,-0.140571,0.042945,0.284544,0.084359,0.191488,0.290815,-0.05389,0.170024,-0.017354,0.089014,-0.298681,0.193136,-0.103332,...,0.164183,0.128467,0.03975,-0.156851,-0.034757,0.059165,-0.207961,-0.266814,0.116116,0.065962,0.099356,-0.157603,-0.11638,-0.264511,0.176312,0.269113,0.071685,-0.171419,0.201537,0.05339,0.432342,0.049735,0.138381,-0.114735,0.022935,-0.030383,-0.222716,0.209642,-0.013833,-0.013324,0.047018,0.168587,0.147671,-0.200164,0.035668,0.072092,-0.089347,0.113562,0.012131,0.077801
3,0.204454,0.129999,0.002591,0.184851,0.236623,0.098258,-0.07227,0.144204,0.332627,0.06082,0.353714,0.066546,0.333712,-0.096286,-0.243998,-0.075111,-0.012867,0.028415,-0.201648,0.01899,0.110832,-0.103295,-0.172666,-0.243262,-0.095308,-0.138136,0.084609,-0.03733,0.115271,0.224204,0.212095,0.118677,0.095768,-0.167019,0.237548,0.250463,-0.065403,-0.105358,0.00277,0.038685,...,0.274231,0.022653,-0.1238,-0.016609,0.07686,-0.146305,0.166027,-0.067783,-0.004471,0.029937,0.023908,-0.079308,0.028808,-0.196193,0.175206,0.216767,-0.231853,0.12433,0.077149,-0.076163,0.254819,0.201637,0.255593,-0.027024,0.210464,-0.293771,-0.232411,-0.047866,-0.052124,-0.062217,-0.167599,0.196851,0.022663,-0.309182,0.072871,-0.074297,-0.053123,-0.041199,0.149606,0.121862
4,0.040696,-0.140997,0.048742,0.166069,0.012763,0.383296,-0.145714,0.233924,0.275225,0.10604,0.157928,-0.048313,0.547881,-0.317503,0.1749,-0.249315,0.020912,0.133484,-0.143742,0.225078,-0.103739,-0.202568,-0.218794,-0.214367,0.042929,-0.056102,-0.039667,-0.092803,-0.08132,0.141056,0.140662,0.103239,0.33518,-0.223149,0.064554,0.00019,-0.048593,-0.160125,0.044306,0.06282,...,0.141752,0.032901,0.292937,-0.123679,-0.079778,0.150132,-0.216592,-0.132141,-0.03594,0.136982,0.328401,0.000825,-0.218181,-0.134768,0.268109,0.197843,-0.102123,-0.187827,0.14386,-0.143092,0.382291,0.040996,-0.085234,0.042325,0.223079,-0.0627,-0.371686,0.094494,-0.011818,-0.064072,-0.232719,0.374637,-0.139446,-0.209834,0.046198,-0.163259,-0.051753,-0.061007,-0.185041,0.006535


In [41]:
wordvec_df.shape

(10000, 200)

#### 3.4.3.transform test data

In [52]:
feature_test.head(2)

Unnamed: 0.1,Unnamed: 0,title
0,153245,iPhone 6 Release Date Pushed Back Due to Issue...
1,308611,Samsung Galaxy S4 vs Galaxy S3: Budget-Friendl...


In [53]:
tokenized_text_test = feature_test['title'].apply(lambda x: x.split()) # tokenizing

In [54]:
wordvec_arrays = np.zeros((len(tokenized_text_test), 200)) 
for i in range(len(tokenized_text_test)):
    wordvec_arrays[i,:] = word_vector(tokenized_text_test[i], 200)
    wordvec_df_test = pd.DataFrame(wordvec_arrays)
wordvec_df_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199
0,0.058814,0.142585,0.162008,-0.131452,-0.038308,-0.093332,-0.295874,0.167939,0.437889,0.11509,0.536151,-0.226375,0.260178,-0.662223,0.517701,-0.15432,0.458935,0.054735,0.195445,0.134831,-0.054053,0.289312,-0.200386,0.586942,-0.012638,-0.562773,0.079707,-0.084097,0.307388,0.477308,0.110977,0.456154,0.755784,-0.264319,0.678109,0.115447,-0.200939,-1.109471,0.225328,-0.234797,...,0.249021,0.311836,0.29292,-0.136811,0.250913,-0.215157,-0.238375,-0.111332,-0.066764,0.463247,-0.335175,0.477282,0.25013,-0.698363,0.900268,0.402433,-0.209631,0.033896,-0.09508,-0.042442,0.332289,0.28467,-0.8167,0.140234,0.058792,0.313469,0.344483,-0.53062,-0.636006,-0.136761,-0.928272,0.75792,0.142069,0.069877,0.267336,0.061147,-0.06097,-0.430171,0.295875,0.204263
1,-0.043115,-0.974044,-0.206173,0.020041,0.107864,0.707798,0.023119,-0.241604,1.079093,-0.468236,0.138944,0.008469,-0.334753,-0.061533,0.272005,-0.344717,0.118476,0.517618,-0.425236,0.244206,0.491671,0.659521,-0.299715,0.288081,0.19881,-0.910492,-0.063048,-0.279178,-0.348123,0.156782,0.85136,-0.178772,0.745497,-0.229396,0.103883,0.048155,0.025691,0.10942,-0.368924,-0.19804,...,0.351654,-0.243385,0.447427,-0.110816,0.137341,0.149611,-0.04109,-0.342503,0.033549,-0.384458,0.425065,0.242884,-0.264038,-0.588206,1.10259,0.71511,0.036484,-0.450524,-0.442744,-0.242947,0.060046,0.14141,0.628185,-0.264588,0.198128,-0.347974,0.325655,0.507832,-0.161697,0.04685,-0.756813,0.462182,-0.082657,-0.189085,0.400665,0.242772,0.349228,-0.438436,0.276026,-0.106309
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.172886,0.193834,-0.374816,-0.039309,-0.043196,-0.047114,-0.291135,0.436828,0.234712,-0.302049,0.18818,0.735383,-0.22691,0.278506,0.016267,-0.153429,-0.303333,-0.314978,-0.145325,0.002189,0.426099,-0.145265,-0.191935,-0.266227,0.45896,-0.091921,-0.11859,-0.475493,-0.014878,0.408133,-0.077138,0.267566,0.121506,0.199233,0.431362,-0.457671,-0.188031,-0.110527,0.312198,-0.060574,...,0.635516,0.193468,0.258531,0.035396,-0.047896,-0.085285,-0.652828,-0.477882,0.115442,-0.070614,0.167695,0.014712,-0.07017,-0.715051,0.170109,0.449853,-0.030339,-0.415414,0.3918,-0.171774,0.502936,0.046668,-0.02792,-0.059437,0.058433,-0.220778,-0.244991,0.402897,-0.147593,0.069189,-0.559469,0.830084,-0.00319,0.12991,0.35259,-0.120139,-0.24068,-0.458383,-0.236813,-0.09824


In [55]:
print(wordvec_df.shape)
print(wordvec_df_test.shape)


(10000, 200)
(84484, 200)


It may look weird as we have less data for train and more to test. But, training is computing intense. So, this will help us. Plus eventually I will train my best performing model to whole data.

# 4)-Model Building

- Logistic Regression 
- Support Vector
- Random Forest
- XGBoost
- MLP

### 4.1.Logistic Regression Model

In [63]:
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import f1_score, accuracy_score, roc_curve,roc_auc_score,confusion_matrix, classification_report

##### 4.1.a. Logistic Regression using Bag-of-Words Features

In [64]:
X=bow
y=data['category']
print(X.shape)
print(y.shape)
print(bow_test.shape)
print(label_test.shape)

(10000, 1000)
(10000,)
(84484, 1000)
(84484, 1)


In [None]:
# splitting data into training and validation set
#xtrain_bow, xvalid_bow, ytrain, yvalid = train_test_split(bow, y,random_state=42,test_size=0.2)

In [66]:
lreg_bow = LogisticRegression(solver='liblinear')

# training the model
lreg_bow.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [68]:
# predicting on the validation set
prediction_bow = lreg_bow.predict_proba(bow_test)
prediction_bow[0]

array([0.16087434, 0.6072539 , 0.04823555, 0.1836362 ])

In [69]:
# prediction over classes

prediction_bow_class=lreg_bow.predict(bow_test)
prediction_bow_class[0]

'e'

In [70]:
accuracy_score(label_test, prediction_bow_class)

0.7044765872827992

In [75]:
from sklearn import metrics
print(metrics.classification_report(label_test, prediction_bow_class))

              precision    recall  f1-score   support

           b       0.73      0.65      0.69     23367
           e       0.65      0.92      0.76     30300
           m       0.75      0.47      0.58      9207
           t       0.80      0.56      0.66     21610

    accuracy                           0.70     84484
   macro avg       0.73      0.65      0.67     84484
weighted avg       0.72      0.70      0.70     84484



##### 4.1.b.Logistic Regression using TF-IDF Features

In [71]:
X=tfidf
y=data['category']
print(X.shape)
print(y.shape)
print(tfidf_test.shape)
print(label_test.shape)

(10000, 1000)
(10000,)
(84484, 1000)
(84484, 1)


In [72]:
lreg_tfidf = LogisticRegression(solver='liblinear')

# training the model
lreg_tfidf.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [73]:
# predicting on the validation set
prediction_tfidf = lreg_tfidf.predict_proba(tfidf_test)

In [74]:
prediction_tfidf_class=lreg_tfidf.predict(tfidf_test)

In [76]:
accuracy_score(label_test, prediction_tfidf_class)

0.707542256521945

In [77]:
print(metrics.classification_report(label_test, prediction_tfidf_class))

              precision    recall  f1-score   support

           b       0.73      0.66      0.69     23367
           e       0.66      0.92      0.77     30300
           m       0.78      0.46      0.58      9207
           t       0.79      0.57      0.66     21610

    accuracy                           0.71     84484
   macro avg       0.74      0.65      0.67     84484
weighted avg       0.72      0.71      0.70     84484



##### 4.1.c. Logistic Regression using Word2Vec Features

In [78]:
X=wordvec_df
y=data['category']
print(X.shape)
print(y.shape)
print(wordvec_df_test.shape)
print(label_test.shape)

(10000, 200)
(10000,)
(84484, 200)
(84484, 1)


In [79]:
lreg_word2vec = LogisticRegression(solver='liblinear')
# training the model
lreg_word2vec.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [81]:
# predicting on the validation set
prediction_word2vec = lreg_word2vec.predict_proba(wordvec_df_test)

In [82]:
prediction_word2vec_class=lreg_word2vec.predict(wordvec_df_test)

In [83]:
accuracy_score(label_test, prediction_word2vec_class)

0.3385374745513943

In [84]:
print(metrics.classification_report(label_test, prediction_word2vec_class))

              precision    recall  f1-score   support

           b       0.63      0.33      0.43     23367
           e       0.58      0.30      0.40     30300
           m       0.14      0.69      0.24      9207
           t       0.45      0.25      0.32     21610

    accuracy                           0.34     84484
   macro avg       0.45      0.39      0.35     84484
weighted avg       0.51      0.34      0.37     84484



##### 4.1.d. Logistic Regression using Doc2Vec Features

In [85]:
X=docvec_df
y=data['category']
print(X.shape)
print(y.shape)
print(docvec_df_test.shape)
print(label_test.shape)

(10000, 200)
(10000,)
(84484, 200)
(84484, 1)


In [86]:
lreg_doc2vec = LogisticRegression(solver='liblinear')
# training the model
lreg_doc2vec.fit(docvec_df, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [87]:
# predicting on the validation set
prediction_doc2vec = lreg_doc2vec.predict_proba(docvec_df_test)

In [88]:
prediction_doc2vec_class=lreg_doc2vec.predict(docvec_df_test)

In [89]:
accuracy_score(label_test, prediction_doc2vec_class)

0.35864779129776053

In [90]:
print(metrics.classification_report(label_test, prediction_doc2vec_class))

              precision    recall  f1-score   support

           b       0.00      0.00      0.00     23367
           e       0.36      1.00      0.53     30300
           m       0.00      0.00      0.00      9207
           t       0.00      0.00      0.00     21610

    accuracy                           0.36     84484
   macro avg       0.09      0.25      0.13     84484
weighted avg       0.13      0.36      0.19     84484



**Summary:**

- bow=70%
- tfidf=70%
- word2vec=33%
- doc2vec=35%

### 4.2.Support Vector Machine (SVM)

In [None]:
from sklearn import svm

##### SVM using Bag-of-Words Features

In [None]:
svc = svm.SVC(kernel='linear', C=1, probability=True).fit(xtrain_bow, ytrain)
prediction = svc.predict_proba(xvalid_bow)
prediction_class = svc.predict(xvalid_bow)

In [None]:
accuracy_score(yvalid, prediction_class)

0.5471956224350205

##### SVM using TF-IDF Features

In [None]:
svc = svm.SVC(kernel='linear',C=1, probability=True).fit(xtrain_tfidf, ytrain)
prediction = svc.predict_proba(xvalid_tfidf)
prediction_class = svc.predict(xvalid_tfidf)
accuracy_score(yvalid, prediction_class)

0.560875512995896

##### SVM using word2vec Features

In [None]:
svc = svm.SVC(kernel='linear', C=1, probability=True).fit(xtrain_word2vec, ytrain)
prediction = svc.predict_proba(xvalid_word2vec)
prediction_class = svc.predict(xvalid_word2vec)
accuracy_score(yvalid, prediction_class)

0.585499316005472

##### SVM using doc2vec Features

In [None]:
svc = svm.SVC(kernel='linear', C=1, probability=True).fit(xtrain_doc2vec, ytrain)
prediction = svc.predict_proba(xvalid_doc2vec)
prediction_class = svc.predict(xvalid_doc2vec)
accuracy_score(yvalid, prediction_class)

0.4117647058823529

**Summary**


- bow = 54%
- tfidf= 56%
- word2vec= 58% 
- doc2vec= 41%

### 4.3.Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

##### RF with Bag-of-Words Features

In [None]:
rf = RandomForestClassifier(n_estimators=400, random_state=11).fit(xtrain_bow, ytrain)
prediction = rf.predict_proba(xvalid_bow)
prediction_class = rf.predict(xvalid_bow)
accuracy_score(yvalid, prediction_class)

0.5704514363885089

##### RF with TF-IDF Features

In [None]:
rf = RandomForestClassifier(n_estimators=400, random_state=11).fit(xtrain_tfidf, ytrain)
prediction = rf.predict_proba(xvalid_tfidf)
prediction_class = rf.predict(xvalid_tfidf)
accuracy_score(yvalid, prediction_class)

0.5677154582763337

##### RF with word2vec Features

In [None]:
rf = RandomForestClassifier(n_estimators=400, random_state=11).fit(xtrain_word2vec, ytrain)
prediction= rf.predict_proba(xvalid_word2vec)
prediction_class = rf.predict(xvalid_word2vec)
accuracy_score(yvalid, prediction_class)

0.5745554035567716

##### RF with doc2vec Feature

In [None]:
rf = RandomForestClassifier(n_estimators=400, random_state=11).fit(xtrain_doc2vec, ytrain)
prediction= rf.predict_proba(xvalid_doc2vec)
prediction_class = rf.predict(xvalid_doc2vec)
accuracy_score(yvalid, prediction_class)

0.4117647058823529

**Summary**

- bow = 57%
- tfidf = 56%
- word2vec = 57%
- doc2vec = 41%

# 4.4.XGBoost
Extreme Gradient Boosting (xgboost) is an advanced implementation of gradient boosting algorithm. It has both linear model solver and tree learning algorithms. Its ability to do parallel computation on a single machine makes it extremely fast. It also has additional features for doing cross validation and finding important variables. There are many parameters which need to be controlled to optimize the model.

Some key benefits of XGBoost are:

Regularization - helps in reducing overfitting
Parallel Processing - XGBoost implements parallel processing and is blazingly faster as compared to GBM.
Handling Missing Values - It has an in-built routine to handle missing values.
Built-in Cross-Validation - allows user to run a cross-validation at each iteration of the boosting process

**Notice there is no sklearn ready made model therefore; I needed to use XGBoost from its main librrary**

In [None]:
from xgboost import XGBClassifier

##### XGBoost using bag of words features

In [None]:
xgb_model = XGBClassifier(max_depth=6, n_estimators=1000).fit(xtrain_bow, ytrain)
prediction = xgb_model.predict_proba(xvalid_bow)
prediction_class = xgb_model.predict(xvalid_bow)
accuracy_score(yvalid, prediction_class)

0.5444596443228454

##### XGBoost using tfidf features

In [None]:
xgb_model = XGBClassifier(max_depth=6, n_estimators=1000).fit(xtrain_tfidf, ytrain)
prediction = xgb_model.predict_proba(xvalid_tfidf)
prediction_class = xgb_model.predict(xvalid_tfidf)
accuracy_score(yvalid, prediction_class)

0.5554035567715458

##### XGBoost using word2vecfeatures

In [None]:
xgb_model = XGBClassifier(max_depth=6, n_estimators=1000).fit(xtrain_word2vec, ytrain)
prediction = xgb_model.predict_proba(xvalid_word2vec)
prediction_class = xgb_model.predict(xvalid_word2vec)
accuracy_score(yvalid, prediction_class)

0.5991792065663475

##### XGBoost using doc2vec features

In [None]:
xgb_model = XGBClassifier(max_depth=6, n_estimators=1000).fit(xtrain_doc2vec, ytrain)
prediction = xgb_model.predict_proba(xvalid_doc2vec)
prediction_class = xgb_model.predict(xvalid_doc2vec)
accuracy_score(yvalid, prediction_class)


0.3679890560875513

**Summary**

- bow = 54%
- tfidf = 55%
- word2vec = 58%
- doc2vec = 37%

### 4.5.MLPClassifier

A multilayer perceptron (MLP) is a class of feedforward artificial neural network

In [None]:
from sklearn.neural_network import MLPClassifier

##### MLP using bag of words features

In [None]:
mlp_model = MLPClassifier(random_state=1, max_iter=300,learning_rate_init=0.001).fit(xtrain_bow, ytrain)
prediction = mlp_model.predict_proba(xvalid_bow)
prediction_class = mlp_model.predict(xvalid_bow)
accuracy_score(yvalid, prediction_class)

0.518467852257182

##### MLP using tfidf features

In [None]:
mlp_model = MLPClassifier(random_state=1, max_iter=300,learning_rate_init=0.001).fit(xtrain_tfidf, ytrain)
prediction = mlp_model.predict_proba(xvalid_tfidf)
prediction_class = mlp_model.predict(xvalid_tfidf)
accuracy_score(yvalid, prediction_class)

0.5280437756497948

##### MLP using word2vecfeatures

In [None]:
mlp_model = MLPClassifier(random_state=1, max_iter=300,learning_rate_init=0.001).fit(xtrain_word2vec, ytrain)
prediction = mlp_model.predict_proba(xvalid_word2vec)
prediction_class = mlp_model.predict(xvalid_word2vec)
accuracy_score(yvalid, prediction_class)

0.5595075239398085

##### MLP using doc2vec features

In [None]:
mlp_model = MLPClassifier(random_state=1, max_iter=300,learning_rate_init=0.001).fit(xtrain_doc2vec, ytrain)
prediction = mlp_model.predict_proba(xvalid_doc2vec)
prediction_class = mlp_model.predict(xvalid_doc2vec)
accuracy_score(yvalid, prediction_class)

0.4117647058823529

**Summary**

- bow = 51%
- tfidf = 52%
- word2vec = 54%
- doc2vec = 41%

**XGBoost using word2vec gives us the best results with our given matrics i.e 58%.**

## Saving model

In [None]:
import pickle
xgb_model_best = XGBClassifier(max_depth=6, n_estimators=1000).fit(xtrain_word2vec, ytrain)
# save model
filename = 'xgb_model.sav'
pickle.dump(xgb_model_best, open(filename, 'wb'))

In [None]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

In [None]:
prediction = loaded_model.predict_proba(xvalid_word2vec)
prediction_class = loaded_model.predict(xvalid_word2vec)

In [None]:
accuracy_score(yvalid, prediction_class)

0.5991792065663475

In [None]:
print(classification_report(yvalid, prediction_class))

                                                      precision    recall  f1-score   support

                                   Customer feedback       0.61      0.53      0.57        78
                       Data protection (Datenschutz)       1.00      0.50      0.67         4
                                   Discovery voucher       0.00      0.00      0.00         4
                                           Marketing       0.71      0.52      0.60        23
                                    Order management       0.65      0.83      0.73       301
                                 Payment (Bezahlung)       0.00      0.00      0.00        12
                                   Product (Produkt)       0.71      0.26      0.38        19
                                   Production delays       0.00      0.00      0.00         9
                    Professional area (Profibereich)       0.60      0.18      0.27        17
                                   Reseller workflow       

In [None]:
print(labels.nunique())
print(yvalid.nunique())
print(ytrain.nunique())

22
18
22


Other values are also very consistent.

- accuracy = 57.8%
- precision = 58%
- recall = 58%
- f-score = 55%
- (test samples=731)
- No. of classes in test data = 18
- No. of classes in train data = 22
- Total Classes = 22

# END OF NOTEBOOK CODE