# Advanced ML Models

# 1)- Import key modules

In [1]:
# support both Python 2 and Python 3 with minimal overhead.
from __future__ import absolute_import, division, print_function

# I am an engineer. I care only about error not warning. So, let's be maverick and ignore warnings.
import warnings
warnings.filterwarnings('ignore')

In [2]:
import re    # for regular expressions 
import nltk  # for text manipulation 
import string 
import numpy as np 
import pandas as pd 
import string 

#For Visuals
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from matplotlib import rcParams
rcParams['figure.figsize'] = 11, 8
%config InlineBackend.figure_format = 'svg'
%matplotlib inline

In [3]:
#models and evaluation

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from nltk.classify.scikitlearn import SklearnClassifier # notice its from ntlk not sklearn
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
# Evaluation packages
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import confusion_matrix

In [4]:
!pip install version_information



In [5]:
#pip install version_information
%reload_ext version_information
%version_information pandas,numpy, nltk, seaborn, matplotlib

Software,Version
Python,3.6.9 64bit [GCC 8.4.0]
IPython,5.5.0
OS,Linux 4.19.112+ x86_64 with Ubuntu 18.04 bionic
pandas,1.0.5
numpy,1.18.5
nltk,3.2.5
seaborn,0.10.1
matplotlib,3.2.2
Fri Sep 25 13:40:47 2020 UTC,Fri Sep 25 13:40:47 2020 UTC


In [6]:
# testing GPU on colab
import tensorflow as tf
tf.test.gpu_device_name()

''

# 2)- Loading Data

In [7]:
data=pd.read_csv('train_data_clean.csv')
#data=data.rename(columns={'Unnamed: 0':'random_columns'}) # a trick to tackle random index values
data.shape

(10000, 3)

In [8]:
data.head(2)

Unnamed: 0,news,category,clean
0,Top 5 Reasons Why 'Divergent' Star Kate Winsle...,e,top 5 reason diverg star kate winslet deserv s...
1,Vessyl Bottle Tracks Your Drink And Its Health...,t,vessyl bottl track drink health benefitsgadget...


In [9]:
data.isnull().sum()

news        0
category    0
clean       0
dtype: int64

In [10]:
#loading test feature and label data saved from previous notebooks
feature_test=pd.read_csv('test_data.csv')
label_test=pd.read_csv('test_label.csv')

In [11]:
print(feature_test.shape)
print(label_test.shape)

(84484, 2)
(84484, 1)


# 3)- Vectorization

- bag of words
- tf-idf
- doc2vec
- word2vec

In [12]:
features=data['clean']
labels=data['category']
print(features.shape)
print(labels.shape)

(10000,)
(10000,)


### 3.1).Bag of Words

Bag-of-Words is a method to represent text into numerical features.

Let us understand this using a simple example. Suppose we have only 2 document

- D1: He is a lazy boy. She is also lazy.

- D2: Smith is a lazy person.

The list created would consist of all the unique tokens in the corpus C.

= [‘He’,’She’,’lazy’,’boy’,’Smith’,’person’]

Here, D=2, N=6



In [13]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import gensim

bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
# bag-of-words feature matrix
bow = bow_vectorizer.fit_transform(features)
bow.shape

(10000, 1000)

#### 3.1.a. Transform test data

In [14]:
feature_test.head(2)

Unnamed: 0.1,Unnamed: 0,title
0,153245,iPhone 6 Release Date Pushed Back Due to Issue...
1,308611,Samsung Galaxy S4 vs Galaxy S3: Budget-Friendl...


In [15]:
bow_test = bow_vectorizer.transform(feature_test["title"])

In [16]:
print(bow.shape)
print(bow_test.shape)

(10000, 1000)
(84484, 1000)


### 3.2)-TF-IDF

This is another method which is based on the frequency method but it is different to the bag-of-words approach in the sense that it takes into account not just the occurrence of a word in a single document (or tweet) but in the entire corpus.

TF-IDF works by penalising the common words by assigning them lower weights while giving importance to words which are rare in the entire corpus but appear in good numbers in few documents.

Let’s have a look at the important terms related to TF-IDF:

- TF = (Number of times term t appears in a document)/(Number of terms in the document)

- IDF = log(N/n), where, N is the number of documents and n is the number of documents a term t has appeared in.

- TF-IDF = TF*IDF

In [17]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
# TF-IDF feature matrix
tfidf = tfidf_vectorizer.fit_transform(features)
tfidf.shape

(10000, 1000)

#### 3.2.a. Transform Test data

In [18]:
tfidf_test = tfidf_vectorizer.transform(feature_test["title"])

In [19]:
print(tfidf.shape)
print(tfidf_test.shape)

(10000, 1000)
(84484, 1000)


### 3.3)- Doc2Vec Embedding

In [20]:
from tqdm import tqdm 
tqdm.pandas(desc="progress-bar") 
from gensim.models.doc2vec import TaggedDocument

In [21]:
tokenized_text = data['clean'].apply(lambda x: x.split()) # tokenizing

In [22]:
def add_label(twt):
    output = []
    for i, s in zip(twt.index, twt):
        output.append(TaggedDocument(s, ["clean_" + str(i)]))
    return output
labeled_text = add_label(tokenized_text) # label all the news

##### 3.3.a.Train doc2vec model

In [23]:
model_d2v = gensim.models.Doc2Vec(dm=1,dm_mean=1,vector_size=200,window=5,negative=7,min_count=5,workers=3,alpha=0.1,seed=23)

In [24]:
model_d2v.build_vocab([i for i in tqdm(labeled_text)])

100%|██████████| 10000/10000 [00:00<00:00, 1565856.79it/s]


##### 3.3.b.Preparing doc2vec Feature Set

In [25]:
docvec_arrays = np.zeros((len(tokenized_text), 200))
for i in range(len(data)):
    docvec_arrays[i,:] = model_d2v.docvecs[i].reshape((1,200))

    
docvec_df = pd.DataFrame(docvec_arrays)
docvec_df.shape

(10000, 200)

In [26]:
docvec_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199
0,0.002366,0.000339,0.001979,-0.001805,-0.002123,-0.000421,0.002377,-0.000688,-0.001438,-0.001871,-0.000374,0.001587,0.001149,0.002127,-0.002418,0.000411,-0.000381,-0.001502,-0.000897,0.002436,0.001844,-0.002053,0.000721,-0.002106,0.002469,-0.002324,-0.002378,0.001189,-0.001385,0.001828,-0.000792,0.002368,-0.000378,0.002183,-0.001297,-0.001243,-0.001983,0.000398,0.000177,-0.002199,...,-0.002043,-0.001246,-0.000738,0.001034,-0.000118,-0.002086,0.001213,0.001872,0.000925,0.002165,0.001009,0.001001,-0.001959,0.002161,-0.001329,0.001841,0.002246,0.001914,-0.002029,-0.001149,0.001282,0.000296,0.002333,-0.000172,-0.002198,-0.002247,-0.000705,-0.001203,-0.002135,-0.002291,0.001395,-0.000762,-0.002241,0.002103,0.002445,-0.002202,-0.00138,-0.001185,0.001691,-0.000495
1,0.002128,0.001104,-0.000378,-0.000623,0.000313,0.002049,0.000831,-0.002358,-0.001693,0.002137,-0.001951,0.001041,0.000403,-0.001987,-0.002405,-0.000736,0.002039,0.001095,-0.001677,-0.000918,-0.002424,-0.001823,-0.000268,0.001134,0.000434,-0.001661,0.002291,0.000445,-0.000541,-0.001965,-0.000471,0.002207,-0.000544,0.001223,-0.002421,0.001224,0.001014,-0.001414,0.001643,-0.001124,...,-0.001507,-0.001089,-0.000736,0.001829,0.00085,-0.000214,0.001046,0.002488,-0.002163,0.000531,-0.001981,0.000709,-0.001329,0.001299,-0.000887,0.000157,0.000851,-0.000823,-0.002031,0.000386,-0.000545,-0.002038,0.00247,0.001149,0.000142,-0.001933,-0.000588,-0.001018,-0.001215,0.001221,0.001176,-0.001778,0.00194,0.000452,-0.001516,-0.001557,-0.000675,-0.00152,0.001158,0.002369
2,6.5e-05,-0.000689,0.001343,0.000533,-0.001908,-0.000405,-0.00181,0.001767,-0.001856,-0.000623,0.00228,0.001935,0.00147,0.002421,-0.002261,-0.001744,0.002102,-0.00124,0.001056,-0.000818,-0.001231,0.002062,-0.000701,-0.002208,0.000649,0.001145,-0.00227,0.001294,-0.000701,-0.000889,-0.000976,-0.002186,-0.002323,0.002071,-0.00033,0.001056,-0.000102,-0.001344,-0.001852,-0.000376,...,0.001373,-0.00153,0.000928,0.000566,-0.001577,0.001878,0.00139,0.000932,-0.001076,-0.000898,0.000778,-0.001273,-0.001925,-0.001208,-0.002446,0.00058,-0.002137,-0.001458,0.00182,0.000492,-0.000329,0.000863,-0.001396,0.00168,-0.000694,-8.6e-05,0.002172,0.000933,0.002309,0.000198,-0.001654,-0.000968,0.00096,-0.001756,-0.001751,-0.001123,0.000363,0.000314,0.001539,-0.001834
3,0.000201,-0.000361,0.001026,0.001459,-0.000964,-0.000134,-0.002101,0.000875,0.001847,-0.001982,-5.3e-05,0.001899,-0.001629,0.001419,0.001246,-0.001224,-0.001636,-0.001737,0.000567,-0.002167,-0.001904,-0.000717,-0.002062,-0.000256,0.002052,0.002191,-0.00022,0.001064,-0.001673,-0.002367,0.000574,0.002202,-0.001786,-0.00181,-0.001121,-0.000466,0.002178,-0.000745,-0.001178,-0.00198,...,-0.001113,-0.001873,-0.00128,-0.000493,0.002181,0.001525,0.00146,0.000856,-0.000972,0.002169,5.4e-05,0.000992,-0.00063,-0.000629,-0.001868,0.001371,-0.002051,-0.001815,0.000115,-0.002097,-0.001085,-1e-06,-0.000588,0.002337,0.002203,1.4e-05,-0.002169,-0.000261,-0.001585,0.000481,0.000493,0.000274,-0.001928,-0.000314,-0.00183,-0.001554,-0.002397,-0.001592,-0.001095,-0.002108
4,-0.00108,0.001557,-0.000543,0.001735,0.000248,-0.001405,0.000292,-0.001002,-0.001321,-0.000316,-0.001278,-0.001844,-0.002068,-0.002425,0.000711,0.000497,-0.001007,-0.002499,-0.000929,-0.002368,0.001957,0.00093,0.001415,-0.001256,-0.002102,0.00112,-0.000944,0.001774,-0.001859,-0.002065,0.001516,0.000272,0.00042,-0.001943,0.00236,-0.000249,0.0001,0.000546,0.000405,-0.000781,...,0.001204,-0.002027,0.00208,8.2e-05,0.002183,0.0023,-0.00043,-0.001808,0.002177,-0.000963,-0.002259,-0.000531,-3.8e-05,0.000837,-0.001778,-0.000956,-0.000592,-6.7e-05,0.001143,-0.001351,0.001456,-0.000255,-0.001258,0.001502,0.000837,-0.001609,0.00089,-0.001653,0.000429,-0.000914,0.000332,0.000802,0.0012,0.001257,-0.00139,-0.000465,-0.001989,-0.002416,0.002157,0.000595


#### 3.3.3.transform test data

In [27]:
feature_test.head(2)

Unnamed: 0.1,Unnamed: 0,title
0,153245,iPhone 6 Release Date Pushed Back Due to Issue...
1,308611,Samsung Galaxy S4 vs Galaxy S3: Budget-Friendl...


In [28]:
tokenized_text_test = feature_test['title'].apply(lambda x: x.split()) # tokenizing

In [29]:
labeled_text_test = add_label(tokenized_text_test) # label all the news

In [30]:
docvec_arrays_test = np.zeros((len(tokenized_text_test), 200))
for i in range(len(data)):
    docvec_arrays_test[i,:] = model_d2v.docvecs[i].reshape((1,200))

    
docvec_df_test = pd.DataFrame(docvec_arrays_test)


In [31]:
print(docvec_df.shape) # training
print(docvec_df_test.shape) # for testing

(10000, 200)
(84484, 200)


Notice I have only transformed test set and didn't train. So only my train set learns about vocab of corpus. My test model has only been transformed and learnt nothing.

### 3.4.Word2Vec Embedding

In [32]:
tokenized_text = data['clean'].apply(lambda x: x.split()) # tokenizing

model_w2v = gensim.models.Word2Vec(
            tokenized_text,
            size=200, # desired no. of features/independent variables
            window=5, # context window size
            min_count=2,
            sg = 1, # 1 for skip-gram model
            hs = 0,
            negative = 10, # for negative sampling i.e class with other types
            workers= 2, # no.of cores
            seed = 34) 

model_w2v.train(tokenized_text, total_examples= len(data['clean']), epochs=20)

(1329944, 1633380)

In [33]:
model_w2v.wv['nasdaq']

array([-0.02183259, -0.4366339 ,  0.06510326,  0.72783786,  0.24485947,
        0.33850783,  0.3416138 , -0.50301486,  0.29840896,  0.12920056,
        0.09041848, -0.50774693,  0.34069255, -0.43587837, -0.4792716 ,
        0.1502633 , -0.16197556,  0.3467329 , -0.26033154, -0.22837308,
       -0.2995486 ,  0.00264726,  0.32378572, -0.42868358,  0.03162241,
       -0.03673098, -0.34075603,  0.09162065, -0.15626894, -0.29774606,
        0.15991998, -0.05255641,  0.45747536, -0.31701863, -0.01571083,
       -0.13146812,  0.42375457, -0.31759384, -0.12677263,  0.25047785,
        0.44083124,  0.34026486,  0.02389309,  0.14316827,  0.26641372,
       -0.01980806, -0.14764051, -0.00484273,  0.03565803,  0.2957805 ,
       -0.00778876, -0.40688795, -0.72026014,  0.0653047 ,  0.13537495,
        0.16425957, -0.48244882, -0.5117245 ,  0.24898359, -0.30187702,
        0.29498556,  0.3112112 ,  0.10090353,  0.5491042 , -0.03915667,
        0.08010899, -0.2108901 , -0.051441  ,  0.5548677 ,  0.18

In [34]:
len(model_w2v.wv['nasdaq'])

200

In [35]:
type(model_w2v)

gensim.models.word2vec.Word2Vec

##### 3.4.1.Preparing Vectors for text data

In [36]:
def word_vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model_w2v[word].reshape((1, size))
            count += 1.
        except KeyError: # handling the case where the token is not in vocabulary           
            continue
    if count != 0:
        vec /= count
    return vec

##### 3.4.2.Preparing word2vec feature set

In [37]:
wordvec_arrays = np.zeros((len(tokenized_text), 200)) 
for i in range(len(tokenized_text)):
    wordvec_arrays[i,:] = word_vector(tokenized_text[i], 200)
    wordvec_df = pd.DataFrame(wordvec_arrays)
wordvec_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199
0,0.201549,-0.05888,0.098633,0.223172,0.180056,0.188836,0.038142,-0.237002,0.175908,0.168333,0.400366,-0.097706,0.339135,-0.493795,-0.143372,0.136877,-0.15549,-0.005183,0.23946,-0.241012,0.049904,-0.058397,-0.036771,-0.039228,0.077104,0.060888,-0.138675,0.189417,-0.366141,0.193656,0.105065,-0.137805,0.044918,-0.063056,0.005776,0.025565,0.145569,-0.053161,-0.098633,0.118543,...,0.10819,-0.039113,0.203187,-0.069502,-0.126691,0.050342,-0.292074,0.20855,0.06103,-0.105535,0.419986,-0.170998,-0.188886,-0.254052,-0.097026,0.107523,0.005968,0.167282,0.111037,0.13101,-0.007265,-0.17276,-0.145243,0.177934,0.041374,0.040244,-0.189848,0.27411,0.189838,-0.19038,-0.160189,0.084203,-0.043201,-0.130171,-0.337717,-0.1023,0.38971,0.029564,-0.295826,-0.001734
1,0.219597,-0.160597,0.16634,0.089833,0.047616,0.276412,0.218447,-0.017701,0.23518,-0.197354,0.054539,-0.00713,0.027033,-0.296322,-0.056736,0.069059,-0.071355,0.288428,-0.243389,-0.167503,0.079687,0.129679,-0.125811,-0.115843,0.03842,-0.087136,-0.05244,0.092082,-0.242383,0.319137,0.226937,0.042859,0.00353,-0.019998,0.137297,-0.35774,0.05841,0.098637,0.152728,0.084178,...,-0.2881,-0.109103,0.391574,-0.122902,-0.120003,0.052285,0.161362,-0.001153,-0.215084,0.34357,0.131715,0.045053,0.123937,0.04827,-0.401386,0.019845,-0.167026,-0.114807,0.143618,0.099672,0.009963,-0.122294,-0.180833,0.10729,0.169298,0.074593,0.20064,0.230538,0.060314,-0.057462,-0.048102,-0.198468,0.216565,-0.100224,-0.426578,-0.105106,0.609833,0.150369,0.085569,-0.130088
2,0.068557,-0.153593,0.196505,0.279166,0.115757,0.131893,0.287359,-0.22362,0.352287,0.065734,-0.060602,-0.144413,0.13402,-0.25904,0.005957,0.096336,-0.27219,0.223377,-0.325389,-0.318443,0.049387,0.105697,0.09377,0.089876,0.102705,-0.060413,0.033723,0.244376,-0.123585,0.021627,-0.000558,0.021347,0.060062,-0.132571,0.147042,-0.119347,0.089759,0.0156,0.084882,0.058693,...,-0.029265,-0.14418,0.171024,-0.126562,-0.079471,0.301216,0.088441,0.062065,-0.028333,0.035766,0.24998,-0.244441,-0.149104,-0.466875,-0.100066,-0.073692,-0.088428,0.155025,-0.012406,0.199835,0.101945,0.037484,0.083273,0.122808,0.151421,0.048715,-0.102749,0.283339,0.009009,0.020612,-0.139026,-0.223437,0.168517,-0.05223,-0.197929,-0.002978,0.081422,-0.033059,-0.080726,0.033157
3,0.145885,-0.114159,0.145472,0.203797,0.180099,0.255026,0.194909,-0.146707,0.382357,-0.038965,0.035106,0.122333,0.326634,-0.423494,-0.132386,0.107922,-0.198404,0.141435,-0.005548,-0.151942,-0.052321,-0.096661,-0.060486,0.144787,0.044767,-0.055905,0.026712,0.113796,-0.00629,0.059798,0.000675,0.123884,0.13243,0.050783,-0.061047,-0.176002,0.208373,0.035253,0.127397,0.145189,...,-0.170562,-0.017938,-0.0446,-0.294726,0.009306,0.154933,-0.027945,0.356192,-0.120101,-0.017419,0.340788,-0.251561,-0.222637,-0.180434,0.009217,0.071818,-0.104818,0.193944,0.158525,0.081381,0.044801,0.16117,-0.137656,0.194172,0.148202,-0.054651,-0.041976,0.098264,-0.084219,-0.11189,-0.085854,-0.260783,0.091261,-0.200442,-0.179743,-0.076632,0.139646,0.103295,-0.073316,-0.070356
4,0.188864,-0.343754,0.201325,0.165398,0.301113,0.339103,0.172763,-0.16595,0.477747,0.123165,0.145546,0.092101,-0.016997,-0.467255,-0.07021,-0.07231,-0.111554,0.170839,-0.193572,-0.324694,-0.111662,-0.114044,0.16913,-0.066418,0.176054,-0.247058,0.135439,0.100032,0.009467,0.228698,0.092868,0.189334,0.095383,-0.099921,0.094894,-0.181975,0.026453,0.096115,-0.156544,0.054113,...,-0.252864,0.020232,0.156052,-0.256226,0.040098,0.153394,-0.084524,-0.013671,-0.039235,-0.04207,0.325697,0.011849,0.003295,-0.212633,-0.211698,0.166788,-0.330244,-0.020508,0.446825,0.167402,-0.106501,-0.003399,0.207154,0.32742,0.148758,-0.051584,0.017852,0.276817,0.231715,0.089713,0.28881,0.003133,0.177812,-0.111687,-0.192715,-0.357444,0.129014,0.360462,-0.056897,-0.303038


In [38]:
wordvec_df.shape

(10000, 200)

#### 3.4.3.transform test data

In [39]:
feature_test.head(2)

Unnamed: 0.1,Unnamed: 0,title
0,153245,iPhone 6 Release Date Pushed Back Due to Issue...
1,308611,Samsung Galaxy S4 vs Galaxy S3: Budget-Friendl...


In [40]:
tokenized_text_test = feature_test['title'].apply(lambda x: x.split()) # tokenizing

In [41]:
wordvec_arrays = np.zeros((len(tokenized_text_test), 200)) 
for i in range(len(tokenized_text_test)):
    wordvec_arrays[i,:] = word_vector(tokenized_text_test[i], 200)
    wordvec_df_test = pd.DataFrame(wordvec_arrays)
wordvec_df_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199
0,0.208947,-0.076844,0.179379,0.280777,0.046792,0.867512,0.257228,-0.461147,0.173479,-0.048893,0.307089,-0.242034,0.031046,-0.498594,-0.15975,0.281571,-0.055472,0.101018,-0.111137,-0.14803,-0.360053,-0.225036,0.153846,0.505878,-0.036454,-0.1438,-0.410052,-0.226629,0.13998,0.242914,0.445334,0.01467,-0.137722,-0.345805,-0.33567,-0.043771,0.398432,0.164009,-0.01723,0.185658,...,0.063325,-0.179066,0.041801,-0.036524,-0.411031,0.187748,-0.151222,0.010793,-0.25038,-0.122431,0.05957,0.046637,-0.592476,-0.817895,-0.336659,-0.215695,-0.259888,-0.079524,0.241328,-0.026151,0.025903,-0.475204,-0.033057,0.405606,-0.379209,-0.070627,-0.1858,0.637822,-0.074722,-0.289431,-0.214507,-0.586476,0.319888,-0.370352,-0.313744,-0.162246,0.262717,0.124041,0.060021,0.161705
1,0.289732,-0.248723,0.023323,0.056374,0.239139,0.127363,-0.248794,-0.509878,-0.13323,0.472773,0.436935,0.317511,0.46044,-0.116106,-0.495276,-0.184468,-0.469791,-0.162628,0.214155,-0.369042,0.422644,-0.356284,-0.009062,-0.297467,0.590991,0.304082,-0.333202,-0.220233,-0.025756,-0.160637,-0.329871,-0.376956,0.285357,-0.287015,0.112305,0.037366,0.548521,-0.483419,-0.246519,0.081826,...,0.38725,0.164938,0.267385,-0.28019,-0.463488,0.129127,0.160741,0.221796,0.207474,-0.090803,0.765732,0.084728,-0.436326,-0.407389,0.102193,0.175556,-0.261573,-0.091003,-0.062443,0.19406,0.388039,0.403039,-0.073053,-0.009276,0.509792,0.045316,-0.396843,-0.104715,-0.163059,0.095767,-0.602258,-0.000553,0.122325,-0.290399,-0.441995,-0.590126,0.364988,0.422009,-0.721807,0.37295
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.020771,-0.480825,0.128506,0.213706,0.300395,0.319023,0.249774,-0.214976,0.145893,-0.21284,0.058646,-0.452445,0.015384,-0.124101,0.074724,0.123463,-0.244625,0.308892,-0.045261,-0.020364,0.085151,0.137208,0.311494,-0.036514,-0.098276,-0.112019,0.137699,0.015872,-0.036999,0.055497,0.180919,0.042164,0.128892,-0.306811,0.075289,0.011943,0.015532,-0.106327,-0.184,-0.018959,...,-0.124567,-0.104154,0.084601,-0.079162,0.178241,0.225159,0.300026,0.148079,0.030213,0.21316,0.175331,0.074395,-0.347344,-0.187774,-0.15973,-0.003938,-0.157141,0.024359,0.085069,0.041583,0.00196,-0.178259,0.225099,-0.03196,-0.021985,0.123214,0.129897,0.170347,0.235304,0.161928,-0.154995,-0.033598,0.22482,-0.091931,-0.324775,0.112336,0.217432,0.14583,-0.117171,-0.132957


In [42]:
print(wordvec_df.shape)
print(wordvec_df_test.shape)


(10000, 200)
(84484, 200)


It may look weird as we have less data for train and more to test. But, training is computing intense. So, this will help us. Plus eventually I will train my best performing model to whole data.

# 4)-Model Building

- Logistic Regression 
- Support Vector
- Random Forest
- XGBoost
- MLP

### 4.1.Logistic Regression Model

In [43]:
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import f1_score, accuracy_score, roc_curve,roc_auc_score,confusion_matrix, classification_report

##### 4.1.a. Logistic Regression using Bag-of-Words Features

In [44]:
X=bow
y=data['category']
print(X.shape)
print(y.shape)
print(bow_test.shape)
print(label_test.shape)

(10000, 1000)
(10000,)
(84484, 1000)
(84484, 1)


In [45]:
# splitting data into training and validation set
#xtrain_bow, xvalid_bow, ytrain, yvalid = train_test_split(bow, y,random_state=42,test_size=0.2)

In [46]:
lreg_bow = LogisticRegression(solver='liblinear')

# training the model
lreg_bow.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [47]:
# predicting on the validation set
prediction_bow = lreg_bow.predict_proba(bow_test)
prediction_bow[0]

array([0.16087434, 0.6072539 , 0.04823555, 0.1836362 ])

In [48]:
# prediction over classes

prediction_bow_class=lreg_bow.predict(bow_test)
prediction_bow_class[0]

'e'

In [49]:
accuracy_score(label_test, prediction_bow_class)

0.7044765872827992

In [50]:
from sklearn import metrics
print(metrics.classification_report(label_test, prediction_bow_class))

              precision    recall  f1-score   support

           b       0.73      0.65      0.69     23367
           e       0.65      0.92      0.76     30300
           m       0.75      0.47      0.58      9207
           t       0.80      0.56      0.66     21610

    accuracy                           0.70     84484
   macro avg       0.73      0.65      0.67     84484
weighted avg       0.72      0.70      0.70     84484



##### 4.1.b.Logistic Regression using TF-IDF Features

In [51]:
X=tfidf
y=data['category']
print(X.shape)
print(y.shape)
print(tfidf_test.shape)
print(label_test.shape)

(10000, 1000)
(10000,)
(84484, 1000)
(84484, 1)


In [52]:
lreg_tfidf = LogisticRegression(solver='liblinear')

# training the model
lreg_tfidf.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [53]:
# predicting on the validation set
prediction_tfidf = lreg_tfidf.predict_proba(tfidf_test)

In [54]:
prediction_tfidf_class=lreg_tfidf.predict(tfidf_test)

In [55]:
accuracy_score(label_test, prediction_tfidf_class)

0.707542256521945

In [56]:
print(metrics.classification_report(label_test, prediction_tfidf_class))

              precision    recall  f1-score   support

           b       0.73      0.66      0.69     23367
           e       0.66      0.92      0.77     30300
           m       0.78      0.46      0.58      9207
           t       0.79      0.57      0.66     21610

    accuracy                           0.71     84484
   macro avg       0.74      0.65      0.67     84484
weighted avg       0.72      0.71      0.70     84484



##### 4.1.c. Logistic Regression using Word2Vec Features

In [57]:
X=wordvec_df
y=data['category']
print(X.shape)
print(y.shape)
print(wordvec_df_test.shape)
print(label_test.shape)

(10000, 200)
(10000,)
(84484, 200)
(84484, 1)


In [58]:
lreg_word2vec = LogisticRegression(solver='liblinear')
# training the model
lreg_word2vec.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [59]:
# predicting on the validation set
prediction_word2vec = lreg_word2vec.predict_proba(wordvec_df_test)

In [60]:
prediction_word2vec_class=lreg_word2vec.predict(wordvec_df_test)

In [61]:
accuracy_score(label_test, prediction_word2vec_class)

0.4234884711898111

In [62]:
print(metrics.classification_report(label_test, prediction_word2vec_class))

              precision    recall  f1-score   support

           b       0.35      0.79      0.48     23367
           e       0.61      0.31      0.41     30300
           m       0.43      0.33      0.38      9207
           t       0.56      0.22      0.32     21610

    accuracy                           0.42     84484
   macro avg       0.49      0.41      0.40     84484
weighted avg       0.51      0.42      0.40     84484



##### 4.1.d. Logistic Regression using Doc2Vec Features

In [63]:
X=docvec_df
y=data['category']
print(X.shape)
print(y.shape)
print(docvec_df_test.shape)
print(label_test.shape)

(10000, 200)
(10000,)
(84484, 200)
(84484, 1)


In [64]:
lreg_doc2vec = LogisticRegression(solver='liblinear')
# training the model
lreg_doc2vec.fit(docvec_df, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [65]:
# predicting on the validation set
prediction_doc2vec = lreg_doc2vec.predict_proba(docvec_df_test)

In [66]:
prediction_doc2vec_class=lreg_doc2vec.predict(docvec_df_test)

In [67]:
accuracy_score(label_test, prediction_doc2vec_class)

0.35864779129776053

In [68]:
print(metrics.classification_report(label_test, prediction_doc2vec_class))

              precision    recall  f1-score   support

           b       0.00      0.00      0.00     23367
           e       0.36      1.00      0.53     30300
           m       0.00      0.00      0.00      9207
           t       0.00      0.00      0.00     21610

    accuracy                           0.36     84484
   macro avg       0.09      0.25      0.13     84484
weighted avg       0.13      0.36      0.19     84484



**Summary:**

- bow=70%
- tfidf=70%
- word2vec=33%
- doc2vec=35%

### 4.2.Support Vector Machine (SVM)

In [69]:
X=bow
y=data['category']
print(X.shape)
print(y.shape)
print(bow_test.shape)
print(label_test.shape)

(10000, 1000)
(10000,)
(84484, 1000)
(84484, 1)


In [70]:
from sklearn import svm

##### SVM using Bag-of-Words Features

In [71]:
# we need to input arrays to our model.
bow_test = bow_vectorizer.transform(feature_test["title"]).toarray()

In [72]:
svc = svm.SVC(kernel='linear', C=1, probability=True).fit(X,y)

In [101]:
#prediction = svc.predict_proba(bow_test)

In [73]:
prediction_class = svc.predict(bow_test)

In [74]:
accuracy_score(label_test, prediction_class)

0.6848752426494957

In [75]:
print(metrics.classification_report(label_test, prediction_class))

              precision    recall  f1-score   support

           b       0.71      0.64      0.67     23367
           e       0.63      0.92      0.75     30300
           m       0.72      0.45      0.56      9207
           t       0.81      0.50      0.62     21610

    accuracy                           0.68     84484
   macro avg       0.72      0.63      0.65     84484
weighted avg       0.71      0.68      0.67     84484



##### SVM using TF-IDF Features

In [76]:
X=tfidf
y=data['category']
print(X.shape)
print(y.shape)
print(tfidf_test.shape)
print(label_test.shape)

(10000, 1000)
(10000,)
(84484, 1000)
(84484, 1)


In [77]:
tfidf_test = tfidf_test.toarray()

In [78]:
svc = svm.SVC(kernel='linear',C=1, probability=True).fit(X,y)
#prediction = svc.predict_proba(xvalid_tfidf)
prediction_class = svc.predict(tfidf_test)
accuracy_score(label_test, prediction_class)

0.6975521992329908

In [79]:
print(metrics.classification_report(label_test, prediction_class))

              precision    recall  f1-score   support

           b       0.72      0.65      0.68     23367
           e       0.65      0.91      0.76     30300
           m       0.74      0.46      0.57      9207
           t       0.79      0.55      0.65     21610

    accuracy                           0.70     84484
   macro avg       0.72      0.64      0.66     84484
weighted avg       0.71      0.70      0.69     84484



##### SVM using word2vec Features

In [80]:
X=wordvec_df
y=data['category']
print(X.shape)
print(y.shape)
print(wordvec_df_test.shape)
print(label_test.shape)

(10000, 200)
(10000,)
(84484, 200)
(84484, 1)


In [81]:
svc = svm.SVC(kernel='linear', C=1, probability=True).fit(X,y)
#prediction = svc.predict_proba(wordvec_df_test)

In [82]:
prediction_class = svc.predict(wordvec_df_test)
accuracy_score(label_test, prediction_class)

0.42191420860754697

In [83]:
print(metrics.classification_report(label_test, prediction_class))

              precision    recall  f1-score   support

           b       0.34      0.80      0.48     23367
           e       0.63      0.30      0.40     30300
           m       0.43      0.35      0.39      9207
           t       0.55      0.22      0.32     21610

    accuracy                           0.42     84484
   macro avg       0.49      0.42      0.40     84484
weighted avg       0.51      0.42      0.40     84484



##### SVM using doc2vec Features

In [84]:
X=docvec_df
y=data['category']
print(X.shape)
print(y.shape)
print(docvec_df_test.shape)
print(label_test.shape)

(10000, 200)
(10000,)
(84484, 200)
(84484, 1)


In [85]:
svc = svm.SVC(kernel='linear', C=1, probability=True).fit(X,y)

In [86]:
#prediction = svc.predict_proba(xvalid_doc2vec)
prediction_class = svc.predict(docvec_df_test)
accuracy_score(label_test, prediction_class)

0.35864779129776053

In [87]:
print(metrics.classification_report(label_test, prediction_class))

              precision    recall  f1-score   support

           b       0.00      0.00      0.00     23367
           e       0.36      1.00      0.53     30300
           m       0.00      0.00      0.00      9207
           t       0.00      0.00      0.00     21610

    accuracy                           0.36     84484
   macro avg       0.09      0.25      0.13     84484
weighted avg       0.13      0.36      0.19     84484



**Summary**


- bow = 69%
- tfidf= 68%
- word2vec= 42% 
- doc2vec= 41%

### 4.3.Random Forest

In [88]:
from sklearn.ensemble import RandomForestClassifier

##### RF with Bag-of-Words Features

In [89]:
X=bow
y=data['category']
print(X.shape)
print(y.shape)
print(bow_test.shape)
print(label_test.shape)

(10000, 1000)
(10000,)
(84484, 1000)
(84484, 1)


In [90]:
rf = RandomForestClassifier().fit(X, y)

In [91]:
#prediction = rf.predict_proba(xvalid_bow)
prediction_class = rf.predict(bow_test)
accuracy_score(label_test, prediction_class)

0.654088348089579

##### RF with TF-IDF Features

In [92]:
X=tfidf
y=data['category']
print(X.shape)
print(y.shape)
print(tfidf_test.shape)
print(label_test.shape)

(10000, 1000)
(10000,)
(84484, 1000)
(84484, 1)


In [93]:
rf = RandomForestClassifier().fit(X, y)

In [94]:
#prediction = rf.predict_proba(xvalid_tfidf)
prediction_class = rf.predict(tfidf_test)
accuracy_score(label_test, prediction_class)

0.6657591970077175

In [95]:
print(metrics.classification_report(label_test, prediction_class))

              precision    recall  f1-score   support

           b       0.70      0.61      0.65     23367
           e       0.64      0.87      0.73     30300
           m       0.61      0.47      0.53      9207
           t       0.73      0.52      0.61     21610

    accuracy                           0.67     84484
   macro avg       0.67      0.62      0.63     84484
weighted avg       0.67      0.67      0.66     84484



##### RF with word2vec Features

In [96]:
X=wordvec_df
y=data['category']
print(X.shape)
print(y.shape)
print(wordvec_df_test.shape)
print(label_test.shape)

(10000, 200)
(10000,)
(84484, 200)
(84484, 1)


In [97]:
rf = RandomForestClassifier().fit(X, y)
#prediction= rf.predict_proba(wordvec_df_test)
prediction_class = rf.predict(wordvec_df_test)
accuracy_score(label_test, prediction_class)

0.5087235452866815

In [98]:
print(metrics.classification_report(label_test, prediction_class))

              precision    recall  f1-score   support

           b       0.61      0.38      0.47     23367
           e       0.47      0.87      0.61     30300
           m       0.55      0.27      0.36      9207
           t       0.55      0.25      0.34     21610

    accuracy                           0.51     84484
   macro avg       0.55      0.44      0.45     84484
weighted avg       0.54      0.51      0.48     84484



##### RF with doc2vec Feature

In [99]:
X=docvec_df
y=data['category']
print(X.shape)
print(y.shape)
print(docvec_df_test.shape)
print(label_test.shape)

(10000, 200)
(10000,)
(84484, 200)
(84484, 1)


In [100]:
rf = RandomForestClassifier().fit(X, y)
prediction_class = rf.predict(docvec_df_test)
accuracy_score(label_test, prediction_class)

0.3488826286634156

In [101]:
print(metrics.classification_report(label_test, prediction_class))

              precision    recall  f1-score   support

           b       0.27      0.03      0.06     23367
           e       0.36      0.92      0.52     30300
           m       0.11      0.01      0.02      9207
           t       0.26      0.03      0.06     21610

    accuracy                           0.35     84484
   macro avg       0.25      0.25      0.16     84484
weighted avg       0.28      0.35      0.22     84484



**Summary**

- bow = 57%
- tfidf = 56%
- word2vec = 57%
- doc2vec = 41%

# 4.4.XGBoost
Extreme Gradient Boosting (xgboost) is an advanced implementation of gradient boosting algorithm. It has both linear model solver and tree learning algorithms. Its ability to do parallel computation on a single machine makes it extremely fast. It also has additional features for doing cross validation and finding important variables. There are many parameters which need to be controlled to optimize the model.

Some key benefits of XGBoost are:

Regularization - helps in reducing overfitting
Parallel Processing - XGBoost implements parallel processing and is blazingly faster as compared to GBM.
Handling Missing Values - It has an in-built routine to handle missing values.
Built-in Cross-Validation - allows user to run a cross-validation at each iteration of the boosting process

**Notice there is no sklearn ready made model therefore; I needed to use XGBoost from its main librrary**

In [102]:
from xgboost import XGBClassifier

##### XGBoost using bag of words features

In [103]:
X=bow
y=data['category']
print(X.shape)
print(y.shape)
print(bow_test.shape)
print(label_test.shape)

(10000, 1000)
(10000,)
(84484, 1000)
(84484, 1)


In [104]:
xgb_model = XGBClassifier(max_depth=6, n_estimators=1000).fit(X, y)
prediction = xgb_model.predict_proba(bow_test)
prediction_class = xgb_model.predict(bow_test)
accuracy_score(label_test, prediction_class)

0.35864779129776053

In [105]:
print(metrics.classification_report(label_test, prediction_class))

              precision    recall  f1-score   support

           b       0.00      0.00      0.00     23367
           e       0.36      1.00      0.53     30300
           m       0.00      0.00      0.00      9207
           t       0.00      0.00      0.00     21610

    accuracy                           0.36     84484
   macro avg       0.09      0.25      0.13     84484
weighted avg       0.13      0.36      0.19     84484



##### XGBoost using tfidf features

In [106]:
X=tfidf
y=data['category']
print(X.shape)
print(y.shape)
print(tfidf_test.shape)
print(label_test.shape)

(10000, 1000)
(10000,)
(84484, 1000)
(84484, 1)


In [107]:
xgb_model = XGBClassifier(max_depth=6, n_estimators=1000).fit(X, y)
prediction = xgb_model.predict_proba(tfidf_test)
prediction_class = xgb_model.predict(tfidf_test)
accuracy_score(label_test, prediction_class)

0.2557880782159936

In [108]:
print(metrics.classification_report(label_test, prediction_class))

              precision    recall  f1-score   support

           b       0.00      0.00      0.00     23367
           e       0.00      0.00      0.00     30300
           m       0.00      0.00      0.00      9207
           t       0.26      1.00      0.41     21610

    accuracy                           0.26     84484
   macro avg       0.06      0.25      0.10     84484
weighted avg       0.07      0.26      0.10     84484



##### XGBoost using word2vecfeatures

In [109]:
X=wordvec_df
y=data['category']
print(X.shape)
print(y.shape)
print(wordvec_df_test.shape)
print(label_test.shape)

(10000, 200)
(10000,)
(84484, 200)
(84484, 1)


In [110]:
xgb_model = XGBClassifier(max_depth=6, n_estimators=1000).fit(X, y)
prediction_class = xgb_model.predict(wordvec_df_test)
accuracy_score(label_test, prediction_class)

0.43125325505421147

In [111]:
print(metrics.classification_report(label_test, prediction_class))

              precision    recall  f1-score   support

           b       0.61      0.39      0.48     23367
           e       0.60      0.33      0.43     30300
           m       0.51      0.32      0.40      9207
           t       0.30      0.66      0.41     21610

    accuracy                           0.43     84484
   macro avg       0.51      0.43      0.43     84484
weighted avg       0.52      0.43      0.44     84484



##### XGBoost using doc2vec features

In [112]:
X=docvec_df
y=data['category']
print(X.shape)
print(y.shape)
print(docvec_df_test.shape)
print(label_test.shape)

(10000, 200)
(10000,)
(84484, 200)
(84484, 1)


In [113]:
xgb_model = XGBClassifier(max_depth=6, n_estimators=1000).fit(X, y)
prediction_class = xgb_model.predict(docvec_df_test)
accuracy_score(label_test, prediction_class)

0.3488826286634156

In [114]:
print(metrics.classification_report(label_test, prediction_class))

              precision    recall  f1-score   support

           b       0.27      0.03      0.06     23367
           e       0.36      0.92      0.52     30300
           m       0.11      0.01      0.02      9207
           t       0.26      0.03      0.06     21610

    accuracy                           0.35     84484
   macro avg       0.25      0.25      0.16     84484
weighted avg       0.28      0.35      0.22     84484



**Summary**

- bow = 54%
- tfidf = 55%
- word2vec = 58%
- doc2vec = 37%

### 4.5.MLPClassifier

A multilayer perceptron (MLP) is a class of feedforward artificial neural network

In [115]:
from sklearn.neural_network import MLPClassifier

##### MLP using bag of words features

In [122]:
X=bow
y=data['category']
print(X.shape)
print(y.shape)
print(bow_test.shape)
print(label_test.shape)

(10000, 1000)
(10000,)
(84484, 1000)
(84484, 1)


In [123]:
mlp_model = MLPClassifier(random_state=1, max_iter=300,learning_rate_init=0.001).fit(X, y)
prediction = mlp_model.predict_proba(bow_test)
prediction_class = mlp_model.predict(bow_test)
accuracy_score(label_test, prediction_class)

0.6599356091094172

In [124]:
print(metrics.classification_report(label_test, prediction_class))

              precision    recall  f1-score   support

           b       0.69      0.59      0.64     23367
           e       0.68      0.81      0.74     30300
           m       0.49      0.53      0.51      9207
           t       0.68      0.58      0.63     21610

    accuracy                           0.66     84484
   macro avg       0.63      0.63      0.63     84484
weighted avg       0.66      0.66      0.66     84484



##### MLP using tfidf features

In [119]:
X=tfidf
y=data['category']
print(X.shape)
print(y.shape)
print(tfidf_test.shape)
print(label_test.shape)

(10000, 1000)
(10000,)
(84484, 1000)
(84484, 1)


In [120]:
mlp_model = MLPClassifier(random_state=1, max_iter=300,learning_rate_init=0.001).fit(X, y)
prediction_class = mlp_model.predict(tfidf_test)
accuracy_score(label_test, prediction_class)

0.6671085649353724

In [121]:
print(metrics.classification_report(label_test, prediction_class))

              precision    recall  f1-score   support

           b       0.69      0.61      0.65     23367
           e       0.67      0.83      0.74     30300
           m       0.57      0.49      0.53      9207
           t       0.68      0.57      0.62     21610

    accuracy                           0.67     84484
   macro avg       0.65      0.63      0.63     84484
weighted avg       0.67      0.67      0.66     84484



##### MLP using word2vecfeatures

In [125]:
X=wordvec_df
y=data['category']
print(X.shape)
print(y.shape)
print(wordvec_df_test.shape)
print(label_test.shape)

(10000, 200)
(10000,)
(84484, 200)
(84484, 1)


In [126]:
mlp_model = MLPClassifier(random_state=1, max_iter=300,learning_rate_init=0.001).fit(X, y)
prediction_class = mlp_model.predict(wordvec_df_test)
accuracy_score(label_test, prediction_class)

0.4867548884995976

In [127]:
print(metrics.classification_report(label_test, prediction_class))

              precision    recall  f1-score   support

           b       0.53      0.39      0.45     23367
           e       0.47      0.81      0.59     30300
           m       0.44      0.32      0.37      9207
           t       0.55      0.21      0.30     21610

    accuracy                           0.49     84484
   macro avg       0.50      0.43      0.43     84484
weighted avg       0.50      0.49      0.45     84484



##### MLP using doc2vec features

In [None]:
X=docvec_df
y=data['category']
print(X.shape)
print(y.shape)
print(docvec_df_test.shape)
print(label_test.shape)

In [None]:
mlp_model = MLPClassifier(random_state=1, max_iter=300,learning_rate_init=0.001).fit(X, y)
prediction_class = mlp_model.predict(docvec_df_test)
accuracy_score(label_test, prediction_class)

0.4117647058823529

In [128]:
print(metrics.classification_report(label_test, prediction_class))

              precision    recall  f1-score   support

           b       0.53      0.39      0.45     23367
           e       0.47      0.81      0.59     30300
           m       0.44      0.32      0.37      9207
           t       0.55      0.21      0.30     21610

    accuracy                           0.49     84484
   macro avg       0.50      0.43      0.43     84484
weighted avg       0.50      0.49      0.45     84484



**Summary**

- bow = 65%
- tfidf = 66%
- word2vec = 48%
- doc2vec = 41%

# END OF NOTEBOOK CODE