# Converting the Text reviews to numeric values using Document embedding.

In [41]:
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [8]:
df = pd.read_csv("https://spotleai.sgp1.digitaloceanspaces.com/course/data/movie_review_data.csv", encoding='utf-8')

                                               review  sentiment
0   Based on an actual story, John Boorman shows t...          1
1   This is a gem. As a Film Four production - the...          1
2   I really like this show. It has drama, romance...          1
3   This is the best 3-D experience Disney has at ...          1
4   Of the Korean movies I've seen, only three had...          1
5   this movie is funny funny funny my favorite qu...          1
6   I'm just starting to explore the so far wonder...          1
7   There is no need for me to repeat the synopsis...          1
8   I got this movie with my BBC "Jane Austen Coll...          1
9   This was a great movie, I would compare it to ...          1
10  I absolutely fell in love with this girls. let...          1
11  It started off weird, the middle was weird, an...          1
12  If you like silly comedies like Airplane you'l...          1
13  The Italian Job requires daylight hours and no...          1
14  I watch a lot of movi

In [9]:
print(df.head(100))

                                               review  sentiment
0   Based on an actual story, John Boorman shows t...          1
1   This is a gem. As a Film Four production - the...          1
2   I really like this show. It has drama, romance...          1
3   This is the best 3-D experience Disney has at ...          1
4   Of the Korean movies I've seen, only three had...          1
5   this movie is funny funny funny my favorite qu...          1
6   I'm just starting to explore the so far wonder...          1
7   There is no need for me to repeat the synopsis...          1
8   I got this movie with my BBC "Jane Austen Coll...          1
9   This was a great movie, I would compare it to ...          1
10  I absolutely fell in love with this girls. let...          1
11  It started off weird, the middle was weird, an...          1
12  If you like silly comedies like Airplane you'l...          1
13  The Italian Job requires daylight hours and no...          1
14  I watch a lot of movi

In [12]:
X_train = list(df.loc[:25000, 'review'].values)
X_test = list(df.loc[25001:, 'review'].values)

In [17]:
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(X_train)]

In [19]:
max_epochs = 50
vec_size = 100
alpha = 0.025

model = Doc2Vec(vector_size=vec_size,
        alpha=alpha, 
        min_alpha=0.00025,
        min_count=1,
        dm =1)
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
  model.train(tagged_data,
        total_examples=model.corpus_count,
        epochs=model.epochs)
  model.alpha -= 0.0002
  model.min_alpha = model.alpha

model.save("doc.vec")
print("Model Saved")

Model Saved


In [23]:
model= Doc2Vec.load("doc.vec")

In [25]:
narr = np.append([], [])
for text in X_train:
  txt = word_tokenize(text.lower())
  vect = model.infer_vector(txt)
  narr = np.append(narr, vect)

In [32]:
for text in X_test:
  txt = word_tokenize(text.lower())
  vect = model.infer_vector(txt)
  narr = np.append(narr, vect)
narr = np.resize(narr, (50000, vec_size))
narr.shape

(50000, 100)

In [33]:
df_trans = pd.DataFrame(narr, columns=['col_'+ str(i+1) for i in range(narr.shape[1])])
df_trans['sentiment'] = df['sentiment']
df_trans.to_csv('review_doc2_vectors.csv')
df_trans.tail()

Unnamed: 0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,...,col_92,col_93,col_94,col_95,col_96,col_97,col_98,col_99,col_100,sentiment
49995,0.202095,0.537234,-1.088663,-0.525027,0.248526,-0.261927,-0.196288,0.268774,0.37192,-0.008234,...,-0.462995,-0.002993,-0.633289,0.76851,0.038059,0.278316,-0.195076,0.173932,-0.299198,0
49996,-0.233895,-0.109597,-0.701388,0.007983,0.742177,-0.513126,-0.523896,0.23632,0.447604,-0.190016,...,-0.438865,-0.530374,-0.028351,0.483884,-0.445169,-0.13079,-0.480975,-0.397844,-0.108381,0
49997,0.397097,0.799458,-1.116067,0.564277,1.447332,-0.513052,0.1875,-0.071688,0.551531,0.918141,...,-0.193167,-0.307577,-0.473804,0.491824,-0.391194,-0.142803,0.101859,0.214777,0.182484,0
49998,-0.110396,0.391524,-0.196285,-0.352988,0.760605,-0.506514,-0.614894,0.178324,0.361245,0.483047,...,0.038585,-0.218119,0.162602,-7.8e-05,-0.256859,-0.010412,-0.475135,-0.09334,-0.195824,0
49999,0.245743,-0.094019,-0.383576,-0.839952,0.926735,-0.441442,-1.484607,0.394144,-0.510404,0.469206,...,0.37956,-0.358242,-0.665567,0.980332,-0.770728,-1.56629,-1.366231,-0.75936,0.090646,0


# Document Embeddings followed by Decision Tree Classifier


In [35]:
df1 = pd.read_csv("review_doc2_vectors.csv")

In [36]:
y = df1.pop('sentiment')
X = df1.values
X_train = df1.loc[:25000].values
X_test = df1.loc[25001:].values
y_train = y.loc[:25000:].values
y_test = y.loc[25001:].values

In [39]:
model = DecisionTreeClassifier()

In [40]:
model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
      max_features=None, max_leaf_nodes=None,
      min_impurity_decrease=0.0, min_impurity_split=None,
      min_samples_leaf=1, min_samples_split=2,
      min_weight_fraction_leaf=0.0, presort=False, random_state=None,
      splitter='best')

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [42]:
predicted = model.predict(X_test)
acc = round(accuracy_score(y_test, predicted)*100,2)
print("Accuracy of the model with test dataset: ", acc)

Accuracy of the model with test dataset:  50.0
