In [12]:
!pip install -q pyyaml h5py

In [13]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
import seaborn as sns
import matplotlib.pyplot as plt
import json
import tensorflow as tf
from tensorflow import keras

In [14]:
DATA_LOC = 'data/'
FILE_NAME = 'data_train.json'

# Load in JSON files and extract from JSON
reviews_json = open(DATA_LOC + FILE_NAME, 'r')
reviews = json.load(reviews_json)

# Analyze the distribution of stars and reviews as seperate lists
texts = [review['text'] for review in reviews] # Features
stars = [review['stars'] for review in reviews] # Labels

df = {'Text' : texts, 'Stars' : stars}
df = pd.DataFrame(df)

In [15]:
stemmer = PorterStemmer()
words = stopwords.words("english")
df['Text'] = df['Text'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())

In [57]:
df['Text'] = df['Text'].apply(lambda x: x.split(' '))

In [58]:
train, test = train_test_split(df, test_size=0.3, random_state=42)

In [59]:
train_tagged = train.apply(
    lambda r: TaggedDocument(words=r['Text'], tags=[r.Stars]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=r['Text'], tags=[r.Stars]), axis=1)

In [60]:
model = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=1, workers=5, alpha=0.065, min_alpha=0.065)
model.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 234006/234006 [00:00<00:00, 2813580.69it/s]


In [62]:
%%time
for epoch in range(30):
    model.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model.alpha -= 0.002
    model.min_alpha = model.alpha

100%|██████████| 234006/234006 [00:00<00:00, 2864633.83it/s]
100%|██████████| 234006/234006 [00:00<00:00, 3123831.70it/s]
100%|██████████| 234006/234006 [00:00<00:00, 3397977.82it/s]
100%|██████████| 234006/234006 [00:00<00:00, 3458260.26it/s]
100%|██████████| 234006/234006 [00:00<00:00, 3112489.07it/s]
100%|██████████| 234006/234006 [00:00<00:00, 3007253.32it/s]
100%|██████████| 234006/234006 [00:00<00:00, 3238756.83it/s]
100%|██████████| 234006/234006 [00:00<00:00, 3258141.44it/s]
100%|██████████| 234006/234006 [00:00<00:00, 2968192.77it/s]
100%|██████████| 234006/234006 [00:00<00:00, 3289084.12it/s]
100%|██████████| 234006/234006 [00:00<00:00, 3428973.75it/s]
100%|██████████| 234006/234006 [00:00<00:00, 3376527.20it/s]
100%|██████████| 234006/234006 [00:00<00:00, 3273037.50it/s]
100%|██████████| 234006/234006 [00:00<00:00, 3270300.18it/s]
100%|██████████| 234006/234006 [00:00<00:00, 3346913.08it/s]
100%|██████████| 234006/234006 [00:00<00:00, 3465537.85it/s]
100%|██████████| 234006/

CPU times: user 35min 57s, sys: 3min 24s, total: 39min 21s
Wall time: 18min 5s


In [63]:
def teach_vectors(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [64]:
y_train, X_train = teach_vectors(model, train_tagged)
y_test, X_test = teach_vectors(model, test_tagged)
lr = LogisticRegression(n_jobs=1, C=1e5)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)



In [68]:
# confusion matrix and classification report(precision, recall, F1-score)
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Testing accuracy 0.566363210322169
Testing F1 score: 0.5048505185038825
              precision    recall  f1-score   support

         1.0       0.56      0.64      0.60     14754
         2.0       0.38      0.09      0.15      8231
         3.0       0.41      0.15      0.22     10972
         4.0       0.42      0.20      0.27     22039
         5.0       0.60      0.92      0.73     44293

    accuracy                           0.57    100289
   macro avg       0.48      0.40      0.39    100289
weighted avg       0.52      0.57      0.50    100289

[[ 9441   403   331   553  4026]
 [ 3210   755   879   873  2514]
 [ 1742   534  1618  2394  4684]
 [ 1264   211   876  4354 15334]
 [ 1173    81   226  2181 40632]]


In [69]:
from sklearn.externals import joblib
filename = 'nltk_model.sav'
joblib.dump(lr, filename)
# Load the model
# loaded_model = joblib.load(filename)
# result = loaded_model.score(X_test, Y_test)
# print(result)



['nltk_model.sav']