In [1]:
%matplotlib inline
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from scipy.spatial.distance import cdist
import pandas as pd
from tensorflow import keras
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import json

In [2]:
#hyperparameters
SIZE = 25000 #the combined size of the training + test set (max 230k)
TRAININGPERCENT = 0.9 #the ratio of the total dataset used for training
MEMORIZE = 10000 #the amount of words to memorize 
POSITIVE = 4 #reviews with stars at or above this value (out of five) are considered positive, CAN BE CHANGED 

In [3]:
trainset = {} #dictionary of texts to positivity for training
testset = {} #dictionary of texts to positivity for testing

reviews = [] #temporary storage for unprocessed json reviews
description = [] #customer review (text)
scores = [] #positivity of review (binary)
    
jsondata = open('videogamereviews.json')

for line in jsondata:
    reviews.append(json.loads(line))
for review in reviews:
    description.append(review.get('reviewText'))
    if(review.get('overall')>=POSITIVE):
        scores.append(1)
    else:
        scores.append(0)
#print(scores[3])

In [4]:
#creating a training set with an equal number of positive/negative cases
marker = 0 
numnegative = 0
numpositive = 0
i=0
trainingscores = []

endtrain = int(SIZE*TRAININGPERCENT) #first test example at this index
while marker<endtrain:
    sentiment = scores[i]
    if(sentiment==1):
        if(numpositive<int(endtrain/2)):
            trainingscores.append(1)
            trainset[description[i]]=1
            numpositive+=1
            marker+=1
    else:
        if(numnegative<endtrain-int(endtrain/2)):
            trainingscores.append(0)
            trainset[description[i]]=0
            numnegative+=1
            marker+=1
    i+=1
len(trainset)
#print(trainset.get("I bought this and the key didn't work.  It was a gift, and the recipient wasn't able to solve the problem.  It might have been a good game, but I never found out because the key failed."))

22497

In [5]:
#check for successful creation of training set
ones = 0
zeros = 0
for sentiment in trainingscores:
    if(sentiment==1):
        ones+=1
    else:
        zeros+=1
ones/zeros

1.0

In [6]:
#creating a test set with an equal number of positive/negative cases
marker = 0 
numnegative = 0
numpositive = 0
testsize = SIZE-endtrain
i=endtrain
testscores = []
while marker<testsize:
    sentiment = scores[i]
    if(sentiment==1):
        if(numpositive<int(testsize/2)):
            testscores.append(1)
            testset[description[i]]=1
            numpositive+=1
            marker+=1
    else:
        if(numnegative<testsize-int(testsize/2)):
            testscores.append(0)
            testset[description[i]]=0
            numnegative+=1
            marker+=1
    i+=1
len(testset)

2500

In [7]:
#check for successful creation of test set
ones = 0
zeros = 0
for sentiment in testscores:
    if(sentiment==1):
        ones+=1
    else:
        zeros+=1
len(testscores), ones/zeros

(2500, 1.0)

In [8]:
#check for correct input size
print("Size of training set: "+str(len(trainset)))
print("Size of test set: "+str(len(testset)))

Size of training set: 22497
Size of test set: 2500


In [9]:
#we find that around 50% of people give a 5 in their review, making classification between 5 and not 5 an unbiased problem
percentage = 0
for score in scores:
    percentage+=score
percentage/=len(scores)
percentage        

0.7549788592630943

In [10]:
#representation of 0s in relevant test set
percentage = 0
for score in testscores:
    percentage+=score
percentage/=len(testscores)
percentage        

0.5

In [11]:
tokenizer = Tokenizer(num_words=MEMORIZE)
tokenizer.fit_on_texts(trainset)
tokenizer.word_index

{'the': 1,
 'and': 2,
 'to': 3,
 'a': 4,
 'of': 5,
 'game': 6,
 'you': 7,
 'is': 8,
 'i': 9,
 'it': 10,
 'this': 11,
 'in': 12,
 'that': 13,
 'for': 14,
 'are': 15,
 'but': 16,
 'with': 17,
 'have': 18,
 'on': 19,
 'as': 20,
 'was': 21,
 'not': 22,
 'if': 23,
 'all': 24,
 'be': 25,
 'one': 26,
 'like': 27,
 'your': 28,
 'can': 29,
 'or': 30,
 'games': 31,
 'so': 32,
 'just': 33,
 'get': 34,
 'there': 35,
 "it's": 36,
 'at': 37,
 'out': 38,
 'they': 39,
 'more': 40,
 'from': 41,
 'play': 42,
 'my': 43,
 'time': 44,
 'good': 45,
 'up': 46,
 'an': 47,
 'some': 48,
 'will': 49,
 'has': 50,
 'very': 51,
 'when': 52,
 'graphics': 53,
 'first': 54,
 'really': 55,
 'fun': 56,
 'only': 57,
 'great': 58,
 'even': 59,
 'about': 60,
 'which': 61,
 'no': 62,
 'what': 63,
 'do': 64,
 'by': 65,
 'much': 66,
 'me': 67,
 'other': 68,
 'also': 69,
 'than': 70,
 "don't": 71,
 'them': 72,
 '2': 73,
 'would': 74,
 'well': 75,
 'because': 76,
 'had': 77,
 'then': 78,
 'its': 79,
 'quot': 80,
 'still': 81,
 

In [12]:
traintokens = tokenizer.texts_to_sequences(trainset)
testtokens = tokenizer.texts_to_sequences(testset)

In [13]:
len(testtokens)

2500

In [14]:
np.array(traintokens[342])

array([1004,    8,   26,    5,  160,   31,   13,   50,  705,   17,   67,
        124,  191,    9,   54,   97,   10,   12,  255,    9,  158,  125,
          5,   96,    6,   13,    9,  186,   27,   20,   66,   20, 1004,
          1,  890,   15,  603,    2, 2435,    1,  461,    8,  710, 2520,
          2,    1,  863,    8,  325, 7333,   12,    1, 6727, 1213,    7,
         15, 6726,   28, 3746,  733,   50, 2589,    2,  106,    7,   18,
          3, 1379,   52,    7,  844,    1,  733,    7,   34, 4998,   12,
          1,  641,   17, 3717, 4540,  890,  380,  863,  603, 2659,    2,
       4540,  137,  268,    5,  288,  990,   30, 3174, 1004, 1672,   79,
         86,  107, 5258,    9, 1398,   10,   13,   88,  990,   74,   18,
       1244,   12,    1,   88, 1004,    8,   69,   58,   14,    1,  255,
         13,   10, 3944, 8213,   46,    3, 1120,  272,    1,  890, 3755,
         16,   24,    5,   72,   15, 1338,   51,   75,  265, 1045,   41,
       8213, 1004,   69,   50, 2848,    2,  440, 28

In [15]:
num_tokens = [len(tokens) for tokens in traintokens + testtokens]
num_tokens = np.array(num_tokens)
#np.mean(num_tokens)
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
#max_tokens

In [16]:
np.sum(num_tokens < max_tokens) / len(num_tokens)

0.9536344361323359

In [17]:
pad = 'pre'

In [18]:
paddedtrain = pad_sequences(traintokens, maxlen=max_tokens, padding=pad, truncating=pad)
paddedtest = pad_sequences(testtokens, maxlen=max_tokens, padding=pad, truncating=pad)

In [19]:
paddedtrain[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [20]:
paddedtrain.shape

(22497, 651)

In [21]:
paddedtest.shape

(2500, 651)

In [22]:
idx = tokenizer.word_index
inverse_map = dict(zip(idx.values(), idx.keys()))

In [23]:
def tokens_to_string(tokens):
    # Map from tokens back to words.
    words = [inverse_map[token] for token in tokens if token != 0]
    # Concatenate all words.
    text = " ".join(words)
    return text

In [24]:
#check
tokens_to_string(traintokens[1])

"if you like rally cars get this game you will have fun it is more oriented to 34 european market 34 since here in america there isn't a huge rally fan party music it is very european and even the voices from the game very 34 english 34 accent the multiplayer isn't the best but it works just ok"

In [25]:
#check
description[1]

"If you like rally cars get this game you will have fun.It is more oriented to &#34;European market&#34; since here in America there isn't a huge rally fan party. Music it is very European and even the voices from the game very &#34;English&#34; accent.The multiplayer isn't the best but it works just ok."

In [26]:
model = Sequential()
embedding_size = 8
model.add(Embedding(input_dim=MEMORIZE,output_dim=embedding_size, input_length=max_tokens, name='layer_embedding'))
model.add(GRU(units=16, return_sequences=True))
model.add(GRU(units=8, return_sequences=True))
model.add(GRU(units=4))
model.add(Dense(1, activation='sigmoid'))
optimizer = Adam(learning_rate=1e-3)
model.compile(loss='binary_crossentropy',optimizer=optimizer, metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 651, 8)            80000     
_________________________________________________________________
gru (GRU)                    (None, 651, 16)           1248      
_________________________________________________________________
gru_1 (GRU)                  (None, 651, 8)            624       
_________________________________________________________________
gru_2 (GRU)                  (None, 4)                 168       
_________________________________________________________________
dense (Dense)                (None, 1)                 5         
Total params: 82,045
Trainable params: 82,045
Non-trainable params: 0
_________________________________________________________________


In [27]:
%%time
history=model.fit(np.array(paddedtrain), np.array(trainingscores), validation_split=0.05, epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 59min 8s, sys: 8min 6s, total: 1h 7min 14s
Wall time: 34min 42s


In [28]:
#testset

In [29]:
#testscores

In [30]:
%%time
result = model.evaluate(paddedtest, np.array(testscores))

CPU times: user 10.8 s, sys: 1.54 s, total: 12.4 s
Wall time: 8.04 s


In [53]:
#user input
#0.5 is the threshold value, and values closer to either 0 or 1 signal greater confidence in the prediction
#write full sentence reviews, short ones seem to be off (presumably because it shows lack of passion to the RNN)
reviews = []
reviewone = "I thought this was going to be good, but it ended up being a terrible game."
reviewtwo = "The game was awesome!"

reviews.append(reviewone)
reviews.append(reviewtwo)

tokens = tokenizer.texts_to_sequences(reviews)
tokens_pad = pad_sequences(tokens, maxlen=max_tokens, padding=pad, truncating=pad)
prediction = model.predict(tokens_pad)
classification = [] 
for predict in prediction:
    if(predict>=0.5):  
        classification.append(1)
    else: 
        classification.append(0)
print("Sigmoid predictions: "+str(prediction))
print("Predicted classifications, in order: "+str(classification))

Sigmoid predictions: [[0.09295633]
 [0.86828345]]
Predicted classifications, in order: [0, 1]
