In [116]:
%matplotlib inline
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from scipy.spatial.distance import cdist
import pandas as pd
from tensorflow import keras
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import json

In [117]:
#hyperparameters
SIZE = 5000 #the combined size of the training + test set (max 230k)
TRAININGPERCENT = 0.8 #the ratio of the total dataset used for training
MEMORIZE = 10000 #the amount of words to memorize 
POSITIVE = 5 #reviews with stars at or above this value (out of five) are considered positive

In [118]:
trainset = {} #dictionary of texts to positivity for training
testset = {} #dictionary of texts to positivity for testing
totalset = {} #total dictionary of texts to positivity

reviews = [] #temporary storage for unprocessed json reviews
description = [] #customer review (text)
scores = [] #positivity of review (binary)
    
jsondata = open('videogamereviews.json')

for line in jsondata:
    reviews.append(json.loads(line))
for review in reviews:
    description.append(review.get('reviewText'))
    if(review.get('overall')>=POSITIVE):
        scores.append(1)
    else:
        scores.append(0)
#print(scores[3])

In [119]:
endtrain = int(SIZE*TRAININGPERCENT) #first test example at this index
for i in range(0, endtrain): 
    trainset[description[i]]=scores[i]
    totalset[description[i]]=scores[i]
len(trainset)
#print(trainset.get("I bought this and the key didn't work.  It was a gift, and the recipient wasn't able to solve the problem.  It might have been a good game, but I never found out because the key failed."))

4000

In [120]:
#creating a test set with an equal number of positive/negative cases
marker = 0 
numnegative = 0
numpositive = 0
testsize = SIZE-endtrain
i=endtrain+1
testscores = []
while marker<testsize:
    sentiment = scores[i]
    if(sentiment==1):
        if(numpositive<int(testsize/2)):
            testscores.append(1)
            testset[description[i]]=1
            totalset[description[i]]=1
            numpositive+=1
            marker+=1
    else:
        if(numnegative<testsize-int(testsize/2)):
            testscores.append(0)
            testset[description[i]]=0
            totalset[description[i]]=0
            numnegative+=1
            marker+=1
    i+=1
#testset

In [121]:
#check for successful creation of test set
ones = 0
zeros = 0
for sentiment in testscores:
    if(sentiment==1):
        ones+=1
    else:
        zeros+=1
len(testscores)

1000

In [122]:
#check for correct input size
print("Size of training set: "+str(len(trainset)))
print("Size of test set: "+str(len(testset)))
print("Combined size: "+str(len(totalset)))

Size of training set: 4000
Size of test set: 1000
Combined size: 5000


In [123]:
#we find that around 50% of people give a 5 in their review, making classification between 5 and not 5 an unbiased problem
percentage = 0
for score in scores:
    percentage+=score
percentage/=len(scores)
percentage        

0.5185305030632497

In [124]:
#representation of 0s in relevant test set
percentage = 0
for score in testscores:
    percentage+=score
percentage/=len(testscores)
percentage        

0.5

In [125]:
tokenizer = Tokenizer(num_words=MEMORIZE)
tokenizer.fit_on_texts(totalset)
tokenizer.word_index

{'the': 1,
 'and': 2,
 'to': 3,
 'a': 4,
 'of': 5,
 'game': 6,
 'is': 7,
 'you': 8,
 'i': 9,
 'it': 10,
 'this': 11,
 'in': 12,
 'that': 13,
 'for': 14,
 'are': 15,
 'but': 16,
 'with': 17,
 'as': 18,
 'have': 19,
 'on': 20,
 'was': 21,
 'games': 22,
 'one': 23,
 'not': 24,
 'all': 25,
 'can': 26,
 'if': 27,
 'be': 28,
 'like': 29,
 'your': 30,
 'so': 31,
 'or': 32,
 'get': 33,
 "it's": 34,
 'there': 35,
 'just': 36,
 'out': 37,
 'play': 38,
 'my': 39,
 'time': 40,
 'more': 41,
 'at': 42,
 'from': 43,
 'has': 44,
 'they': 45,
 'will': 46,
 'good': 47,
 'great': 48,
 'an': 49,
 'some': 50,
 'up': 51,
 'graphics': 52,
 'when': 53,
 'very': 54,
 'first': 55,
 'fun': 56,
 'really': 57,
 'which': 58,
 'even': 59,
 'about': 60,
 'only': 61,
 'by': 62,
 'best': 63,
 'also': 64,
 'still': 65,
 'what': 66,
 'me': 67,
 'other': 68,
 'do': 69,
 'well': 70,
 'than': 71,
 'much': 72,
 'its': 73,
 'them': 74,
 '2': 75,
 'no': 76,
 'story': 77,
 'quot': 78,
 'many': 79,
 'had': 80,
 'because': 81,
 '

In [126]:
traintokens = tokenizer.texts_to_sequences(trainset)
testtokens = tokenizer.texts_to_sequences(testset)

In [127]:
len(testtokens)

1000

In [128]:
np.array(traintokens[342])

array([1145,    7,   23,    5,  147,   22,   13,   44,  771,   17,   67,
         96,  188,    9,   55,   85,   10,   12,  278,    9,  184,  130,
          5,  100,    6,   13,    9,  211,   29,   18,   72,   18, 1145,
          1,  858,   15,  493,    2, 2056,    1,  705,    7,  793, 3055,
          2,    1,  664,    7,  344, 4716,   12,    1, 4988, 1017,    8,
         15, 6123,   30, 3588,  983,   44, 2419,    2,   97,    8,   19,
          3, 1015,   53,    8,  851,    1,  983,    8,   33, 6683,   12,
          1,  577,   17, 3142, 3562,  858,  293,  664,  493, 3043,    2,
       3562,  120,  325,    5,  298, 1047,   32, 2630, 1145, 1551,   73,
         77,  105, 4514,    9, 1377,   10,   13,   94, 1047,   87,   19,
       1266,   12,    1,   94, 1145,    7,   64,   48,   14,    1,  278,
         13,   10, 3254, 7223,   51,    3, 1387,  345,    1,  858, 4989,
         16,   25,    5,   74,   15, 1897,   54,   70,  290,  969,   43,
       7223, 1145,   64,   44, 1933,    2,  410, 19

In [129]:
num_tokens = [len(tokens) for tokens in traintokens + testtokens]
num_tokens = np.array(num_tokens)
#np.mean(num_tokens)
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
#max_tokens

In [130]:
np.sum(num_tokens < max_tokens) / len(num_tokens)

0.9502

In [131]:
pad = 'pre'

In [132]:
paddedtrain = pad_sequences(traintokens, maxlen=max_tokens, padding=pad, truncating=pad)
paddedtest = pad_sequences(testtokens, maxlen=max_tokens, padding=pad, truncating=pad)

In [133]:
paddedtrain[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [134]:
paddedtrain.shape

(4000, 620)

In [135]:
paddedtest.shape

(1000, 620)

In [136]:
idx = tokenizer.word_index
inverse_map = dict(zip(idx.values(), idx.keys()))

In [137]:
def tokens_to_string(tokens):
    # Map from tokens back to words.
    words = [inverse_map[token] for token in tokens if token != 0]
    # Concatenate all words.
    text = " ".join(words)
    return text

In [138]:
tokens_to_string(traintokens[1])

"if you like rally cars get this game you will have fun it is more oriented to 34 european market 34 since here in america there isn't a huge rally fan party music it is very european and even the voices from the game very 34 english 34 accent the multiplayer isn't the best but it works just ok"

In [139]:
description[1]

"If you like rally cars get this game you will have fun.It is more oriented to &#34;European market&#34; since here in America there isn't a huge rally fan party. Music it is very European and even the voices from the game very &#34;English&#34; accent.The multiplayer isn't the best but it works just ok."

In [140]:
model = Sequential()
embedding_size = 8
model.add(Embedding(input_dim=MEMORIZE,output_dim=embedding_size, input_length=max_tokens, name='layer_embedding'))
model.add(GRU(units=16, return_sequences=True))
model.add(GRU(units=8, return_sequences=True))
model.add(GRU(units=4))
model.add(Dense(1, activation='sigmoid'))
optimizer = Adam(learning_rate=1e-3)
model.compile(loss='binary_crossentropy',optimizer=optimizer, metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 620, 8)            80000     
_________________________________________________________________
gru_6 (GRU)                  (None, 620, 16)           1248      
_________________________________________________________________
gru_7 (GRU)                  (None, 620, 8)            624       
_________________________________________________________________
gru_8 (GRU)                  (None, 4)                 168       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 5         
Total params: 82,045
Trainable params: 82,045
Non-trainable params: 0
_________________________________________________________________


In [141]:
%%time
history=model.fit(np.array(paddedtrain), np.array(scores), validation_split=0.05, epochs=5, batch_size=64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 5min 11s, sys: 44.2 s, total: 5min 55s
Wall time: 2min 50s


In [142]:
%%time
result = model.evaluate(paddedtest, np.array(testscores))

CPU times: user 4.7 s, sys: 712 ms, total: 5.41 s
Wall time: 2.55 s
