In [1]:
%matplotlib inline
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from scipy.spatial.distance import cdist
import pandas as pd
from tensorflow import keras
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import json

In [2]:
#hyperparameters
SIZE = 1000 #the combined size of the training + test set (max 230k)
TRAININGPERCENT = 0.8 #the ratio of the total dataset used for training
MEMORIZE = 10000 #the amount of words to memorize 
POSITIVE = 5 #reviews with stars at or above this value (out of five) are considered positive

In [3]:
trainset = {} #dictionary of texts to positivity for training
testset = {} #dictionary of texts to positivity for testing
totalset = {} #total dictionary of texts to positivity

reviews = [] #temporary storage for unprocessed json reviews
description = [] #customer review (text)
scores = [] #positivity of review (binary)
    
jsondata = open('videogamereviews.json')

for line in jsondata:
    reviews.append(json.loads(line))
for review in reviews:
    description.append(review.get('reviewText'))
    if(review.get('overall')>=POSITIVE):
        scores.append(1)
    else:
        scores.append(0)
#print(scores[3])

In [4]:
endtrain = int(SIZE*TRAININGPERCENT) #first test example at this index
for i in range(0, endtrain): 
    trainset[description[i]]=scores[i]
    totalset[description[i]]=scores[i]
len(trainset)
#print(trainset.get("I bought this and the key didn't work.  It was a gift, and the recipient wasn't able to solve the problem.  It might have been a good game, but I never found out because the key failed."))

800

In [5]:
#creating a test set with an equal number of positive/negative cases
marker = 0 
numnegative = 0
numpositive = 0
testsize = SIZE-endtrain
i=endtrain
testscores = []
while marker<testsize:
    sentiment = scores[i]
    if(sentiment==1):
        if(numpositive<int(testsize/2)):
            testscores.append(1)
            testset[description[i]]=1
            totalset[description[i]]=1
            numpositive+=1
            marker+=1
    else:
        if(numnegative<testsize-int(testsize/2)):
            testscores.append(0)
            testset[description[i]]=0
            totalset[description[i]]=0
            numnegative+=1
            marker+=1
    i+=1
#testset

In [6]:
#check for successful creation of test set
ones = 0
zeros = 0
for sentiment in testscores:
    if(sentiment==1):
        ones+=1
    else:
        zeros+=1
len(testscores)

200

In [7]:
#check for correct input size
print("Size of training set: "+str(len(trainset)))
print("Size of test set: "+str(len(testset)))
print("Combined size: "+str(len(totalset)))

Size of training set: 800
Size of test set: 200
Combined size: 1000


In [8]:
#we find that around 50% of people give a 5 in their review, making classification between 5 and not 5 an unbiased problem
percentage = 0
for score in scores:
    percentage+=score
percentage/=len(scores)
percentage        

0.5185305030632497

In [9]:
#representation of 0s in relevant test set
percentage = 0
for score in testscores:
    percentage+=score
percentage/=len(testscores)
percentage        

0.5

In [10]:
tokenizer = Tokenizer(num_words=MEMORIZE)
tokenizer.fit_on_texts(totalset)
tokenizer.word_index

{'the': 1,
 'and': 2,
 'to': 3,
 'a': 4,
 'of': 5,
 'game': 6,
 'is': 7,
 'you': 8,
 'i': 9,
 'it': 10,
 'this': 11,
 'in': 12,
 'that': 13,
 'for': 14,
 'but': 15,
 'are': 16,
 'have': 17,
 'with': 18,
 'was': 19,
 'as': 20,
 'on': 21,
 'all': 22,
 'one': 23,
 'not': 24,
 'games': 25,
 'like': 26,
 'if': 27,
 'time': 28,
 'can': 29,
 'be': 30,
 'get': 31,
 'so': 32,
 "it's": 33,
 'or': 34,
 'just': 35,
 'there': 36,
 'out': 37,
 'play': 38,
 'my': 39,
 'from': 40,
 'more': 41,
 'at': 42,
 'your': 43,
 'good': 44,
 'great': 45,
 'graphics': 46,
 'fun': 47,
 'has': 48,
 'will': 49,
 'when': 50,
 'mario': 51,
 'an': 52,
 'first': 53,
 'they': 54,
 'some': 55,
 'up': 56,
 'very': 57,
 'best': 58,
 'even': 59,
 'really': 60,
 '64': 61,
 'still': 62,
 'do': 63,
 'which': 64,
 'zelda': 65,
 'about': 66,
 'only': 67,
 'by': 68,
 'no': 69,
 'what': 70,
 'me': 71,
 'other': 72,
 'also': 73,
 'much': 74,
 'well': 75,
 'its': 76,
 'than': 77,
 'then': 78,
 'many': 79,
 'because': 80,
 'had': 81,


In [11]:
traintokens = tokenizer.texts_to_sequences(trainset)
testtokens = tokenizer.texts_to_sequences(testset)

In [12]:
len(testtokens)

200

In [13]:
np.array(traintokens[342])

array([ 591,    7,   23,    5,  161,   25,   13,   48,  587,   18,   71,
         89,  188,    9,   53,   84,   10,   12,  241,    9,  191,  148,
          5,  106,    6,   13,    9,  260,   26,   20,   74,   20,  591,
          1,  844,   16,  426,    2, 7602,    1,  861,    7,  950, 3111,
          2,    1,  925,    7,  420, 4915,   12,    1, 7603, 1192,    8,
         16, 3164, 7604,   43, 2123,  728,   48, 1855,    2,   88,    8,
         17,    3, 1890,   50,    8,  724,    1,  728,    8,   31, 3792,
         12,    1,  480,   18, 2713, 2041,  844,  301,  925,  426, 3648,
          2, 2041,  115,  451,    5,  337, 2395,   34, 4719,  591, 1001,
         76,  101,  116, 7605, 4916,    9, 1628,   10,   13,   95, 2395,
         93,   17, 1565,   12,    1,   95,  591,    7,   73,   45,   14,
          1,  241,   13,   10, 4917, 3407,   56,    3,  967,  459,    1,
        844, 4918,   15,   22,    5,   83,   16, 4919,   57,   75,  320,
        949,   40, 3407,  591,   73,   48, 2743,   

In [14]:
num_tokens = [len(tokens) for tokens in traintokens + testtokens]
num_tokens = np.array(num_tokens)
#np.mean(num_tokens)
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
#max_tokens

In [15]:
np.sum(num_tokens < max_tokens) / len(num_tokens)

0.964

In [16]:
pad = 'pre'

In [17]:
paddedtrain = pad_sequences(traintokens, maxlen=max_tokens, padding=pad, truncating=pad)
paddedtest = pad_sequences(testtokens, maxlen=max_tokens, padding=pad, truncating=pad)

In [18]:
paddedtrain[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [19]:
paddedtrain.shape

(800, 673)

In [20]:
paddedtest.shape

(200, 673)

In [21]:
idx = tokenizer.word_index
inverse_map = dict(zip(idx.values(), idx.keys()))

In [22]:
def tokens_to_string(tokens):
    # Map from tokens back to words.
    words = [inverse_map[token] for token in tokens if token != 0]
    # Concatenate all words.
    text = " ".join(words)
    return text

In [23]:
tokens_to_string(traintokens[1])

"if you like rally cars get this game you will have fun it is more oriented to 34 european market 34 since here in america there isn't a huge rally fan party music it is very european and even the voices from the game very 34 english 34 accent the multiplayer isn't the best but it works just ok"

In [24]:
description[1]

"If you like rally cars get this game you will have fun.It is more oriented to &#34;European market&#34; since here in America there isn't a huge rally fan party. Music it is very European and even the voices from the game very &#34;English&#34; accent.The multiplayer isn't the best but it works just ok."

In [25]:
model = Sequential()
embedding_size = 8
model.add(Embedding(input_dim=MEMORIZE,output_dim=embedding_size, input_length=max_tokens, name='layer_embedding'))
model.add(GRU(units=16, return_sequences=True))
model.add(GRU(units=8, return_sequences=True))
model.add(GRU(units=4))
model.add(Dense(1, activation='sigmoid'))
optimizer = Adam(learning_rate=1e-3)
model.compile(loss='binary_crossentropy',optimizer=optimizer, metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer_embedding (Embedding)  (None, 673, 8)            80000     
_________________________________________________________________
gru (GRU)                    (None, 673, 16)           1248      
_________________________________________________________________
gru_1 (GRU)                  (None, 673, 8)            624       
_________________________________________________________________
gru_2 (GRU)                  (None, 4)                 168       
_________________________________________________________________
dense (Dense)                (None, 1)                 5         
Total params: 82,045
Trainable params: 82,045
Non-trainable params: 0
_________________________________________________________________


In [26]:
%%time
history=model.fit(np.array(paddedtrain), np.array(scores), validation_split=0.05, epochs=5, batch_size=64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 1min 8s, sys: 9.65 s, total: 1min 18s
Wall time: 38.4 s


In [27]:
%%time
result = model.evaluate(paddedtest, np.array(testscores))

CPU times: user 966 ms, sys: 140 ms, total: 1.11 s
Wall time: 629 ms
