In [2]:
import tensorflow as tf
import pandas as pd
import nltk, re, time
from langdetect import detect
from contractions import get_contractions
from sqlalchemy import create_engine
from pprint import pprint
from nltk.corpus import stopwords
import chars2vec
import sklearn.decomposition
import matplotlib.pyplot as plt
import itertools
import string
import re
from keras.models import model_from_json
from keras.models import load_model
import gensim
import os
import numpy as np
from sklearn.utils import resample
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from nltk.tokenize import word_tokenize
from sklearn.utils import class_weight
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU, GlobalMaxPool1D
from keras.layers.embeddings import Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import optimizers
from sklearn.metrics import roc_auc_score
from keras.initializers import Constant

# We download stopwords package
# nltk.download('stopwords')
contractions = get_contractions()


In [7]:
'''
1. Load the Steam review data into Pandas dataframe
'''

engine = create_engine('mysql://root:@localhost:3306/steam')
# steam_data_query = """SELECT url AS reviewid, content, CAST(recommend AS SIGNED) AS recommend
#     FROM latest_review"""

steam_data_query = """SELECT url AS reviewid, content, CAST(recommend AS SIGNED) AS recommend, hours_all, compensation
  FROM latest_review;
"""

# 	WHERE (content IS NOT NULL 
# 	OR content != ''
#     OR content != ' ');
df_steam_reviews_copy = pd.read_sql(steam_data_query, engine)
# pprint(df_steam_reviews)


In [8]:
len(df_steam_reviews_copy)

91321

In [10]:
# First we process the data with the following steps
# '''
# 1. We lower case all the words
# 2. We tokenize the sentence
# 3. We expands words to a natural form based on a dictionary of known contractions
# 4. We remove hyperlinks, html tags, delimiters, and symbols
# 5. Then, we remove stopwords
# '''

def clean_text(text):
    text = text.lower().strip().rstrip('\n')
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags = re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    
    text_tokens = word_tokenize(text)
    new_text = []
    
    for word in text_tokens:
        if word in contractions:
            new_text.append(contractions[word])
        else:
            new_text.append(word)
    
    text = " ".join(new_text)
    
    # Remove punctuation and number tokens
    text_tokens = word_tokenize(text)
    table = str.maketrans('', '', string.punctuation)
    stripped = [word.translate(table) for word in text_tokens]
    words = [word for word in stripped if word.isalpha()]
   
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    words = [word for word in words if not word in stop_words]
    
    text = " ".join(words)
    
    return text;

def pad_text_samples(max_length, test_lines):
    review_lines = list()
    
    for line in test_lines:
        tokens = word_tokenize(line)
        review_lines.append(tokens)
        
    tokenizer_obj = Tokenizer()
    tokenizer_obj.fit_on_texts(review_lines)

    new_tokens = tokenizer_obj.texts_to_sequences(review_lines)
    padded_samples = pad_sequences(new_tokens, maxlen = max_length)
    return padded_samples;

def get_language(text):
    try:
        language = detect(text)
        return language;
    except:
        pass
    
def convert_to_int(compensation):
    return int.from_bytes(compensation, 'big');



In [11]:
df_steam_reviews_copy.head(5)

Unnamed: 0,reviewid,content,recommend,hours_all,compensation
0,http://steamcommunity.com/id/--C10H15N/recomme...,bardzo fajna gra jest fajna lubie jom,1,237.8,b'\x00'
1,http://steamcommunity.com/id/--Candy--/recomme...,"Hace 7 años vicie este juego, ahora que lo vi ...",1,58.5,b'\x00'
2,http://steamcommunity.com/id/--HeLp---MePLiiS/...,G0Ty,1,766.6,b'\x00'
3,http://steamcommunity.com/id/--mihawk--/recomm...,"O jogo em si é bom, mas tem muitos problemas d...",1,2.4,b'\x00'
4,http://steamcommunity.com/id/--u/recommended/3...,It's subjectively a compilation of mediocre mi...,0,1.1,b'\x00'


In [12]:
#### This block pre-processes the data ####

# Make a copy of pandas dataframe, so we do not change the original
# df_steam_reviews_copy = df_steam_reviews.copy()

# We pre-process the reviews
# df_steam_reviews_copy['content'] = df_steam_reviews_copy['content'].map(clean_text)
# pprint(df_steam_reviews_copy['content'].head(5))

# We remove non-english reviews, as it could bias our results
sentences = df_steam_reviews_copy['content']

# Create a language column
df_steam_reviews_copy['lang'] = 'null'

# We only want English reviews

df_steam_reviews_copy['lang'] = df_steam_reviews_copy['content'].map(get_language)
# empty_reviews = []
# non_english_reviews = []
# for ind, sentence in sentences.items():
    
#     if sentence == None:
#         empty_reviews.append(df_steam_reviews_copy['reviewid'].iloc[ind])
#         continue;
        
#     if sentence == '':
#         empty_reviews.append(df_steam_reviews_copy['reviewid'].iloc[ind])
#         continue;
        
#     if sentence == ' ':
#         empty_reviews.append(df_steam_reviews_copy['reviewid'].iloc[ind])
#         continue;
        
#     try:
#         if detect(sentence) == 'en':
#             language = detect(sentence)
#             df_steam_reviews_copy.at[ind, 'lang'] = language
            
#         else:
#             non_english_reviews.append(df_steam_reviews_copy['reviewid'].iloc[ind])
#     except:
#         non_english_reviews.append(df_steam_reviews_copy['reviewid'].iloc[ind])

        


In [13]:
print(df_steam_reviews_copy.head(10))
# data = pd.read_csv('/Users/admin/Documents/Queens_Masters_Courses/Deep_Learning/course_project/best/pre_processed_steam_reviews2.csv')


                                            reviewid  \
0  http://steamcommunity.com/id/--C10H15N/recomme...   
1  http://steamcommunity.com/id/--Candy--/recomme...   
2  http://steamcommunity.com/id/--HeLp---MePLiiS/...   
3  http://steamcommunity.com/id/--mihawk--/recomm...   
4  http://steamcommunity.com/id/--u/recommended/3...   
5  http://steamcommunity.com/id/-7656119804976061...   
6  http://steamcommunity.com/id/-ANDREA-/recommen...   
7  http://steamcommunity.com/id/-Andrealphus-/rec...   
8  http://steamcommunity.com/id/-Artifex-/recomme...   
9  http://steamcommunity.com/id/-asymmetry/recomm...   

                                             content  recommend  hours_all  \
0              bardzo fajna gra jest fajna lubie jom          1      237.8   
1  Hace 7 años vicie este juego, ahora que lo vi ...          1       58.5   
2                                               G0Ty          1      766.6   
3  O jogo em si é bom, mas tem muitos problemas d...          1        

In [14]:
# The main dataset that we will use
# data = df_steam_reviews_copy[df_steam_reviews_copy['lang'] == 'en']
# data['compensation'] = data['compensation'].map(convert_to_int)

# data = data.reset_index(drop = True)
# data.recommend.value_counts()
df_steam_reviews_copy.to_csv('/Users/admin/Documents/Queens_Masters_Courses/Deep_Learning/course_project/best/pre_processed_steam_reviews_final3.csv')
                   

In [147]:
print(len(data))

45472


In [10]:
# IGNORE this: Balance classes by up/downsampling 
df_minority = data[data.recommend == 0]
df_majority = data[data.recommend == 1]

# n_samples_minority = len(df_minority)
n_samples_majority = len(df_majority)

# df_majority_downsampled = resample(df_majority,
#                                   replace = False,
#                                   n_samples = n_samples_minority,
#                                   random_state = 123)

df_minority_upsampled = resample(df_minority,
                                replace = True,
                                n_samples = n_samples_majority,
                                random_state = 123)

data = pd.concat([df_majority, df_minority_upsampled])
# data = pd.concat([df_majority_downsampled, df_minority])
data = data.reset_index(drop = True)
data.recommend.value_counts()


1    33703
0    33703
Name: recommend, dtype: int64

In [56]:
# Optional step IGNORE for now
# print(data['content'])

# flattened_data = data['content'].values.flatten().tolist()
# words = list(set(list(itertools.chain.from_iterable(flattened_data))))
# Load Inutition Engineering pretrained model
# Models names: 'eng_50', 'eng_100', 'eng_150'
# c2v_model = chars2vec.load_model('eng_50')
# word_embeddings = c2v_model.vectorize_words(words)

# projection_2d = sklearn.decomposition.PCA(n_components=2).fit_transform(word_embeddings)

# f = plt.figure(figsize=(8, 6))



<Figure size 576x432 with 0 Axes>

In [21]:
#### MODEL 1 IGNORE####
# Split data 80/20 train and test
X_train = data.sample(frac = 0.8, random_state = 200)['content']
y_train = data.loc[X_train.index]['recommend']
X_test = data.drop(X_train.index)['content']
y_test = data.loc[X_test.index]['recommend']

## BACKLOG
# X_train = data.loc[:449, 'content'].values
# y_train = data.loc[:449, 'recommend'].values
# X_test = data.loc[450:, 'content'].values
# y_test = data.loc[450:, 'recommend'].values

In [24]:
len(X_test)

5224

In [356]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

tokenizer_obj = Tokenizer()
total_reviews = pd.DataFrame(np.hstack((X_train.values, X_test.values)))
total_reviews = total_reviews[0].tolist()
# print(total_reviews[0].tolist())
tokenizer_obj.fit_on_texts(total_reviews)

# Pad sequences
max_length = max([len(s.split()) for s in total_reviews])
print(max_length)
# Define vocab size
vocab_size = len(tokenizer_obj.word_index) + 1

X_train_tokens = tokenizer_obj.texts_to_sequences(X_train)
X_test_tokens = tokenizer_obj.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_tokens, maxlen = max_length, padding = 'post')
X_test_pad = pad_sequences(X_test_tokens, maxlen = max_length, padding = 'post')

1582


In [361]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers.embeddings import Embedding

EMBEDDING_DIM = 100

print("Building model..")

model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_DIM, input_length = max_length))
model.add(GRU(units = 32, dropout = 0.2, recurrent_dropout = 0.2))
model.add(Dense(1, activation = 'sigmoid'))

# Optimizer / minimizer/ cost function
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])


Building model..


In [362]:
print("Training model...")
model.fit(X_train_pad, y_train, batch_size = 128, epochs = 50, validation_data = (X_test_pad, y_test), verbose = 2)



Training model...
Train on 41806 samples, validate on 10451 samples
Epoch 1/50
 - 680s - loss: 0.5656 - acc: 0.7476 - val_loss: 0.5724 - val_acc: 0.7406
Epoch 2/50
 - 675s - loss: 0.5602 - acc: 0.7527 - val_loss: 0.5746 - val_acc: 0.7406
Epoch 3/50
 - 671s - loss: 0.5601 - acc: 0.7528 - val_loss: 0.5725 - val_acc: 0.7406
Epoch 4/50
 - 680s - loss: 0.5601 - acc: 0.7528 - val_loss: 0.5727 - val_acc: 0.7406
Epoch 5/50
 - 721s - loss: 0.5598 - acc: 0.7528 - val_loss: 0.5726 - val_acc: 0.7406
Epoch 6/50
 - 725s - loss: 0.5598 - acc: 0.7528 - val_loss: 0.5729 - val_acc: 0.7406
Epoch 7/50
 - 649s - loss: 0.5602 - acc: 0.7528 - val_loss: 0.5725 - val_acc: 0.7406
Epoch 8/50
 - 651s - loss: 0.5598 - acc: 0.7528 - val_loss: 0.5724 - val_acc: 0.7406
Epoch 9/50
 - 684s - loss: 0.5599 - acc: 0.7528 - val_loss: 0.5724 - val_acc: 0.7406
Epoch 10/50
 - 713s - loss: 0.5599 - acc: 0.7528 - val_loss: 0.5726 - val_acc: 0.7406
Epoch 11/50
 - 685s - loss: 0.5600 - acc: 0.7528 - val_loss: 0.5727 - val_acc: 0.

<keras.callbacks.History at 0x1a4984aac8>

In [363]:
# Evaluate the model
# scores = model.evaluate(X_test, y_test, verbose = 2)
# print('%s: %.2f%%' % (model.metrics_names[1], scores[1] * 100))

#Serialize model to JSON
model_json = model.to_json()
with open('model.json', 'w') as json_file:
    json_file.write(model_json)
    
#Serialize weights to HDF5
model.save_weights('model.h5')
print('Saved model to disk')



Saed model to disk


In [83]:
#### MODEL 3 ####

# Split data 70/20/10 (train, val, test)
X_train = data.sample(frac = 0.7, random_state = 200)['content']
y_train = data.loc[X_train.index]['recommend']

X_val = data.drop(X_train.index)['content'].sample(frac = 0.80, random_state = 200)
y_val = data.loc[X_val.index]['recommend']

X_test = data.drop(X_train.index).drop(X_val.index)['content']
y_test = data.loc[X_test.index]['recommend']

print(len(data))
print(len(X_train))
print(len(X_val))
print(len(X_test))


45613
31929
10947
2737


In [84]:
import numpy as np
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

tokenizer_obj = Tokenizer()
total_reviews = pd.DataFrame(np.hstack((X_train.values, X_val.values)))
total_reviews = total_reviews[0].tolist()
# print(total_reviews[0].tolist())
tokenizer_obj.fit_on_texts(total_reviews)

# Pad sequences by maximum length sentence
max_length = max([len(s.split()) for s in total_reviews])
print(max_length)

# Define vocab size
vocab_size = len(tokenizer_obj.word_index) + 1

X_train_tokens = tokenizer_obj.texts_to_sequences(X_train)
X_val_tokens = tokenizer_obj.texts_to_sequences(X_val)
print(len(X_val_tokens))
X_train_pad = pad_sequences(X_train_tokens, maxlen = max_length, padding = 'post')
X_val_pad = pad_sequences(X_val_tokens, maxlen = max_length, padding = 'post')
print(len(X_val_pad))


796
10947
10947


In [151]:
print(total_reviews[1])

bought game felines disappointed


In [85]:


# print(len(total_reviews))
# print(len(X_train_pad))

# Class weights
class_weights = class_weight.compute_class_weight('balanced',
                                                np.unique(y_train),
                                                y_train)
class_weight_dict = dict(enumerate(class_weights))
print(np.unique(y_train))
print(class_weights)
print(class_weight_dict)




[0 1]
[1.91237422 0.67700691]
{0: 1.912374221370388, 1: 0.6770069123446842}


In [80]:


print("Building model 3...")

# Class weights
class_weights = class_weight.compute_class_weight('balanced',
                                                np.unique(y_train),
                                                y_train)
print(class_weights)
class_weight_dict = dict(enumerate(class_weights))
print(class_weight_dict)

callbacks = [EarlyStopping(monitor = 'val_loss', patience = 10),
            ModelCheckpoint(filepath = 'review_model3.h5', monitor = 'val_loss', save_best_only = True)]

EMBEDDING_DIM = 64

model3 = Sequential()
model3.add(Embedding(vocab_size, EMBEDDING_DIM, input_length = max_length))
model3.add(GlobalMaxPool1D())
model3.add(GRU(units = 64, dropout = 0.3, recurrent_dropout = 0.3))
model3.add(Dense(1, activation = 'sigmoid'))

# Stochastic gradient descent optimizer or can use adam for dynamic lr
# adam = optimizers.Adam(lr = 0.0001)
# sgd = optimizers.sgd(lr = 0.0001)

# Optimizer with cost function
model3.compile(loss = 'binary_crossentropy',
               optimizer = 'adam', 
               metrics = ['accuracy'])
model3.summary()


Building model 3...
[1.91237422 0.67700691]
{0: 1.912374221370388, 1: 0.6770069123446842}


In [None]:
print("Training model 3...")
model3.fit(X_train_pad, y_train, batch_size = 128, epochs = 30, validation_data = (X_val_pad, y_val), verbose = 2, callbacks = callbacks, class_weight = class_weight_dict, shuffle = True)


In [417]:
print(len(X_test))
print(len(y_test))
model3.summary()

10451
10451
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, 1582, 64)          3052800   
_________________________________________________________________
gru_16 (GRU)                 (None, 64)                24768     
_________________________________________________________________
dense_31 (Dense)             (None, 1)                 65        
Total params: 3,077,633
Trainable params: 3,077,633
Non-trainable params: 0
_________________________________________________________________


In [None]:
data.plot(kind = 'bar', title = ' Count(target)')


<matplotlib.axes._subplots.AxesSubplot at 0x1a7c87dac8>

In [435]:
### Evaluate the model
# scores = model2.evaluate(X_test, y_test, verbose = 2)
# print('%s: %.2f%%' % (model2.metrics_names[1], scores[1] * 100))

test_samples_tokens = tokenizer_obj.texts_to_sequences(X_test)
test_samples_tokens_pad = pad_sequences(test_samples_tokens, maxlen = max_length)

#predict
result = model.predict(x = test_samples_tokens_pad)

#Serialize model to JSON
# model3_json = model2.to_json()
# with open('model3.json', 'w') as json_file:
#     json_file.write(model3_json)
# #Serialize weights to HDF5
# model3.save_weights('model3.h5')
# print('Saved model to disk')

In [436]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, result)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
# for ind, i in enumerate(result):
#     print(i)
#     print(y_test.tolist()[ind])
# y_test.tolist()

[0.6679652]
1
[0.7406281]
1
[0.6898999]
1
[0.7158276]
1
[0.6934513]
1
[0.74072444]
1
[0.68493694]
1
[0.7325671]
1
[0.7412317]
1
[0.7421388]
1
[0.7406555]
1
[0.74593854]
1
[0.7400028]
1
[0.73871356]
1
[0.73883337]
0
[0.68338555]
0
[0.6934463]
1
[0.70826757]
0
[0.675733]
1
[0.7435258]
1
[0.7401847]
0
[0.74344975]
1
[0.7412765]
0
[0.7409642]
1
[0.73802596]
1
[0.7400106]
1
[0.7337381]
0
[0.73719275]
1
[0.6947951]
1
[0.7414872]
1
[0.7417125]
1
[0.7385872]
1
[0.74677247]
1
[0.72671854]
1
[0.735161]
1
[0.7413735]
1
[0.74163896]
1
[0.72904885]
1
[0.7316165]
1
[0.69609517]
1
[0.7446429]
0
[0.7320243]
1
[0.74125355]
1
[0.66525686]
1
[0.73987305]
1
[0.7336175]
1
[0.73763895]
0
[0.7156405]
1
[0.74014086]
1
[0.68983525]
1
[0.74064255]
0
[0.7335622]
0
[0.74012196]
1
[0.73239166]
1
[0.74037623]
1
[0.7401241]
1
[0.73945093]
1
[0.740994]
1
[0.7374503]
1
[0.74410254]
1
[0.74148077]
1
[0.7393488]
1
[0.74322313]
1
[0.74035174]
1
[0.7241383]
1
[0.7418246]
0
[0.7441298]
0
[0.73735297]
1
[0.74077547]
0
[0.73

[0.74200743]
1
[0.7408025]
1
[0.73993105]
1
[0.73858094]
0
[0.7411561]
1
[0.68758]
1
[0.74163574]
1
[0.73137116]
1
[0.7398047]
1
[0.71565825]
1
[0.7322807]
1
[0.7385252]
1
[0.74191946]
1
[0.7394123]
1
[0.6864797]
0
[0.7271397]
0
[0.6864046]
1
[0.7411875]
1
[0.7397883]
1
[0.7411627]
1
[0.68894076]
1
[0.74024]
1
[0.7377701]
1
[0.7421527]
1
[0.7386093]
1
[0.7401228]
1
[0.69505674]
1
[0.7252169]
1
[0.7397353]
0
[0.74049896]
1
[0.74297607]
0
[0.6915799]
1
[0.67625284]
1
[0.7325885]
0
[0.6966799]
1
[0.7406734]
1
[0.7413974]
1
[0.7408414]
0
[0.7398648]
1
[0.7376035]
1
[0.7363793]
1
[0.7355085]
0
[0.74031603]
0
[0.7399968]
1
[0.74295944]
1
[0.74442065]
0
[0.7390194]
1
[0.74087036]
1
[0.67837983]
0
[0.6893668]
1
[0.7410222]
1
[0.7383144]
1
[0.74151975]
1
[0.73939306]
0
[0.6865012]
1
[0.7403032]
1
[0.73835605]
1
[0.71794486]
1
[0.7392035]
0
[0.7383925]
1
[0.7399255]
1
[0.741439]
0
[0.73711336]
1
[0.73864686]
0
[0.7269964]
1
[0.7363799]
1
[0.67282623]
1
[0.73742706]
1
[0.69431037]
0
[0.7338071]
0

[0.742599]
1
[0.73887485]
0
[0.7403504]
0
[0.7250303]
1
[0.7398847]
1
[0.7368364]
1
[0.7432118]
1
[0.714562]
1
[0.73609143]
1
[0.7405337]
1
[0.6929513]
0
[0.7327227]
1
[0.73158044]
1
[0.74272496]
1
[0.6880805]
1
[0.745517]
1
[0.7172236]
1
[0.7275856]
0
[0.74026775]
1
[0.6899013]
1
[0.73783666]
1
[0.7388722]
1
[0.74048215]
0
[0.6943158]
1
[0.73524296]
1
[0.67257464]
0
[0.7403216]
1
[0.73898065]
1
[0.7392471]
1
[0.74140215]
1
[0.7420883]
1
[0.7431169]
0
[0.7350856]
1
[0.6947951]
1
[0.7373071]
1
[0.7338989]
0
[0.7360411]
0
[0.7373405]
1
[0.72862476]
1
[0.7390881]
1
[0.71593237]
1
[0.7311897]
1
[0.7380362]
1
[0.72451735]
1
[0.74020857]
1
[0.7381918]
1
[0.74044657]
0
[0.6890655]
1
[0.7380344]
1
[0.7396378]
1
[0.7386774]
1
[0.73806095]
1
[0.74063367]
0
[0.74006253]
1
[0.7331455]
1
[0.7405311]
1
[0.74204886]
1
[0.72490096]
1
[0.72490096]
1
[0.72490096]
1
[0.7397018]
0
[0.71960986]
1
[0.73926795]
1
[0.7405468]
1
[0.72937083]
1
[0.6830213]
1
[0.74174696]
1
[0.74040043]
1
[0.68806535]
1
[0.73947

0
[0.7386163]
1
[0.7413953]
1
[0.6965672]
0
[0.69863683]
1
[0.74207073]
0
[0.73868525]
1
[0.7394812]
1
[0.74044377]
0
[0.7201936]
1
[0.73771554]
1
[0.7391239]
1
[0.7432532]
1
[0.7411157]
1
[0.74217695]
1
[0.7409017]
1
[0.738087]
1
[0.74073046]
1
[0.7347803]
0
[0.7421684]
1
[0.7231097]
1
[0.7416802]
1
[0.7376787]
0
[0.74178344]
0
[0.7139596]
1
[0.66131127]
1
[0.7394114]
1
[0.7390093]
0
[0.68556976]
1
[0.74023867]
0
[0.6645356]
1
[0.72520226]
1
[0.73340666]
1
[0.73768455]
1
[0.740257]
1
[0.7257112]
1
[0.73788273]
0
[0.74394554]
0
[0.73990375]
0
[0.7397942]
1
[0.6947951]
1
[0.70169455]
1
[0.74450016]
0
[0.7350454]
1
[0.73794794]
1
[0.7111933]
0
[0.6891204]
0
[0.73362213]
1
[0.7417382]
1
[0.7405731]
1
[0.7391443]
1
[0.71167684]
1
[0.68818057]
1
[0.731144]
1
[0.74319935]
0
[0.74094176]
1
[0.7388314]
1
[0.688335]
1
[0.6874226]
1
[0.6881617]
1
[0.6693371]
0
[0.7395031]
1
[0.737401]
1
[0.7049548]
1
[0.7333648]
0
[0.7188821]
1
[0.7419635]
0
[0.7275287]
1
[0.69398224]
1
[0.7401733]
1
[0.74066085

1
[0.73883086]
0
[0.7174462]
1
[0.6974946]
1
[0.742025]
1
[0.6317999]
1
[0.6974952]
1
[0.6759783]
0
[0.7421602]
0
[0.74065024]
1
[0.7403825]
0
[0.74160445]
1
[0.7158253]
1
[0.7399597]
0
[0.73964435]
1
[0.73569566]
1
[0.7422195]
1
[0.74065304]
1
[0.74119335]
1
[0.7404343]
1
[0.7291088]
1
[0.7159862]
0
[0.6654806]
1
[0.7359278]
1
[0.7211198]
1
[0.73925465]
1
[0.7402469]
1
[0.73945284]
1
[0.73959196]
1
[0.74198]
1
[0.7377371]
1
[0.74962306]
1
[0.73604864]
1
[0.7391078]
0
[0.7424501]
1
[0.7457577]
1
[0.73629946]
1
[0.7401398]
0
[0.6514732]
1
[0.69004697]
0
[0.7413876]
1
[0.74118227]
1
[0.71034634]
1
[0.7407771]
0
[0.7365664]
1
[0.74177575]
0
[0.7283873]
0
[0.73890793]
0
[0.73915786]
0
[0.7177029]
1
[0.7401445]
1
[0.74648124]
0
[0.71432024]
1
[0.74081534]
1
[0.73701704]
0
[0.7407218]
0
[0.7372456]
1
[0.73495466]
1
[0.7420396]
0
[0.7417401]
0
[0.74400294]
1
[0.7427142]
0
[0.7318794]
1
[0.7391481]
1
[0.7394313]
1
[0.72827053]
0
[0.7173813]
1
[0.7402056]
1
[0.73168546]
1
[0.7378383]
1
[0.73972

[0.7352116]
0
[0.73674715]
1
[0.6595171]
1
[0.7364868]
1
[0.6976287]
1
[0.7408813]
0
[0.7411109]
1
[0.7402892]
1
[0.7414364]
0
[0.73726004]
1
[0.73165274]
0
[0.7402072]
1
[0.7342587]
1
[0.7242175]
1
[0.7395345]
1
[0.71541667]
1
[0.7387907]
0
[0.7426781]
0
[0.73891306]
1
[0.74176615]
1
[0.7375607]
1
[0.7376626]
0
[0.74014944]
0
[0.7422197]
0
[0.65936846]
1
[0.7415982]
0
[0.7386987]
0
[0.6787571]
1
[0.73225516]
0
[0.7170644]
1
[0.7341657]
1
[0.68705136]
1
[0.7171008]
1
[0.7374644]
0
[0.74333787]
0
[0.7262274]
1
[0.7445898]
1
[0.73984]
1
[0.7431344]
0
[0.74818045]
0
[0.7366168]
1
[0.7181934]
1
[0.72481024]
0
[0.73870003]
1
[0.73989195]
1
[0.68322533]
0
[0.74994695]
0
[0.7423858]
1
[0.7365689]
0
[0.73974746]
1
[0.7350961]
1
[0.7374408]
1
[0.7421017]
0
[0.74182713]
1
[0.73874295]
1
[0.74335533]
0
[0.74076694]
1
[0.7361528]
1
[0.69735426]
1
[0.7158838]
1
[0.74180484]
1
[0.7372692]
1
[0.741226]
1
[0.7466965]
1
[0.7471351]
1
[0.74009025]
1
[0.7414261]
0
[0.7410528]
0
[0.70793766]
1
[0.7207759]

0
[0.74009115]
1
[0.73901635]
1
[0.7405393]
1
[0.7279522]
0
[0.7360495]
1
[0.68513465]
0
[0.73713833]
0
[0.74252003]
1
[0.7385731]
1
[0.7403736]
0
[0.7422935]
0
[0.74487555]
0
[0.74146724]
0
[0.6947951]
1
[0.7384107]
1
[0.73877287]
0
[0.7399022]
0
[0.7405849]
0
[0.7403242]
0
[0.73872554]
1
[0.7409086]
1
[0.7406708]
1
[0.7388848]
1
[0.7427137]
1
[0.6903255]
0
[0.67062587]
1
[0.7406331]
0
[0.6947951]
1
[0.69745165]
0
[0.7390344]
0
[0.7363897]
1
[0.74315804]
0
[0.7367878]
1
[0.7402227]
0
[0.7386149]
1
[0.73942685]
1
[0.7427101]
1
[0.74156773]
0
[0.7408409]
1
[0.7408672]
1
[0.7380159]
0
[0.73896736]
0
[0.73674786]
1
[0.74262035]
1
[0.7149698]
1
[0.7383444]
0
[0.6993833]
1
[0.7390828]
0
[0.7433391]
1
[0.7414957]
1
[0.74002975]
1
[0.73715156]
0
[0.73910993]
1
[0.7400861]
0
[0.7406969]
1
[0.74100286]
1
[0.6987663]
1
[0.74024755]
0
[0.7377419]
1
[0.7396108]
0
[0.7281374]
1
[0.74319947]
1
[0.72235656]
1
[0.7425495]
1
[0.7325353]
0
[0.716083]
0
[0.7375311]
1
[0.73939025]
1
[0.7407331]
1
[0.73746

[0.73574615]
1
[0.74038553]
0
[0.73920083]
1
[0.7063268]
1
[0.7400446]
1
[0.7386986]
1
[0.74094677]
1
[0.738913]
1
[0.7363895]
1
[0.7420118]
1
[0.7423416]
0
[0.68755007]
1
[0.7443216]
1
[0.7368475]
0
[0.71462816]
0
[0.6858029]
0
[0.7068342]
0
[0.73464036]
1
[0.74238306]
0
[0.7407448]
0
[0.69088155]
0
[0.74323124]
1
[0.7147378]
1
[0.7391652]
0
[0.74057615]
1
[0.7408011]
0
[0.7331516]
1
[0.7398354]
1
[0.7392819]
1
[0.736783]
0
[0.72897536]
0
[0.74371666]
0
[0.741559]
0
[0.74063957]
1
[0.72865933]
1
[0.7273223]
1
[0.7421479]
1
[0.739874]
1
[0.683932]
0
[0.73894465]
0
[0.7400521]
1
[0.69148564]
1
[0.7400777]
0
[0.7399515]
1
[0.7166397]
1
[0.7405458]
1
[0.7372308]
1
[0.7372274]
1
[0.67911017]
1
[0.74310106]
1
[0.71793175]
1
[0.736785]
1
[0.73613733]
1
[0.72738016]
1
[0.7409566]
1
[0.73839325]
0
[0.7373449]
0
[0.74025494]
0
[0.71071935]
1
[0.73943675]
0
[0.739095]
1
[0.7395785]
1
[0.7397146]
1
[0.7330434]
1
[0.738924]
1
[0.6885754]
1
[0.73145485]
1
[0.7432199]
1
[0.7415431]
0
[0.68743855]
0


1
[0.67903405]
0
[0.73446393]
0
[0.70713615]
1
[0.74036765]
0
[0.73983455]
1
[0.6917789]
1
[0.72451574]
1
[0.73993945]
1
[0.7370736]
1
[0.72412306]
1
[0.7421036]
1
[0.7412579]
1
[0.68552953]
1
[0.73959637]
0
[0.7267263]
1
[0.7419728]
1
[0.74268305]
0
[0.7259721]
1
[0.6962361]
1
[0.7468974]
1
[0.70329833]
1
[0.74436426]
1
[0.7406078]
0
[0.73588604]
1
[0.73965555]
1
[0.7381547]
1
[0.7392588]
1
[0.7294149]
0
[0.74003094]
1
[0.7193149]
1
[0.73959583]
1
[0.73955053]
1
[0.7369968]
1
[0.6803109]
1
[0.7391507]
1
[0.74192834]
1
[0.7422529]
1
[0.74030954]
1
[0.7396074]
0
[0.7351288]
1
[0.740911]
1
[0.72921014]
1
[0.6864961]
1
[0.7405214]
1
[0.7384748]
0
[0.70593804]
1
[0.74222225]
1
[0.6587856]
0
[0.73856646]
1
[0.7432076]
1
[0.7378529]
0
[0.7414459]
0
[0.68304294]
1
[0.7470577]
0
[0.6947951]
1
[0.7375665]
0
[0.7400366]
1
[0.73934627]
1
[0.6645308]
0
[0.7414688]
1
[0.7316953]
1
[0.7411518]
1
[0.7378565]
1
[0.74052703]
0
[0.7398696]
1
[0.685624]
1
[0.685624]
1
[0.73990387]
1
[0.73730093]
1
[0.739

[0.7441898]
1
[0.73752236]
1
[0.7419667]
1
[0.68811935]
1
[0.7257822]
1
[0.73844475]
1
[0.7405168]
1
[0.73668396]
0
[0.7409227]
1
[0.73939025]
1
[0.73864365]
1
[0.7389896]
1
[0.74379283]
0
[0.74081564]
0
[0.72508216]
1
[0.73854256]
0
[0.7349268]
1
[0.7436654]
1
[0.7402021]
1
[0.73904574]
0
[0.7388584]
0
[0.66537386]
0
[0.7395558]
1
[0.7069765]
1
[0.748562]
1
[0.73903286]
1
[0.7411836]
1
[0.68431866]
1
[0.7349167]
1
[0.72168016]
1
[0.7356899]
1
[0.6795912]
1
[0.70902133]
1
[0.7396518]
1
[0.7388456]
1
[0.73963857]
1
[0.7376626]
1
[0.68936485]
1
[0.7362714]
1
[0.74044603]
1
[0.6402693]
1
[0.73581606]
0
[0.70770884]
1
[0.736875]
0
[0.71187246]
1
[0.68517286]
0
[0.74096614]
1
[0.6859516]
1
[0.739245]
0
[0.72421104]
0
[0.73924774]
1
[0.6920091]
0
[0.7406509]
1
[0.73909146]
1
[0.7396822]
1
[0.7397946]
1
[0.71778446]
1
[0.742296]
0
[0.73875767]
1
[0.68407476]
1
[0.73781264]
1
[0.74253327]
1
[0.7414396]
1
[0.7423035]
0
[0.7289849]
1
[0.7311556]
1
[0.7428719]
0
[0.73844403]
0
[0.7431646]
1
[0.74

0
[0.6886319]
1
[0.6947951]
1
[0.74041164]
1
[0.7409002]
1
[0.69764435]
1
[0.7429802]
1
[0.7454449]
1
[0.74055135]
0
[0.7141102]
1
[0.74042135]
1
[0.7384483]
0
[0.70613694]
0
[0.6947951]
1
[0.71459097]
1
[0.7406958]
1
[0.7393066]
1
[0.7392964]
0
[0.73778635]
1
[0.7374428]
1
[0.7382797]
1
[0.69139147]
1
[0.7408467]
1
[0.7316323]
1
[0.74060696]
1
[0.6875472]
1
[0.740179]
1
[0.7403039]
1
[0.6685577]
1
[0.741622]
1
[0.6947951]
1
[0.7451396]
1
[0.6975]
1
[0.7030349]
1
[0.7421019]
0
[0.73549366]
0
[0.74099636]
1
[0.74017733]
1
[0.7421471]
1
[0.73991233]
1
[0.73344797]
1
[0.74021626]
1
[0.69764435]
1
[0.6958892]
1
[0.72721153]
1
[0.73986393]
1
[0.6555825]
1
[0.74399066]
1
[0.73967326]
1
[0.6870874]
1
[0.72470385]
1
[0.7423936]
1
[0.7392329]
1
[0.7007187]
1
[0.7384443]
1
[0.7400896]
1
[0.6876447]
1
[0.7383515]
0
[0.73854357]
1
[0.74076235]
1
[0.6665805]
0
[0.728069]
0
[0.73235226]
0
[0.7259736]
1
[0.6927427]
0
[0.7350031]
1
[0.73183435]
1
[0.7041396]
0
[0.71694356]
1
[0.72007453]
1
[0.7140923]

In [148]:
#### MODEL 4 ####
# Split data 60/20/20 (train/test/val)
# X_train, X_test, y_train, y_test = train_test_split(
#     data['content'], data['recommend'], test_size=0.2, random_state=1)
# X_train, X_val, y_train, y_val = train_test_split(
#     X_train, y_train, test_size=0.25, random_state=1)
# Split data 70/20/10 (train, val, test)
X_train = data.sample(frac = 0.7, random_state = 200)['content']
y_train = data.loc[X_train.index]['recommend']

X_val = data.drop(X_train.index)['content'].sample(frac = 0.7, random_state = 200)
y_val = data.loc[X_val.index]['recommend']

X_test = data.drop(X_train.index).drop(X_val.index)['content']
y_test = data.loc[X_test.index]['recommend']

print(len(data))
print(len(X_train))
print(len(X_val))
print(len(X_test))

45472
31830
9549
4093


In [150]:
review_lines = []
lines = data.drop(X_test.index)['content'].values.tolist()

for line in lines:
    words = word_tokenize(line)
    review_lines.append(words)
print(review_lines[1:5])

[['game', 'made', 'lose', 'hope', 'played', 'game', 'open', 'beta', 'came', 'na', 'eu', 'excited', 'beacuse', 'jake', 'song', 'former', 'developer', 'lineage', 'game', 'director', 'game', 'much', 'fun', 'pvp', 'system', 'update', 'came', 'resisted', 'playing', 'game', 'quitted', 'around', 'came', 'back', 'end', 'good', 'erenor', 'update', 'came', 'erenor', 'update', 'destroyed', 'game', 'even', 'tho', 'dead', 'update', 'came', 'erenor', 'update', 'litterally', 'destroyed', 'might', 'ask', 'well', 'grind', 'new', 'levels', 'normal', 'think', 'player', 'trying', 'kill', 'faction', 'players', 'players', 'everyone', 'quitted', 'wait', 'update', 'included', 'new', 'armor', 'believe', 'needed', 'obsidian', 'ayanand', 'gear', 'costed', 'reccomend', 'garbageage', 'play', 'lineage', 'prelude', 'interlude', 'private', 'server', 'try', 'black', 'desert', 'online', 'fan', 'open', 'world', 'pvp'], ['extremely', 'visually', 'pleasing', 'enticing', 'story', 'solid', 'good', 'fps'], ['battlefield', 's

In [130]:
### MODEL 4: USING a pre-trained embedding with Word2Vec model

review_lines = []
lines = data.drop(X_test.index)['content'].values.tolist()

for line in lines:
    words = word_tokenize(line)
    review_lines.append(words)

train_review_lines = []
# lines =  data['content'].values.tolist()
train_lines = X_train.values.tolist()
# .drop(X_test.index)
for line in train_lines:
    words = word_tokenize(line)
    train_review_lines.append(words)
    
val_review_lines = []
val_lines = X_val.values.tolist()
for line in val_lines:
    words = word_tokenize(line)
    val_review_lines.append(words)

print(len(review_lines))

EMBEDDING_DIM = 128

word2vec_model = gensim.models.Word2Vec(sentences = review_lines,
                              size = EMBEDDING_DIM,
                              window = 5,
                              workers = 6,
                              min_count = 1)
words = list(word2vec_model.wv.vocab)
print('Vocab size: ', len(words))

40102
Vocab size:  49007


In [131]:
### Test word2vec model
# model.wv.most_similar_cosmul(positive=['buy','good'], negative = ['noobs'])
word2vec_model.wv.most_similar('excellent')


[('incredible', 0.958478569984436),
 ('fantastic', 0.9417787790298462),
 ('gorgeous', 0.9370279312133789),
 ('stunning', 0.9283438920974731),
 ('wonderful', 0.9257760047912598),
 ('breathtaking', 0.9176409244537354),
 ('wise', 0.9163503646850586),
 ('beautiful', 0.9148737192153931),
 ('immersive', 0.9140902161598206),
 ('atmospheric', 0.9082704782485962)]

In [132]:
### Save word2vec model ###
filename = 'test/steam_reviews_embedding_word2vec2.txt'
word2vec_model.wv.save_word2vec_format(filename, binary = False)


In [133]:
### Load word2vec model and get embedding index ###
embeddings_index = {}
file_path = os.path.join('', 'test/steam_reviews_embedding_word2vec2.txt')

with open(file_path, encoding = 'utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coeffs = np.asarray(values[1:])
        embeddings_index[word] = coeffs


In [134]:
# Get maximum length of all reviews to pad
total_reviews = data['content'].values.tolist()
# total_reviews = total_reviews[0].tolist()
# print(total_reviews[3])
max_length = max([len(s.split()) for s in total_reviews])

print(max_length)
print(len(total_reviews))


796
45570


In [135]:
## Create sequence and pad training set
train_tokenizer_obj = Tokenizer()
train_tokenizer_obj.fit_on_texts(train_review_lines)
train_sequences = train_tokenizer_obj.texts_to_sequences(train_review_lines)
train_word_index = train_tokenizer_obj.word_index
print('Found %s unique tokens.' % len(train_word_index))

X_train_pad = pad_sequences(train_sequences, maxlen = max_length)

## Create sequence and pad validation set
val_tokenizer_obj = Tokenizer()
val_tokenizer_obj.fit_on_texts(val_review_lines)
val_sequences = val_tokenizer_obj.texts_to_sequences(val_review_lines)
val_word_index = val_tokenizer_obj.word_index
print('Found %s unique tokens.' % len(val_word_index))

X_val_pad = pad_sequences(val_sequences, maxlen = max_length)
# VALIDATION_SPLIT = 0.2


# pad sequences
# word_index = tokenizer_obj.word_index
# print('Found %s unique tokens.' % len(word_index))

# X_train_pad = pad_sequences(sequences, maxlen = max_length)
# y_recommend = data.drop(X_test.index)['recommend'].values
# print('Shape of review tensor:', review_pad.shape)
# print('Shape of recommend tensor:', recommend.shape)\

# indices = np.arange(review_pad.shape[0])
# np.random.shuffle(indices)
# review_pad = review_pad[indices]
# recommend = recommend[indices]
# num_validation_samples = int(VALIDATION_SPLIT * review_pad.shape[0])

# X_train_pad = review_pad[:-num_validation_samples]
# y_train = recommend[:-num_validation_samples]
# X_val_pad = review_pad[-num_validation_samples:]
# y_val = recommend[-num_validation_samples:]

print('Shape of X_train_pad tensor:', X_train_pad.shape)
print('Shape of y_train tensor:', y_train.shape)

print('Shape of X_val_pad tensor:', X_val_pad.shape)
print('Shape of y_val tensor:', y_val.shape)


Found 43635 unique tokens.
Found 21099 unique tokens.
Shape of X_train_pad tensor: (31899, 796)
Shape of y_train tensor: (31899,)
Shape of X_val_pad tensor: (8203, 796)
Shape of y_val tensor: (8203,)


In [138]:
EMBEDDING_DIM = 128
num_words = len(train_word_index) + len(val_word_index) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if i > num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
print(num_words)

64735


In [139]:
# Build GRU model

rnn_model = Sequential()

embedding_layer = Embedding(num_words,
                           EMBEDDING_DIM,
                           embeddings_initializer = Constant(embedding_matrix),
                           input_length = max_length,
                           trainable = False)

rnn_model.add(embedding_layer)
rnn_model.add(GRU(units = 32, dropout = 0.2, recurrent_dropout = 0.2))
rnn_model.add(Dense(1, activation = 'sigmoid'))

rnn_model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

print('Summary of the built model...')
print(rnn_model.summary())


Summary of the built model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 796, 128)          8286080   
_________________________________________________________________
gru_3 (GRU)                  (None, 32)                15456     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 8,301,569
Trainable params: 15,489
Non-trainable params: 8,286,080
_________________________________________________________________
None


In [140]:
# Class weights and early stopping
class_weights = class_weight.compute_class_weight('balanced',
                                                np.unique(y_train),
                                                y_train)

class_weight_dict = dict(enumerate(class_weights))
print(np.unique(y_train))
print(class_weights)
print(class_weight_dict)

callbacks = [EarlyStopping(monitor = 'val_loss', patience = 10),
            ModelCheckpoint(filepath = 'checkpoint_model_recent.h5', monitor = 'val_loss', save_best_only = True)]


[0 1]
[1.9309322  0.67471128]
{0: 1.9309322033898304, 1: 0.6747112822031389}


In [141]:
print("Training model 4...")
rnn_model.fit(X_train_pad, 
              y_train, 
              batch_size = 128,
              epochs = 50, 
              validation_data = (X_val_pad, y_val), 
              verbose = 2, 
              callbacks = callbacks, 
              class_weight = class_weight_dict, 
              shuffle = True)


Training model 4...
Train on 31899 samples, validate on 8203 samples
Epoch 1/50
 - 227s - loss: 0.5522 - acc: 0.7067 - val_loss: 0.6939 - val_acc: 0.5933
Epoch 2/50
 - 216s - loss: 0.4663 - acc: 0.7703 - val_loss: 0.7408 - val_acc: 0.5764
Epoch 3/50


KeyboardInterrupt: 

In [220]:
### Save model architecture and weights separately
# Serialize model to JSON
rnn_model4_json = rnn_model.to_json()
with open('rnn_model4.json', 'w') as json_file:
    json_file.write(rnn_model4_json)
# Serialize weights to HDF5
rnn_model.save_weights('rnn_model4.h5')
print('Saved model 4 to disk')

Saved RNN model 4 to disk


In [83]:
### Save and load whole Keras model to disk
rnn_model.save('best/rnn_model_2019-03-20.h5')  # creates a HDF5 file 'my_model.h5'
# del rnn_model  # deletes the existing model

# returns a compiled model
# identical to the previous one
# rnn_model = load_model('rnn_model_2019-03-19.h5')


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 770, 128)          6077824   
_________________________________________________________________
gru_4 (GRU)                  (None, 32)                15456     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 33        
Total params: 6,093,313
Trainable params: 15,489
Non-trainable params: 6,077,824
_________________________________________________________________


In [85]:
### Evaluate model on test dataset
print(len(X_test))
print(len(test_lines))
print(len(y_test))
test_lines =  X_test.values.tolist()
padded_test_samples = pad_text_samples(770, test_lines)
# print(padded_test_samples)

print('Testing...')
score, acc = rnn_model.evaluate(padded_test_samples, y_test, batch_size = 128)

print('Test score:', score)
print('Test accuracy:', acc)
print("Accuracy: {0:.2%}".format(acc))

2726
2726
2726
Testing...
Test score: 0.8336969167103421
Test accuracy: 0.5638297871465816
Accuracy: 56.38%


In [86]:
y_pred = rnn_model.predict(x = padded_test_samples)
roc_auc_score(y_test, y_pred)

0.5859509029999537

In [89]:
test_model = load_model('best/rnn_model_2019-03-19.h5')
test_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 770, 128)          6077824   
_________________________________________________________________
gru_2 (GRU)                  (None, 64)                37056     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 6,114,945
Trainable params: 37,121
Non-trainable params: 6,077,824
_________________________________________________________________


In [75]:
score, acc = test_model.evaluate(padded_test_samples, y_test, batch_size = 128)
print('Test score:', score)
print('Test accuracy:', acc)
print("Accuracy: {0:.2%}".format(acc))




Test score: 0.785690209442259
Test accuracy: 0.5928099775961542
Accuracy: 59.28%


In [23]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences


def pre_process_samples(test_lines):
    review_lines = list()
    
    for line in test_lines:
        test_tokens = word_tokenize(line)
        test_tokens = [w.lower() for w in test_tokens]
        test_table = str.maketrans('', '', string.punctuation)
        test_stripped = [w.translate(test_table) for w in test_tokens]
        test_words = [word for word in test_stripped if word.isalpha()]
        stop_words = set(stopwords.words('english'))
        test_words = [w for w in test_words if not w in stop_words]
        review_lines.append(test_words)
        
    tokenizer_obj = Tokenizer()
    tokenizer_obj.fit_on_texts(review_lines)
    
    test_samples_tokens = tokenizer_obj.texts_to_sequences(review_lines)
    test_samples_pad = pad_sequences(test_samples_tokens, maxlen = 796)
    return test_samples_pad;


In [79]:
# Load model
# loaded_model = None
# with open('best/rnn_model4.json') as f:
#     loaded_model_json = f.read()

# loaded_model = model_from_json(loaded_model_json)
# loaded_model.load_weights('best/rnn_model4.h5')
loaded_model.summary()
print('Loaded model from disk')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, 796, 128)          6055424   
_________________________________________________________________
gru_16 (GRU)                 (None, 32)                15456     
_________________________________________________________________
dense_16 (Dense)             (None, 1)                 33        
Total params: 6,070,913
Trainable params: 15,489
Non-trainable params: 6,055,424
_________________________________________________________________
Loaded model from disk


In [70]:
test_lines2 =  X_test.values.tolist()
padded_test_samples2 = pad_text_samples(796, test_lines2)

loaded_model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
score = loaded_model.evaluate(padded_test_samples2, y_test, verbose = 0)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))


acc: 53.93%


In [38]:
from sklearn.metrics import roc_auc_score


test_sample1 = 'Dude this game is so bad'
test_sample2 = 'I think this game is not worth'
test_sample3 = 'Love this game'
test_sample4 = 'Wow so great-_-'
test_sample5 = 'Wow so great that I forgot to laugh'
test_sample6 = 'I fuckin love this game man'
test_sample7 = 'This game is hella mediocre'
test_sample8 = 'LAMEEEEE'
test_samples = [test_sample1, test_sample2, test_sample3, test_sample4, 
                test_sample5, test_sample6, test_sample7, test_sample8
               ]
test_samples_pad = pre_process_samples(test_samples)

# y_test_samples = [[0],[0],[1],[0],[0],[1],[0],[0]]
y_test_samples = [0,0,1,0,0,1,0,0]
y_pred_samples = rnn_model.predict(x = test_samples_pad)

loaded_model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
score = loaded_model.evaluate(test_samples_pad, y_test_samples, verbose = 0)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))
# roc_auc_score(y_test_samples, y_pred_samples)
# print(y_pred_samples)


[['dude', 'game', 'bad'], ['think', 'game', 'worth'], ['love', 'game'], ['wow', 'great'], ['wow', 'great', 'forgot', 'laugh'], ['fuckin', 'love', 'game', 'man'], ['game', 'hella', 'mediocre'], ['lameeeee']]
acc: 37.50%


In [25]:
result = pre_process_samples(test_samples)
y_pred = rnn_model.predict(x = result)
print(y_pred)

[['dude', 'game', 'bad'], ['think', 'game', 'worth'], ['love', 'game'], ['wow', 'great'], ['wow', 'great', 'forgot', 'laugh'], ['fuckin', 'love', 'game', 'man'], ['game', 'hella', 'mediocre'], ['lameeeee']]
[[0.9182093 ]
 [0.8216222 ]
 [0.8091505 ]
 [0.91073906]
 [0.98938596]
 [0.87950206]
 [0.5120096 ]
 [0.40626955]]


In [328]:
y
roc_auc_score(y_test_samples, y_pred_samples)

0.9

In [303]:

test_data = df_steam_reviews_copy.loc[df_steam_reviews_copy['reviewid'].isin(data['reviewid'][X_test.index].values)]['content'].values.tolist()

test_samples_tokens = tokenizer_obj.texts_to_sequences(test_data)
test_samples_tokens_pad = pad_sequences(test_samples_tokens, maxlen=796)

res = rnn_model.predict(x = test_samples_tokens_pad)
                     

In [307]:
print(res[1:10])
print(y_test[1:10])

[[0.7714907 ]
 [0.7716196 ]
 [0.31901607]
 [0.05989276]
 [0.7656045 ]
 [0.891559  ]
 [0.98777676]
 [0.95926505]
 [0.02286744]]
5      0
17     1
56     1
57     0
72     1
74     1
108    1
129    1
151    0
Name: recommend, dtype: int64


In [9]:
from keras.models import load_model
rnn_model = load_model('/Users/admin/Documents/Queens_Masters_Courses/Deep_Learning/course_project/best/review_model4.h5')

score, acc = rnn_model.evaluate(test_samples_tokens_pad, y_test, batch_size=128)
print('Test score:', score)
print('Test accuracy:', acc)
print("Accuracy: {0:.2%}".format(acc))


NameError: name 'test_samples_tokens_pad' is not defined

In [270]:
print('Testing...')

score, acc = rnn_model.evaluate(X_test_pad, y_test, batch_size=128)
print('Test score:', score)
print('Test accuracy:', acc)
print("Accuracy: {0:.2%}".format(acc))

Testing...
Test score: 0.8931745234788401
Test accuracy: 0.5374497627750295
Accuracy: 53.74%


In [269]:
y_pred = rnn_model.predict(x = X_test_pad)

In [None]:
#### Build CNN Model 4
from keras.models import Sequential
from keras.layers import Dense, Embedding, Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.initializers import Constant

# define CNN model
cnn_model = Sequential()
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=max_length,
                            trainable=False)

cnn_model.add(embedding_layer)
cnn_model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
cnn_model.add(MaxPooling1D(pool_size=2))
cnn_model.add(Flatten())
cnn_model.add(Dense(1, activation='sigmoid'))
print(cnn_model.summary())

# compile network
cnn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# fit the model
print("Training CNN model...")
cnn_model.fit(X_train_pad, 
              y_train, 
              batch_size = 128, 
              epochs = 30, 
              validation_data = (X_val_pad, y_val), 
              verbose = 2, 
              callbacks = callbacks, 
              class_weight = class_weight_dict, 
              shuffle = True)




In [2]:
def get_model():
    model = Sequential()

    embedding_layer = Embedding(num_words,
                               EMBEDDING_DIM,
                               embeddings_initializer = Constant(embedding_matrix),
                               input_length = max_length,
                               trainable = False)

    model.add(embedding_layer)
    model.add(GRU(units = 32, dropout = 0.2, recurrent_dropout = 0.2))
    model.add(Dense(1, activation = 'sigmoid'))

    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])


In [None]:
# Using generator and 10-fold cross validation

for j, ( )

# NEXT STEPS: USE generator to speed up
# 10 fold cross validation
# Concatenate hours of gameplay feature to word2vec
