In [75]:
%matplotlib inline

import tensorflow as tf
import pandas as pd
import nltk, re, time
from langdetect import detect
from contractions import get_contractions
from sqlalchemy import create_engine
from pprint import pprint
from nltk.corpus import stopwords
import chars2vec
import sklearn.decomposition
import matplotlib.pyplot as plt
import itertools
import string
import re
from keras.models import model_from_json
from keras.models import load_model
import gensim
import os
import numpy as np
from sklearn.utils import resample
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from nltk.tokenize import word_tokenize
from sklearn.utils import class_weight
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU, GlobalMaxPool1D
from keras.layers.embeddings import Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import optimizers
from sklearn.metrics import roc_auc_score
from keras.initializers import Constant

# We download stopwords package
# nltk.download('stopwords')
contractions = get_contractions()


In [76]:
# Load data
engine = create_engine('mysql://root:@localhost:3306/steam')
# steam_data_query = """SELECT url AS reviewid, content, CAST(recommend AS SIGNED) AS recommend
#     FROM latest_review"""
train_val_data_query = """SELECT gameid, url, CAST(recommend AS SIGNED) AS recommend, hours_2w, hours_all, posttime, updatetime, EAG, compensation, content, initial_release_date, lang 
FROM latest_review 
WHERE url NOT IN (SELECT DISTINCT url FROM test_set) 
AND lang = 'en';
"""

test_data_query = """SELECT gameid, url, CAST(recommend AS SIGNED) AS recommend, hours_2w, hours_all, posttime, updatetime, EAG, compensation, content, initial_release_date, lang
FROM test_set;"""

df_train_val = pd.read_sql(train_val_data_query, engine)
df_test = pd.read_sql(test_data_query, engine)


In [77]:
# Total English Steam reviews
len(df_train_val) + len(df_test)

53764

In [78]:
review_lines = []

lines = df_train_val['content'].values.tolist()


for line in lines:
    tokens = word_tokenize(line)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    
    new_text = []
    for word in tokens:
        if word in contractions:
            new_text.append(contractions[word])
        else:
            new_text.append(word)
    
    text = " ".join(new_text)
    # remove punctuation from each word    
#     table = str.maketrans('', '', string.punctuation)
#     stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
#     words = [word for word in stripped if word.isalpha()]

    words = word_tokenize(text)
    # filter out stop words    
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    review_lines.append(words)



In [80]:
print(len(review_lines))


48388


In [82]:
EMBEDDING_DIM = 128
# train word2vec model
model = gensim.models.Word2Vec(sentences=review_lines, size=EMBEDDING_DIM, 
                               window=5, workers=4, min_count=1)
# vocab size
words = list(model.wv.vocab)
print('Vocabulary size: %d' % len(words))

Vocabulary size: 65500


In [83]:
# save model in ASCII (word2vec) format
filename = 'recent_embedding_word2vec.txt'
model.wv.save_word2vec_format(filename, binary=False)

In [84]:
# model.wv.most_similar('bad')#, topn =1)
model.wv.similar_by_word("love")


[('amazing', 0.7361174821853638),
 ('awesome', 0.7357403635978699),
 ('hate', 0.7297787666320801),
 ('loved', 0.6969842910766602),
 ('adore', 0.6915671229362488),
 ('wonderful', 0.6890279650688171),
 ('liked', 0.6842567920684814),
 ('enjoy', 0.6815904378890991),
 ('fantastic', 0.6786534190177917),
 ('great', 0.6610466241836548)]

In [85]:
embeddings_index = {}
f = open(os.path.join('', 'recent_embedding_word2vec.txt'),  encoding = "utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word] = coefs
f.close()

In [86]:
X_train = df_train_val.sample(frac = 0.8)['content']
y_train = df_train_val.loc[X_train.index]['recommend']

X_val = df_train_val.drop(X_train.index)['content']
y_val = df_train_val.loc[X_val.index]['recommend']

print(len(df_train_val))
print(len(X_train))
print(len(X_val))

X_train = X_train.values
y_train = y_train.values
X_val = X_val.values
y_val = y_val.values


48388
38710
9678


In [87]:
print(X_train.shape)
print(X_val.shape)

(38710,)
(9678,)


In [88]:
total_reviews = np.concatenate((X_train, X_val), axis = 0)
max_length = max([len(s.split()) for s in total_reviews])

In [89]:
VALIDATION_SPLIT = 0.2

# vectorize the text samples into a 2D integer tensor
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(review_lines)
sequences = tokenizer_obj.texts_to_sequences(review_lines)

# pad sequences
word_index = tokenizer_obj.word_index
print('Found %s unique tokens.' % len(word_index))

review_pad = pad_sequences(sequences, maxlen=max_length)
sentiment =  df_train_val['recommend'].values
print('Shape of review tensor:', review_pad.shape)
print('Shape of sentiment tensor:', sentiment.shape)

# split the data into a training set and a validation set
indices = np.arange(review_pad.shape[0])
np.random.shuffle(indices)
review_pad = review_pad[indices]
sentiment = sentiment[indices]
num_validation_samples = int(VALIDATION_SPLIT * review_pad.shape[0])

X_train_pad = review_pad[:-num_validation_samples]
y_train = sentiment[:-num_validation_samples]
X_val_pad = review_pad[-num_validation_samples:]
y_val = sentiment[-num_validation_samples:]

Found 65500 unique tokens.
Shape of review tensor: (48388, 1511)
Shape of sentiment tensor: (48388,)


In [90]:
print('Shape of X_train_pad tensor:', X_train_pad.shape)
print('Shape of y_train tensor:', y_train.shape)

print('Shape of X_test_pad tensor:', X_val_pad.shape)
print('Shape of y_test tensor:', y_val.shape)

Shape of X_train_pad tensor: (38711, 1511)
Shape of y_train tensor: (38711,)
Shape of X_test_pad tensor: (9677, 1511)
Shape of y_test tensor: (9677,)


In [91]:
EMBEDDING_DIM = 128
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if i > num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [92]:
print(num_words)


65501


In [93]:
# define model
model = Sequential()
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=max_length,
                            trainable=False)
model.add(embedding_layer)
model.add(GRU(units=32,  dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print('Summary of the built model...')
print(model.summary())

Summary of the built model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1511, 128)         8384128   
_________________________________________________________________
gru_2 (GRU)                  (None, 32)                15456     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 8,399,617
Trainable params: 15,489
Non-trainable params: 8,384,128
_________________________________________________________________
None


In [94]:
print(y_train)

[1 1 1 ... 1 1 1]


In [95]:
# Class weights and early stopping
class_weights = class_weight.compute_class_weight('balanced',
                                                np.unique(y_train),
                                                y_train)

class_weight_dict = dict(enumerate(class_weights))
print(np.unique(y_train))
print(class_weights)
print(class_weight_dict)

callbacks = [EarlyStopping(monitor = 'val_loss', patience = 10),
            ModelCheckpoint(filepath = 'checkpoint_model_recent.h5', monitor = 'val_loss', save_best_only = True)]


[0 1]
[2.04322812 0.66199808]
{0: 2.0432281220310355, 1: 0.661998084684315}


In [96]:
print('Train...')

model.fit(X_train_pad, 
              y_train, 
              batch_size = 128,
              epochs = 25, 
              validation_data = (X_val_pad, y_val), 
              verbose = 2, 
              callbacks = callbacks, 
              class_weight = class_weight_dict, 
              shuffle = True)


Train...
Train on 38711 samples, validate on 9677 samples
Epoch 1/25
 - 514s - loss: 0.5344 - acc: 0.7148 - val_loss: 0.4903 - val_acc: 0.7572
Epoch 2/25
 - 503s - loss: 0.4526 - acc: 0.7763 - val_loss: 0.4504 - val_acc: 0.7788
Epoch 3/25
 - 478s - loss: 0.4332 - acc: 0.7857 - val_loss: 0.4178 - val_acc: 0.8038
Epoch 4/25
 - 485s - loss: 0.4188 - acc: 0.7995 - val_loss: 0.3943 - val_acc: 0.8145
Epoch 5/25
 - 501s - loss: 0.4102 - acc: 0.8018 - val_loss: 0.4103 - val_acc: 0.8042
Epoch 6/25
 - 544s - loss: 0.4039 - acc: 0.8056 - val_loss: 0.3656 - val_acc: 0.8318
Epoch 7/25
 - 525s - loss: 0.3973 - acc: 0.8098 - val_loss: 0.3849 - val_acc: 0.8179
Epoch 8/25
 - 528s - loss: 0.3926 - acc: 0.8125 - val_loss: 0.3820 - val_acc: 0.8221
Epoch 9/25
 - 500s - loss: 0.3883 - acc: 0.8151 - val_loss: 0.3783 - val_acc: 0.8237
Epoch 10/25
 - 490s - loss: 0.3827 - acc: 0.8193 - val_loss: 0.3866 - val_acc: 0.8169
Epoch 11/25
 - 488s - loss: 0.3817 - acc: 0.8186 - val_loss: 0.3575 - val_acc: 0.8356
Epoch

<keras.callbacks.History at 0x1a3f877a20>

In [115]:
def pre_process(data):
    review_lines = []

    lines = data['content'].values.tolist()


    for line in lines:
        tokens = word_tokenize(line)
        # convert to lower case
        tokens = [w.lower() for w in tokens]

        new_text = []
        for word in tokens:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)

        text = " ".join(new_text)
        # remove punctuation from each word    
    #     table = str.maketrans('', '', string.punctuation)
    #     stripped = [w.translate(table) for w in tokens]
        # remove remaining tokens that are not alphabetic
    #     words = [word for word in stripped if word.isalpha()]

        words = word_tokenize(text)
        # filter out stop words    
        stop_words = set(stopwords.words('english'))
        words = [w for w in words if not w in stop_words]
        review_lines.append(words)
    return review_lines;

def padding_test(df_test, review_lines, max_length = 1511):

    # vectorize the text samples into a 2D integer tensor
    tokenizer_obj = Tokenizer()
    tokenizer_obj.fit_on_texts(review_lines)
    sequences = tokenizer_obj.texts_to_sequences(review_lines)


    review_pad = pad_sequences(sequences, maxlen = max_length)
    sentiment =  df_test['recommend'].values
    print('Shape of review tensor:', review_pad.shape)
    print('Shape of sentiment tensor:', sentiment.shape)

    # split the data into a training set and a validation set
 
    X_test_pad = review_pad
    y_test = sentiment
    
    return X_test_pad, y_test;




In [116]:
# Process test
# df_test['content'].values
review_lines = pre_process(df_test)


In [118]:
X_test_pad, y_test = padding_test(df_test, review_lines)

Shape of review tensor: (5376, 1511)
Shape of sentiment tensor: (5376,)


In [120]:
print('Testing...')
score, acc = model.evaluate(X_test_pad, y_test, batch_size=128)

print('Test score:', score)
print('Test accuracy:', acc)

print("Accuracy: {0:.2%}".format(acc))

Testing...
Test score: 0.7366692139988854
Test accuracy: 0.6155133928571429
Accuracy: 61.55%
