In [None]:
%matplotlib inline

import tensorflow as tf
import pandas as pd
import nltk, re, time
from langdetect import detect
from contractions import get_contractions
from sqlalchemy import create_engine
from pprint import pprint
from nltk.corpus import stopwords
import chars2vec
import sklearn.decomposition
import matplotlib.pyplot as plt
import itertools
import string
import re
from keras.models import model_from_json
from keras.models import load_model
import gensim
import os
import numpy as np
from sklearn.utils import resample
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from nltk.tokenize import word_tokenize
from sklearn.utils import class_weight
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU, GlobalMaxPool1D
from keras.layers.embeddings import Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import optimizers
from sklearn.metrics import roc_auc_score
from keras.initializers import Constant

# We download stopwords package
# nltk.download('stopwords')
contractions = get_contractions()


In [None]:
# Load data
engine = create_engine('mysql://root:@localhost:3306/steam')
# steam_data_query = """SELECT url AS reviewid, content, CAST(recommend AS SIGNED) AS recommend
#     FROM latest_review"""
train_val_data_query = """SELECT gameid, url, CAST(recommend AS SIGNED) AS recommend, hours_2w, hours_all, posttime, updatetime, EAG, compensation, content, initial_release_date, lang 
FROM latest_review 
WHERE url NOT IN (SELECT DISTINCT url FROM test_set) 
AND lang = 'en';
"""

test_data_query = """SELECT gameid, url, CAST(recommend AS SIGNED) AS recommend, hours_2w, hours_all, posttime, updatetime, EAG, compensation, content, initial_release_date, lang
FROM test_set;"""

df_train_val = pd.read_sql(train_val_data_query, engine)
df_test = pd.read_sql(test_data_query, engine)


In [None]:
review_lines = []

lines = df_train_val['content'].values.tolist()


for line in lines:
    tokens = word_tokenize(line)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    
    new_text = []
    for word in tokens:
        if word in contractions:
            new_text.append(contractions[word])
        else:
            new_text.append(word)
    
    text = " ".join(new_text)
    # remove punctuation from each word    
#     table = str.maketrans('', '', string.punctuation)
#     stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
#     words = [word for word in stripped if word.isalpha()]

    words = word_tokenize(text)
    # filter out stop words    
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    review_lines.append(words)



In [None]:
EMBEDDING_DIM = 128
# train word2vec model
model = gensim.models.Word2Vec(sentences=review_lines, size=EMBEDDING_DIM, 
                               window=5, workers=4, min_count=1)
# vocab size
words = list(model.wv.vocab)
print('Vocabulary size: %d' % len(words))

In [None]:
# save model in ASCII (word2vec) format
filename = 'recent_embedding_word2vec_with_features.txt'
model.wv.save_word2vec_format(filename, binary=False)

In [None]:
embeddings_index = {}
f = open(os.path.join('', 'recent_embedding_word2vec_with_features.txt'),  encoding = "utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word] = coefs
f.close()

In [None]:
X_train = df_train_val.sample(frac = 0.8)['content']
y_train = df_train_val.loc[X_train.index]['recommend']

X_val = df_train_val.drop(X_train.index)['content']
y_val = df_train_val.loc[X_val.index]['recommend']

print(len(df_train_val))
print(len(X_train))
print(len(X_val))

X_train = X_train.values
y_train = y_train.values
X_val = X_val.values
y_val = y_val.values


In [None]:
total_reviews = np.concatenate((X_train, X_val), axis = 0)
max_length = max([len(s.split()) for s in total_reviews])

In [None]:
VALIDATION_SPLIT = 0.2

# vectorize the text samples into a 2D integer tensor
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(review_lines)
sequences = tokenizer_obj.texts_to_sequences(review_lines)

# pad sequences
word_index = tokenizer_obj.word_index
print('Found %s unique tokens.' % len(word_index))

review_pad = pad_sequences(sequences, maxlen=max_length)
sentiment =  df_train_val['recommend'].values
print('Shape of review tensor:', review_pad.shape)
print('Shape of sentiment tensor:', sentiment.shape)

# split the data into a training set and a validation set
indices = np.arange(review_pad.shape[0])
np.random.shuffle(indices)
review_pad = review_pad[indices]
sentiment = sentiment[indices]
num_validation_samples = int(VALIDATION_SPLIT * review_pad.shape[0])

X_train_pad = review_pad[:-num_validation_samples]
y_train = sentiment[:-num_validation_samples]
X_val_pad = review_pad[-num_validation_samples:]
y_val = sentiment[-num_validation_samples:]

In [None]:
print('Shape of X_train_pad tensor:', X_train_pad.shape)
print('Shape of y_train tensor:', y_train.shape)

print('Shape of X_test_pad tensor:', X_val_pad.shape)
print('Shape of y_test tensor:', y_val.shape)

In [None]:
EMBEDDING_DIM = 128
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if i > num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
# define model
model = Sequential()
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=max_length,
                            trainable=False)
model.add(embedding_layer)


model.add(GRU(units=32,  dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print('Summary of the built model...')
print(model.summary())

In [None]:
# Class weights and early stopping
class_weights = class_weight.compute_class_weight('balanced',
                                                np.unique(y_train),
                                                y_train)

class_weight_dict = dict(enumerate(class_weights))
print(np.unique(y_train))
print(class_weights)
print(class_weight_dict)

callbacks = [EarlyStopping(monitor = 'val_loss', patience = 10),
            ModelCheckpoint(filepath = 'checkpoint_model_recent.h5', monitor = 'val_loss', save_best_only = True)]


In [None]:
print('Train...')

model.fit(X_train_pad, 
              y_train, 
              batch_size = 128,
              epochs = 25, 
              validation_data = (X_val_pad, y_val), 
              verbose = 2, 
              callbacks = callbacks, 
              class_weight = class_weight_dict, 
              shuffle = True)
