In [None]:
import os
import pickle
import gzip
import json
import pandas as pd
import numpy as np

from config import RAW_DIR, PRE_DIR, RES_DIR
from utils.data_porter import read_from_csv, save_to_csv

In [None]:
with open(os.path.join(PRE_DIR, 'review_data.pkl'), 'rb') as f:
    review_data = pickle.load(f)
print(review_data.shape)
review_data.head()

(484214, 12)


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,4.0,False,2000-01-01,A261TLAGXR52NH,B00002CF8V,THOR (Global Gamer Reviewer/Previewer),"Take Resident Evil,jack its graphics up alot,s...",D*MN NEAR PERFECT!,946684800,6.0,,
1,4.0,False,2000-01-01,A261TLAGXR52NH,B00002CF96,THOR (Global Gamer Reviewer/Previewer),Half-Life:Opposing Force takes place as your c...,Better than the first?,946684800,,,
2,5.0,False,2000-01-01,A261TLAGXR52NH,B00002CF8U,THOR (Global Gamer Reviewer/Previewer),GTA2 is set in a futuristic city where you try...,Just read it!,946684800,2.0,{'Format:': ' Video Game'},
3,5.0,False,2000-01-02,A2KG4CXNXJVPTK,B00001ZT9E,Cavan,"TR:LR is excellent! The story, the graphics, ...",Excellent game!,946771200,8.0,{'Format:': ' Video Game'},
4,4.0,False,2000-01-02,A2KG4CXNXJVPTK,B00000K1VE,Cavan,Dino Crisis is a great game -- lots of excitem...,"Great game, though it can be short",946771200,,,


In [None]:
review_data['reviewTime'].max()

Timestamp('2017-12-09 00:00:00')

In [None]:
review_data['reviewTime'].min()

Timestamp('2000-01-01 00:00:00')

# Key word

In [None]:
import 

# NLP data processing

In [None]:
import re
import nltk
# import nltk
# nltk.download('stopwords')
# add manually
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
english_stemmer=nltk.stem.SnowballStemmer('english')

In [None]:
def data_clean(rev, remove_stopwords=True): 
    
    try:
        new_text = re.sub("[^a-zA-Z]"," ", rev)
    except:
        print(rev)
        new_text = []
   
    words = new_text.lower().split()
    
    if remove_stopwords:
        sts = set(stopwords.words("english"))
        words = [w for w in words if not w in sts]
    ary=[]
    eng_stemmer = english_stemmer 
    for word in words:
        ary.append(eng_stemmer.stem(word))

    new_ary = ' '.join(ary)
    return(new_ary)

In [None]:
review_data.shape

(484214, 12)

In [None]:
part_data = review_data.sample(frac=0.1, random_state=1).fillna('')
part_data = part_data.drop(columns=['vote', 'style', 'image', 'reviewTime', 'reviewerName', 'verified'])
part_data = part_data.sort_values(by='unixReviewTime')
print(part_data.shape)
part_data.head()

(48421, 6)


Unnamed: 0,overall,reviewerID,asin,reviewText,summary,unixReviewTime
20,4.0,A29R3O02O0PON4,B00002EPZ2,"Well, if you haven't figured it out yet from t...",Are you a dedd'er too?,947635200
25,3.0,A2QOK3KRHQKGSG,B00002NDRY,"This is an excellent ""product"" from a guy who ...","This is an excellent ""product"" from a guy who...",947808000
31,5.0,A1LWYVLGA3N2R4,B00001XDKO,"Absolutely stunning graphics, incredible gamep...",Best game of 1999,948499200
28,5.0,A2QML92353G4GG,B00002NDSI,After countless hours of frustration and disap...,A breath of fresh air!,948499200
33,5.0,A1UF6GIMK5U1Z5,B00001KUII,"This is without a doubt, one of the most fanta...","Great, but use caution!",948585600


In [None]:
# part_data['reviewText'] = part_data['reviewText'].apply(lambda x: data_clean(x))
# part_data['summary'] = part_data['summary'].apply(lambda x: data_clean(x))

# part_data['docs'] = part_data.apply(lambda x: x.reviewText + x.summary, axis=1)


part_data['docs'] = part_data.apply(lambda x: data_clean(x.reviewText) + data_clean(x.summary), axis=1)

In [None]:
part_data.head()

Unnamed: 0,overall,reviewerID,asin,reviewText,summary,unixReviewTime,docs
20,4.0,A29R3O02O0PON4,B00002EPZ2,"Well, if you haven't figured it out yet from t...",Are you a dedd'er too?,947635200,well figur yet review player assum role immort...
25,3.0,A2QOK3KRHQKGSG,B00002NDRY,"This is an excellent ""product"" from a guy who ...","This is an excellent ""product"" from a guy who...",947808000,excel product guy admit casual gamer actual ad...
31,5.0,A1LWYVLGA3N2R4,B00001XDKO,"Absolutely stunning graphics, incredible gamep...",Best game of 1999,948499200,absolut stun graphic incred gameplay best bot ...
28,5.0,A2QML92353G4GG,B00002NDSI,After countless hours of frustration and disap...,A breath of fresh air!,948499200,countless hour frustrat disappoint ultima onli...
33,5.0,A1UF6GIMK5U1Z5,B00001KUII,"This is without a doubt, one of the most fanta...","Great, but use caution!",948585600,without doubt one fantast game avili ever pc b...


In [None]:
part_data['overall'].value_counts()

5.0    28897
4.0     9238
3.0     4844
1.0     3091
2.0     2351
Name: overall, dtype: int64

In [None]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
corpus = part_data['reviewText'].tolist()

In [None]:
vectorizer = TfidfVectorizer(min_df=20, max_df=0.5, max_features=2000)
X = vectorizer.fit_transform(corpus)
print(X.shape)

(48421, 2000)


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Embedding

In [None]:
train_x = X[: int(X.shape[0] * 0.8)]
val_x = X[int(X.shape[0] * 0.8): int(X.shape[0] * 0.9)]
test_x = X[int(X.shape[0] * 0.9) :]
print(f"train_x: {train_x.shape}")
print(f"val_x: {val_x.shape}")
print(f"test_x: {test_x.shape}")

train_x: (38736, 2000)
val_x: (4842, 2000)
test_x: (4843, 2000)


In [None]:
y_encode = {
    1: [1, 0, 0, 0, 0],
    2: [0, 1, 0, 0, 0],
    3: [0, 0, 1, 0, 0],
    4: [0, 0, 0, 1, 0],
    5: [0, 0, 0, 0, 1]
}
part_data['y'] = part_data['overall'].apply(lambda x: y_encode[x])

train_y = np.array(part_data['y'][: int(X.shape[0] * 0.8)].tolist())
val_y = np.array(part_data['y'][int(X.shape[0] * 0.8): int(X.shape[0] * 0.9)].tolist())
test_y = np.array(part_data['y'][int(X.shape[0] * 0.9) :].tolist())

In [None]:
model = Sequential()
model.add(Dense(256, input_dim=train_x.shape[1]))
model.add(Dropout(rate=0.2))
model.add(Dense(128, input_dim=train_x.shape[1]))
model.add(Dropout(rate=0.2))
model.add(Dense(5, activation='softmax'))

model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['accuracy'])

In [None]:
callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=10,
    mode='auto', restore_best_weights=False
)# verbose

In [None]:
model.fit(train_x, train_y,
          validation_data=(val_x, val_y),
          epochs=100,
          batch_size=64,
          verbose=1, 
          callbacks=[callback])

model.evaluate(test_x, test_y)[1]

Train on 38736 samples, validate on 4842 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100


0.70865166