In [None]:
import os
import pickle
import gzip
import json
import pandas as pd
import numpy as np

from config import RAW_DIR, PRE_DIR, RES_DIR
from utils.data_porter import read_from_csv, save_to_csv

In [None]:
with open(os.path.join(PRE_DIR, 'review_data.pkl'), 'rb') as f:
    review_data = pickle.load(f)
print(review_data.shape)
review_data.head()

In [None]:
review_data['reviewTime'].max()

In [None]:
review_data['reviewTime'].min()

# Key word

In [None]:
import 

# NLP data processing

In [None]:
import re
import nltk
# import nltk
# nltk.download('stopwords')
# Add manually
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
english_stemmer=nltk.stem.SnowballStemmer('english')

In [None]:
def data_clean(rev, remove_stopwords=True): 
    
    try:
        new_text = re.sub("[^a-zA-Z]"," ", rev)
    except:
        print(rev)
        new_text = []
   
    words = new_text.lower().split()
    
    if remove_stopwords:
        sts = set(stopwords.words("english"))
        words = [w for w in words if not w in sts]
    ary=[]
    eng_stemmer = english_stemmer 
    for word in words:
        ary.append(eng_stemmer.stem(word))

    new_ary = ' '.join(ary)
    return(new_ary)

In [None]:
review_data.shape

In [None]:
part_data = review_data.sample(frac=0.1, random_state=1).fillna('')
part_data = part_data.drop(columns=['vote', 'style', 'image', 'reviewTime', 'reviewerName', 'verified'])
part_data = part_data.sort_values(by='unixReviewTime')
print(part_data.shape)
part_data.head()

In [None]:
# part_data['reviewText'] = part_data['reviewText'].apply(lambda x: data_clean(x))
# part_data['summary'] = part_data['summary'].apply(lambda x: data_clean(x))

# part_data['docs'] = part_data.apply(lambda x: x.reviewText + x.summary, axis=1)


part_data['docs'] = part_data.apply(lambda x: data_clean(x.reviewText) + data_clean(x.summary), axis=1)

In [None]:
part_data.head()

In [None]:
part_data['overall'].value_counts()

In [None]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
corpus = part_data['reviewText'].tolist()

In [None]:
vectorizer = TfidfVectorizer(min_df=20, max_df=0.5, max_features=2000)
X = vectorizer.fit_transform(corpus)
print(X.shape)

(48421, 2000)


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Embedding

In [None]:
train_x = X[: int(X.shape[0] * 0.8)]
val_x = X[int(X.shape[0] * 0.8): int(X.shape[0] * 0.9)]
test_x = X[int(X.shape[0] * 0.9) :]
print(f"train_x: {train_x.shape}")
print(f"val_x: {val_x.shape}")
print(f"test_x: {test_x.shape}")

In [None]:
y_encode = {
    1: [1, 0, 0, 0, 0],
    2: [0, 1, 0, 0, 0],
    3: [0, 0, 1, 0, 0],
    4: [0, 0, 0, 1, 0],
    5: [0, 0, 0, 0, 1]
}
part_data['y'] = part_data['overall'].apply(lambda x: y_encode[x])

train_y = np.array(part_data['y'][: int(X.shape[0] * 0.8)].tolist())
val_y = np.array(part_data['y'][int(X.shape[0] * 0.8): int(X.shape[0] * 0.9)].tolist())
test_y = np.array(part_data['y'][int(X.shape[0] * 0.9) :].tolist())

In [None]:
model = Sequential()
model.add(Dense(256, input_dim=train_x.shape[1]))
model.add(Dropout(rate=0.2))
model.add(Dense(128, input_dim=train_x.shape[1]))
model.add(Dropout(rate=0.2))
model.add(Dense(5, activation='softmax'))

model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['accuracy'])

In [None]:
callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=10,
    mode='auto', restore_best_weights=False
)# verbose

In [None]:
model.fit(train_x, train_y,
          validation_data=(val_x, val_y),
          epochs=100,
          batch_size=64,
          verbose=1, 
          callbacks=[callback])

model.evaluate(test_x, test_y)[1]