In [62]:
import nltk
import string
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_addons as tfa

from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
 
lemmatizer = WordNetLemmatizer()

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# preprocessing documents
def preprocess(document):
    # 1. Lowercase Sentence
    document = document.lower()

    # 2. Sentence Splitter
    bag_of_document = document.split(' ')

    # 3. Punctuation Remove
    for i, word in enumerate(bag_of_document):
            bag_of_document[i] = word.translate(str.maketrans('', '', string.punctuation))

    # 4. Stop Word Remove
    #print(stopwords.words('english'))
    bag_of_document = [word for word in bag_of_document if word not in stopwords.words('english')]

    # 5. Lemmatize
    for i, word in enumerate(bag_of_document):
            bag_of_document[i] = lemmatizer.lemmatize(word)

    bag_of_document = ' '.join(bag_of_document)

    return bag_of_document

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kilometers/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kilometers/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/kilometers/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [63]:
df = pd.read_csv('Reviews.csv')
df.drop(['UserId', 'ProductId', 'Id', 'ProfileName','HelpfulnessDenominator', 'HelpfulnessNumerator', 'Time', 'Summary'], axis=1, inplace=True)

In [64]:
for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
    df.at[idx, 'Text'] = preprocess(row['Text'])

100%|██████████| 568454/568454 [1:53:13<00:00, 83.68it/s]  


In [65]:
df.to_csv('Output.csv')

In [21]:
df = pd.read_csv('Output.csv')

df['Rating'] = df.Score.apply(lambda x: 1 if x in [3, 4, 5] else 0)
df.head()

Unnamed: 0.1,Unnamed: 0,Score,Text,Rating
0,0,5,bought several vitality canned dog food produc...,1
1,1,1,product arrived labeled jumbo salted peanutsth...,0
2,2,4,confection around century light pillowy citru...,1
3,3,2,looking secret ingredient robitussin believe f...,0
4,4,5,great taffy great price wide assortment yummy...,1


In [16]:
dataset_split = round(len(df)*0.7)

train_text = df['Text'][:dataset_split]
train_rating = df['Rating'][:dataset_split]

test_text = df['Text'][dataset_split:]
test_rating = df['Rating'][dataset_split:]

token = Tokenizer(num_words=30000)
token.fit_on_texts(train_text)

padded = pad_sequences(token.texts_to_sequences(train_text), maxlen=100, truncating='post')
testing_padded = pad_sequences(token.texts_to_sequences(test_text), maxlen=100)

In [17]:
# RNN Model
rnn_model = tf.keras.Sequential([tf.keras.layers.Embedding(30000, 15, input_length=100), tf.keras.layers.GlobalAveragePooling1D(), tf.keras.layers.Dense(7, activation='relu'), tf.keras.layers.Dense(1, activation='sigmoid')])
rnn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), tfa.metrics.F1Score(num_classes=1)])
rnn_model.summary()
rnn_model.fit(padded, train_rating, epochs=20, steps_per_epoch=1000, validation_data=(testing_padded, test_rating))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
INFO:tensorflow:Assets written to: rnn model/assets


INFO:tensorflow:Assets written to: rnn model/assets


In [25]:
print(classification_report(test_rating, (rnn_model.predict(testing_padded)>=0.5).astype(np.uint8)))

              precision    recall  f1-score   support

           0       0.79      0.66      0.72     24068
           1       0.95      0.97      0.96    146468

    accuracy                           0.93    170536
   macro avg       0.87      0.81      0.84    170536
weighted avg       0.92      0.93      0.92    170536



In [19]:
# LSTM Model
lstm_model = tf.keras.Sequential([tf.keras.layers.Embedding(30000, 15, input_length=100), tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(30)), tf.keras.layers.Dense(7, activation='relu'), tf.keras.layers.Dense(1, activation='sigmoid')])
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), tfa.metrics.F1Score(num_classes=1)])
lstm_model.summary()
lstm_model.fit(padded, train_rating, epochs=20, steps_per_epoch=1000, validation_data=(testing_padded, test_rating))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20




INFO:tensorflow:Assets written to: lstm model/assets


INFO:tensorflow:Assets written to: lstm model/assets


In [24]:
print(classification_report(test_rating, (lstm_model.predict(testing_padded)>=0.5).astype(np.uint8)))

              precision    recall  f1-score   support

           0       0.75      0.73      0.74     24068
           1       0.96      0.96      0.96    146468

    accuracy                           0.93    170536
   macro avg       0.85      0.85      0.85    170536
weighted avg       0.93      0.93      0.93    170536

