In [62]:
import nltk
import string

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
 
lemmatizer = WordNetLemmatizer()

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# preprocessing documents
def preprocess(document):
    # 1. Lowercase Sentence
    document = document.lower()

    # 2. Sentence Splitter
    bag_of_document = document.split(' ')

    # 3. Punctuation Remove
    for i, word in enumerate(bag_of_document):
            bag_of_document[i] = word.translate(str.maketrans('', '', string.punctuation))

    # 4. Stop Word Remove
    #print(stopwords.words('english'))
    bag_of_document = [word for word in bag_of_document if word not in stopwords.words('english')]

    # 5. Lemmatize
    for i, word in enumerate(bag_of_document):
            bag_of_document[i] = lemmatizer.lemmatize(word)

    bag_of_document = ' '.join(bag_of_document)

    return bag_of_document

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kilometers/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kilometers/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/kilometers/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [63]:
import pandas as pd
from tqdm import tqdm

df = pd.read_csv('Reviews.csv')
df.drop(['UserId', 'ProductId', 'Id', 'ProfileName','HelpfulnessDenominator', 'HelpfulnessNumerator', 'Time', 'Summary'], axis=1, inplace=True)

In [64]:
for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
    df.at[idx, 'Text'] = preprocess(row['Text'])

100%|██████████| 568454/568454 [1:53:13<00:00, 83.68it/s]  


In [65]:
df.to_csv('Output.csv')

In [3]:
import pandas as pd

df = pd.read_csv('Output.csv')

df['Rating'] = df.Score.apply(lambda x: 1 if x in [3, 4, 5] else 0)
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.head()

Unnamed: 0,Score,Text,Rating
0,5,bought several vitality canned dog food produc...,1
1,1,product arrived labeled jumbo salted peanutsth...,0
2,4,confection around century light pillowy citru...,1
3,2,looking secret ingredient robitussin believe f...,0
4,5,great taffy great price wide assortment yummy...,1


In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

vocab_size = 40000
embedding_dim = 16
max_length = 120

split = round(len(df)*0.7)

train_text = df['Text'][:split]
train_rating = df['Rating'][:split]

test_text = df['Text'][split:]
test_rating = df['Rating'][split:]

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(train_text)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(train_text)
padded = pad_sequences(sequences, maxlen=max_length, truncating='post')

testing_sentences = tokenizer.texts_to_sequences(test_text)
testing_padded = pad_sequences(testing_sentences, maxlen=max_length)


In [8]:
import tensorflow as tf
import tensorflow_addons as tfa

rnn_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

rnn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), tfa.metrics.F1Score(num_classes=1)])

#rnn_model.summary()

num_epochs = 20
history = rnn_model.fit(padded, train_rating, epochs=num_epochs, steps_per_epoch=1000,validation_data=(testing_padded, test_rating))

rnn_model.save('rnn model')

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 120, 16)           640000    
                                                                 
 global_average_pooling1d_3   (None, 16)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_6 (Dense)             (None, 6)                 102       
                                                                 
 dense_7 (Dense)             (None, 1)                 7         
                                                                 
Total params: 640,109
Trainable params: 640,109
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epo

In [11]:
y_pred = rnn_model.predict(testing_padded)

from sklearn.metrics import classification_report

y_pred[y_pred>=0.5] = 1
y_pred[y_pred<0.5] = 0

print(classification_report(test_rating, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.69      0.73     24068
           1       0.95      0.97      0.96    146468

    accuracy                           0.93    170536
   macro avg       0.86      0.83      0.84    170536
weighted avg       0.93      0.93      0.93    170536



In [14]:
lstm_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), tfa.metrics.F1Score(num_classes=1)])

num_epochs = 5
history = lstm_model.fit(padded, train_rating, epochs=num_epochs, steps_per_epoch=1000,validation_data=(testing_padded, test_rating))

lstm_model.save('lstm model')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




INFO:tensorflow:Assets written to: lstm model/assets


INFO:tensorflow:Assets written to: lstm model/assets


In [15]:
y_pred = lstm_model.predict(testing_padded)

from sklearn.metrics import classification_report

y_pred[y_pred>=0.5] = 1
y_pred[y_pred<0.5] = 0

print(classification_report(test_rating, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.65      0.72     24068
           1       0.94      0.98      0.96    146468

    accuracy                           0.93    170536
   macro avg       0.88      0.81      0.84    170536
weighted avg       0.93      0.93      0.93    170536

