In [55]:
from google.colab import drive
drive.mount('/content/drive')
# drive/MyDrive/Colab Notebooks/..

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load Dataset
load 'amazon_reviews' dataset and discover it

In [56]:
import pandas as pd

In [57]:
amazon_reviews = pd.read_csv('drive/MyDrive/Colab Notebooks/amazon_reviews.csv')

In [58]:
print('Amazon Reviews Data Head:')
print('-------------------------')
print(amazon_reviews.head().to_markdown(tablefmt="github", index=False))

Amazon Reviews Data Head:
-------------------------
| sentiments   | cleaned_review                                                                                                                                                                                                                                                                                                                                                                                                                                                              |   cleaned_review_length |   review_score |
|--------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [59]:
print("amazon reviews information:")
print(amazon_reviews.info())

amazon reviews information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17340 entries, 0 to 17339
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   sentiments             17340 non-null  object 
 1   cleaned_review         17337 non-null  object 
 2   cleaned_review_length  17340 non-null  int64  
 3   review_score           17340 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 542.0+ KB
None


In [60]:
print(amazon_reviews['sentiments'].value_counts())

sentiments
positive    9503
neutral     6303
negative    1534
Name: count, dtype: int64


In [61]:
# get max review length to use it in the padding
max_review_length = amazon_reviews['cleaned_review_length'].max()
print(f'max cleaned review length: {max_review_length}')

max cleaned review length: 571


In [62]:
# Remove rows where the 'cleaned_review_length' column is 0
print(f"amazon reviews shape before remove 0 length: {amazon_reviews.shape}")
amazon_reviews = amazon_reviews[amazon_reviews['cleaned_review_length'] != 0]
print(f"amazon reviews shape before after 0 length: {amazon_reviews.shape}")

amazon reviews shape before remove 0 length: (17340, 4)
amazon reviews shape before after 0 length: (17321, 4)


# Data Pre-processing (if needed)
to clean your data and provide a valid dataset for the models to be trained, like removing stopwords using NLTK



In [63]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [64]:
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

we add **'data cleaning'** to ensure that data will come from the input part (bonus part) will be cleaned as our amazon reviews data.



In [65]:
def text_cleaning(text):
    # remove email address
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', '', text)
    # remove puctuations and numbers
    text = re.sub(r'[^A-Za-z\s]', ' ', text)

    return text

In [66]:
def preprocessing(text):
    # perform text cleaning (removing irrelevant words, symbols, etc.)
    clean_text = text_cleaning(text)
    # tokenization for lowercase words
    text_tokens = word_tokenize(clean_text.lower())
    # remove all stopwords
    stopwrds = set(stopwords.words('english'))
    text_rmstop = [i for i in text_tokens if i not in stopwrds]
    # limmatize all words
    lemmatizer = WordNetLemmatizer()
    text_lemm = [lemmatizer.lemmatize(w) for w in text_rmstop]

    return ' '.join(text_lemm)

In [67]:
# apply preprocessing step
amazon_reviews['cleaned_review'] = amazon_reviews['cleaned_review'] .apply(preprocessing)

In [68]:
print('Amazon Reviews Data Head after Preprocessing:')
print('---------------------------------------------')
print(amazon_reviews.head().to_markdown(tablefmt="github", index=False))

Amazon Reviews Data Head after Preprocessing:
---------------------------------------------
| sentiments   | cleaned_review                                                                                                                                                                                                                                                                                          |   cleaned_review_length |   review_score |
|--------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------|----------------|
| positive     | wish would gotten one earlier love make working laptop much easier                                                                                                             

 # Data Splitting
 apply data splitting for your; 80% as training set and 20% as
 validation set.

In [69]:
from sklearn.model_selection import train_test_split

In [70]:
# map the sentiments to numbers to be used in the models
amazon_reviews['sentiments'] = amazon_reviews['sentiments'].map({'negative': 0, 'neutral': 1, 'positive': 2})

In [71]:
# Split the data
def data_splitting(training_ratio):
    return train_test_split(amazon_reviews['cleaned_review'], amazon_reviews['sentiments'], train_size=training_ratio, random_state=42)

x_train, x_validation, y_train, y_validation = data_splitting(0.8)

 # Word Embedding
 build your vocabulary by extracting and indexing unique words,
 convert each review to a sequence of indices, then apply sequence padding to
 have all sequences of the same length in preparation for input to the embedding
 layer.

In [72]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [73]:
def train_tokinizer(x_train):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(x_train)

  vocab_size = len(tokenizer.word_index) + 1

  return tokenizer, vocab_size

def build_vocab(tokenizer, text, padding_length):
  # Convert text to sequences
  text_seq = tokenizer.texts_to_sequences(text)
  # Padding sequences
  text_pad = pad_sequences(text_seq, maxlen=padding_length)

  return text_pad

In [74]:
tokenizer, vocab_size = train_tokinizer(x_train)
x_train_pad, x_validation_pad= build_vocab(tokenizer, x_train, max_review_length), build_vocab(tokenizer, x_validation, max_review_length)

In [75]:
print('Shape of x_train:', x_train_pad.shape)
print('Shape of x_validation:', x_validation_pad.shape)

Shape of x_train: (13856, 571)
Shape of x_validation: (3465, 571)


In [76]:
print(f'Vocabulary size: {vocab_size} unique tokens.')

Vocabulary size: 7846 unique tokens.


 # Model Training
 You will train two models simpleRNN and LSTM and print the
 accuracy for each model on testing data.

## 1. Simple RNN model

In [77]:
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, LSTM, Dense, SpatialDropout1D

In [78]:
def create_rnn_model(vocab_size, embedding_dim, padding_length, units):
    model = Sequential([
      # Embedding layer to convert words to vectors of fixed size (embedding_dim)
      Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=padding_length),
      # recurrent layer with 'units' hidden neurans
      SimpleRNN(units), # default activation='tanh'
      Dense(3, activation='softmax')
    ])

    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [79]:
# Train RNN model
rnn_model = create_rnn_model(vocab_size, 200, max_review_length, 32)
rnn_model.fit(x_train_pad, y_train, epochs=5, batch_size=64, validation_data=(x_validation_pad, y_validation))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7d5588ca2080>

In [80]:
print(rnn_model.summary())

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 571, 200)          1569200   
                                                                 
 simple_rnn_3 (SimpleRNN)    (None, 32)                7456      
                                                                 
 dense_4 (Dense)             (None, 3)                 99        
                                                                 
Total params: 1576755 (6.01 MB)
Trainable params: 1576755 (6.01 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [81]:
# Evaluate RNN
rnn_loss, rnn_accuracy = rnn_model.evaluate(x_validation_pad, y_validation)
print(f'RNN Accuracy: {rnn_accuracy*100.0:.2f}%')

RNN Accuracy: 86.03%


## 2. LSTM model

In [82]:
def create_lstm_model(vocab_size, embedding_dim, padding_length, units, dropout):
    model = Sequential([
        # Embedding layer to convert words to vectors of fixed size (embedding_dim)
        Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=padding_length),
        # SpatialDropout1D layer applies random dropout to the input (1D feature vectors) of the LSTM layer to prevent overfitting
        SpatialDropout1D(dropout),
        # number of hidden neurans, dropout rate for input units
        LSTM(units, dropout=dropout),
        Dense(3, activation='softmax')
    ])

    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [83]:
# Train LSTM model
lstm_model = create_lstm_model(vocab_size, 200, max_review_length, 32, 0.2)
lstm_model.fit(x_train_pad, y_train, epochs=5, batch_size=64, validation_data=(x_validation_pad, y_validation))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7d5517508880>

In [84]:
print(lstm_model.summary())

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 571, 200)          1569200   
                                                                 
 spatial_dropout1d_1 (Spati  (None, 571, 200)          0         
 alDropout1D)                                                    
                                                                 
 lstm_1 (LSTM)               (None, 32)                29824     
                                                                 
 dense_5 (Dense)             (None, 3)                 99        
                                                                 
Total params: 1599123 (6.10 MB)
Trainable params: 1599123 (6.10 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [85]:
# Evaluate LSTM
lstm_loss, lstm_accuracy = lstm_model.evaluate(x_validation_pad, y_validation)
print(f'LSTM Accuracy: {lstm_accuracy*100:.2f}%')

LSTM Accuracy: 87.47%


 # Bonus:


 ### 1. Input
 Allow the user to input a new review and predict the result.

In [86]:
sentiments = {0: 'negative', 1: 'neutral', 2: 'positive'}

In [87]:
def predict_review(model, review):
  review = preprocessing(review)
  review_pad = build_vocab(tokenizer, [review], max_review_length)

  y_pred = model.predict(review_pad)
  sentiment_pred = np.argmax(y_pred)
  return sentiments[sentiment_pred]

In [89]:
while True:
  review = input('Enter your review (-1 to stop): ')
  if review == "-1":
    break

  rnn_prediction = predict_review(rnn_model, review)
  print(f'predicted sentiment by RNN model: {rnn_prediction}')

  lstm_prediction = predict_review(lstm_model, review)
  print(f'predicted sentiment by LSTM model: {lstm_prediction}')
  print()

Enter your review (-1 to stop): I love it
predicted sentiment by RNN model: positive
predicted sentiment by LSTM model: positive

Enter your review (-1 to stop): Bad! so bad
predicted sentiment by RNN model: negative
predicted sentiment by LSTM model: negative

Enter your review (-1 to stop): It's ok
predicted sentiment by RNN model: neutral
predicted sentiment by LSTM model: neutral

Enter your review (-1 to stop): -1


### 2. Report
Provide a report that shows model summary of each model and the best
 hyperparameters for each model (splitting ratio, sequence padding length
 … ) with a table showing the accuracy against each parameter (i.e. 80% 20%
 ratio, 70% 30% ratio, and same for sequence padding length).



> Provided as PDF in the submitted folder.

