In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, Dense, concatenate, Input
from tensorflow.keras.callbacks import ModelCheckpoint

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

data = pd.read_csv("amazon_reviews.csv")

# Drop missing values
data.replace(' ', pd.NA, inplace=True)
data = data.dropna()

# stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# data preprocessing
def data_preprocessing(row):
    row = re.sub(r'[^a-zA-Z\s]', '', row.lower())
    tokens = word_tokenize(row)
    clean_rows = []
    for token in tokens:
        if token not in stop_words:
            clean_token = lemmatizer.lemmatize(token)
            clean_rows.append(clean_token)
    clean_row = ' '.join(clean_rows)
    return clean_row

# Apply data preprocessing to cleaned_review column
data['cleaned_review'] = data['cleaned_review'].apply(data_preprocessing)
# Update cleaned_review_length column
data['cleaned_review_length'] = data['cleaned_review'].apply(lambda x: len(x.split()))

# hyperparameters maxlen values and split ratios 
maxlength = max(data['cleaned_review_length'])
maxlen_values = [100, 150, maxlength]
split_ratios = [0.2, 0.3, 0.4]
# list to append results of each model
results_SimpleRNN = []
results_LSTM = []

best_accuracy_simple_rnn = 0
best_accuracy_lstm = 0
best_parameters_simple_rnn = {}
best_parameters_lstm = {}

for maxlen in maxlen_values:
    for split_ratio in split_ratios:
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(data['cleaned_review'], data['sentiments'], test_size=split_ratio, random_state=42)

        # Tokenization
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(X_train)
        X_train_text = tokenizer.texts_to_sequences(X_train)
        X_test_text = tokenizer.texts_to_sequences(X_test)

        vocab_size = len(tokenizer.word_index) + 1

        # Encoding the target variable
        encoder = LabelEncoder()
        y_train_encoded = encoder.fit_transform(y_train)
        y_test_encoded = encoder.transform(y_test)

        # Padding (same length)
        X_train_text_padded = pad_sequences(X_train_text, maxlen=maxlen)
        X_test_text_padded = pad_sequences(X_test_text, maxlen=maxlen)

        # Define input layer
        model_simple_rnn_input_text = Input(shape=(maxlen,))
        # Embedding layer
        embedding_layer_simple_rnn = Embedding(vocab_size, 100, input_length=maxlen)(model_simple_rnn_input_text)
        # SimpleRNN layer
        simple_rnn_layer = SimpleRNN(100)(embedding_layer_simple_rnn)
        # Output layer
        output_simple_rnn = Dense(3, activation='softmax')(simple_rnn_layer)
        # Define model
        model_simple_rnn = Model(inputs=model_simple_rnn_input_text, outputs=output_simple_rnn)
        # Compile the model
        model_simple_rnn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        # Train the model
        history_simple_rnn = model_simple_rnn.fit(X_train_text_padded, y_train_encoded, validation_data=(X_test_text_padded, y_test_encoded), epochs=5, batch_size=64, verbose=1)

        # Check if the current model is better than the previous best
        val_accuracy_simple_rnn = max(history_simple_rnn.history['val_accuracy'])
        if val_accuracy_simple_rnn > best_accuracy_simple_rnn:
            best_accuracy_simple_rnn = val_accuracy_simple_rnn
            best_parameters_simple_rnn = {'maxlen': maxlen, 'split_ratio': split_ratio}
            # Save the best SimpleRNN model
            model_simple_rnn.save(f"best_model_simple_rnn.h5")

        results_SimpleRNN.append([maxlen,split_ratio,val_accuracy_simple_rnn])

        # accuracy for SimpleRNN
        print(f"SimpleRNN - maxlen: {maxlen}, split_ratio: {split_ratio}, val_accuracy: {val_accuracy_simple_rnn}")

        # Define input layer
        model_lstm_input_text = Input(shape=(maxlen,))
        # Embedding layer
        embedding_layer_lstm = Embedding(vocab_size, 100, input_length=maxlen)(model_lstm_input_text)
        # LSTM layer
        lstm_layer = LSTM(100)(embedding_layer_lstm)
        # Output layer
        output_lstm = Dense(3, activation='softmax')(lstm_layer)
        # Define model
        model_lstm = Model(inputs=model_lstm_input_text, outputs=output_lstm)
        # Compile the model
        model_lstm.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        # Train the model
        history_lstm = model_lstm.fit(X_train_text_padded, y_train_encoded, validation_data=(X_test_text_padded, y_test_encoded), epochs=5, batch_size=64, verbose=1)

        # Check if the current model is better than the previous best
        val_accuracy_lstm = max(history_lstm.history['val_accuracy'])
        if val_accuracy_lstm > best_accuracy_lstm:
            best_accuracy_lstm = val_accuracy_lstm
            best_parameters_lstm = {'maxlen': maxlen, 'split_ratio': split_ratio}
            # Save the best LSTM model
            model_lstm.save(f"best_model_lstm.h5")

        results_LSTM.append([maxlen, split_ratio, val_accuracy_lstm])

        # accuracy for LSTM
        print(f"LSTM - maxlen: {maxlen}, split_ratio: {split_ratio}, val_accuracy: {val_accuracy_lstm}")

# best parameters for SimpleRNN and LSTM
print("Best parameters for SimpleRNN:", best_parameters_simple_rnn)
print("Best parameters for LSTM:", best_parameters_lstm)

# best accuracy for SimpleRNN and LSTM
print("Best accuracy for SimpleRNN:", best_accuracy_simple_rnn)
print("Best accuracy for LSTM:", best_accuracy_lstm)

# model summary of each model and the best hyperparameters
results_SimpleRNN_df = pd.DataFrame(results_SimpleRNN, columns=['maxlen', 'split_ratio', 'Accuracy'])
results_LSTM_df = pd.DataFrame(results_LSTM, columns=['maxlen', 'split_ratio', 'Accuracy'])
print ('SimpleRNN summary')
print(results_SimpleRNN_df)
print ('LSTM summary')
print(results_LSTM_df)




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maria\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\maria\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maria\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!




Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


  saving_api.save_model(


SimpleRNN - maxlen: 100, split_ratio: 0.2, val_accuracy: 0.8730158805847168
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
LSTM - maxlen: 100, split_ratio: 0.2, val_accuracy: 0.8724386692047119
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
SimpleRNN - maxlen: 100, split_ratio: 0.3, val_accuracy: 0.8524148464202881
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
LSTM - maxlen: 100, split_ratio: 0.3, val_accuracy: 0.8560708165168762
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
SimpleRNN - maxlen: 100, split_ratio: 0.4, val_accuracy: 0.847597062587738
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
LSTM - maxlen: 100, split_ratio: 0.4, val_accuracy: 0.8491845726966858
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
SimpleRNN - maxlen: 150, split_ratio: 0.2, val_accuracy: 0.86753249168396
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
LSTM - maxlen: 150, split_ratio: 0.2, val_accuracy: 0.8701298832893372
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
SimpleRNN - max

  saving_api.save_model(


LSTM - maxlen: 313, split_ratio: 0.2, val_accuracy: 0.8750360608100891
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
SimpleRNN - maxlen: 313, split_ratio: 0.3, val_accuracy: 0.8449105024337769
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
LSTM - maxlen: 313, split_ratio: 0.3, val_accuracy: 0.8579949736595154
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
SimpleRNN - maxlen: 313, split_ratio: 0.4, val_accuracy: 0.8507721424102783
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
LSTM - maxlen: 313, split_ratio: 0.4, val_accuracy: 0.8487516045570374
Best parameters for SimpleRNN: {'maxlen': 100, 'split_ratio': 0.2}
Best parameters for LSTM: {'maxlen': 313, 'split_ratio': 0.2}
Best accuracy for SimpleRNN: 0.8730158805847168
Best accuracy for LSTM: 0.8750360608100891
SimpleRNN summary
   maxlen  split_ratio  Accuracy
0     100          0.2  0.873016
1     100          0.3  0.852415
2     100          0.4  0.847597
3     150          0.2  0.867532
4     150          0.3  0.855878

In [2]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the best SimpleRNN model
best_model_simple_rnn = tf.keras.models.load_model("best_model_simple_rnn.h5")
# Load the best LSTM model
best_model_lstm = tf.keras.models.load_model("best_model_lstm.h5")

# User input for a new review
new_review = input("Enter your review: ")
# Preprocess the new review
cleaned_review = data_preprocessing(new_review)
# Tokenize
new_review_sequence = tokenizer.texts_to_sequences([cleaned_review])

# Pad the new review simple_rnn
new_review_padded = pad_sequences(new_review_sequence, maxlen=best_parameters_simple_rnn['maxlen'])
# Predict using the best SimpleRNN model
predicted_sentiment_SimpleRNN = best_model_simple_rnn.predict(new_review_padded)
# Decode the predicted sentiment
predicted_sentiment_label_SimpleRNN = encoder.classes_[np.argmax(predicted_sentiment_SimpleRNN)]
print("Predicted sentiment SimpleRNN:", predicted_sentiment_label_SimpleRNN)

# Pad the new review lstm
new_review_padded = pad_sequences(new_review_sequence, maxlen=best_parameters_lstm['maxlen'])
# Predict the sentiment of the new review using the best LSTM model
predicted_sentiment_LSTM = best_model_lstm.predict(new_review_padded)
# Decode the predicted sentiment
predicted_sentiment_label_LSTM = encoder.classes_[np.argmax(predicted_sentiment_LSTM)]
print("Predicted sentiment LSTM:", predicted_sentiment_label_LSTM)

Enter your review:  perfect little mouse this mouse is so easy to use and to charge up it lightweight love the little colors and it fits my hand perfectly


Predicted sentiment SimpleRNN: positive
Predicted sentiment LSTM: positive
