In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
import string
from nltk.stem import WordNetLemmatizer
import spacy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import warnings
warnings.filterwarnings("ignore")

In [2]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\noran\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\noran\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\noran\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\noran\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
data = pd.read_csv("amazon_reviews.csv")
data

Unnamed: 0,sentiments,cleaned_review,cleaned_review_length,review_score
0,positive,i wish would have gotten one earlier love it a...,19,5.0
1,neutral,i ve learned this lesson again open the packag...,88,1.0
2,neutral,it is so slow and lags find better option,9,2.0
3,neutral,roller ball stopped working within months of m...,12,1.0
4,neutral,i like the color and size but it few days out ...,21,1.0
...,...,...,...,...
17335,positive,i love this speaker and love can take it anywh...,30,5.0
17336,positive,i use it in my house easy to connect and loud ...,13,4.0
17337,positive,the bass is good and the battery is amazing mu...,41,5.0
17338,positive,love it,2,5.0


In [4]:
data = data.drop(columns = ['cleaned_review_length', 'review_score'])
data

Unnamed: 0,sentiments,cleaned_review
0,positive,i wish would have gotten one earlier love it a...
1,neutral,i ve learned this lesson again open the packag...
2,neutral,it is so slow and lags find better option
3,neutral,roller ball stopped working within months of m...
4,neutral,i like the color and size but it few days out ...
...,...,...
17335,positive,i love this speaker and love can take it anywh...
17336,positive,i use it in my house easy to connect and loud ...
17337,positive,the bass is good and the battery is amazing mu...
17338,positive,love it


In [5]:
#Check for missing values
data.isnull().sum()

sentiments        0
cleaned_review    3
dtype: int64

In [6]:
#Drop missing values
data = data.dropna()
data['sentiments'].value_counts()

sentiments
positive    9503
neutral     6300
negative    1534
Name: count, dtype: int64

In [7]:
def remove_negated_stopwords(stop_words):
    # Define negated stop words
    negation = set(["no", "not", "never", "none", "nobody", "nowhere", "nothing", "neither", "nor", "noone"])



    # Remove negated stop words
    modified_stop_words = stop_words - negation
    # print(modified_stop_words)

    # regular expression to match words ending with "n't"
    suffix_pattern = re.compile(r"n't")

    for word in list(stop_words):
        # If a word ends with "n't"
        if suffix_pattern.search(word):
            # Substitute the suffix with an empty string
            main_word = re.sub(suffix_pattern, '', word)

            #Remove the negated word from the modified stop words set
            modified_stop_words.remove(word)

            # Add the orginal word to the modified stop words set
            modified_stop_words.add(main_word)

    return modified_stop_words


stop_words = set(stopwords.words('english'))
print("Stop words before filtering:\n", stop_words , "\n")

modified_stop_words = remove_negated_stopwords(stop_words)
print("Stop words after filtering negations:\n", modified_stop_words)

Stop words before filtering:
 {"hadn't", 'our', 'while', "isn't", 'each', 'will', 'didn', "don't", "haven't", 'they', 'don', 'there', 'but', 'we', "you'd", 'their', 'why', 'ain', 'doesn', "mustn't", "weren't", 'yourself', 'having', 'hers', 'about', 'against', 'its', 'yourselves', "shouldn't", 'ourselves', 'she', 'some', 'o', 'm', 'theirs', 'those', "hasn't", 'into', 'both', 'he', 'until', 'have', 'same', 'me', "you've", 'any', 'myself', 'more', 'be', 'how', 'should', 'what', 'such', 'very', 'ours', 'that', 'further', 've', 'in', 'wouldn', "didn't", "doesn't", 'can', 'y', 'of', "wouldn't", 'under', 'out', 'been', 'them', "wasn't", 'my', 'after', 'were', 'won', 'themselves', "that'll", 'mustn', 'do', 'just', 'i', 'when', 'and', 'at', 'who', 'now', 'him', 'being', 'has', "you'll", 'with', 'by', 'off', 'down', 'yours', 'was', 'wasn', 'is', 'during', 'aren', 'no', 'a', 's', 'shouldn', 'through', 'below', 'needn', 'his', 'than', 'an', 'above', 'isn', 'her', 'few', 'again', 'you', 'himself', 

In [8]:
def data_preprocessing(data):
    #Convert to lowercase
    data = data.lower()

    doc = nlp(data)

    #Tokenize data
    tokens = nltk.word_tokenize(data)

    #Remove stop words and punctuations
    stop_words = set(stopwords.words('english'))

    modified_stop_words = remove_negated_stopwords(stop_words)

    filtered_tokens = [token for token in tokens if token.isalpha() and token not in modified_stop_words]

    # Lemmatization
    # lemmatizer = WordNetLemmatizer()
    # lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    lemmatized_tokens = [token.lemma_ for token in doc if token.is_alpha and token.text not in modified_stop_words]

    # Convert tokens into text again (into single string)
    preprocessed_data = ' '.join(lemmatized_tokens)
    return preprocessed_data

In [9]:
# Apply preprocessing to the text column
data['cleaned_review'] = data['cleaned_review'].apply(data_preprocessing)

print("Show preprocessed data:")
data
# print(data['cleaned_review'])

Show preprocessed data:


Unnamed: 0,sentiments,cleaned_review
0,positive,wish get one early love make work laptop much ...
1,neutral,learn lesson open package use product right aw...
2,neutral,slow lag find well option
3,neutral,roller ball stop work within month minimal use...
4,neutral,like color size day return period not hold charge
...,...,...
17335,positive,love speaker love take anywhere charge phone w...
17336,positive,use house easy connect loud clear music
17337,positive,bass good battery amazing much well charge thi...
17338,positive,love


In [10]:
X = data['cleaned_review']
Y = data['sentiments']

In [11]:
# Label encoder (pos=>0 , Neu=>1, Neg=>2)
label_encoder = LabelEncoder()

# Fit and transform the column containing strings
Y = label_encoder.fit_transform(Y)



X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


# Word Embedding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

vocab_size = len(tokenizer.word_index) + 1

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

maxlen = max([len(seq) for seq in X_train_sequences + X_test_sequences])

X_train_pad = pad_sequences(X_train_sequences, maxlen=maxlen)
X_test_pad = pad_sequences(X_test_sequences, maxlen=maxlen)

In [19]:
# Model Training
def train_model(model_type, X_train, y_train, X_test, y_test):
    model = Sequential()
    model.add(Embedding(vocab_size, 128, input_length=maxlen))

    if model_type == 'SimpleRNN':
        model.add(SimpleRNN(128, return_sequences=False))
    elif model_type == 'LSTM':
        model.add(LSTM(128, return_sequences=False))

    model.add(Dropout(0.5))

    model.add(Dense(3, activation='softmax'))

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

    # Model summary
    model.summary()

    return model

In [20]:
# Train SimpleRNN model
simple_rnn_model = train_model('SimpleRNN', X_train_pad, y_train, X_test_pad, y_test)

Epoch 1/5
[1m434/434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 46ms/step - accuracy: 0.6389 - loss: 0.7784 - val_accuracy: 0.8108 - val_loss: 0.5129
Epoch 2/5
[1m434/434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 77ms/step - accuracy: 0.8237 - loss: 0.4699 - val_accuracy: 0.8209 - val_loss: 0.4755
Epoch 3/5
[1m434/434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 71ms/step - accuracy: 0.8468 - loss: 0.4096 - val_accuracy: 0.8377 - val_loss: 0.4624
Epoch 4/5
[1m434/434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 70ms/step - accuracy: 0.8886 - loss: 0.3198 - val_accuracy: 0.8449 - val_loss: 0.4414
Epoch 5/5
[1m434/434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 70ms/step - accuracy: 0.9182 - loss: 0.2560 - val_accuracy: 0.8480 - val_loss: 0.4757


In [21]:
rnn_loss, rnn_accuracy = simple_rnn_model.evaluate(X_test_pad, y_test)

print(f'RNN Model Accuracy: {rnn_accuracy}')
print(f'RNN Model rnn_loss: {rnn_loss}')

[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.8501 - loss: 0.4564
RNN Model Accuracy: 0.8480392098426819
RNN Model rnn_loss: 0.47567227482795715


In [22]:
# Train LSTM model
lstm_model = train_model('LSTM', X_train_pad, y_train, X_test_pad, y_test)

Epoch 1/5
[1m434/434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 204ms/step - accuracy: 0.6790 - loss: 0.7014 - val_accuracy: 0.8377 - val_loss: 0.4169
Epoch 2/5
[1m434/434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 199ms/step - accuracy: 0.8755 - loss: 0.3401 - val_accuracy: 0.8622 - val_loss: 0.3743
Epoch 3/5
[1m434/434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 206ms/step - accuracy: 0.9115 - loss: 0.2453 - val_accuracy: 0.8737 - val_loss: 0.3751
Epoch 4/5
[1m434/434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 194ms/step - accuracy: 0.9263 - loss: 0.2138 - val_accuracy: 0.8685 - val_loss: 0.3893
Epoch 5/5
[1m434/434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 173ms/step - accuracy: 0.9351 - loss: 0.1924 - val_accuracy: 0.8777 - val_loss: 0.4117


In [23]:
lstm_loss,  lstm_accuracy = lstm_model.evaluate(X_test_pad, y_test)

print(f' lstm Model Accuracy: {lstm_accuracy}')
print(f' lstm Model rnn_loss: {lstm_loss}')

[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 58ms/step - accuracy: 0.8780 - loss: 0.4032
 lstm Model Accuracy: 0.8777393102645874
 lstm Model rnn_loss: 0.411745548248291


In [17]:
# User Input Prediction
def predict_review(review):
    review_cleaned = data_preprocessing(review)
    review_seq = tokenizer.texts_to_sequences([review_cleaned])
    review_pad = pad_sequences(review_seq, maxlen=maxlen)
    rnn_prediction = simple_rnn_model.predict(review_pad)
    lstm_prediction = lstm_model.predict(review_pad)
    return {
        'Simple RNN': label_encoder.inverse_transform(np.argmax(rnn_prediction, axis=1))[0],
        'LSTM': label_encoder.inverse_transform(np.argmax(lstm_prediction, axis=1))[0]
    }

In [18]:
# Test user input prediction
review = input()
predicted_class = predict_review(review)
print("Predicted class:", predicted_class)

 good


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 216ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 237ms/step
Predicted class: {'Simple RNN': 'neutral', 'LSTM': 'neutral'}
