In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, SimpleRNN

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\misho\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Data Loading And Dropping irrelevant Values

In [2]:
df = pd.read_csv('amazon_reviews.csv')
df = df.drop('review_score', axis=1)

df = df.dropna();

print(df)

      sentiments                                     cleaned_review  \
0       positive  i wish would have gotten one earlier love it a...   
1        neutral  i ve learned this lesson again open the packag...   
2        neutral          it is so slow and lags find better option   
3        neutral  roller ball stopped working within months of m...   
4        neutral  i like the color and size but it few days out ...   
...          ...                                                ...   
17335   positive  i love this speaker and love can take it anywh...   
17336   positive  i use it in my house easy to connect and loud ...   
17337   positive  the bass is good and the battery is amazing mu...   
17338   positive                                            love it   
17339    neutral                                       mono speaker   

       cleaned_review_length  
0                         19  
1                         88  
2                          9  
3                      

# Data Preprocessing

In [3]:
stopwords = stopwords.words('english')

remove_stopwords_regex = r'\b(' + r'|'.join(stopwords) + r')\b\s*'

df['cleaned_review'] = df.apply(lambda x : re.sub(remove_stopwords_regex, '', x['cleaned_review']), axis=1)


# Word Embedding

## Generating Unique Tokens

In [4]:
tokens = []
cleaned_reviews = df['cleaned_review']

tokens = [word for review in cleaned_reviews for word in review.split(' ')]
tokens = list(set(tokens))
tokens.remove('')

print(tokens)



## Converting reviews to sequences

In [5]:
word_to_index = {word: index for index, word in enumerate(tokens,1)}

def review_to_indices(review):
    return [word_to_index.get(word, 0) for word in review.split(' ')]

df['indexed_review'] = df.apply(lambda x : review_to_indices(x['cleaned_review']), axis=1)
max_length = max([len(review) for review in df['indexed_review']])

## Padding Sequences

In [6]:
max_length = max([len(review) for review in df['indexed_review']])
padded_sequences = pad_sequences(df['indexed_review'], maxlen=max_length, padding='post')

# Splitting Data and encoding labels

In [7]:
X = padded_sequences
y = df['sentiments']

In [8]:
# encoding the target variable
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [9]:
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Building LSTM Model

In [10]:
# Building the model
model = Sequential()
model.add(Embedding(len(tokens)+1, 128, input_length=max_length))
model.add(LSTM(128))
model.add(Dense(3, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])




In [11]:
# Training the model
model.fit(X_train, y_train, epochs=10, batch_size=32)

Epoch 1/10
[1m198/434[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m35s[0m 148ms/step - accuracy: 0.5297 - loss: 0.9422

KeyboardInterrupt: 

In [None]:
# Evaluating the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Loss: {loss}, Accuracy: {accuracy}')

In [None]:
# Save the model
model.save('lstm.h5')

# Building RNN Model

In [None]:
# Building the model
model = Sequential()
model.add(Embedding(len(tokens)+1, 128, input_length=max_length))
model.add(SimpleRNN(128))
model.add(Dense(3, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Training the model
model.fit(X_train, y_train, epochs=10, batch_size=32)

In [None]:
# Evaluating the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Loss: {loss}, Accuracy: {accuracy}')

In [None]:
model.save('rnn.h5')