In [None]:
import string
import re
import nltk

import pandas as pd
from keras import Sequential
from keras.src.layers import LSTM, Dense, Embedding, GlobalMaxPooling1D
from keras_preprocessing.sequence import pad_sequences
from keras_preprocessing.text import Tokenizer
from nltk import PorterStemmer
import plotly.graph_objects as go

In [None]:
data = pd.read_csv("Restaurant_Reviews.tsv", encoding_errors="ignore", sep="\t")
data.columns = ["review", "label"]
data

# Stemmed data

In [None]:
stemmed_data = data.copy()

In [None]:
stopwords = nltk.corpus.stopwords.words("english")
ps = PorterStemmer()


def stemm_text(text):
    text_without_punctuations = "".join([c for c in text if c not in string.punctuation])
    tokens = re.split(r"\W", text_without_punctuations)
    text = "".join([ps.stem(word) + " " for word in tokens if word.lower() not in stopwords])
    return text

In [None]:
stemmed_data["review"] = stemmed_data["review"].apply(lambda text: stemm_text(text))
stemmed_data = stemmed_data.drop_duplicates()
stemmed_data

# Lemmatized data

In [None]:
lemmatized_data = data.copy()

In [None]:
wnl = nltk.WordNetLemmatizer()


def lemmatize_text(text):
    text_without_punctuations = "".join([c for c in text if c not in string.punctuation])
    tokens = re.split(r"\W", text_without_punctuations)
    text = "".join([wnl.lemmatize(word).lower() + " " for word in tokens if word.lower() not in stopwords])
    return text

In [None]:
lemmatized_data["review"] = lemmatized_data["review"].apply(lambda text: lemmatize_text(text))
lemmatized_data = lemmatized_data.drop_duplicates()
lemmatized_data

# Raw data

In [None]:
raw_data = data.copy()

# Model

In [None]:
results = []

for data in [stemmed_data, lemmatized_data, raw_data]:
    MAX_WORDS = max([len(re.split(r"\W", review)) for review in data["review"]])
    tokenizer = Tokenizer(num_words=2500, lower=True)

    tokenizer.fit_on_texts(data["review"])
    sequences = tokenizer.texts_to_sequences(data["review"])

    padded_sequences = pad_sequences(sequences, maxlen=MAX_WORDS, padding='pre')

    V = len(tokenizer.word_index)
    D = 40

    model = Sequential()
    model.add(Embedding(V + 1, D, input_shape=(MAX_WORDS,)))
    model.add(LSTM(units=8, return_sequences=True))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(units=1, activation="sigmoid"))

    # model.summary()

    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

    history = model.fit(padded_sequences, data["label"], epochs=50, validation_split=.35, verbose=0)

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=history.epoch, y=history.history['accuracy'], name='accuracy'))
    fig.add_trace(go.Scatter(x=history.epoch, y=history.history['val_accuracy'], name='val_accuracy'))
    fig.show()

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=history.epoch, y=history.history['loss'], name='loss'))
    fig.add_trace(go.Scatter(x=history.epoch, y=history.history['val_loss'], name='val_loss'))
    fig.show()

    results.append(model.evaluate(padded_sequences, data["label"]))
