### Imports

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import keras

from tensorflow.keras.preprocessing.text import Tokenizer # type: ignore
from tensorflow.keras.preprocessing.sequence import pad_sequences # type: ignore

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [None]:
DATASET_PATH = "../datasets/stanfordSentimentTreebank/datasetSentences.txt"
LABELS_PATH = "../datasets/stanfordSentimentTreebank/sentiment_labels.txt"
DICTIONARY_PATH = "../datasets/stanfordSentimentTreebank/dictionary.txt"
DATASPLIT_PATH = "../datasets/stanfordSentimentTreebank/datasetSplit.txt"

MODEL_PATH = "sentiment_analysis_model_02.keras"
TOKENIZER_PATH = "tokenizer_02.pickle"

### Preprocessing

In [None]:
train_df = pd.read_csv(DATASET_PATH, sep="\t")
train_df = train_df[["sentence", "sentence_index"]]
train_df.set_index("sentence_index", inplace=True)

split = pd.read_csv(DATASPLIT_PATH, sep=",")
split = split[["sentence_index", "splitset_label"]]
split.set_index("sentence_index", inplace=True)

train_df = train_df.join(split)
train_df.head()

In [None]:
df = pd.read_csv(LABELS_PATH, sep="|")
df.columns = ["phrase_id", "sentiment_value"]
df.set_index("phrase_id", inplace=True)
df.sort_index(inplace=True)
df["sentiment_class"] = df["sentiment_value"].apply(lambda x: "negative" if x < 0.4
                                                    else "neutral" if x < 0.6
                                                    else "positive")
# df.head()
# df["sentiment_class"].value_counts()

dictionary = pd.read_csv(DICTIONARY_PATH, sep="|", header=None)
dictionary.columns = ["phrase", "phrase_id"]

dictionary.set_index("phrase_id", inplace=True)
# dictionary["phrase"] = dictionary["phrase"].astype(str)

# Check for non-string phrases. Convert them to strings if possible. Drop them if not.
for i in range(len(dictionary["phrase"])):
    if type(dictionary.loc[i, "phrase"]) != str:
        try:
            dictionary.loc[i, "phrase"] = str(dictionary.loc[i, "phrase"])
        except:
            print(f"Dictionary phrase {i} is not a string. Type: {type(dictionary.loc[i, "phrase"])} Removed.")
            dictionary.drop(i, inplace=True)


dictionary["phrase"] = dictionary["phrase"].apply(lambda x: x.lower())
dictionary.sort_index(inplace=True)

df = df.join(dictionary, how="left")
dictionary.dropna(subset=["phrase"], inplace=True)  # Remove rows with NaN phrases
df.head()

# 42077, 67853

In [None]:
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(df["phrase"]) # df["sentence"]

sequences = tokenizer.texts_to_sequences(df["phrase"]) # df["sentence"]
padded_sequences = pad_sequences(sequences, padding="post")

# sentiment_labels = pd.read_csv(LABELS_PATH, sep="|")
# sentiment_labels["sentiment class"] = pd.cut(sentiment_labels["sentiment values"], bins=[0, 0.45, 0.55, 1], labels=["negative", "neutral", "positive"])
# sentiment_labels = sentiment_labels[["phrase_id", "sentiment values", "sentiment class"]]
# sentiment_labels[1:100]

# sentiment_labels["sentiment class"].value_counts()
# print(padded_sequences.shape, sentiment_labels["sentiment class"].shape, dictionary_df.shape)

In [None]:
# x_train = padded_sequences[df["splitset_label"] == 1]
# y_train = sentiment_labels["sentiment values"][sentiment_labels["phrase_id"].isin(df.index[df["splitset_label"] == 1])]
# y_train = np.array(y_train)


# x_test = padded_sequences[df["splitset_label"] == 2]
# y_test = sentiment_labels["sentiment values"][sentiment_labels["phrase_id"].isin(df.index[df["splitset_label"] == 2])]
# y_test = np.array(y_test)

x_train = padded_sequences
y_train = pd.get_dummies(df["sentiment_class"]).values
y_train = np.array(y_train)

x_test = train_df["sentence"][train_df["splitset_label"] == 2]
x_test = tokenizer.texts_to_sequences(x_test)
x_test = pad_sequences(x_test, padding="post")
x_test = np.array(x_test)

y_test = df["sentiment_class"][df.index.isin(train_df.index[train_df["splitset_label"] == 2])]
y_test = pd.get_dummies(y_test).values
y_test = np.array(y_test)


# x_train, y_train, x_test, y_test = train_test_split(padded_sequences, sentiment_labels["sentiment values"], test_size=0.2)

# model = Sequential()
# model.add(Embedding(5000, 100))
# model.add(Conv1D(64, 5, activation="relu"))
# model.add(GlobalMaxPooling1D())
# model.add(Dense(32, activation="relu"))
# model.add(Dropout(0.5))
# model.add(Dense(3, activation="softmax"))

print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)
# type(x_train), type(y_train), type(x_test), type(y_test)

model = keras.models.Sequential(
    [
        keras.layers.Embedding(10000, 100),
        keras.layers.Conv1D(64, 5, activation="relu"),
        keras.layers.GlobalMaxPooling1D(),
        keras.layers.Dense(32, activation="relu", kernel_regularizer=keras.regularizers.l2(0.01)),
        keras.layers.Dropout(0.7),
        keras.layers.Dense(32, activation="relu", kernel_regularizer=keras.regularizers.l2(0.01)),
        keras.layers.Dropout(0.6),
        keras.layers.Dense(3, activation="softmax"),
    ]
)

keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()

### Training

In [None]:
model.fit(x_train, y_train, epochs=5, batch_size=128, validation_data=(x_test, y_test))

### Evaluation

In [None]:
y_pred = model.predict(x_test)
# type(y_pred)

# def return_sentiment_class(pred):
#     if pred > 0.66:
#         return "positive"
#     elif pred < 0.33:
#         return "negative"
#     else:
#         return "neutral"

# y_test_classes = [return_sentiment_class(label) for label in y_test]
# y_pred_classes = [return_sentiment_class(label) for label in y_pred]

y_test_classes = np.argmax(y_test, axis=1)
y_pred_classes = np.argmax(y_pred, axis=1)

accuracy = accuracy_score(y_test_classes, y_pred_classes)
precision = precision_score(y_test_classes, y_pred_classes, average="weighted", zero_division=1)
recall = recall_score(y_test_classes, y_pred_classes, average="weighted")
f1 = f1_score(y_test_classes, y_pred_classes, average="weighted")

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {f1}")

### Save Model

In [None]:
import pickle

model.save(MODEL_PATH)
with open(TOKENIZER_PATH, "wb") as f:
    pickle.dump(tokenizer, f)

### Prediction

In [None]:
model = keras.models.load_model(MODEL_PATH)
with open(TOKENIZER_PATH, "rb") as handle:
    tokenizer = pickle.load(handle)


def predict_sentiment(sentence):
    text_sequence = tokenizer.texts_to_sequences([sentence])
    text_sequence = pad_sequences(text_sequence, maxlen=100, truncating="post")

    predicted_sentiment = model.predict(text_sequence)
    # predicted_sentiment_class = return_sentiment_class(predicted_sentiment)

    # return predicted_sentiment_class, predicted_sentiment
    return predicted_sentiment


In [None]:
print(predict_sentiment("I love this movie!"))
print(predict_sentiment("I hate this movie!"))
print(predict_sentiment("This movie is okay."))
print(predict_sentiment("This movie is the worst!"))
print(predict_sentiment("This movie is the best!"))
