# Read dataset

In [ ]:
import nltk
import pandas as pd
import seaborn as sns
import numpy as np
from nltk.tokenize import word_tokenize

from utils.lemmatization import Lemmatization
from utils.preprocessing import Utilities

# Plot

In [ ]:
movies = pd.read_csv('input/tmdb_5000_movies.csv')

print(len(movies))
# remove movies where overview is null or equals to ''
movies = movies.dropna(axis=0, subset=['overview'])

movies = movies[movies['overview'] != '']
print(len(movies))

## Plot observations

In [ ]:
nltk.download('punkt_tab')

# Tokenize each paragraph in the 'overview' column and calculate the number of tokens
movies['tokens'] = movies['overview'].apply(lambda x: len(word_tokenize(x)))


# Calculate min, max, and average number of tokens
min_tokens = movies['tokens'].min()
max_tokens = movies['tokens'].max()
avg_tokens = movies['tokens'].mean()

print(f"Minimum number of tokens: {min_tokens}")
print(f"Maximum number of tokens: {max_tokens}")
print(f"Average number of tokens: {avg_tokens}")

In [ ]:
# remove movies where tokens are 0
movies = movies[movies['tokens'] != 0]

# Calculate min, max, and average number of tokens
min_tokens = movies['tokens'].min()
max_tokens = movies['tokens'].max()
avg_tokens = movies['tokens'].mean()

print(f"Minimum number of tokens: {min_tokens}")
print(f"Maximum number of tokens: {max_tokens}")
print(f"Average number of tokens: {avg_tokens}")

In [ ]:
movies.head()

In [ ]:
movies.iloc[0]['overview']

## Preprocessing

Remove stop words and punctuation

In [ ]:
movies['processed_plot'] = movies['overview'].apply(Utilities.preprocess)

Print plots before and after preprocessing for the first 5 movies.

In [ ]:
display(movies[['overview', 'processed_plot']].head(5))

## Lemmatization

In [ ]:
movies['lem_processed_plot'] = movies['processed_plot'].apply(Lemmatization.lemmatize_sent)

In [ ]:
display(movies[['processed_plot', 'lem_processed_plot']].head(5))

### TF-IDF

In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer()

def calculate_similarity(df):
  overview_df = df['lem_processed_plot']
  movies_vector_1 = vectorizer.fit_transform(overview_df)

  return cosine_similarity(movies_vector_1)


similarity_matrix = calculate_similarity(movies)

In [ ]:

def get_top_n_similar_indices(similarity_matrix, n=10):
    top_n_similar_indices = {}

    for i in range(similarity_matrix.shape[0]):
        similarity_scores = similarity_matrix[i]
        top_indices = np.argsort(similarity_scores)[::-1][1:n+1]
        top_n_similar_indices[i] = top_indices

    return top_n_similar_indices

movies['similar_movie_indices'] = get_top_n_similar_indices(similarity_matrix)

In [ ]:
movies.iloc[0]

# Categorize reviews

In [ ]:
reviews = pd.read_csv('input/reviews.csv')

reviews.head()

## Dataset info

Let's see if the positive and negative reviews are equally distributed within the dataset.

In [ ]:
plt = reviews['sentiment'].value_counts().plot.pie(autopct='%1.1f%%',
                                                   colors=sns.palettes.mpl_palette('Dark2'),
                                                   labels=None,
                                                   legend=True,
                                                   startangle=90)
plt.legend(title="Sentiment", labels=reviews['sentiment'].value_counts().index)
# plt.bar_label(plt.containers[0])

In [ ]:
plt = reviews['sentiment'].value_counts().plot.barh(color=sns.palettes.mpl_palette('Dark2'))

plt.bar_label(plt.containers[0])

plt.set_xlim(right=29000)  # adjust xlim to fit labels

As we can see they are equally distributed in our dataset.

## Preprocess

Since many reviews contain html tags, we are also going to remove them too

In [ ]:
reviews['processed_review'] = reviews['review'].apply(Utilities.remove_html)

display(reviews[['review', 'processed_review']].head(5))

In [ ]:
# Use the preprocess method that was used in the movies plot

reviews['processed_review'] = reviews['processed_review'].apply(Utilities.preprocess)

Print first five reviews before and after preprocessing

In [ ]:
display(reviews[['review', 'processed_review']].head(5))

## Lemmatization

In [ ]:
reviews['lem_processed_review'] = reviews['processed_review'].apply(Lemmatization.lemmatize_sent)

display(reviews[['processed_review', 'lem_processed_review']].head(5))

## Train models

## Process sentiment column

Converts categorical labels of sentiment column into binary (1 for positive, 0 for negative)

Print the sentiment column unique values to confirm the encoding

In [ ]:
print(reviews['sentiment'].unique())

## Train models

### Split to train-test set

In [ ]:
from sklearn.model_selection import train_test_split

x = reviews['lem_processed_review'].to_numpy()
y = reviews['sentiment_to_binary'].to_numpy()

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0, test_size=0.30)

Use TfidfVectorizer to transform a collection of text documents into a matrix of TF-IDF features, which reflects the importance of a term in a document relative to the entire corpus. TF-IDF adjusts the term frequencies based on how common or rare a term is across the documents.

TfidfVectorizer is also a pre-processing technique used to convert text data into numerical form. TfidfVectorizer not only counts the frequency of each word but also assigns a weight to each word based on its frequency in the document and its frequency in the entire corpus. This means that it gives higher weights to words that are important or informative in the document and lower weights to common words that are not. This is achieved through a term frequency-inverse document frequency (TF-IDF) formula that balances the frequency of a word in a document with its frequency in the entire corpus.
(https://www.kaggle.com/code/zeeshanlatif/countvectorizer-vs-tfidfvectorizer)

In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

x_train_tfidf = vectorizer.fit_transform(x_train)
x_test_tfidf = vectorizer.transform(x_test)

In [ ]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def calculate_metrics(test, pred, model):
  accuracy = accuracy_score(test, pred)
  precision = precision_score(test, pred, average='macro')
  recall = recall_score(test, pred, average='macro')
  f1 = f1_score(test, pred, average='macro')
  return [model, accuracy, precision, recall, f1]


### Random Forest

In [ ]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
import time

average_training_time = 0
metrics_dict = {
    "precision": [],
    "recall": [],
    "f1-score": [],
}

for index in range(5):
    random_forest = RandomForestClassifier(n_estimators=120, n_jobs=-1, criterion='gini', min_samples_split=3)
    start_time = time.time()
    random_forest.fit(x_train_tfidf, y_train)
    execution_time = time.time() - start_time
    average_training_time += execution_time

    y_pred_random_forest = random_forest.predict(x_test_tfidf)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred_random_forest, average='weighted')

    metrics_dict["precision"].append(precision)
    metrics_dict["recall"].append(recall)
    metrics_dict["f1-score"].append(f1)


average_metrics = {metric: np.mean(values) for metric, values in metrics_dict.items()}

print("\n===== Average Metrics of Random forest after 5 Runs =====")
print(f"Average Training Time: {average_training_time / 5:.3f} seconds")
print(f"Precision: {average_metrics['precision']:.3f}")
print(f"Recall: {average_metrics['recall']:.3f}")
print(f"F1-Score: {average_metrics['f1-score']:.3f}")

### Naive Bayes

In [ ]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_recall_fscore_support
import time

average_training_time = 0
metrics_dict = {
    "precision": [],
    "recall": [],
    "f1-score": [],
}

for index in range(5):
    naive_bayes = MultinomialNB()
    start_time = time.time()
    naive_bayes.fit(x_train_tfidf, y_train)
    execution_time = time.time() - start_time
    average_training_time += execution_time

    y_predict = naive_bayes.predict(x_test_tfidf)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred_random_forest, average='weighted')

    metrics_dict["precision"].append(precision)
    metrics_dict["recall"].append(recall)
    metrics_dict["f1-score"].append(f1)


average_metrics = {metric: np.mean(values) for metric, values in metrics_dict.items()}

print("\n===== Average Metrics of Naive Bayes after 5 Runs =====")
print(f"Average Training Time: {average_training_time / 5:.3f} seconds")
print(f"Precision: {average_metrics['precision']:.3f}")
print(f"Recall: {average_metrics['recall']:.3f}")
print(f"F1-Score: {average_metrics['f1-score']:.3f}")

### RNN model (Recurrent Neural Network)

In [ ]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Convert text data to numerical format in oreder to feed them as input to RNN model
VOCAB_SIZE = 5000
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(x_train)  # Fit on training text

# Convert text to sequences
x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)

# Pad sequences to ensure consistent input length
MAX_SEQUENCE_LENGTH = 100
x_train_padded = pad_sequences(x_train_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
x_test_padded = pad_sequences(x_test_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

In [ ]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional

# Creating the RNN model
model = tf.keras.Sequential([
    Embedding(input_dim=5000, output_dim=50),
    Bidirectional(
        LSTM(units=64),
    ),
    Dense(units=1, activation='sigmoid')
])

In [ ]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [ ]:
history = model.fit(x_train_padded, y_train, validation_data=(x_test_padded, y_test), epochs=5, batch_size=64)

In [ ]:
loss, accuracy = model.evaluate(x_test_padded, y_test, batch_size=64)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

In [ ]:
from sklearn.metrics import classification_report
# Get predicted probabilities
y_probs = model.predict(x_test_padded, batch_size=64)

# Convert probabilities to binary predictions (0 or 1)
y_pred = (y_probs > 0.5).astype(int)

In [ ]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

In [ ]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [ ]:
import matplotlib.pyplot as plt

# Plot training & validation accuracy
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Accuracy Over Epochs')
plt.show()

# Plot training & validation loss
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Loss Over Epochs')
plt.show()

### BERT Transformer

In [ ]:
import numpy as np
import evaluate

# Load all required metrics
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Compute f1, precision and recall metrics
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")
    recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")

    # Combine metrics into a single dictionary
    return {
        "f1": f1["f1"],
        "precision": precision["precision"],
        "recall": recall["recall"]
    }

model.add(tf.keras.layers.MaxPooling1D(pool_size=2))
model.add(tf.keras.layers.Conv1D(32, 3, activation='relu'))
model.add(tf.keras.layers.MaxPooling1D(2))
model.add(tf.keras.layers.Conv1D(32, 3, activation='relu'))

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

train_inputs = tokenizer(x_train.tolist(), padding="max_length", max_length=128, truncation=True)

test_inputs = tokenizer(x_test.tolist(), padding="max_length", max_length=128, truncation=True)

In [ ]:
import torch

# Convert our data into tensors (https://www.analyticsvidhya.com/blog/2022/02/sentiment-analysis-using-transformers/)
class ReviewDataset(torch.utils.data.Dataset):

    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = { key: torch.tensor(val[idx]) for key, val in self.encodings.items() }
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [ ]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [ ]:
train_dataset = ReviewDataset(train_inputs, y_train.tolist())

test_dataset = ReviewDataset(test_inputs, y_test.tolist())

print(train_dataset.__getitem__(2))
print('\n')
print(test_dataset.__getitem__(2))

In [ ]:
# build the model
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

In [ ]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    report_to="none", #remove this to save to wandb
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [ ]:
trainer.train()

In [ ]:
trainer.evaluate()

### ROBERTA Transformer

In [ ]:
from transformers import AutoTokenizer

# Prepare the data for transformer model (padding, truncation and all the preprocessing are done in the DistillBert tokenizer)
tokenizer_roberta = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

train_inputs_roberta = tokenizer_roberta(x_train.tolist(), padding="max_length", max_length=128, truncation=True)

test_inputs_roberta = tokenizer_roberta(x_test.tolist(), padding="max_length", max_length=128, truncation=True)

In [ ]:
# DataCollatorWithPadding is a class in Hugging Face Transformers that helps in preparing batches of data for training transformer models
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer_roberta)

In [ ]:
train_roberta_dataset = ReviewDataset(train_inputs_roberta, y_train.tolist())

test_roberta_dataset = ReviewDataset(test_inputs_roberta, y_test.tolist())

print(train_roberta_dataset.__getitem__(2))
print('\n')
print(test_roberta_dataset.__getitem__(2))

In [ ]:
# build the model
from transformers import AutoModelForSequenceClassification

roberta_model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest", num_labels=2, ignore_mismatched_sizes=True)

In [ ]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    report_to="none", #remove this to save to wandb
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
)

trainer = Trainer(
    model=roberta_model,
    args=training_args,
    train_dataset=train_roberta_dataset,
    eval_dataset=test_roberta_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [ ]:
trainer.train()

In [ ]:
trainer.evaluate()

# Final example

In [ ]:
from utils.recommendation import Recommendation
import random

value_for_positive = 'positive'
# value_for_positive = 1

def recommend_movies_based_on(movie_index):
    movie = movies.iloc[movie_index]
    print('Movie title: ' + movie['title'])

    print('\nSimilar movies based on plot:')
    plot_based_similar = []

    movies_with_reviews_perc=[]
    for i in movie['similar_movie_indices']:
        plot_based_similar.append(movies.iloc[i]['title'])
        print(movies.iloc[i]['title'])

        movie_reviews = Recommendation.get_reviews_for_movie(movies, movies.iloc[i])
        df = pd.DataFrame(movie_reviews, columns=['review'])
        if df['review'].size != 0:
            df['processed_review'] = df['review'].apply(Utilities.remove_html)
            df['processed_review'] = df['processed_review'].apply(Utilities.preprocess)

            review_pred = random_forest.predict(vectorizer.transform(df['processed_review'].to_numpy()))
            # find number of positive reviews (first column)
             # count_of_ones = np.count_nonzero(review_pred[:, 0] == value_for_positive)
            count_of_ones = np.sum(review_pred == 'positive')

            movies_with_reviews_perc.append(tuple([movies.iloc[i]['title'], count_of_ones/df['processed_review'].size]))
        else:
            movies_with_reviews_perc.append(tuple([movies.iloc[i]['title'], 0]))

    # Sort the list of movies by the second value (review percentage) in descending order
    sorted_movies = sorted(movies_with_reviews_perc, key=lambda x: x[1], reverse=True)

    print('\nSimilar movies based on reviews:')

    top_five_pairs = sorted_movies[:5]

    print([movie[0] for movie in top_five_pairs])
    return [movie[0] for movie in top_five_pairs], plot_based_similar


In [ ]:
recommended, plot = recommend_movies_based_on(random.randint(0, len(movies.axes[0])-1))