In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.layers import TextVectorization
from tensorflow import keras
from keras import layers

import matplotlib.pyplot as plt
import plotly.express as px
from plotly.offline import init_notebook_mode
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

init_notebook_mode(connected=True)
sns.set_style("darkgrid")
plt.rcParams['figure.figsize'] = [20, 8]
plt.rcParams['font.size'] = 18 

## Load, Visualize and Prepare Data

In [None]:
import os

# Download McDonalds dataset
if not os.path.exists('mcdonalds-store-reviews.zip'):
    print("Downloading McDonalds dataset...")
    !kaggle datasets download -d nelgiriyewithana/mcdonalds-store-reviews
if os.path.exists('mcdonalds-store-reviews.zip'):
    print("Unzipping McDonalds dataset...")
    !unzip -n mcdonalds-store-reviews.zip

# Download IMDB dataset
if not os.path.exists('imdb-dataset-of-50k-movie-reviews.zip'):
    print("Downloading IMDB dataset...")
    !kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
if os.path.exists('imdb-dataset-of-50k-movie-reviews.zip'):
    print("Unzipping IMDB dataset...")
    !unzip -n imdb-dataset-of-50k-movie-reviews.zip


In [None]:
df_mc = pd.read_csv('McDonald_s_Reviews.csv', encoding="latin-1")
df_imdb = pd.read_csv('IMDB Dataset.csv')

In [None]:
df_mc.head()

In [None]:
df_imdb.head()

### Converting McDonalds review labels from categorical to binary

In [None]:
df_mc = df_mc[df_mc['rating'] != '3 stars']
rating_mapping_mc = {
    '1 star': 0,
    '2 stars': 0,
    '4 stars': 1,
    '5 stars': 1
}

label_mc = df_mc['rating'].map(rating_mapping_mc).to_numpy()
print(label_mc[:10])
print(f'Labels McDonalds: {len(label_mc)}')

### Converting IMDB review labels from categorical to binary 

In [None]:
label_imdb = df_imdb['sentiment'].map({'positive': 1, 'negative': 0}).to_numpy()
print(label_imdb[:10])
print(f'Labels IMDB: {len(label_imdb)}')

### Merging the data

In [None]:
data_mc = df_mc['review'].to_numpy()
data_imdb = df_imdb['review'].to_numpy()
data = np.append(data_imdb, data_mc)
label = np.append(label_imdb, label_mc)
print(f'Reviews: {len(data)}')
print(f'Labels: {len(label)}')

### Visualize data distribution 

In [None]:
px.bar(x=['positive', 'negative'], y=[len(label[label == 1]), len(label[label == 0])], title='Overall review distribution')

In [None]:
px.bar(x=['positive', 'negative'], y=[len(label_mc[label_mc == 1]), len(label_mc[label_mc == 0])], title='McDonalds review distribution')

In [None]:
px.bar(x=['positive', 'negative'], y=[len(label_imdb[label_imdb == 1]), len(label_imdb[label_imdb == 0])], title='IMDB review distribution')

### Some Word Clouds

In [None]:
# TODO

### Split data into train/validation/test sets

In [None]:
train_data, test_data, train_label, test_label = train_test_split(data, label, test_size=0.2, random_state=42)

print(train_data.shape)
print(train_label.shape)
print(test_data.shape)
print(test_label.shape)

## Tokenization / Vectorization / Word Embedding

In [None]:
# Hyperparameters
MAX_FEATURES = 30000 
SEQ_LENGTH = 100
EMBEDDING_DIM = 100 
BATCH_SIZE = 128
NUM_EPOCHS = 20 

In [None]:
text_vectorizer = TextVectorization(max_tokens=MAX_FEATURES, output_sequence_length=SEQ_LENGTH)
text_vectorizer.adapt(train_data)
text_vectorizer.adapt(test_data)

# Check the vocabulary
vocabulary = text_vectorizer.get_vocabulary()
vocab_size = len(vocabulary)
print(f'Vocabulary size: {len(vocabulary)}')
print(f'First 10 Vocabulary Item: {vocabulary[:10]}')

# TODO We already have some bs in this vocabulary if we don't enforce the max_features => need better cleaning
print(f'Last 10 Vocabulary Item: {vocabulary[-10:]}')

### Encode Reviews to SEQ_LENGTH words (padded)

In [None]:
train_data = text_vectorizer(train_data)
test_data = text_vectorizer(test_data)

print(f'Train data shape: {train_data.shape}')
print(f'Test data shape: {test_data.shape}')

## Defining the Model

Since we only want to classify our reviews into either positive or negative sentiment, we don't need to use the full transformer architecture. The encoder block is sufficient. Via. [Attention is all you need](https://arxiv.org/abs/1706.03762).

In [None]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, heads, neurons):
        super(TransformerEncoder, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [layers.Dense(neurons, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(0.5)
        self.dropout2 = layers.Dropout(0.5)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
    
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim, embedding_matrix=None):
        super(TokenAndPositionEmbedding, self).__init__()
        if embedding_matrix is not None:
            self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim, weights=[embedding_matrix], trainable=False)
        else:
            self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [None]:
# Model Hyperparameters
NUM_HEADS = 16  # number of attention heads
FFN_DIM = 1024 # hidden layer size in feed forward network inside transformer
DROPOUT = 0.5

In [None]:
def create_model(embedding_matrix=None):
    inputs = layers.Input(shape=(SEQ_LENGTH,))
    embedding_layer = TokenAndPositionEmbedding(SEQ_LENGTH, MAX_FEATURES, EMBEDDING_DIM)   
    if(embedding_matrix is not None) : 
        embedding_layer = layers.Embedding(MAX_FEATURES, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False)
    x = embedding_layer(inputs)
    transformer_block = TransformerEncoder(EMBEDDING_DIM, NUM_HEADS, FFN_DIM)
    x = transformer_block(x)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(DROPOUT)(x)
    x = layers.Dense(64, activation="relu")(x)
    x = layers.Dropout(DROPOUT)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs=inputs, outputs=outputs)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

In [None]:
transformer_model = create_model()

## Training the Model

### Defining some callbacks for better control of model training

In [None]:
from keras.callbacks import TensorBoard
import datetime

# Define TensorBoard callback
log_dir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard = TensorBoard(log_dir=log_dir, histogram_freq=1)

model_name = 'transformer_model.h5' 
checkpoint = keras.callbacks.ModelCheckpoint(
    model_name,
    monitor='val_loss',
    save_best_only=True,
    verbose=1
)

early_stopping = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True,
    verbose=1
)

learning_rate_decay = keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss',
    patience=2,
    factor=0.2,
    min_lr=0.00001,
    verbose=1
)  

callbacks = [checkpoint, early_stopping, learning_rate_decay, tensorboard]


In [None]:
history = transformer_model.fit(
    train_data,
    train_label,    
    batch_size=BATCH_SIZE,
    epochs=NUM_EPOCHS,
    validation_split=0.2,
    callbacks=callbacks
)

In [None]:
transformer_model.evaluate(test_data, test_label)

In [None]:
from utils.plot_utils import plot_history_metrics, get_classification_report 
plot_history_metrics(history, ['loss', 'accuracy'])

In [None]:
get_classification_report(transformer_model, test_data, test_label)

### Results
The model isn't generalizing all to good, which can be seen by the increasing validation loss, which is an indicator for overfitting. To combat this we could lower the vocabulary size, decrease sequence length or even try pre-trained embeddings like GloVe.

## Using pre-trained word embeddings (GloVe)

We will use [GloVe](https://github.com/stanfordnlp/GloVe) pre-trained embeddings which were published by the Stanford University. The 100d Model is available on [Kaggle](https://www.kaggle.com/datasets/anindya2906/glove6b). The file `glove.6B.100d.txt` needs to be placed in the `embeddings` folder so that the path `embeddings/glove.6B.100d.txt` is available.

In [None]:
from utils.file_utils import read_embeddings
filepath = './embeddings/glove.6B.100d.txt'
GLOVE_EMBEDDINGS = read_embeddings(filepath)

### Checking the encoding of a random word

In [None]:
test_word = 'hello'
test_vector = GLOVE_EMBEDDINGS[test_word]
print(f'Vector for {test_word}:\n\n{test_vector}')

All vectors have 100 dimensions to capture the semenatics of a word.

In [None]:
print(f'Vector shape: {test_vector.shape}')

### Encode our vocabulary with GloVe Vectors

In [None]:
EMBEDDING_MATRIX = np.zeros((MAX_FEATURES, SEQ_LENGTH))

for i, word in np.ndenumerate(vocabulary):
    embedding_vector = GLOVE_EMBEDDINGS.get(word)
    if embedding_vector is not None:
        EMBEDDING_MATRIX[i] = embedding_vector

print(f'Embedding matrix shape: {EMBEDDING_MATRIX.shape}')

### Create model with glove embeddings

In [None]:
glove_transformer_model = create_model(EMBEDDING_MATRIX)

In [None]:
glove_history = transformer_model.fit(
    train_data,
    train_label,    
    batch_size=BATCH_SIZE,
    epochs=NUM_EPOCHS,
    validation_split=0.2,
    callbacks=callbacks
)

In [None]:
glove_transformer_model.evaluate(test_data, test_label)

In [None]:
plot_history_metrics(history, ['loss', 'accuracy'])

In [None]:
%tensorboard --logdir logs