<a href="https://www.kaggle.com/code/maulikjain26/review-analysis?scriptVersionId=136165694" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
!pip install -q tokenizers

In [None]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, LayerNormalization, MultiHeadAttention
from tokenizers import ByteLevelBPETokenizer, AddedToken

In [None]:
# Detect and init the TPU if available
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
    strategy = tf.distribute.TPUStrategy(tpu)
    print("Running on TPU:", tpu.master() or "local")
except ValueError:
    # If a TPU is not available, check for a GPU
    gpus = tf.config.list_physical_devices("GPU")

    if gpus:
        strategy = tf.distribute.MirroredStrategy()
        
        gpu_len = len(gpus)
        print(f"Running on", gpu_len, f"GPU{'s' if gpu_len != 1 else ''}")
    else:
        strategy = tf.distribute.OneDeviceStrategy("CPU")
        print("Running on CPU")

In [None]:
df = pd.read_csv("/kaggle/input/mcdonalds-store-reviews/McDonald_s_Reviews.csv", encoding="latin-1")

In [None]:
df.head()

In [None]:
df["rating"] = df["rating"].str.extract(r"(\d+)", expand=False).astype(int)

In [None]:
df["review"] = df["review"].str.encode("ascii", "ignore").str.decode("utf-8")

In [None]:
review_mapping = {1: 0, 2: 0, 3: None, 4: 1, 5: 1}
df["rating"] = df["rating"].map(review_mapping)

# Drop rows with review value equal to 3
df = df.dropna(subset=["rating"])

In [None]:
df = df[["review", "rating"]]

In [None]:
df.head()

In [None]:
# Instantiate the tokenizer
sequence_length = 512
vocab_size = 10_000

tokenizer = ByteLevelBPETokenizer()

In [None]:
tokenizer.train_from_iterator(
    df["review"], vocab_size=vocab_size, special_tokens=["<pad>"])

In [None]:
# Tokenize the sequences
encode_batch = tokenizer.encode_batch

df["encoded_sequence"] = [i.ids for i in encode_batch(df["review"])]

In [None]:
max(df["encoded_sequence"].apply(len))

In [None]:
# Pad the sequences to sequence_length
df["padded_sequence"] = tf.keras.preprocessing.sequence.pad_sequences(
    df["encoded_sequence"], maxlen=sequence_length, padding="post", truncating="post").tolist()

In [None]:
df.head(5)

In [None]:
class BaseAttention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(BaseAttention, self).__init__()

        self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()

In [None]:
class GlobalSelfAttention(BaseAttention):
    def call(self, x):
        attn_output = self.mha(
            query=x,
            value=x,
            key=x
        )

        x = self.add([x, attn_output])
        x = self.layernorm(x)

        return x

In [None]:
def positional_encoding(length, depth):
    depth = depth / 2

    positions = np.arange(length)[:, np.newaxis]  # (seq, 1)
    depths = np.arange(depth)[np.newaxis, :] / depth  # (1, depth)

    angle_rates = 1 / (10000 ** depths)  # (1, depth)
    angle_rads = positions * angle_rates  # (pos, depth)

    pos_encoding = np.concatenate(
        [np.sin(angle_rads), np.cos(angle_rads)],
        axis=-1
    )

    return tf.cast(pos_encoding, dtype=tf.float32)

In [None]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, d_model):
        super(PositionalEmbedding, self).__init__()
        
        self.d_model = d_model
        
        self.embedding = tf.keras.layers.Embedding(
            vocab_size, d_model, mask_zero=True)
        self.pos_encoding = positional_encoding(length=2048, depth=d_model)

    def compute_mask(self, *args, **kwargs):
        return self.embedding.compute_mask(*args, **kwargs)

    def call(self, x):
        seq_len = tf.shape(x)[1]
        x = self.embedding(x)

        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[tf.newaxis, :seq_len, :]

        return x

In [None]:
class FeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model, dff, dropout_rate=0.1):
        super(FeedForward, self).__init__()

        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation="gelu"),
            tf.keras.layers.Dense(d_model),
        ])

        self.dropout = tf.keras.layers.Dropout(dropout_rate)

        self.add = tf.keras.layers.Add()
        self.layer_norm = tf.keras.layers.LayerNormalization()

    def call(self, x):
        ffn_output = self.ffn(x)
        ffn_output = self.dropout(ffn_output)

        x = self.add([x, ffn_output])
        x = self.layer_norm(x)

        return x

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, *, d_model, num_heads, dff, dropout_rate=0.1):
        super(EncoderLayer, self).__init__()

        self.self_attention = GlobalSelfAttention(
            num_heads=num_heads,
            key_dim=d_model,
            dropout=dropout_rate)

        self.ffn = FeedForward(d_model, dff)

    def call(self, x):
        x = self.self_attention(x)
        x = self.ffn(x)

        return x

In [None]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, *, num_layers, d_model, num_heads,
                 dff, vocab_size, dropout_rate=0.1):

        super(Encoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.pos_embedding = PositionalEmbedding(
            vocab_size=vocab_size, d_model=d_model)

        self.enc_layers = [
            EncoderLayer(d_model=d_model,
                         num_heads=num_heads,
                         dff=dff,
                         dropout_rate=dropout_rate)
            for _ in range(num_layers)]

        self.dropout = tf.keras.layers.Dropout(dropout_rate)

    def call(self, x):
        x = self.pos_embedding(x)  # (batch_size, seq_len, d_model)
        x = self.dropout(x)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x)

        return x  # (batch_size, seq_len, d_model)

In [None]:
class Transformer(tf.keras.Model):
    def __init__(self, *, num_layers, d_model, num_heads, dff,
                 input_vocab_size, target_vocab_size, dropout_rate=0.1):
        super(Transformer, self).__init__()
        
        self.encoder = Encoder(num_layers=num_layers, d_model=d_model,
                               num_heads=num_heads, dff=dff,
                               vocab_size=input_vocab_size,
                               dropout_rate=dropout_rate)

    def call(self, inputs):
        logits = self.encoder(inputs)

        try:
            del logits._keras_mask
        except AttributeError:
            pass

        return logits

In [None]:
vocab_size = tokenizer.get_vocab_size()
num_layers = 4
d_model = 256
dff = 300
num_heads = 8
dropout_rate = 0.1

In [None]:
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=4e-4,
    decay_steps=30,
    decay_rate=0.9
)

In [None]:
with strategy.scope():
    inputs = tf.keras.Input(shape=(None,))

    transformer = Transformer(
        num_layers=num_layers,
        d_model=d_model,
        num_heads=num_heads,
        dff=dff,
        input_vocab_size=vocab_size,
        target_vocab_size=None,
        dropout_rate=dropout_rate
    )

    x = transformer(inputs)

    classif_layers = [
        tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(128, return_sequences=True)),
        tf.keras.layers.LayerNormalization(),

        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),
        tf.keras.layers.LayerNormalization(),

        tf.keras.layers.Dense(128, activation="relu",
                              kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.20)
    ]

    for layer in classif_layers:
        x = layer(x)

    outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)

    model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
                  optimizer=tf.keras.optimizers.Adam(
                      learning_rate=lr_schedule, beta_1=0.9, beta_2=0.98, epsilon=1e-9),
                  metrics=["accuracy"])

In [None]:
model.summary()

In [None]:
# Splitting the dataframe into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

X_train = np.array(train_df["padded_sequence"].tolist())
X_train = np.ascontiguousarray(X_train)
X_train.flags.writeable = False

y_train = tf.expand_dims(train_df["rating"].tolist(), axis=1)
y_train = np.ascontiguousarray(y_train)
y_train.flags.writeable = False

X_test = np.array(test_df["padded_sequence"].tolist())
X_test = np.ascontiguousarray(X_test)
X_test.flags.writeable = False

y_test = tf.expand_dims(test_df["rating"].tolist(), axis=1)
y_test = np.ascontiguousarray(y_test)
y_test.flags.writeable = False

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(
    strategy.num_replicas_in_sync * 128
).cache().prefetch(AUTOTUNE)

test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(
    strategy.num_replicas_in_sync * 128
).cache().prefetch(AUTOTUNE)

In [None]:
history = model.fit(
    train_ds,
    validation_data=test_ds,
    epochs=32,
    callbacks=[tf.keras.callbacks.EarlyStopping(
        monitor="val_loss", patience=2, restore_best_weights=True)]
)

In [None]:
model.evaluate(test_ds)

In [None]:
metrics = history.history
plt.figure(figsize=(16, 6))
plt.subplot(1, 2, 1)
plt.plot(history.epoch, metrics["loss"], metrics["val_loss"])
plt.legend(["loss", "val_loss"])
plt.ylim([0, max(plt.ylim())])
plt.xlabel("Epoch")
plt.ylabel("Loss [BinaryCrossentropy]")

plt.subplot(1, 2, 2)
plt.plot(history.epoch, 100 *
         np.array(metrics["accuracy"]), 100 * np.array(metrics["val_accuracy"]))
plt.legend(["accuracy", "val_accuracy"])
plt.ylim([0, 100])
plt.xlabel("Epoch")
plt.ylabel("Accuracy [%]")
plt.show()

In [None]:
# Save the model
model.save("/kaggle/working/model.h5")

# Save the model weights
model.save_weights("/kaggle/working/model_weights.h5")

# Save the tokenizer
tokenizer.save("/kaggle/working/tokenizer.bpe")

In [None]:
class SentimentAnalysisModel():
    def __init__(self, model, tokenizer, sequence_length):
        self.model = model
        self.tokenizer = tokenizer
        self.sequence_length = sequence_length

    def __call__(self, x):
        if isinstance(x, str):
            x = [x]
        
        encode_batch = self.tokenizer.encode_batch
        x = [i.ids for i in encode_batch(x)]
        
        x = tf.keras.preprocessing.sequence.pad_sequences(
            x, maxlen=self.sequence_length, padding="post", truncating="post")

        pred = self.model(x, training=False)

        res = np.zeros((len(pred),), dtype=object)

        for i, p in enumerate(pred):
            label = "POSITIVE" if tf.keras.backend.greater(
                p[0], 0.5) else "NEGATIVE"
            confidence = tf.keras.backend.abs(p[0] - 0.5) * 2

            res[i] = {
                "label": label,
                "confidence": confidence.numpy()
            }

        return res


export = SentimentAnalysisModel(model, tokenizer, sequence_length)

with open("/kaggle/working/model_end2end", "wb") as f:
    pickle.dump(export, f)

In [None]:
custom_objs = {"Transformer": Transformer}

with tf.keras.utils.custom_object_scope(custom_objs), open("/kaggle/working/model_end2end", "rb") as f:
    loaded_model = pickle.load(f)

In [None]:
export(["This is a great product", "This is a horrible product"])

In [None]:
(export(["This is a great product", "This is a horrible product"]) == loaded_model(
    ["This is a great product", "This is a horrible product"])).all()