In [1]:
import math
import rich
import torch
import string
import numpy  as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow.keras.backend as K

from tqdm import tqdm
from torch   import nn
from termcolor import colored

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Layer
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.callbacks  import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

2023-12-02 11:25:42.029307: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-02 11:25:45.617692: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-02 11:26:00.866122: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-02 11:26:00.866184: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-02 11:26:00.937432: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi

In [None]:
# define the device to use
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
rich.print(f"Device: [red]{DEVICE}")

In [None]:
df = pd.read_csv('clean_data.csv')

# split the data
x = df['headline'].values
y = df['is_sarcastic'].values

x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    test_size = 0.20,
                                                    # shuffle = True,
                                                    random_state = 2456)

print(colored("\nDIVIDED SUCCESFULLY...", "green"))

In [None]:
unique, counts = np.unique(y_train, return_counts=True)
print("Training set distribution:")
print(dict(zip(unique, counts)))

unique, counts = np.unique(y_test, return_counts=True)
print("\nTest set distribution:")
print(dict(zip(unique, counts)))

In [None]:
# add new column 'headline_len': Text length of headline 
df['headline_len'] = df['headline'].map(len)

In [None]:
types = df['is_sarcastic'].unique()
cmap  = ['rocket_r', 'mako_r'] 

fig, axs = plt.subplots(2, 1, figsize=(10, 6), dpi=150,
                        sharex=True)
fig.subplots_adjust(hspace=.5)
for idx, x in enumerate(types):
    ax = plt.subplot(2, 1, idx + 1)
    subset = df[df['is_sarcastic'] == x]
    sns.histplot(data = subset[(subset['headline_len'] < 250)], 
                 x = 'headline_len', hue='is_sarcastic', 
                 kde = True, palette=cmap[idx])
    plt.xticks(np.arange(0, 250, 10))
    plt.axvline(subset[(subset['headline_len'] < 250)].headline_len.mean(), 
                color='r', linestyle = '--')

plt.xlabel('Headline Length')
plt.ylabel('Frequence')
fig.suptitle('Headline length histogram with marked mean',
                x=.5, y=.95, fontsize=18)
plt.tight_layout()
plt.show()

In [None]:
unique_words = set()
len_max = 0

for sent in tqdm(x_train):
    unique_words.update(sent)
    
    if(len_max<len(sent)):
        len_max = len(sent)
        
print(len(list(unique_words)))
print(len_max)

In [None]:
tokenizer = Tokenizer(num_words=len(list(unique_words)))
tokenizer.fit_on_texts(list(x_train))

x_train = tokenizer.texts_to_sequences(x_train)
x_test  = tokenizer.texts_to_sequences(x_test)

x_train = pad_sequences(x_train, maxlen=len_max)
x_test  = pad_sequences(x_test, maxlen=len_max)

print(x_train.shape, x_test.shape)

In [None]:
early_stopping = EarlyStopping(min_delta = 0.001, mode = "auto", 
                               verbose = 1, monitor = "val_acc",
                               patience = 3)
callbacks = [early_stopping]

In [None]:
from tensorflow.keras.layers import Layer
import tensorflow.keras.backend as K

class AttentionLayer(Layer):
    """
    Keras layer to compute an attention score for each input sequence.
    """
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name='attention_weight',
                                 shape=(input_shape[-1], 1),
                                 initializer='random_normal',
                                 trainable=True)
        self.b = self.add_weight(name='attention_bias',
                                 shape=(input_shape[1], 1),
                                 initializer='zeros',
                                 trainable=True)
        super(AttentionLayer, self).build(input_shape)

    def call(self, x):
        # Applying a linear layer (Wx + b)
        e = K.tanh(K.dot(x, self.W) + self.b)
        # Compute the weights
        a = K.softmax(e, axis=1)
        # Weighted sum of the input
        output = x * a
        # Sum over the time dimension to get the context vector
        return K.sum(output, axis=1)

    def get_config(self):
        return super(AttentionLayer, self).get_config()

from tensorflow.keras.optimizers import Adam

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(list(unique_words)), 100, input_length=len_max),
    tf.keras.layers.LSTM(64, return_sequences=True),  # LSTM layer with sequence output
    AttentionLayer(),  # Attention layer after LSTM
    tf.keras.layers.Dense(40, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(20, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(5, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss="binary_crossentropy", 
              optimizer=Adam(learning_rate=0.004),
              metrics=["accuracy"])

model.summary()

In [None]:
history = model.fit(x_train, y_train, 
                    epochs = 5, validation_data = (x_test, y_test), 
                    batch_size = 16, verbose = 1, 
                    callbacks = callbacks)

In [None]:
plt.figure(figsize=(10, 6), dpi=100)
epoch_num = range(1, len(history.history["loss"]) + 1)
plt.plot(epoch_num, history.history["loss"], "r--")
plt.plot(epoch_num, history.history["val_loss"], "b-")
plt.legend(["Training loss", "Validation loss"])
plt.xlabel("Epoch numbers")
plt.ylabel("Loss")
plt.title('LSTM model: Training and validation loss')
plt.savefig('baseline_LSTM_loss_plot.png')
plt.show()

In [None]:
plt.figure(figsize=(10, 6), dpi=100)
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training accuracy')
plt.plot(epochs, val_acc, 'r', label='Validation accuracy')
plt.legend()
plt.xlabel("Epoch numbers")
plt.ylabel("Accuracy")
plt.title('LSTM model: Training and validation accuracy')
plt.savefig('baseline_LSTM_acc_plot.png')
plt.show()