# Import Necessary Libraries

In [221]:
import gzip
import os
import argparse
import torch
import random
import torch.nn as nn
import time
import pandas as pd
import numpy as np
from collections import Counter
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import f1_score

In [222]:
def get_device():
    """Returns the device to be used for model training."""
    if torch.cuda.is_available():
        device = torch.device("cuda")
    elif torch.backends.mps.is_available():
        device = torch.device("mps")  # For new Mac M1 or M2 chips
    else:
        device = torch.device("cpu")
    return device

device = get_device()
print(f"Device: {device}")

Device: cuda


In [223]:
argument_parser = argparse.ArgumentParser()
argument_parser.add_argument("--epochs", dest = "EPOCHS", type = int, default = 5)
argument_parser.add_argument("--seed", dest = "SEED", type = int, default = 42)
argument_parser.add_argument("--batch_size", dest = "BATCH_SIZE", type = int, default = 128)
argument_parser.add_argument("--embed_dim", dest = "EMBED_DIM", type = int, default = 100)
argument_parser.add_argument("--hidden_dim", dest = "HIDDEN_DIM", type = int, default = 256)
argument_parser.add_argument("--num_layers", dest = "NUM_LAYERS", type = int, default = 2)
argument_parser.add_argument("--bidirectional", dest = "BIDIRECTIONAL", type = bool, default = False)
argument_parser.add_argument("--optimizer", dest = "OPTIMIZER", type = str, default = 'AdamW')
argument_parser.add_argument("--loss_fn", dest = "LOSS_FN", type = str, default = 'BCELoss')
argument_parser.add_argument("--score_fn", dest = "SCORE_FN", type = str, default = 'F1_Score')
argument_parser.add_argument("--learning_rate", dest = "LEARNING_RATE", type = float, default =  1e-3)
argument_parser.add_argument("--dropout", dest = "DROPOUT", type = float, default =  0.2)
args, _ = argument_parser.parse_known_args()
print(args)

Namespace(EPOCHS=5, SEED=42, BATCH_SIZE=128, EMBED_DIM=100, HIDDEN_DIM=256, NUM_LAYERS=2, BIDIRECTIONAL=False, OPTIMIZER='AdamW', LOSS_FN='BCELoss', SCORE_FN='F1_Score', LEARNING_RATE=0.001, DROPOUT=0.2)


In [224]:
def make_reproducible(seed:int = 42) -> None:
    """
    Set random seed in a bunch of libraries.
    """
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    _numpy_rng = np.random.default_rng(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [225]:
# Read clickbait headlines
with gzip.open('dataset/clickbait_data.gz', 'rb') as f:
    clickbait_df = pd.read_csv(f, sep='\t', header=None, names=['headline'])
clickbait_df['label'] = 1

# Read non-clickbait headlines
with gzip.open('dataset/non_clickbait_data.gz', 'rb') as f:
    non_clickbait_df = pd.read_csv(f, sep='\t', header=None, names=['headline'])
non_clickbait_df['label'] = 0

# Merge clickbait and non-clickbait dataframes
data = pd.concat([clickbait_df, non_clickbait_df])

# Shuffle the rows to randomize the order
data = data.sample(frac=1, random_state=1).reset_index(drop=True)

In [226]:
# Calculate the percentage of clickbait headlines
clickbait_percentage = (data[data['label'] == 1]['label'].count() / len(data)) * 100

# Print the percentage of clickbait headlines
print(f"Percentage of clickbait headlines: {clickbait_percentage:.2f}%")

Percentage of clickbait headlines: 50.00%


In [227]:
train, dev = train_test_split(data, test_size=0.3, random_state=args.SEED)
dev, test = train_test_split(dev, test_size=0.5, random_state=args.SEED)
print(f"Train size: {len(train)}")
print(f"Dev size: {len(dev)}")
print(f"Test size: {len(test)}")

Train size: 22400
Dev size: 4800
Test size: 4800


In [228]:
class Vocabulary:
    def __init__(self, df: pd.DataFrame) -> None:
        self.df = df
        self.tokenized_df = self.tokenize()
        self.unique_words = self.load_unique_words()
        self.word2idx = {word:idx for idx, word in enumerate(self.unique_words)}
        self.word2idx['<pad>'] = len(self.word2idx)
        self.word2idx['<unk>'] = len(self.word2idx) + 1
        self.idx2word = {idx:word for word, idx in self.word2idx.items()}

    def load_unique_words(self) -> list:
        word_list = [word for _, line in self.df['tokenized'].items() for word in line]
        return Counter(word_list)
    
    def tokenize(self) -> pd.DataFrame:
        self.df['tokenized'] = self.df['headline'].apply(lambda line: ["<sos>"] + line.strip().split() + ["<eos>"])
        return self.df['tokenized']
    
    def __len__(self) -> int:
        return len(self.word2idx)

vocab = Vocabulary(train)

In [229]:
argument_parser.add_argument("--input_dim", dest = "INPUT_DIM", type = int, default = len(vocab.word2idx))
argument_parser.add_argument("--output_dim", dest = "OUTPUT_DIM", type = int, default = 1)
args, _ = argument_parser.parse_known_args()
print(args)

Namespace(EPOCHS=5, SEED=42, BATCH_SIZE=128, EMBED_DIM=100, HIDDEN_DIM=256, NUM_LAYERS=2, BIDIRECTIONAL=False, OPTIMIZER='AdamW', LOSS_FN='BCELoss', SCORE_FN='F1_Score', LEARNING_RATE=0.001, DROPOUT=0.2, INPUT_DIM=29430, OUTPUT_DIM=1)


In [230]:
class ClickbaitDataset(Dataset):
    def __init__(self, df: pd.DataFrame, vocab: Vocabulary) -> None:
        self.df = df
        self.vocab = vocab
        self.tokenized_df = self.tokenize()
        self.word2idx = self.vocab.word2idx
        self.idx2word = self.vocab.idx2word
        
    def __getitem__(self, idx):
        line = self.tokenized_df.iloc[idx]
        text = [self.word2idx[word] if word in self.word2idx else self.word2idx['<unk>'] for word in line]
        label = self.df.iloc[idx]['label']
        return text, label
    
    def __len__(self):
        return len(self.df)
    
    def tokenize(self) -> pd.DataFrame:
        self.df['tokenized'] = self.df['headline'].apply(lambda line: ["<sos>"] + line.strip().split() + ["<eos>"])
        return self.df['tokenized']

train_ds = ClickbaitDataset(train, vocab)
dev_ds = ClickbaitDataset(dev, vocab)
test_ds = ClickbaitDataset(test, vocab)

In [231]:
def custom_collate_fn(batch):
    """
    Adds padding token '0' at the end of each text vector to make it
    of the same length as the maximum length input in the batch.

    Returns
    -------
    tuple
        a tuple of padded text tensors, label tensors and text lengths tensor in a batch
    """
    texts, labels = zip(*batch)
    
    texts_tensor = [torch.tensor(text, device = device) for text in texts]
    labels_tensor = torch.tensor(labels, device = device)

    lengths = [len(text) for text in texts]
    lengths = torch.tensor(lengths, device = 'cpu') # Lengths need to be on CPU
    
    texts_padded = pad_sequence(texts_tensor, batch_first = False, padding_value = vocab.word2idx['<pad>'])
    
    return texts_padded, labels_tensor, lengths

train_loader = DataLoader(train_ds, batch_size=args.BATCH_SIZE, shuffle=True, collate_fn=custom_collate_fn)
dev_loader = DataLoader(dev_ds, batch_size=args.BATCH_SIZE, shuffle=True, collate_fn=custom_collate_fn)
test_loader = DataLoader(test_ds, batch_size=args.BATCH_SIZE, shuffle=True, collate_fn=custom_collate_fn)

In [232]:
class LSTM(nn.Module):

    def __init__(self, 
                 input_dim, 
                 embedding_dim, 
                 hidden_dim, 
                 output_dim, 
                 n_layers, 
                 bidirectional, 
                 dropout):
        super(LSTM, self).__init__()
        # Initialize Embedding Layer
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        # Initialzie LSTM layer to process the vector sequences 
        self.lstm = nn.LSTM(embedding_dim, 
                            hidden_dim,
                            num_layers = n_layers,
                            bidirectional = bidirectional,
                            dropout = dropout,
                            batch_first = False)
        num_directions = 2 if bidirectional else 1
        # Initialize Dense layer to predict
        self.fc = nn.Linear(hidden_dim * num_directions, output_dim)
        # Initialize dropout to improve with regularization
        self.dropout = nn.Dropout(dropout)

    def forward(self, 
                x, 
                x_lengths):
        # Embedding Layer
        embedded = self.embedding(x)
        # Dropout Layer before LSTM Layer
        embedded = self.dropout(embedded)
        # Packed Sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, 
                                                            x_lengths, 
                                                            batch_first = False, 
                                                            enforce_sorted = False)
        # LSTM Layer
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        # Unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, 
                                                                  batch_first = False)
        # Concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers and Apply Dropout
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        # Fully Connected Layer
        output = self.fc(hidden)
        return output

In [233]:
model = LSTM(args.INPUT_DIM, 
             args.EMBED_DIM, 
             args.HIDDEN_DIM, 
             args.OUTPUT_DIM, 
             args.NUM_LAYERS, 
             args.BIDIRECTIONAL, 
             args.DROPOUT).to(device)

print('LSTM Model: ', model)

LSTM Model:  LSTM(
  (embedding): Embedding(29430, 100)
  (lstm): LSTM(100, 256, num_layers=2, dropout=0.2)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [234]:
# Model Train Function
def train(loader, 
          model, 
          optimizer, 
          loss_fn):
    model.train()
    losses = list()
    pbar = tqdm(loader, desc = 'Training...', colour = 'red')
    for x, y, x_lengths in pbar:
        optimizer.zero_grad()
        
        # Calculate y_pred
        y_pred = model(x, x_lengths).squeeze(1)
        
        loss = loss_fn(y_pred, y.float())
        pbar.set_postfix({'Loss': loss.item()})
        losses.append(loss.item())
        
        # Calculate gradients for w/b
        loss.backward()  
        # Update weights according to optimizer rules
        optimizer.step()          
    return sum(losses) / len(losses)

# Model Evaluate Function
def evaluate(loader, 
             model, 
             loss_fn, 
             score_fn):
    model.eval()
    losses = list()
    pbar = tqdm(loader, desc = 'Evaluation...', colour = 'green')
    for x, y, x_lengths in pbar:

        # Calculate y_pred
        y_pred = model(x, x_lengths).squeeze(1)
        
        loss = loss_fn(y_pred, y.float())
        pbar.set_postfix({'Loss': loss.item()})
        losses.append(loss.item())

        score = score_fn(y, y_pred)
              
    return sum(losses) / len(losses), score

In [235]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [236]:
# Optimizer
if args.OPTIMIZER == 'AdamW':
    optimizer = torch.optim.AdamW(model.parameters(), 
                                  lr = args.LEARNING_RATE)

# Loss Function
if args.LOSS_FN == 'BCELoss':
    loss_fn = nn.BCEWithLogitsLoss().to(device)

# Initialize Best Validation Loss
best_valid_loss = float('inf')
    
# Path to Save Best Model
PATH = f'lstm-best-model.pt'

# Score Function
if args.SCORE_FN == 'F1_Score':
    score_fn = f1_score

for epoch in range(args.EPOCHS):

    start_time = time.time()
    
    # Avg Train Loss, Train Accuracy
    train_loss = train(train_loader, 
                       model, 
                       optimizer, 
                       loss_fn)

    # Avg Val Loss, F1_Score
    val_loss, val_acc = evaluate(dev_loader, 
                                 model, 
                                 loss_fn, 
                                 score_fn)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'\n\tEpoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tValidation Loss: {val_loss:.3f} | F1_Score: {val_acc*100:.2f}%\n')

    if val_loss < best_valid_loss:
        best_valid_loss = val_loss
        torch.save(model.state_dict(), PATH)

Training...:   0%|[31m          [0m| 0/175 [00:00<?, ?it/s]


RuntimeError: philox_cuda_state for an unexpected CUDA generator used during capture. In regions captured by CUDA graphs, you may only use the default CUDA RNG generator on the device that's current when capture begins. If you need a non-default (user-supplied) generator, or a generator on another device, please file an issue.