In [None]:
import os
import copy
import pandas as pd
import numpy as np
from tqdm.autonotebook import tqdm

import torch
import torch.nn as nn


from sklearn.model_selection import train_test_split, KFold

# importing HuggingFace transformers library which is all we need
import transformers
from transformers import get_linear_schedule_with_warmup

print(transformers.__version__)

  from tqdm.autonotebook import tqdm


4.35.2


In [None]:
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, mode="train", max_length=None):
        self.dataframe = dataframe
        if mode != "test":
            self.targets = dataframe['target'].values
        texts = list(dataframe['text'].values)
        self.encodings = tokenizer(texts,
                                   padding=True,
                                   truncation=True,
                                   max_length=max_length)
        self.mode = mode


    def __getitem__(self, idx):
        # putting each tensor in front of the corresponding key from the tokenizer
        # HuggingFace tokenizers give you whatever you need to feed to the corresponding model
        item = {key: torch.tensor(values[idx]) for key, values in self.encodings.items()}

        if self.mode != "test":
            item['labels'] = torch.tensor(self.targets[idx])
        return item

    def __len__(self):
        return len(self.dataframe)

In [None]:
def make_loaders(dataframe, tokenizer, mode="train", max_length=None):
    dataset = TweetDataset(dataframe, tokenizer, mode, max_length=max_length)
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=options.batch_size,
                                             shuffle=True if mode == "train" else False,
                                             num_workers=options.num_workers)
    return dataloader

In [None]:
class CustomModel(nn.Module):
    def __init__(self,
                 bert_model,
                 num_labels,
                 bert_hidden_dim=768,
                 classifier_hidden_dim=768,
                 dropout=None):

        super().__init__()
        self.bert_model = bert_model
        # nn.Identity does nothing if the dropout is set to None
        self.head = nn.Sequential(nn.Linear(bert_hidden_dim, classifier_hidden_dim),
                                  nn.ReLU(),
                                  nn.Dropout(dropout) if dropout is not None else nn.Identity(),
                                  nn.Linear(classifier_hidden_dim, num_labels))

    def forward(self, batch):
        # feeding the input_ids and masks to the model.
        output = self.bert_model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        # obtaining the last layer hidden states of the Transformer
        last_hidden_state = output.last_hidden_state # shape: (batch_size, seq_length, bert_hidden_dim)
        # representation of CLS token
        # indexing the tensor containing the hidden representations
        CLS_token_state = last_hidden_state[:, 0, :]
        # passing this representation through our custom head
        logits = self.head(CLS_token_state)
        return logits

In [None]:
class AvgMeter:
    def __init__(self, name="Metric"):
        self.name = name
        self.reset()

    def reset(self):
        self.avg, self.sum, self.count = [0]*3

    def update(self, val, count=1):
        self.count += count
        self.sum += val * count
        self.avg = self.sum / self.count

    def __repr__(self):
        text = f"{self.name}: {self.avg:.4f}"
        return text

def one_epoch(model, criterion, loader, device, optimizer=None, lr_scheduler=None, mode="train", step="batch"):
    loss_meter = AvgMeter()
    acc_meter = AvgMeter()

    tqdm_object = tqdm(loader, total=len(loader))
    for batch in tqdm_object:
        batch = {k: v.to(device) for k, v in batch.items()}
        preds = model(batch)
        loss = criterion(preds, batch['labels'])
        if mode == "train":
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if step == "batch":
                lr_scheduler.step()

        count = batch['input_ids'].size(0)
        loss_meter.update(loss.item(), count)

        accuracy = get_accuracy(preds.detach(), batch['labels'])
        acc_meter.update(accuracy.item(), count)
        if mode == "train":
            tqdm_object.set_postfix(loss=loss_meter.avg, accuracy=acc_meter.avg, lr=get_lr(optimizer))
        else:
            tqdm_object.set_postfix(loss=loss_meter.avg, accuracy=acc_meter.avg)

    return loss_meter, acc_meter

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group["lr"]

def get_accuracy(preds, targets):
    """
    preds shape: (batch_size, num_labels)
    targets shape: (batch_size)
    """
    preds = preds.argmax(dim=1)
    acc = (preds == targets).float().mean()
    return acc

In [None]:
def train_eval(epochs, model, train_loader, valid_loader,
               criterion, optimizer, device, options, lr_scheduler=None):

    best_loss = float('inf')
    best_model_weights = copy.deepcopy(model.state_dict())

    for epoch in range(epochs):
        print("*" * 30)
        print(f"Epoch {epoch + 1}")
        current_lr = get_lr(optimizer)

        model.train()
        train_loss, train_acc = one_epoch(model,
                                          criterion,
                                          train_loader,
                                          device,
                                          optimizer=optimizer,
                                          lr_scheduler=lr_scheduler,
                                          mode="train",
                                          step=options.step)
        model.eval()
        with torch.no_grad():
            valid_loss, valid_acc = one_epoch(model,
                                              criterion,
                                              valid_loader,
                                              device,
                                              optimizer=None,
                                              lr_scheduler=None,
                                              mode="valid")

        if valid_loss.avg < best_loss:
            best_loss = valid_loss.avg
            best_model_weights = copy.deepcopy(model.state_dict())
            torch.save(model.state_dict(), f'{options.model_path}/{options.model_save_name}')
            print("Saved best model!")

        # or you could do: if step == "epoch":
        if isinstance(lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
            lr_scheduler.step(valid_loss.avg)
            # if the learning rate changes by ReduceLROnPlateau, we are going to
            # reload our previous best model weights and start from there with a lower LR
            if current_lr != get_lr(optimizer):
                print("Loading best model weights!")
                model.load_state_dict(torch.load(f'{options.model_path}/{options.model_save_name}',
                                                 map_location=device))


        print(f"Train Loss: {train_loss.avg:.5f}")
        print(f"Train Accuracy: {train_acc.avg:.5f}")

        print(f"Valid Loss: {valid_loss.avg:.5f}")
        print(f"Valid Accuracy: {valid_acc.avg:.5f}")
        print("*" * 30)

In [None]:
class Options:
    model_name = 'distilbert-base-uncased'
    batch_size = 64
    num_labels = 2
    epochs = 10
    num_workers = 2
    learning_rate = 3e-5
    scheduler = "ReduceLROnPlateau"
    patience = 2
    dropout = 0.5
    # model_path = "/kaggle/working"
    max_length = 140
    model_save_name = "model.pt"
    n_folds = 5

In [None]:
def make_folds(dataframe, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    for i, (_, valid_idx) in enumerate(kf.split(X=dataframe['id'])):
        dataframe.loc[valid_idx, 'fold'] = i
    return dataframe

In [None]:
def one_fold(fold, options):
    print(f"Training Fold: {fold}")

    # Here, we load the pre-trained DistilBERT model from transformers library
    bert_model = transformers.DistilBertModel.from_pretrained(options.model_name)
    # Loading the corresponding tokenizer from HuggingFace by using AutoTokenizer class.
    tokenizer = transformers.AutoTokenizer.from_pretrained(options.model_name, use_fast=True)

    dataframe = pd.read_csv("/content/drive/MyDrive/fakenews_data.csv")
    dataframe = make_folds(dataframe, n_splits=options.n_folds)
    train_dataframe = dataframe[dataframe['fold'] != fold].reset_index(drop=True)
    valid_dataframe = dataframe[dataframe['fold'] == fold].reset_index(drop=True)

    train_loader = make_loaders(train_dataframe, tokenizer, "train", options.max_length)
    valid_loader = make_loaders(valid_dataframe, tokenizer, "valid", options.max_length)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = CustomModel(bert_model, options.num_labels, dropout=options.dropout).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=options.learning_rate)
    if options.scheduler == "ReduceLROnPlateau":
        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                                  mode="min",
                                                                  factor=0.5,
                                                                  patience=options.patience)

        # when to step the scheduler: after an epoch or after a batch
        options.step = "epoch"

    elif options.scheduler == "LinearWarmup":
        num_train_steps = len(train_loader) * options.epochs
        lr_scheduler = get_linear_schedule_with_warmup(optimizer,
                                                       num_warmup_steps=0,
                                                       num_training_steps=num_train_steps)

        # when to step the scheduler: after an epoch or after a batch
        options.step = "batch"

    criterion = nn.CrossEntropyLoss()
    options.model_save_name = f"model_fold_{fold}.pt"
    train_eval(options.epochs, model, train_loader, valid_loader,
               criterion, optimizer, device, options, lr_scheduler=lr_scheduler)

In [None]:
def train_folds(options):
    n_folds = options.n_folds
    for i in range(n_folds):
        one_fold(fold=i, options=options)

In [None]:
options = Options()
train_folds(options)

Training Fold: 0


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

KeyError: ignored