# **Transfer Learning in Transformers**
## **Sarcasm Prediction using News Headlines** 

In [40]:
import numpy as np
import pandas as pd
import os

from datasets import load_dataset, Dataset, DatasetDict, load_metric
from transformers import EarlyStoppingCallback, AdamW, get_scheduler, DataCollatorWithPadding, Trainer, TrainingArguments,AutoTokenizer,AutoModel,AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from tqdm.auto import tqdm

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## **Dataset**

In [6]:
dataset = "data/sarcasm_headlines/Sarcasm_Headlines_Dataset_v2.json"
df = pd.read_json(dataset, lines=True)
df.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [8]:
dataset_hf = load_dataset('json', data_files=dataset)
dataset_hf

Using custom data configuration default-e4c5588aeed38262


Downloading and preparing dataset json/default to /home/mnk/.cache/huggingface/datasets/json/default-e4c5588aeed38262/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/mnk/.cache/huggingface/datasets/json/default-e4c5588aeed38262/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['is_sarcastic', 'headline', 'article_link'],
        num_rows: 28619
    })
})

In [9]:
dataset_hf=dataset_hf.remove_columns(['article_link'])
dataset_hf.set_format('pandas')
dataset_hf=dataset_hf['train'][:]

In [10]:
dataset_hf.rename(columns={'is_sarcastic':'label'},inplace=True)
dataset_hf.drop_duplicates(subset=['headline'],inplace=True)
dataset_hf=dataset_hf.reset_index()[['headline', 'label']]
dataset_hf=Dataset.from_pandas(dataset_hf)

train_testvalid = dataset_hf.train_test_split(test_size=0.2,seed=15)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5,seed=15)

dataset_hf = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

dataset_hf

DatasetDict({
    train: Dataset({
        features: ['headline', 'label'],
        num_rows: 22802
    })
    test: Dataset({
        features: ['headline', 'label'],
        num_rows: 2851
    })
    valid: Dataset({
        features: ['headline', 'label'],
        num_rows: 2850
    })
})

In [11]:
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.model_max_len=512


## **Tokenize Data**

In [12]:
def tokenize(data):
  return tokenizer(data["headline"], truncation=True, max_length=512)

tokenized_dataset = dataset_hf.map(tokenize, batched=True)
print(tokenized_dataset)
tokenized_dataset['train'][0]

  0%|          | 0/23 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['headline', 'label', 'input_ids', 'attention_mask'],
        num_rows: 22802
    })
    test: Dataset({
        features: ['headline', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2851
    })
    valid: Dataset({
        features: ['headline', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2850
    })
})


{'headline': "dan harmon finally reveals reason behind 'rick and morty' delays",
 'label': 0,
 'input_ids': [101,
  4907,
  25546,
  2633,
  7657,
  3114,
  2369,
  1005,
  6174,
  1998,
  22294,
  2100,
  1005,
  14350,
  102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [59]:
tokenized_dataset.set_format('torch', columns=["input_ids", "attention_mask", "label"] )
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # A DataCollator that will dynamically pad the inputs received.
data_collator

print(tokenized_dataset['train'][:3])
data_collator(tokenized_dataset['train'][:3])

{'label': tensor([0, 0, 1]), 'input_ids': [tensor([  101,  4907, 25546,  2633,  7657,  3114,  2369,  1005,  6174,  1998,
        22294,  2100,  1005, 14350,   102]), tensor([  101,  2654,  7800,  1997, 17077, 13318,  2111,   102]), tensor([  101, 13025,  1011,  3573,  7309,  2064,  1005,  1056,  4562,  2000,
         4521,  2833,  4902,   102])], 'attention_mask': [tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), tensor([1, 1, 1, 1, 1, 1, 1, 1]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])]}


{'input_ids': tensor([[  101,  4907, 25546,  2633,  7657,  3114,  2369,  1005,  6174,  1998,
         22294,  2100,  1005, 14350,   102],
        [  101,  2654,  7800,  1997, 17077, 13318,  2111,   102,     0,     0,
             0,     0,     0,     0,     0],
        [  101, 13025,  1011,  3573,  7309,  2064,  1005,  1056,  4562,  2000,
          4521,  2833,  4902,   102,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]), 'labels': tensor([0, 0, 1])}

## **Transfer Learning**

In [27]:
class TaskSpecificTransformerHead(nn.Module): 
    def __init__(self, checkpoint:str, num_labels:int):
        super(TaskSpecificTransformerHead, self).__init__()
        self.num_labels = num_labels
        self.model = AutoModel.from_pretrained(checkpoint, config =  AutoConfig.from_pretrained(checkpoint, output_attention = True, output_hidden_states = True)) 

        self.classifier = nn.Sequential(nn.Linear(768, 256)
                                        ,nn.ReLU()
                                        ,nn.Dropout(0.1)
                                        ,nn.Linear(256, self.num_labels)
                                        ,nn.Softmax(dim=1))

    def forward(self, input_idx=None, attention_mask=None, labels=None):
        transformer_output = self.model(input_idx, attention_mask=attention_mask)
        logits = self.classifier(transformer_output[0][:, 0, :].view(-1, 768))
        loss = None

        if labels is not None:
            loss_func = nn.CrossEntropyLoss()
            loss = loss_func(logits.view(-1, self.num_labels), labels.view(-1))
            return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=transformer_output.hidden_states, attentions=transformer_output.attentions)

In [28]:
task_specific_model = TaskSpecificTransformerHead(checkpoint, num_labels=2).to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## **PyTorch DataLoader**

In [65]:
train_dataloader = DataLoader(
    tokenized_dataset['train'], shuffle = True, batch_size = 32, collate_fn = data_collator
)

eval_dataloader = DataLoader(
    tokenized_dataset['valid'], shuffle = True, collate_fn = data_collator
)

In [67]:
train_dataloader.dataset[1]

{'label': tensor(0),
 'input_ids': tensor([  101,  2654,  7800,  1997, 17077, 13318,  2111,   102]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1])}

## **Training**

In [51]:
optimizer = AdamW(task_specific_model.parameters(), lr = 1e-4 )
num_epoch = 10
num_training_steps = num_epoch * len(train_dataloader) # no_of_epochs * no_of_batches

lr_scheduler = get_scheduler(
    'linear',
    optimizer = optimizer,
    num_warmup_steps = 0,
    num_training_steps = num_training_steps,
)

metric = load_metric("f1")



In [52]:
'''
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir = './results',          # output directory
    num_train_epochs = num_epoch,              # total number of training epochs
    per_device_train_batch_size = 32,  # batch size per device during training
    per_device_eval_batch_size = 32,   # batch size for evaluation
    warmup_steps = 500,                # number of warmup steps for learning rate scheduler
    weight_decay = 0.01,               # strength of weight decay
    logging_dir = './logs',            # directory for storing logs
    logging_steps = 10,
    evaluation_strategy = "steps",
    eval_steps = 100,
    save_steps = 100,
    load_best_model_at_end = True,
    metric_for_best_model = "f1",
    greater_is_better = True,
    fp16 = True,
    dataloader_num_workers = 4,
    run_name = "sarcasm_headlines"
)

trainer = Trainer(
    model = task_specific_model,                         # the instantiated 🤗 Transformers model to be trained
    args = training_args,                  # training arguments, defined above
    train_dataset = train_dataloader.dataset,         # training dataset
    eval_dataset = eval_dataloader.dataset,             # evaluation dataset
    compute_metrics = compute_metrics,         # define metrics function
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)
'''


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend


In [79]:
for epoch in tqdm(range(num_epoch)):
    task_specific_model.train()
    for batch in tqdm(train_dataloader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        output = task_specific_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = output.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
    task_specific_model.eval()
    for batch in tqdm(eval_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        output = task_specific_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = output.loss
        logits = output.logits
        metric.add_batch(predictions=torch.argmax(logits, dim=-1), references=labels)
    print(metric.compute())


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/713 [00:00<?, ?it/s]

  0%|          | 0/2850 [00:00<?, ?it/s]

{'f1': 0.8982591876208897}


  0%|          | 0/713 [00:00<?, ?it/s]

  0%|          | 0/2850 [00:00<?, ?it/s]

{'f1': 0.9011827546737886}


  0%|          | 0/713 [00:00<?, ?it/s]

  0%|          | 0/2850 [00:00<?, ?it/s]

{'f1': 0.9081404032860343}


  0%|          | 0/713 [00:00<?, ?it/s]

  0%|          | 0/2850 [00:00<?, ?it/s]

{'f1': 0.9003516998827666}


  0%|          | 0/713 [00:00<?, ?it/s]

KeyboardInterrupt: 

## **Evaluation**

In [78]:
task_specific_model.eval()

test_dataloader = DataLoader(
    tokenized_dataset['test'], batch_size = 32, collate_fn = data_collator
)

for batch in test_dataloader:
    batch = { k: v.to(device) for k, v in batch.items() }
    with torch.no_grad():
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = task_specific_model(input_ids, attention_mask=attention_mask, labels=labels)
        
    logits = outputs.logits
    predictions = torch.argmax(logits, dim = -1)
    metric.add_batch(predictions = predictions, references=batch['labels'] )
    
metric.compute()  
    

{'f1': 0.9029931482149297}