# **Transfer Learning in Transformers**
**Sarcasm Prediction using News Headlines** 

In [26]:
import numpy as np
import pandas as pd
import os

from datasets import load_dataset, Dataset, DatasetDict
from transformers import AdamW, get_scheduler, DataCollatorWithPadding, AutoModelForSequenceClassification, Trainer, TrainingArguments,AutoTokenizer,AutoModel,AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput

import torch
import torch.nn as nn
from torch.utils.data import DataLoader


from datasets import load_metric
from tqdm.auto import tqdm

In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## **Dataset**

In [3]:
dataset = "dataset/sarcasm_headlines/Sarcasm_Headlines_Dataset_v2.json"
df = pd.read_json(dataset, lines=True)
df.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [6]:
dataset_hf = load_dataset('json', data_files=dataset)
dataset_hf

Using custom data configuration default-3d48aa61d6a02314
Found cached dataset json (/home/mnk/.cache/huggingface/datasets/json/default-3d48aa61d6a02314/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['is_sarcastic', 'headline', 'article_link'],
        num_rows: 28619
    })
})

In [7]:
dataset_hf=dataset_hf.remove_columns(['article_link'])
dataset_hf.set_format('pandas')
dataset_hf=dataset_hf['train'][:]

In [11]:
dataset_hf.rename(columns={'is_sarcastic':'label'},inplace=True)
dataset_hf.drop_duplicates(subset=['headline'],inplace=True)
dataset_hf=dataset_hf.reset_index()[['headline', 'label']]
dataset_hf=Dataset.from_pandas(dataset_hf)

train_testvalid = dataset_hf.train_test_split(test_size=0.2,seed=15)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5,seed=15)

dataset_hf = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

dataset_hf

DatasetDict({
    train: Dataset({
        features: ['headline', 'label'],
        num_rows: 22802
    })
    test: Dataset({
        features: ['headline', 'label'],
        num_rows: 2851
    })
    valid: Dataset({
        features: ['headline', 'label'],
        num_rows: 2850
    })
})

In [12]:
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.model_max_len=512


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

## **Tokenize Data**

In [17]:
def tokenize(data):
  return tokenizer(data["headline"], truncation=True, max_length=512)

tokenized_dataset = dataset_hf.map(tokenize, batched=True)
print(tokenized_dataset)
tokenized_dataset['train'][0]

  0%|          | 0/23 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['headline', 'label', 'input_ids', 'attention_mask'],
        num_rows: 22802
    })
    test: Dataset({
        features: ['headline', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2851
    })
    valid: Dataset({
        features: ['headline', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2850
    })
})


{'headline': "dan harmon finally reveals reason behind 'rick and morty' delays",
 'label': 0,
 'input_ids': [101,
  4907,
  25546,
  2633,
  7657,
  3114,
  2369,
  1005,
  6174,
  1998,
  22294,
  2100,
  1005,
  14350,
  102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [19]:
tokenized_dataset.set_format('torch', columns=["input_ids", "attention_mask", "label"] )
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_collator

DataCollatorWithPadding(tokenizer=PreTrainedTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

## **Transfer Learning**

In [44]:
class TransformerModel(nn.Module):
    def __init__(self, checkpoint, num_labels):
        super(TransformerModel, self).__init__()
        self.num_labels = num_labels
        self.model = AutoModel.from_pretrained(checkpoint, config=AutoConfig.from_pretrained(
            checkpoint, output_attention=True, output_hidden_states=True))
    
        self.droutout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.model.config.hidden_size, self.num_labels)

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs[0]
        sequence_output = self.droutout(last_hidden_state)
        logits = self.classifier(sequence_output[:, 0, :].view(-1, self.model.config.hidden_size))

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return TokenClassifierOutput(loss = loss, logits = logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
        

In [45]:
model_task_specific = TransformerModel(checkpoint=checkpoint, num_labels=2 ).to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## **PyTorch DataLoader**

In [41]:
train_dataloader = DataLoader(
    tokenized_dataset['train'], shuffle = True, batch_size = 32, collate_fn = data_collator
)

eval_dataloader = DataLoader(
    tokenized_dataset['valid'], shuffle = True, collate_fn = data_collator
)

## **Training**

In [42]:
optimizer = AdamW(model_task_specific.parameters(), lr = 5e-5 )
num_epoch = 3
num_training_steps = num_epoch * len(train_dataloader)

lr_scheduler = get_scheduler(
    'linear',
    optimizer = optimizer,
    num_warmup_steps = 0,
    num_training_steps = num_training_steps,
)

metric = load_metric("f1")




In [46]:
from tqdm.auto import tqdm

progress_bar_train = tqdm(range(num_training_steps))
progress_bar_eval = tqdm(range(num_epoch * len(eval_dataloader) ))

for epoch in range(num_epoch):
    model_task_specific.train()
    for batch in train_dataloader:
        batch = { k: v.to(device) for k, v in batch.items() }
        outputs = model_task_specific(**batch)
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar_train.update(1)
        
    model_task_specific.eval()
    for batch in eval_dataloader:
        batch = { k: v.to(device) for k, v in batch.items() }
        with torch.no_grad():
            outputs = model_task_specific(**batch)
            
        logits = outputs.logits
        predictions = torch.argmax(logits, dim = -1 )
        metric.add_batch(predictions = predictions, references = batch['labels'] )
        progress_bar_eval.update(1)
        
    print(metric.compute()) 
       

  0%|          | 0/2139 [00:00<?, ?it/s]

  0%|          | 0/8550 [00:00<?, ?it/s]

{'f1': 0.6350574712643678}
{'f1': 0.6350574712643678}
{'f1': 0.6350574712643678}


## **Evaluation**

In [48]:
model_task_specific.eval()

test_dataloader = DataLoader(
    tokenized_dataset['test'], batch_size = 32, collate_fn = data_collator
)

for batch in test_dataloader:
    batch = { k: v.to(device) for k, v in batch.items() }
    with torch.no_grad():
        outputs = model_task_specific(**batch)
        
    logits = outputs.logits
    predictions = torch.argmax(logits, dim = -1)
    metric.add_batch(predictions = predictions, references=batch['labels'] )
    
metric.compute()  
    

{'f1': 0.6433499881037354}