# **DistilBERT for Sarcasm detection**

**Author** : Gautham Gururajan \\
**Objective** : This project is a simple implementation of a language model making use of Hugging Face's compact, lightweight BERT - DistilBERT. \\
**Description** : The goal is to perform sarcasm detection (Classification) on a Reddit Corpus (SARC).

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
!pip install transformers
from transformers import get_linear_schedule_with_warmup
from tokenizers.processors import BertProcessing
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import numpy as np
import csv
import random
import re
import os
import json
from collections import defaultdict
from collections import Counter
import unicodedata
import codecs
from io import open
import itertools
import math
import pickle
import statistics
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import tqdm
import nltk
nltk.download('punkt')
from google.colab import files

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Data source - https://nlp.cs.princeton.edu/SARC/2.0/

In [11]:
# Hosted the same data as above in the followind link - follow the github readme instructions to do the pull on your own
!pip install gdown
!gdown https://drive.google.com/uc?id=1_2YmBH3EnhX5bKl7B_y4vIiNR1ZpG_Ye

# Unzip the downloaded data
!unzip files.zip
responses = np.load('files/responses.npy')
responses = responses[:30000] # To make the model lighter
train_labels = np.load('files/train_labels.npy')
train_labels = train_labels[:30000]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Downloading...
From: https://drive.google.com/uc?id=1_2YmBH3EnhX5bKl7B_y4vIiNR1ZpG_Ye
To: /content/files.zip
100% 21.1M/21.1M [00:00<00:00, 117MB/s] 


In [15]:
responses

array([['religion must have the answer',
        "it's obviously tracks from a giant water tractor, farming for giant arctic sea prawn!"],
       ['wow...he smoked pot...oh lord hes such a horrible person now..',
        "wow, his girlfriend is uhm... ah fuck it, he's an olympic champion, who am i to pass judgement..."],
       ['i think the government should track every mormon in the country for subversive activity.',
        'another idea from the party that wants to get government off our backs.'],
       ...,
       ['holy shit its fake', 'now that kid can buy a pc'],
       ['emma watson could do anything and i would still be in love with her',
        "because millions of dollars isn't enough."],
       ['no one who posts #refugeeswelcome could ever be an out of touch elite screwing the people!',
        'what a username, how specific']], dtype='<U3776')

In [14]:
train_labels

array([['1', '0'],
       ['1', '0'],
       ['0', '1'],
       ...,
       ['0', '1'],
       ['0', '1'],
       ['1', '0']], dtype='<U1')

Data Processing :    
* The data is in the following format :
  * Responses - list of lists - [[comment1, reply1], [comment2, reply2], ..]
  * Train labels - list of lists - [[isCommentSarcastic?, isReplySarcastic?], ..] \\
* We make use only of the label of the comment for this problem.


In [16]:
# Perform a simple concatenation of comment and reply to add more context - better performance
comment_label = np.array([a[0] for a in train_labels])

# To concat comment and reply
responses_concat = np.array([a[0] + '. '+ a[1] for a in responses])
# Uncomment the below and comment the above to use only comment
# responses_concat = np.array([a[0] for a in responses])

In [17]:
# To make sure we exclude sentences that are too long - DistilBert trains only on sentences with len<512
def make_words(s):
  w = s.split(' ')
  return ' '.join(w[:511])

In [18]:
dict_df = {'text' : responses_concat, 'target' : comment_label}
full_df = pd.DataFrame(dict_df)
full_df['text_len'] = full_df['text'].apply(lambda x: len(make_words(x)))
full_df['text'] = full_df['text'].apply(lambda x: make_words(x))
full_df = full_df[full_df['text_len']<510]
full_df = full_df[['text', 'target']]
full_df['target'] = full_df['target'].astype(np.int64)
full_df['text'] = full_df['text'].astype(str)

In [19]:
full_df.head()

Unnamed: 0,text,target
0,religion must have the answer. it's obviously ...,1
1,wow...he smoked pot...oh lord hes such a horri...,1
2,i think the government should track every morm...,0
3,"oh right, *both* wars were just jewish conspir...",1
4,good luck with that.. time to get that shack i...,1


Preprocess - make train, test valid data

In [20]:
import pandas as pd
import numpy as np
import sys
from functools import partial
import time

In [21]:
#divide data into train, validation, and test datasets
num_com = len(full_df)
idxs = list(range(num_com))
print('Total comments in dataset: ', num_com)
test_idx = idxs[:int(0.1*num_com)]
val_idx = idxs[int(0.1*num_com):int(0.2*num_com)]
train_idx = idxs[int(0.2*num_com):]

train_df = full_df.iloc[train_idx].reset_index(drop=True)
val_df = full_df.iloc[val_idx].reset_index(drop=True)
test_df = full_df.iloc[test_idx].reset_index(drop=True)

train_data = train_df[['text', 'target']]
val_data   = val_df[['text', 'target']]
test_data  = test_df[['text', 'target']]

Total comments in dataset:  29976


In [22]:
#Defining torch dataset class for sarcasm reddit dataset
class SARCDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return self.df.iloc[idx]

In [23]:
#set up train, validation, and testing datasets
train_dataset = SARCDataset(train_data)
val_dataset   = SARCDataset(val_data)
test_dataset  = SARCDataset(test_data)

Create some utility functions

In [24]:
def transformer_collate_fn(batch, tokenizer):
  bert_vocab = tokenizer.get_vocab()
  bert_pad_token = bert_vocab['[PAD]']
  bert_unk_token = bert_vocab['[UNK]']
  bert_cls_token = bert_vocab['[CLS]']
  
  sentences, labels, masks = [], [], []
  for data in batch:
    tokenizer_output = tokenizer([data['text']], max_length = 600, truncation=True)
    tokenized_sent = tokenizer_output['input_ids'][0]
    mask = tokenizer_output['attention_mask'][0]
    sentences.append(torch.tensor(tokenized_sent))
    labels.append(torch.tensor(data['target']))
    masks.append(torch.tensor(mask))
  sentences = pad_sequence(sentences, batch_first=True, padding_value=bert_pad_token)
  labels = torch.stack(labels, dim=0)
  masks = pad_sequence(masks, batch_first=True, padding_value=0.0)
  return sentences, labels, masks

In [25]:
#computes the amount of time that a training epoch took and displays it in human readable form
def epoch_time(start_time: int,
               end_time: int):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [26]:
#count the number of trainable parameters in the model
def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [27]:
#train a given model, using a pytorch dataloader, optimizer, and scheduler (if provided)
def train(model,
          dataloader,
          optimizer,
          device,
          clip: float,
          scheduler = None):

    model.train()

    epoch_loss = 0

    for batch in dataloader:
        sentences, labels, masks = batch[0], batch[1], batch[2]

        optimizer.zero_grad()

        output = model(sentences.to(device), masks.to(device))
        loss = F.cross_entropy(output, labels.to(device))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        if scheduler is not None:
          scheduler.step()
          
        epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

In [28]:
#calculate the loss from the model on the provided dataloader
def evaluate(model,
             dataloader,
             device):

    model.eval()

    epoch_loss = 0
    with torch.no_grad():
      for batch in dataloader:
          sentences, labels, masks = batch[0], batch[1], batch[2]
          output = model(sentences.to(device), masks.to(device))
          loss = F.cross_entropy(output, labels.to(device))
            
          epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

In [29]:
#calculate the prediction accuracy on the provided dataloader
def evaluate_acc(model,
                 dataloader,
                 device):

    model.eval()

    epoch_loss = 0
    with torch.no_grad():
      total_correct = 0
      total = 0
      for i, batch in enumerate(dataloader):
          
          sentences, labels, masks = batch[0], batch[1], batch[2]
          output = model(sentences.to(device), masks.to(device))
          output = F.softmax(output, dim=1)
          output_class = torch.argmax(output, dim=1)
          total_correct += torch.sum(torch.where(output_class == labels.to(device), 1, 0))
          total += sentences.size()[0]

    return total_correct / total

In [30]:
!pip install torchmetrics

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchmetrics
  Downloading torchmetrics-0.11.0-py3-none-any.whl (512 kB)
[K     |████████████████████████████████| 512 kB 13.5 MB/s 
Installing collected packages: torchmetrics
Successfully installed torchmetrics-0.11.0


In [31]:
from torchmetrics.classification import BinaryF1Score
f1_score = BinaryF1Score().to(device)

In [32]:
#calculate the prediction accuracy on the provided dataloader
def evaluate_f1(model,
                 dataloader,
                 device):

    model.eval()

    epoch_loss = 0
    with torch.no_grad():
      total_correct = 0
      total = 0
      preds = []
      labs = []
      for i, batch in enumerate(dataloader):
          
          sentences, labels, masks = batch[0], batch[1], batch[2]
          output = model(sentences.to(device), masks.to(device))
          output = F.softmax(output, dim=1)
          output_class = torch.argmax(output, dim=1)
          
          preds.append(output_class)
          labs.append(labels)
      
      preds = torch.cat(preds).to(device)
      labs = torch.cat(labs).to(device)
    return f1_score(preds, labs)

In [33]:
bert_model_name = 'distilbert-base-uncased' 

from transformers import DistilBertModel, DistilBertTokenizer
bert_model = DistilBertModel.from_pretrained(bert_model_name)
tokenizer = DistilBertTokenizer.from_pretrained(bert_model_name)

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [34]:
# Classification module, add a dropout and a linear layer after the BERT

class SARCClassifier(nn.Module):
    def __init__(self,
                 bert_encoder: nn.Module,
                 enc_hid_dim=768, #default embedding size
                 outputs=2,
                 dropout=0.1):
        super().__init__()

        self.bert_encoder = bert_encoder
        self.enc_hid_dim = enc_hid_dim
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(enc_hid_dim, outputs)

    def forward(self,
                src,
                mask):
        bert_output = self.bert_encoder(src, mask)
        output = self.dropout(bert_output[0])
        output = self.fc(torch.max(output, dim = 1)[0])
        return output

In [35]:
# Initialize weights of the NN

def init_weights(m: nn.Module, hidden_size=768):
    k = 1/hidden_size
    for name, param in m.named_parameters():
        if 'weight' in name:
            print(name)
            nn.init.uniform_(param.data, a=-1*k**0.5, b=k**0.5)
        else:
            print(name)
            nn.init.uniform_(param.data, 0)

In [36]:
def init_classification_head_weights(m: nn.Module, hidden_size=768):
    k = 1/hidden_size
    for name, param in m.named_parameters():
        #if param.shape == torch.Size([2, 768]) or param.shape == torch.Size([2]):
        if name == "fc.weight":
          print(name)
          nn.init.uniform_(param.data, a=-1*k**0.5, b=k**0.5)
        elif name == "fc.bias":
          print(name)
          nn.init.uniform_(param.data, 0)

In [37]:
#define hyperparameters
BATCH_SIZE = 10
LR = 1e-5
WEIGHT_DECAY = 0
N_EPOCHS = 1 # Just for a demonstration
CLIP = 1.0

#define models, move to device, and initialize weights
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = SARCClassifier(bert_model).to(device)
model.apply(init_classification_head_weights)
model.to(device)
print('Model Initialized')

fc.weight
fc.bias
Model Initialized


In [38]:
#create pytorch dataloaders from train_dataset, val_dataset, and test_datset
train_dataloader = DataLoader(train_dataset,batch_size=BATCH_SIZE,collate_fn=partial(transformer_collate_fn, tokenizer=tokenizer), shuffle = True)
val_dataloader = DataLoader(val_dataset,batch_size=BATCH_SIZE,collate_fn=partial(transformer_collate_fn, tokenizer=tokenizer))
test_dataloader = DataLoader(test_dataset,batch_size=BATCH_SIZE,collate_fn=partial(transformer_collate_fn, tokenizer=tokenizer))

In [40]:
optimizer = optim.Adam(model.parameters(), lr=LR)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=10, num_training_steps=N_EPOCHS*len(train_dataloader))

print(f'The model has {count_parameters(model):,} trainable parameters')

train_loss = evaluate(model, train_dataloader, device)
train_acc = evaluate_acc(model, train_dataloader, device)
train_f1 = evaluate_f1(model, train_dataloader, device)

valid_loss = evaluate(model, val_dataloader, device)
valid_acc = evaluate_acc(model, val_dataloader, device)
valid_f1 = evaluate_f1(model, val_dataloader, device)

print(f'Initial Train Loss: {train_loss:.3f}')
print(f'Initial Train Acc: {train_acc:.3f}')
print(f'Initial Train f1: {train_f1:.3f}')

print(f'Initial Valid Loss: {valid_loss:.3f}')
print(f'Initial Valid Acc: {valid_acc:.3f}')
print(f'Initial Valid f1: {valid_f1:.3f}')

for epoch in range(N_EPOCHS):
    start_time = time.time()

    train_loss = train(model, train_dataloader, optimizer, device, CLIP, scheduler)
    
    end_time = time.time()
    
    train_acc = evaluate_acc(model, train_dataloader, device)
    train_f1 = evaluate_f1(model, train_dataloader, device)
    valid_loss = evaluate(model, val_dataloader, device)
    valid_acc = evaluate_acc(model, val_dataloader, device)
    valid_f1 = evaluate_f1(model, val_dataloader, device)
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\tTrain Acc: {train_acc:.3f}')
    print(f'\tTrain f1: {train_f1:.3f}')

    print(f'\tValid Loss: {valid_loss:.3f}')
    print(f'\tValid Acc: {valid_acc:.3f}')
    print(f'\tValid f1: {valid_f1:.3f}')


The model has 66,364,418 trainable parameters
Initial Train Loss: 0.709
Initial Train Acc: 0.495
Initial Train f1: 0.654
Initial Valid Loss: 0.710
Initial Valid Acc: 0.492
Initial Valid f1: 0.652
Epoch: 01 | Time: 3m 21s
	Train Loss: 0.644
	Train Acc: 0.691
	Train f1: 0.695
	Valid Loss: 0.607
	Valid Acc: 0.659
	Valid f1: 0.666


# Great! Now we have a sarcasm detection model - Feel free to tweak the num. epochs, and other hyper parameters to get a better model performance!