# Imports

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import Dataset
import math

!pip install -q transformers==4.28.0
!pip install -q evaluate
from transformers import BertTokenizer, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import evaluate

# Read data into a dataframe
Before you continue, download Sarcasm_Headlines_Dataset_v2.json from https://www.kaggle.com/datasets/rmisra/news-headlines-dataset-for-sarcasm-detection and upload it to the Google Colaboratory runtime.

In [None]:
df = pd.read_json("Sarcasm_Headlines_Dataset_v2.json", lines=True) # Read line separated json
df.drop(columns=['article_link'], inplace=True) # Drop irrelevant columns
df.head()

In [None]:
import re
def clean_text(text):
    
    # lower case characters only
    text = text.lower() 
    
    # remove urls
    text = re.sub('http\S+', ' ', text)
    
    # only alphabets, spaces and apostrophes 
    text = re.sub("[^a-z' ]+", ' ', text)
    
    # remove all apostrophes which are not used in word contractions
    text = ' ' + text + ' '
    text = re.sub("[^a-z]'|'[^a-z]", ' ', text)
    
    return text.split()

df['headline'] = df['headline'].apply(lambda x: clean_text(x))

stop_words = [ 'the', 'a', 'in', 'to', 'of', 'i', 'and', 'is', 'you', 'for', 'on', 'it', 'my', 'that',
               'with', 'are', 'at', 'by', 'this', 'have', 'from', 'be', 'was', 'do', 'will', 'as', 'up', 
               'me', 'am', 'so', 'we', 'your', 'has', 'when', 'an', 's', 'they', 'about', 'been', 'there',
               'who', 'would', 'into', 'his', 'them', 'did', 'w', 'their', 'm', 'its', 'does', 'where', 'th',
               'b', 'd', 'x', 'p', 'o', 'r', 'c', 'n', 'e', 'g', 'v', 'k', 'l', 'f', 'j', 'z', 'us', 'our',
               'all', 'can', 'may' ] 

def remove_stop_words(words):
  result = []
  for word in words:
    if not (word in stop_words):
      result.append(word)
  return result

df['headline'] = df['headline'].apply(lambda words: remove_stop_words(words))

df.sample(5)

# Create training, evaluation, and testing splits

In [None]:
embed = dict()
glove = open('glove.twitter.27B.200d.txt', encoding="utf8")
for line in glove:
    word = line.split()[0]
    embed[word] = np.asarray(line.split()[1:], dtype='float32')
glove.close()


In [None]:
df.head()

In [None]:
def embedding(words):
    
    notfound = []
    mean = np.zeros(200)
    for i in range(len(words)):
        t= words[i]
        if t in embed:
            emb=embed[t]
            words[i] = emb
            mean += emb
        else:
            notfound.append(i)
            
    mean /= len(words)-len(notfound)
    for i in notfound:
        words[i] = mean
    
    return np.array(words)

df['headline'] = df['headline'].apply(lambda words: embedding(words))
df.head()

In [None]:
fractions = np.array([0.6, 0.2, 0.2]) # 60% training, 20% evaluation, 20% testing
df = df.sample(frac=1) # Shuffle the dataset
train, val, test = np.array_split(df, (fractions[:-1].cumsum() * len(df)).astype(int))
train = train.reset_index()
val = val.reset_index()
test = test.reset_index()
print(len(train['headline']))

In [None]:

train.head()
val['headline'][0]

# Load the model

In [None]:
class LSTMnetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm = nn.LSTM( input_size=200, hidden_size=75, num_layers=1, bidirectional=True )        
        self.linear = nn.Linear(in_features=150,out_features=1)
        self.dropout = nn.Dropout(p=0.47)
    def forward(self, x):
        
        out1, _ = self.lstm(x.view(len(x), 1, -1))

        x1 = self.dropout(out1.view(len(x),-1))
        
        pred = torch.sigmoid(self.linear(x1)[-1])
        
        return pred
    
    
lstm = LSTMnetwork()

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(lstm.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.1)

# Train the model

In [None]:
import time
start_time = time.time()


for epoch in range(4):  
    loss = 0
    correct = 0
    vloss = 0
    vcorrect = 0
    epoch_start_time = time.time()

    print('Epoch :', epoch)

    lstm.train()

    for i in range(len(train['headline'])):  
        if math.isnan(train['headline'][i][0][0]):        
            print(i)
            continue
        sample = train['headline'][i]
        sample = torch.FloatTensor(sample)
        label = [train['is_sarcastic'][i]]
        label = torch.FloatTensor( np.array(label) )
        
        optimizer.zero_grad()
        lstm.zero_grad()
        
        pred = lstm(sample)
        #print(i,train['is_sarcastic'][i],train['headline'][i])
        #print(pred.item())
        #print(pred)
        eloss = criterion(pred, label)

        for param in lstm.parameters():
            eloss += torch.tensor(0.002) * torch.norm(param)
        predi = 0
        if pred.item() > 0.5:
            predi = 1
        if predi == int( label.item() ):
            correct += 1

        loss += eloss.item()

        eloss.backward()
        optimizer.step()


    lstm.eval()

    with torch.no_grad():

        for i in range(len(val['headline'])):
            if math.isnan(val['headline'][i][0][0]):        
                print(i)
                continue
            sample = val['headline'][i]
            sample = torch.FloatTensor(sample)
            label = [val['is_sarcastic'][i]]
            label = torch.FloatTensor( np.array(label) )

            pred = lstm( sample)

            eloss = criterion(pred, label)

            vloss += eloss.item()

            predi = 0
            if pred.item() > 0.5:
                predi = 1
            if predi == int( label.item() ):
                vcorrect += 1


    print('Epoch:', epoch)
    print('loss:', loss/len(train['headline'])  ,'accuracy:', correct/len(train['headline']))
    print('val loss:', vloss/len(val['headline']) , 'val accuracy:', vcorrect/len(val['headline']))

    scheduler.step()

print('Duration:', time.time() - start_time, ' seconds')

# Evaluate on test data

In [None]:

tloss = 0
tcorrect = 0
with torch.no_grad():

    for i in range(len(test['headline'])):
        if math.isnan(test['headline'][i][0][0]):        
            print(i)
            continue
        sample = test['headline'][i]
        sample = torch.FloatTensor(sample)
        label = [test['is_sarcastic'][i]]
        label = torch.FloatTensor( np.array(label) )
        
        pred = lstm( sample )

        loss = criterion(pred, label)

        tloss += loss.item()

        pred = pred.item()
        predi = 0
        if pred.item() > 0.5:
            predi = 1
        if predi == int( label.item() ):
            tcorrect += 1

print('test loss:', tloss/len(test['headline']) , 'test accuracy:', tcorrect/len(test['headline']))
