In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.1-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m51.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m99.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import transformers as ppb
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

### Importing the dataset

In [None]:
# read train.EN.csv into a dataframe
df = pd.read_csv('https://raw.githubusercontent.com/iabufarha/iSarcasmEval/main/train/train.En.csv', index_col=False)

# read test.EN.csv into a dataframe
df_test = pd.read_csv('https://raw.githubusercontent.com/iabufarha/iSarcasmEval/main/test/task_A_En_test.csv', index_col=False)

# Shuffle data
df = df.sample(frac=1).reset_index(drop=True)
df_test = df_test.sample(frac=1).reset_index(drop=True)

In [None]:
df1 = df[['tweet', 'sarcastic']]
df2 = df_test[['text', 'sarcastic']]

In [None]:
df1.head(1)

Unnamed: 0,tweet,sarcastic
0,y’all were all vote-blue-no-matter-who till be...,0


In [None]:
batch_1 = df1

### Loading the Pre-trained TwHinBERT model

In [None]:
from transformers import AutoModel, AutoTokenizer

model = AutoModel.from_pretrained("Twitter/TwHIN-BERT-base")
threshold = 0.475

# For transformers v4.x+:
tokenizer = AutoTokenizer.from_pretrained("Twitter/TwHIN-BERT-base")

Downloading (…)lve/main/config.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of the model checkpoint at Twitter/TwHIN-BERT-base were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at Twitter/TwHIN-BERT-base and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weig

Downloading (…)okenizer_config.json:   0%|          | 0.00/372 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

## Model #1: TwHinBERT

We first use TwHinBERT to find the [CLS] tokens for all input sentences

### Tokenization

In [None]:
tokenized = batch_1['tweet'].apply((lambda x: tokenizer.encode(str(x), add_special_tokens=True)))
tokenized_test = df2['text'].apply((lambda x: tokenizer.encode(str(x), add_special_tokens=True)))

### Padding

In [None]:
# Pad to the same size for train data
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

# Pad to the same size for test data
max_len = 0
for i in tokenized_test.values:
    if len(i) > max_len:
        max_len = len(i)
      
padded_test = np.array([i + [0]*(max_len-len(i)) for i in tokenized_test.values])

Our dataset is now in the `padded` variable, we can view its dimensions below:

In [None]:
print(np.array(padded).shape, np.array(padded_test).shape)

(3468, 121) (1400, 157)


### Masking

In [None]:
attention_mask = np.where(padded != 0, 1, 0)
attention_test_mask = np.where(padded_test != 0, 1, 0)
print(attention_mask.shape, attention_test_mask.shape)

(3468, 121) (1400, 157)


### Embedding

In [None]:
# Use TwHinBERT to embed all training data
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [None]:
# Use TwHinBERT to embed all test data
input_ids_test = torch.tensor(padded_test)
attention_test_mask = torch.tensor(attention_test_mask)

with torch.no_grad():
    last_hidden_states_test = model(input_ids_test, attention_mask=attention_test_mask)

In [None]:
# Save [CLS] tokens for classification
# last_hidden_states[0][sentences, token position [CLS] as the first one, 768 units]
train_features = last_hidden_states[0][:,0,:].numpy()
test_features = last_hidden_states_test[0][:,0,:].numpy()

In [None]:
train_labels = batch_1['sarcastic'].to_numpy()
test_labels = df2['sarcastic'].to_numpy()

## Model #2: MLP

In [None]:
class MainNet(nn.Module):

    def __init__(self):
        super().__init__()
        self.layernorm1 = nn.LayerNorm(768)
        self.fc1 = nn.Linear(768, 64)
        self.layernorm2 = nn.LayerNorm(64)
        self.fc2 = nn.Linear(64, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_):
        out = self.layernorm1(input_)
        out = self.fc1(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        
        return out

In [None]:
def train_epoch(model, opt, criterion, batch_size=len(train_features)):
    model.train()
    losses = []
    for beg_i in range(0, len(train_features), batch_size):
        x_batch = torch.from_numpy(train_features[beg_i:beg_i + batch_size, :])
        y_true = train_labels[beg_i:beg_i + batch_size]
        y_batch = torch.from_numpy(train_labels[beg_i:beg_i + batch_size])
        x_batch = Variable(x_batch)
        y_batch = Variable(y_batch)
        y_batch = y_batch.unsqueeze(1)
        y_batch = y_batch.float()

        opt.zero_grad()
        y_hat = model(x_batch)
        loss = criterion(y_hat, y_batch)
        loss.backward()
        opt.step()
        losses.append(loss.data.numpy())

    return losses

In [None]:
test_features, valid_features, test_labels, valid_labels = train_test_split(test_features, test_labels, test_size = 0.1, random_state=256)

In [None]:
e_losses = []
num_epochs = 5
net = MainNet()
opt = optim.Adam(net.parameters(), lr=0.00001, betas=(0.9, 0.999))
criterion = nn.BCELoss()
for e in range(num_epochs):
    e_losses += train_epoch(net, opt, criterion)
#plt.plot(e_losses)

## Evaluating Model

In [None]:
# Evaluate the test set and print the accuracy and f1 score
test_features_t = torch.from_numpy(test_features)
test_labels_t = torch.from_numpy(test_labels)
test_labels_t = test_labels_t.unsqueeze(1)
test_labels_t = test_labels_t.float()
net.eval()
y_hat = net(test_features_t)
y_hat = y_hat.detach().numpy()
y_hat = np.where(y_hat <= threshold, 0, 1)
y_hat = np.round(y_hat)
print(f1_score(test_labels, y_hat))

F1 Score:  0.0
0.2826666666666667 0.47500000000000003


In [None]:
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

y_hat = lr_clf.predict(valid_features)
print("F1 Score: ", f1_score(valid_labels, y_hat))

F1 Score:  0.17777777777777778
