## Python Imports

In [1]:
!pip install accelerate
!pip install transformers
!pip install sentence-transformers
!pip install datasets
# !pip -q install google-generativeai==0.3.0
# !pip -q install google-ai-generativelanguage==0.4.0
import accelerate
import torch
import pandas as pd
from torch.utils.data import Dataset
import numpy as np
from torch.utils.data import TensorDataset
from transformers import TrainingArguments, Trainer
import matplotlib.pyplot as plt

import os
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup
from datasets import Dataset, DatasetDict
from accelerate import Accelerator
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder

Collecting accelerate
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/297.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/297.6 kB[0m [31m984.6 kB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/297.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m297.0/297.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_

# Task 2 Setup & Baseline

## Downloads

In [2]:
!mkdir dataset
!curl https://gitlab.com/checkthat_lab/clef2024-checkthat-lab/-/raw/main/task2/data/subtask-2-english/dev_en.tsv -o dataset/dev_en.tsv
!curl https://gitlab.com/checkthat_lab/clef2024-checkthat-lab/-/raw/main/task2/data/subtask-2-english/dev_test_en.tsv -o dataset/dev_test_en.tsv
!curl https://gitlab.com/checkthat_lab/clef2024-checkthat-lab/-/raw/main/task2/data/subtask-2-english/test_en.tsv -o dataset/test_en.tsv
!curl https://gitlab.com/checkthat_lab/clef2024-checkthat-lab/-/raw/main/task2/data/subtask-2-english/train_en.tsv -o dataset/train_en.tsv

!curl -O https://gitlab.com/checkthat_lab/clef2024-checkthat-lab/-/raw/main/task2/baselines/baseline.py
!curl -O https://gitlab.com/checkthat_lab/clef2024-checkthat-lab/-/raw/main/task2/scorer/evaluate.py

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 43329  100 43329    0     0   145k      0 --:--:-- --:--:-- --:--:--  144k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 47282  100 47282    0     0   117k      0 --:--:-- --:--:-- --:--:--  117k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 88045  100 88045    0     0   244k      0 --:--:-- --:--:-- --:--:--  244k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  152k  100  152k    0     0   481k      0 --:--:-- --:--:-- --:--:--  482k
  % Total    % Received % Xferd  Average Speed   Tim

## Baseline

In [3]:
!python baseline.py -trp dataset/train_en.tsv -ttp dataset/dev_test_en.tsv

2024-05-03 08:20:17.035410: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-03 08:20:17.035459: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-03 08:20:17.036575: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
INFO : Running baseline with following configuration: 
                 Train: dataset/train_en.tsv 
                 Test: dataset/dev_test_en.tsv
INFO : Load pretrained SentenceTransformer: paraphrase-multilingual-MiniLM-L12-v2
modules.json: 100% 229/229 [00:00<00:00, 1.26MB/s]
config_sentence_transformers.json: 100% 122/122 [00:00<00:00, 632kB/s]
README.md: 10

# My Task 2 Solution

## Load Datasets

In [4]:
labelencoder = LabelEncoder()
id2label = {0: "OBJ", 1: "SUBJ"}
label2id = {"OBJ": 0, "SUBJ": 1}

train_dataset = pd.read_csv("dataset/train_en.tsv", sep='\t')
train_dataset["label"] = labelencoder.fit_transform(train_dataset["label"])
train_dataset = Dataset.from_pandas(train_dataset, preserve_index=False)

validation_dataset = pd.read_csv("dataset/dev_en.tsv", sep='\t')
validation_dataset.drop(columns=["sentence_id", "solved_conflict"], inplace=True)
validation_dataset["label"] = labelencoder.transform(validation_dataset["label"])
validation_dataset = Dataset.from_pandas(validation_dataset, preserve_index=False)

dev_test_dataset = pd.read_csv("dataset/dev_test_en.tsv", sep='\t')
dev_test_dataset.drop(columns=["sentence_id"], inplace=True)
dev_test_dataset["label"] = labelencoder.transform(dev_test_dataset["label"])
dev_test_dataset = Dataset.from_pandas(dev_test_dataset, preserve_index=False)

test_dataset = pd.read_csv("dataset/test_en.tsv", sep='\t')
test_dataset.drop(columns=["sentence_id"], inplace=True)
test_dataset = Dataset.from_pandas(test_dataset, preserve_index=False)

## Load Model & Tokenizer

In [5]:
checkpoint = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Hyperparameters

In [6]:
# batch_size = 32
# learning_rate = 8e-6
# num_epochs = 8
# warmup_steps = 100
# weight_decay = 0.01

In [7]:
# batch_size = 32
# learning_rate = 8e-6
# num_epochs = 10
# warmup_steps = 100
# weight_decay = 0.01

In [None]:
# batch_size = 64
# learning_rate = 5e-6
# num_epochs = 12
# warmup_steps = 100
# weight_decay = 0.01

## Preprocessing Datasets

In [8]:
import pandas as pd

# GitHub raw URL to your TSV file
url = 'https://raw.githubusercontent.com/Khubaib2002/Subj-Classification-Model/main/data.tsv'

# Read the TSV file into a DataFrame
train_dataset = pd.read_csv(url, sep='\t')

# Display the DataFrame
print(train_dataset)

# train_dataset = pd.read_csv("data.tsv", sep='\t')
train_dataset["label"] = labelencoder.fit_transform(train_dataset["label"])
print(train_dataset)
train_dataset = Dataset.from_pandas(train_dataset, preserve_index=False)


                                               sentence label
0     The implications of such a state are unclear, ...  SUBJ
1     The exact number of allowable unpaid sick days...   OBJ
2     Adult play is an essential component of a fulf...  SUBJ
3     Levelling up, under this administration, means...  SUBJ
4     After 2001, activists defiantly tried to reorg...   OBJ
...                                                 ...   ...
3262  The supporters of stakeholder capitalism congr...  SUBJ
3263  When there is no clear steer from the Kremlin,...   OBJ
3264  The infrastructure is aging and needs signific...  SUBJ
3265  Economists have praised its capacity to boost ...   OBJ
3266  Identified case percentages reach 10% in Denmark.   OBJ

[3267 rows x 2 columns]
                                               sentence  label
0     The implications of such a state are unclear, ...      1
1     The exact number of allowable unpaid sick days...      0
2     Adult play is an essential component

In [33]:
batch_size = 128
learning_rate = 5e-6
num_epochs = 12
warmup_steps = 100
weight_decay = 0.01

In [34]:
def tokenize_function(example):
    return tokenizer(example["sentence"], truncation=True)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_validation_dataset = validation_dataset.map(tokenize_function, batched=True)
tokenized_dev_test_dataset = dev_test_dataset.map(tokenize_function, batched=True)

tokenized_train_dataset = tokenized_train_dataset.remove_columns(["sentence"])
tokenized_validation_dataset = tokenized_validation_dataset.remove_columns(["sentence"])
tokenized_dev_test_dataset = tokenized_dev_test_dataset.remove_columns(["sentence"])

tokenized_train_dataset.set_format(type='torch')
tokenized_validation_dataset.set_format(type='torch')
tokenized_dev_test_dataset.set_format(type='torch')

train_dataloader = DataLoader(tokenized_train_dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator)
validation_dataloader = DataLoader(tokenized_validation_dataset, batch_size=batch_size, collate_fn=data_collator)
dev_test_dataloader = DataLoader(tokenized_dev_test_dataset, batch_size=batch_size, collate_fn=data_collator)

Map:   0%|          | 0/3267 [00:00<?, ? examples/s]

Map:   0%|          | 0/219 [00:00<?, ? examples/s]

Map:   0%|          | 0/243 [00:00<?, ? examples/s]

In [35]:
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["sentence"])
tokenized_test_dataset.set_format(type='torch')
test_dataloader = DataLoader(tokenized_test_dataset, batch_size=batch_size, collate_fn=data_collator)

Map:   0%|          | 0/484 [00:00<?, ? examples/s]

## Training Model

In [36]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Training on device {device}")
model.to(device)

Training on device cuda


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [37]:
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
num_training_steps = num_epochs * len(train_dataloader)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps)
accelerator = Accelerator()
train_dataloader, validation_dataloader, model, optimizer = accelerator.prepare(train_dataloader, validation_dataloader, model, optimizer)

In [38]:
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

model.save_pretrained("model")

  0%|          | 0/624 [00:00<?, ?it/s]

## Evaluating Model

In [39]:
model.eval()
predictions = []
for batch in dev_test_dataloader:
    with torch.no_grad():
        batch = {key: value.to("cuda") for key, value in batch.items()}
        outputs = model(**batch)
    probabilities = torch.sigmoid(outputs.logits).cpu().numpy()
    predictions.extend(np.argmax(probabilities, axis=1))

dev_test_pd = pd.read_csv("dataset/dev_test_en.tsv", sep='\t')
predicted_labels = labelencoder.inverse_transform(predictions)
dev_test_pd["label"] = predicted_labels
dev_test_pd.to_csv("dataset/dev_test_predictions_en.tsv", sep='\t', index=False)

In [40]:
model.eval()
predictions = []
for batch in test_dataloader:
    with torch.no_grad():
        batch = {key: value.to("cuda") for key, value in batch.items()}
        outputs = model(**batch)
    probabilities = torch.sigmoid(outputs.logits).cpu().numpy()
    predictions.extend(np.argmax(probabilities, axis=1))

test_pd = pd.read_csv("dataset/test_en.tsv", sep='\t')
predicted_labels = labelencoder.inverse_transform(predictions)
test_pd["label"] = predicted_labels
test_pd.to_csv("dataset/test_predictions_en.tsv", sep='\t', index=False)

In [41]:
!python evaluate.py -g dataset/dev_test_en.tsv -p dataset/dev_test_predictions_en.tsv

The file is properly formatted
Started evaluating results for task-2...
macro_F1:	0.82		macro_P:		0.82		macro_R:		0.82		SUBJ_F1:	0.83		SUBJ_P:		0.81		SUBJ_R:		0.85		accuracy:	0.82


In [42]:
!python evaluate.py -g dataset/dev_test_en.tsv -p dataset/base_pred_lan.tsv

The file is properly formatted
Started evaluating results for task-2...
macro_F1:	0.72		macro_P:		0.72		macro_R:		0.72		SUBJ_F1:	0.73		SUBJ_P:		0.73		SUBJ_R:		0.72		accuracy:	0.72
