In [1]:
%pip install torch tensorboard
%pip install  transformers datasets accelerate evaluate bitsandbytes huggingface_hub trl peft

Note: you may need to restart the kernel to use updated packages.
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting trl
  Downloading trl-0.15.2-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading trl-0.15.2-py3-none-any.whl (318 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.9/318.9 kB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: trl, evaluate, bitsandbytes
Successfully installed bitsandbyt

In [None]:
import os
os.kill(os.getpid(), 9)

In [1]:
import os
import random
import functools
import csv
import pandas as pd
import numpy as np
import torch
from pathlib import Path
import torch.nn.functional as F
import evaluate
from huggingface_hub import notebook_login

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, classification_report, balanced_accuracy_score, accuracy_score

from scipy.stats import pearsonr
from datasets import Dataset, DatasetDict
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)



In [2]:
data_folder = '/kaggle/input/clef2025-checkthat/data' # data
dataset = pd.DataFrame(columns=['sentence_id','sentence','label','lang','split'])

for language in os.listdir(data_folder):
    for filename in os.listdir(f"{data_folder}{os.sep}{language}"):
        if '.tsv' in filename:
            abs_path = f"{data_folder}{os.sep}{language}{os.sep}{filename}"
            df = pd.read_csv(abs_path, sep='\t', quoting=csv.QUOTE_NONE)
            if 'solved_conflict' in df.columns:
                df.drop(columns=['solved_conflict'], inplace=True)
            df['lang'] = language
            df['split'] = Path(filename).stem
            dataset = pd.concat([dataset, df], axis=0)

In [3]:
dataset = dataset[dataset['lang'] == 'english']

In [4]:
train = dataset[dataset['split'].str.contains('train')].copy()
dev = dataset[dataset['split'].str.contains('dev')].copy()
test = dataset[dataset['split'].str.contains('dev_test')].copy()

print(f"Train: {train.shape}")
print(f"Dev: {dev.shape}")
print(f"Test: {test.shape}")


Train: (830, 5)
Dev: (946, 5)
Test: (484, 5)


In [5]:
print(f"Train: {train['label'].value_counts(normalize=True)}")
print(f"Dev: {dev['label'].value_counts(normalize=True)}")
print(f"Test: {test['label'].value_counts(normalize=True)}")

Train: label
OBJ     0.640964
SUBJ    0.359036
Name: proportion, dtype: float64
Dev: label
OBJ     0.617336
SUBJ    0.382664
Name: proportion, dtype: float64
Test: label
OBJ     0.747934
SUBJ    0.252066
Name: proportion, dtype: float64


In [6]:
train.loc[:, 'label'] = train['label'].apply(lambda x: 0 if x == 'OBJ' else 1)
dev.loc[:, 'label'] = dev['label'].apply(lambda x: 0 if x == 'OBJ' else 1)
test.loc[:, 'label'] = test['label'].apply(lambda x: 0 if x == 'OBJ' else 1)

In [7]:
train['label'] = train['label'].astype(int)
dev['label'] = dev['label'].astype(int)
test['label'] = test['label'].astype(int)

In [8]:
train.head()

Unnamed: 0,sentence_id,sentence,label,lang,split
0,b9e1635a-72aa-467f-86d6-f56ef09f62c3,Gone are the days when they led the world in r...,1,english,train_en
1,f99b5143-70d2-494a-a2f5-c68f10d09d0a,The trend is expected to reverse as soon as ne...,0,english,train_en
2,4076639c-aa56-4202-ae0f-9d9217f8da68,But there is the specious point again.,0,english,train_en
3,b057c366-698e-419d-a284-9b16d835c64e,He added he wouldn’t be surprised to see a new...,0,english,train_en
4,a5a9645e-7850-41ba-90a2-5def725cd5b8,"Not less government, you see; the same amount ...",1,english,train_en


In [9]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [10]:
# Converting pandas DataFrames into Hugging Face Dataset objects:
df_train = Dataset.from_pandas(train.drop(['sentence_id', 'lang', 'split'],axis=1).reset_index(drop=True))
df_dev = Dataset.from_pandas(dev.drop(['sentence_id', 'lang', 'split'],axis=1).reset_index(drop=True))
df_test = Dataset.from_pandas(test.drop(['sentence_id', 'lang', 'split'],axis=1).reset_index(drop=True))


In [24]:
dataset = DatasetDict({
    'train': df_train,
    'dev': df_dev,
    'test': df_test
})
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 830
    })
    dev: Dataset({
        features: ['sentence', 'label'],
        num_rows: 946
    })
    test: Dataset({
        features: ['sentence', 'label'],
        num_rows: 484
    })
})

In [25]:
df_train[0]

{'sentence': 'Gone are the days when they led the world in recession-busting',
 'label': 1}

In [13]:
train.label.value_counts(normalize=True)

label
0    0.640964
1    0.359036
Name: proportion, dtype: float64

In [14]:
class_weights=(1/train.label.value_counts(normalize=True).sort_index()).tolist()
class_weights=torch.tensor(class_weights)
class_weights=class_weights/class_weights.sum()
class_weights

tensor([0.3590, 0.6410])

In [15]:
model_name = "meta-llama/Llama-3.2-1B"

In [16]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, # enable 4-bit quantization
    bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
    bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
    bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
)

lora_config = LoraConfig(
    r = 16, # the dimension of the low-rank matrices
    lora_alpha = 8, # scaling factor for LoRA activations vs pre-trained weight activations
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.05, # dropout probability of the LoRA layers
    bias = 'none', # wether to train bias weights, set to 'none' for attention layers
    task_type = 'SEQ_CLS'
)

In [18]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    num_labels=2
)

model

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((20

In [19]:
model = prepare_model_for_kbit_training(model)
model

LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((20

In [20]:
model = get_peft_model(model, lora_config)
model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
           

In [21]:
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

In [22]:
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False
model.config.pretraining_tp = 1

In [29]:
MAX_LEN = 256

def llama_preprocessing_function(examples):
    return tokenizer(examples['sentence'], truncation=True, max_length=MAX_LEN)

tokenized_datasets = dataset.map(llama_preprocessing_function, batched=True)

Map:   0%|          | 0/830 [00:00<?, ? examples/s]

Map:   0%|          | 0/946 [00:00<?, ? examples/s]

Map:   0%|          | 0/484 [00:00<?, ? examples/s]

In [30]:
collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [31]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    try:
        # it's a classification task, take the argmax
        predictions_processed = np.argmax(predictions, axis=1)


        f1 = f1_score(labels, predictions, average='macro')
        
        return {'f1': f1}
    except Exception as e:
        print(f"Error in compute_metrics: {e}")
        return {'f1': None}