In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install git+https://github.com/huggingface/transformers.git

In [None]:
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
import torch
from torch import nn
import numpy as np
import pandas as pd

In [None]:
train = pd.read_csv(r"train data path")
val = pd.read_csv(r"validate data path")

In [None]:
train.head(10)

In [None]:
train_binary = train[['text', 'binary']]
val_binary = val[['text', 'binary']]

In [None]:
train_multiclass = train[['text', 'multiclass']]
val_multiclass = val[['text', 'multiclass']]

In [None]:
train_binary.columns = ['text', 'label']
val_binary.columns = ['text', 'label']
train_multiclass.columns = ['text', 'label']
val_multiclass.columns = ['text', 'label']

# **For binary task**

In [None]:
# df = pd.concat([train_binary, val_binary], axis = 0)

# **For multiclass task**

In [None]:
# df = pd.concat([train_binary, val_binary], axis = 0)

In [None]:
df.info()

In [None]:
df['label'].unique()

In [None]:
train_, val_ = train_test_split(df, test_size = 0.2, random_state = 221)
train_.reset_index(inplace=True, drop=True)
val_.reset_index(inplace=True, drop=True)

In [None]:
feature = "label"

In [None]:
# CHANGE HERE
num_labels = len(df[feature].unique())
num_labels

In [None]:
# model_name = r"FacebookAI/xlm-roberta-large"
# model_name = r"facebook/bart-large"
# model_name = "FacebookAI/roberta-large"
# model_name="microsoft/mdeberta-v3-base"
# model_name = "answerdotai/ModernBERT-large"
# model_name = "microsoft/deberta-v3-base"
# model_name = r"BAAI/bge-m3"
model_name = r"BAAI/bge-large-en-v1.5"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to("cuda")

In [None]:
def tokenized_data(data):
    inputs = tokenizer(
        data["text"],
        max_length = 512,
        truncation=False,
        padding="max_length",
        return_tensors='pt'
    )
    # CHANGE HERE
    inputs["label"] = data[feature]
    return inputs

In [None]:
from datasets import Dataset
pd_train = Dataset.from_pandas(train_)
pd_val = Dataset.from_pandas(val_)
# pd_train = Dataset.from_pandas(df)

In [None]:
encoded_train = pd_train.map(tokenized_data, batched=True)
encoded_val = pd_val.map(tokenized_data, batched=True)

In [None]:
encoded_train

In [None]:
encoded_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])
encoded_val.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
import wandb

key = "paste your wandb api key here"

wandb.login(key=key)

In [None]:
from sklearn.metrics import f1_score

def compute_metrics(eval_pred):
    pred, true = eval_pred
    true = np.array(true).flatten()  
    pred = np.argmax(pred, axis=-1)  

    
    f1 = f1_score(true, pred, average="macro")

    return {
        "f1": f1
    }

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
train_args = TrainingArguments(
    output_dir = "/kaggle/working/",
    learning_rate = 1e-5,
    per_device_train_batch_size = 2,  
    per_device_eval_batch_size = 2,  
    num_train_epochs = 3,
    seed = 221,
    warmup_steps = 200,
    eval_strategy="epoch",
    save_strategy="epoch",
    gradient_accumulation_steps=4, 
    weight_decay = 0.01,
    fp16=True,   
    gradient_checkpointing=False,
    optim="adamw_torch",
    logging_dir = "./logs",  
    logging_strategy="epoch",
    metric_for_best_model = 'f1',
    save_total_limit=2,
    load_best_model_at_end=True
)

trainer = Trainer(
    model = model,
    args = train_args,
    train_dataset = encoded_train,
    eval_dataset= encoded_val,
    processing_class = tokenizer,
    data_collator=data_collator,
    compute_metrics = compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=2)]
)

In [None]:
trainer.train()

In [None]:
model.save_pretrained("/kaggle/working/bge-finetuned-multiclass")
tokenizer.save_pretrained("/kaggle/working/bge-finetuned-multiclass")