In [21]:
from utils.download_kaggle_dataset import download_kaggle_competition
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

In [22]:
data = download_kaggle_competition("llm-classification-finetuning")
train_df = pd.read_csv(f"{data}/train.csv")
test_df = pd.read_csv(f"{data}/test.csv")
sample_sub = pd.read_csv(f"{data}/sample_submission.csv")

Downloading competition data: llm-classification-finetuning
Unzipping files...
Competition data downloaded to: datasets


In [23]:
def get_label(row):
    if row["winner_model_a"]:
        return 0  
    elif row["winner_model_b"]:
        return 1  
    else:
        return 2 

train_df["label"] = train_df.apply(get_label, axis=1)
train_split, val_split = train_test_split(train_df, test_size=0.1, stratify=train_df["label"], random_state=42)

In [24]:
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(example):
    return tokenizer(
        example["prompt"] + " </s> " + example["response_a"] + " </s> " + example["response_b"],
        padding="max_length",
        truncation=True,
        max_length=512,
    )

In [25]:
class LLMTrainDataset(Dataset):
    def __init__(self, df):
        self.texts = df.apply(lambda row: f"{row['prompt']} </s> {row['response_a']} </s> {row['response_b']}", axis=1)
        self.labels = df["label"].tolist()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        encoding = tokenizer(self.texts.iloc[idx], truncation=True, padding="max_length", max_length=512, return_tensors="pt")
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
        }

train_dataset = LLMTrainDataset(train_split)
val_dataset = LLMTrainDataset(val_split)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = model.to(device)

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    learning_rate=2e-5,
    weight_decay=0.01,
    save_total_limit=2,
    logging_steps=10,
    disable_tqdm=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cpu




Step,Training Loss


In [None]:
class LLMTestDataset(Dataset):
    def __init__(self, df):
        self.texts = df.apply(lambda row: f"{row['prompt']} </s> {row['response_a']} </s> {row['response_b']}", axis=1)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = tokenizer(self.texts.iloc[idx], truncation=True, padding="max_length", max_length=512, return_tensors="pt")
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
        }

test_dataset = LLMTestDataset(test_df)

preds = trainer.predict(test_dataset).predictions
pred_labels = preds.argmax(axis=1)

submission_df = pd.DataFrame({
    "id": test_df["id"],
    "winner_model_a": (pred_labels == 0).astype(int),
    "winner_model_b": (pred_labels == 1).astype(int),
    "winner_tie": (pred_labels == 2).astype(int),
})

submission_df.to_csv("submission.csv", index=False)