In [None]:
# Kaggle Notebook for LLM Classification Finetuning Competition
# Using BERT-base-uncased model for preference classification

import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW  # import AdamW from torch.optim now
from tqdm import tqdm


In [None]:
# 1. Load train and test data
train = pd.read_csv('/kaggle/input/llm-classification-finetuning/train.csv')
test = pd.read_csv('/kaggle/input/llm-classification-finetuning/test.csv')

print(f"Train data shape: {train.shape}")
print(f"Test data shape: {test.shape}")



In [None]:
# 2. Prepare input text by combining prompt + responses for BERT input
def prepare_text(row):
    return f"Prompt: {row['prompt']} [SEP] Response A: {row['response_a']} [SEP] Response B: {row['response_b']}"

train['input_text'] = train.apply(prepare_text, axis=1)
test['input_text'] = test.apply(prepare_text, axis=1)


In [None]:
# 3. Convert winner columns into single label: 0 = A wins, 1 = B wins, 2 = Tie
def winner_to_label(row):
    if row['winner_model_a'] == 1:
        return 0
    elif row['winner_model_b'] == 1:
        return 1
    else:
        return 2

train['label'] = train.apply(winner_to_label, axis=1)



In [None]:
# 4. Load BERT-base-uncased tokenizer
# 4. Load BERT-base-uncased tokenizer from local path
from transformers import BertTokenizer

MODEL_PATH = "/kaggle/input/bert-base-uncased/pytorch/default/1/bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)

MAX_LEN = 128



In [None]:
# 5. Tokenize all text inputs for train and test
# 5. Tokenize all text inputs for train and test
#1. Use .apply() instead of tolist()
def tokenize_texts(texts, tokenizer, max_len=128):
    return tokenizer(
        list(texts),
        padding='max_length',
        truncation=True,
        max_length=max_len,
        return_tensors='pt'
    )
#2. Limit Dataset for Testing
train_encodings = tokenize_texts(train['input_text'], tokenizer, MAX_LEN)
test_encodings = tokenize_texts(test['input_text'], tokenizer, MAX_LEN)




In [None]:
# 6. Create custom Dataset class for PyTorch
class LLMPreferenceDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels
    
    def __len__(self):
        return len(self.encodings['input_ids'])
    
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = LLMPreferenceDataset(train_encodings, train['label'].values)
test_dataset = LLMPreferenceDataset(test_encodings)



In [None]:
# 7. Split train dataset into train and validation sets (80/20 split)
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size

train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)


In [None]:
# 8. Load BERT-base-uncased model with 3 output labels (A wins, B wins, Tie)

from transformers import BertForSequenceClassification
from torch.optim import AdamW
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_path = "/kaggle/input/bert-base-uncased/pytorch/default/1/bert-base-uncased"
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    model_path,
    num_labels=9  # match the checkpoint for now
)

# Then replace the classifier with a new one (3 output labels)
import torch.nn as nn
model.classifier = nn.Linear(model.config.hidden_size, 3)
model.num_labels = 3

#|model


In [None]:
# 9. Training and evaluation functions
def train_epoch(model, dataloader):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def eval_model(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = outputs.logits.argmax(dim=1)

            correct += (preds == labels).sum().item()
            total += labels.size(0)
    return correct / total



In [None]:
# 10. Train model for 3 epochs
from torch.optim import AdamW  # ✅ Use PyTorch AdamW to avoid import issues

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Number of training epochs
EPOCHS = 3

# Training loop
for epoch in range(EPOCHS):
    print(f"Epoch {epoch+1}/{EPOCHS}")
    
    train_loss = train_epoch(model, train_loader)   # Make sure 'train_loader' is defined
    val_acc = eval_model(model, val_loader)         # Make sure 'val_loader' is defined
    
    print(f"Train loss: {train_loss:.4f}, Validation accuracy: {val_acc:.4f}")


In [None]:
# 11. Predict probabilities on test set
def predict(model, dataloader):
    model.eval()
    preds = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            probs = torch.nn.functional.softmax(outputs.logits, dim=1)
            preds.append(probs.cpu())
    return torch.cat(preds)

test_loader = DataLoader(test_dataset, batch_size=32)
test_preds = predict(model, test_loader).numpy()



In [None]:
# 12. Prepare submission file
submission = pd.DataFrame({
    'id': test['id'],
    'winner_model_a': test_preds[:, 0],
    'winner_model_b': test_preds[:, 1],
    'winner_tie': test_preds[:, 2]
})

submission.to_csv('submission.csv', index=False)
print("Submission file saved as submission.csv")
