# Preparation

In [1]:
# Libraries
import ast
import re
import unicodedata
import torch
import torch.nn as nn
import optuna
import os
import pickle
import numpy as np
from collections import Counter
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from copy import deepcopy
from transformers.trainer import Trainer, TrainerCallback
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel, AutoConfig, TrainingArguments, EarlyStoppingCallback
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForSequenceClassification
from torch.amp import autocast
from datasets import Dataset, Value, DatasetDict, load_from_disk
from sklearn.linear_model import LogisticRegression
from sklearn.isotonic import IsotonicRegression

2025-07-25 03:46:01.541262: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753415161.741488      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753415161.799620      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Configuration

In [2]:
# Define configuration settings
os.environ["TOKENIZERS_PARALLELISM"] = "false"

USE_CACHE = False
IGNORE_ALL_CACHE = False
CACHE_DIR = './cache'
os.makedirs(CACHE_DIR, exist_ok=True)

def load_cache(name):
    if not USE_CACHE or IGNORE_ALL_CACHE:
        return None
    pkl_path = os.path.join(CACHE_DIR, name + '.pkl')
    dir_path = os.path.join(CACHE_DIR, name)
    if os.path.exists(pkl_path):
        with open(pkl_path, 'rb') as f:
            return pickle.load(f)
    elif os.path.isdir(dir_path):
        return load_from_disk(dir_path)
    return None

def save_cache(obj, name):
    if not USE_CACHE or IGNORE_ALL_CACHE:
        return
    if isinstance(obj, pd.DataFrame):
        obj.to_pickle(os.path.join(CACHE_DIR, name + '.pkl'))
    elif isinstance(obj, (Dataset, DatasetDict)):
        obj.save_to_disk(os.path.join(CACHE_DIR, name))

def log(message):
    print(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')} - {message}")

SEED = 379
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

# Load and Explore Data

In [3]:
def load_data():
    train_df = pd.read_csv('/kaggle/input/llm-classification-finetuning/train.csv')
    test_df = pd.read_csv('/kaggle/input/llm-classification-finetuning/test.csv')
    return train_df, test_df

train_df, test_df = load_data()
log("Data loaded")

print("Train Data Info:")
print(train_df.info())
print("\nTest Data Info:")
print(test_df.info())
log("Printed data info")

print("\nTrain Data Sample:")
print(train_df.head())
print("\nTest Data Sample:")
print(test_df.head())
log("Printed data samples")

print("\nMissing Values in Train:")
print(train_df.isnull().sum())
print("\nMissing Values in Test:")
print(test_df.isnull().sum())
log("Printed missing values")

# sample 10,000 observations 
train_df = train_df.sample(10000, random_state=SEED).reset_index(drop=True)
log("Sampled smaller data for POC")

2025-07-25 03:46:19.112235 - Data loaded
Train Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57477 entries, 0 to 57476
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              57477 non-null  int64 
 1   model_a         57477 non-null  object
 2   model_b         57477 non-null  object
 3   prompt          57477 non-null  object
 4   response_a      57477 non-null  object
 5   response_b      57477 non-null  object
 6   winner_model_a  57477 non-null  int64 
 7   winner_model_b  57477 non-null  int64 
 8   winner_tie      57477 non-null  int64 
dtypes: int64(4), object(5)
memory usage: 3.9+ MB
None

Test Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          3 non-null      int64 
 1   prompt      3 non-null      object
 2   response_a  3

# Features

In [4]:
def count_segments(text):
    if pd.isnull(text):
        return 0
    return len(re.findall(r'\["[^"]*"\]', text)) or len(re.findall(r'"[^"]*"', text))

n_prompts = train_df['prompt'].apply(count_segments)
n_responses_a = train_df['response_a'].apply(count_segments)
n_responses_b = train_df['response_b'].apply(count_segments)
log("Computed segment counts")

prompt_counts = n_prompts.value_counts().sort_index().reset_index()
prompt_counts.columns = ['num_prompts', 'num_observations']
response_a_counts = n_responses_a.value_counts().sort_index().reset_index()
response_a_counts.columns = ['num_responses_a', 'num_observations']
response_b_counts = n_responses_b.value_counts().sort_index().reset_index()
response_b_counts.columns = ['num_responses_b', 'num_observations']
log("Counted frequencies")

prompt_counts['percent'] = 100 * prompt_counts['num_observations'] / prompt_counts['num_observations'].sum()
response_a_counts['percent'] = 100 * response_a_counts['num_observations'] / response_a_counts['num_observations'].sum()
response_b_counts['percent'] = 100 * response_b_counts['num_observations'] / response_b_counts['num_observations'].sum()
log("Computed percentages")

def clean_text(text):
    text = str(text)
    text = text.encode('utf-8', 'replace').decode('utf-8')
    text = re.sub(r'[\uD800-\uDFFF]', '?', text)
    text = unicodedata.normalize('NFKD', text)
    text = re.sub(r'\s+', ' ', text).strip()
    if not text:
        return '[EMPTY]'
    return text

def parse_list(x):
    if isinstance(x, list):
        return x
    if pd.isnull(x) or x == '':
        return []
    if isinstance(x, str):
        x = clean_text(x)
        try:
            parsed = ast.literal_eval(x)
            return [clean_text(item) for item in parsed]
        except (ValueError, SyntaxError):
            return [x]
    return [str(x)]

def structure_conversational_data(row):
    prompts_raw = parse_list(row.get('prompt', ''))
    responses_a_raw = parse_list(row.get('response_a', ''))
    responses_b_raw = parse_list(row.get('response_b', ''))
    conv_a = ""
    conv_b = ""
    for p, ra, rb in zip(prompts_raw, responses_a_raw, responses_b_raw):
        cp = clean_text(p) if p else '[EMPTY]'
        cra = clean_text(ra) if ra else '[EMPTY]'
        crb = clean_text(rb) if rb else '[EMPTY]'
        conv_a += "[USER] " + cp + " [ASSISTANT] " + cra + " "
        conv_b += "[USER] " + cp + " [ASSISTANT] " + crb + " "
    return clean_text(conv_a.strip()), clean_text(conv_b.strip())

cached_train_structured = load_cache('train_df_structured_v4')
if cached_train_structured is not None:
    train_df = cached_train_structured
else:
    train_df[["response_a", "response_b"]] = train_df.apply(lambda row: pd.Series(structure_conversational_data(row)), axis=1)
    save_cache(train_df, 'train_df_structured_v4')
log("Structured conversational data for train_df")

cached_test_structured = load_cache('test_df_structured_v4')
if cached_test_structured is not None:
    test_df = cached_test_structured
else:
    test_df[["response_a", "response_b"]] = test_df.apply(lambda row: pd.Series(structure_conversational_data(row)), axis=1)
    save_cache(test_df, 'test_df_structured_v4')
log("Structured conversational data for test_df")

positive_words = {'good', 'great', 'excellent', 'wonderful', 'best', 'love', 'like', 'positive', 'happy', 'awesome', 'fantastic', 'amazing', 'super', 'nice', 'cool'}
negative_words = {'bad', 'poor', 'terrible', 'worst', 'hate', 'dislike', 'negative', 'sad', 'awful', 'horrible', 'boring', 'stupid', 'wrong', 'false', 'fail'}

def get_sentiment(text):
    words = re.findall(r'\b\w+\b', text.lower())
    pos = sum(1 for w in words if w in positive_words)
    neg = sum(1 for w in words if w in negative_words)
    total = len(words) + 1e-5
    return (pos - neg) / total

def get_length(text):
    return len(re.findall(r'\b\w+\b', text))

vectorizer = TfidfVectorizer(max_features=5000)
all_responses = pd.concat([train_df['response_a'], train_df['response_b'], test_df['response_a'], test_df['response_b']])
vectorizer.fit(all_responses)

def get_tfidf_sim(a, b):
    vec_a = vectorizer.transform([a])
    vec_b = vectorizer.transform([b])
    return cosine_similarity(vec_a, vec_b)[0][0]

cached_train_features = load_cache('train_df_features_v4')
if cached_train_features is not None:
    train_df = cached_train_features
else:
    train_df['length_a'] = train_df['response_a'].apply(get_length)
    train_df['length_b'] = train_df['response_b'].apply(get_length)
    train_df['sentiment_a'] = train_df['response_a'].apply(get_sentiment)
    train_df['sentiment_b'] = train_df['response_b'].apply(get_sentiment)
    train_df['tfidf_sim'] = train_df.apply(lambda row: get_tfidf_sim(row['response_a'], row['response_b']), axis=1)
    save_cache(train_df, 'train_df_features_v4')
log("Added features to train_df")

cached_test_features = load_cache('test_df_features_v4')
if cached_test_features is not None:
    test_df = cached_test_features
else:
    test_df['length_a'] = test_df['response_a'].apply(get_length)
    test_df['length_b'] = test_df['response_b'].apply(get_length)
    test_df['sentiment_a'] = test_df['response_a'].apply(get_sentiment)
    test_df['sentiment_b'] = test_df['response_b'].apply(get_sentiment)
    test_df['tfidf_sim'] = test_df.apply(lambda row: get_tfidf_sim(row['response_a'], row['response_b']), axis=1)
    save_cache(test_df, 'test_df_features_v4')
log("Added features to test_df")

cached_train_balanced = load_cache('train_df_balanced_v4')
if cached_train_balanced is not None:
    train_df = cached_train_balanced
else:
    if 'winner_tie' not in train_df.columns:
        train_df['winner_tie'] = ((train_df['winner_model_a'] == 0) & (train_df['winner_model_b'] == 0)).astype(int)
    class_counts = train_df.groupby(['winner_model_a', 'winner_model_b', 'winner_tie']).size()
    min_count = class_counts.min()
    balanced_df = pd.DataFrame()
    for label, group in train_df.groupby(['winner_model_a', 'winner_model_b', 'winner_tie']):
        balanced_df = pd.concat([balanced_df, group.sample(min_count, random_state=SEED, replace=False)])
    train_df = balanced_df.reset_index(drop=True)
    save_cache(train_df, 'train_df_balanced_v4')
log("Balanced samples")
log(f"Label distribution in train_df: {train_df[['winner_model_a', 'winner_model_b', 'winner_tie']].sum().to_dict()}")

MODEL_NAME = 'microsoft/deberta-v3-base'

def build_pointwise_df(df):
    rows = []
    for i, row in df.iterrows():
        tfidf_sim = row['tfidf_sim']
        if row['winner_model_a'] == 1 and row['winner_model_b'] == 0:
            # Response A is winner
            rows.append({
                'prompt': row['prompt'],
                'response': row['response_a'],
                'labels': 1,
                'length': row['length_a'],
                'sentiment': row['sentiment_a'],
                'tfidf_sim': tfidf_sim
            })
            rows.append({
                'prompt': row['prompt'],
                'response': row['response_b'],
                'labels': 0,
                'length': row['length_b'],
                'sentiment': row['sentiment_b'],
                'tfidf_sim': tfidf_sim
            })
        elif row['winner_model_b'] == 1 and row['winner_model_a'] == 0:
            # Response B is winner
            rows.append({
                'prompt': row['prompt'],
                'response': row['response_a'],
                'labels': 0,
                'length': row['length_a'],
                'sentiment': row['sentiment_a'],
                'tfidf_sim': tfidf_sim
            })
            rows.append({
                'prompt': row['prompt'],
                'response': row['response_b'],
                'labels': 1,
                'length': row['length_b'],
                'sentiment': row['sentiment_b'],
                'tfidf_sim': tfidf_sim
            })
        elif row['winner_tie'] == 1:
            # Tie: both are winners!
            rows.append({
                'prompt': row['prompt'],
                'response': row['response_a'],
                'labels': 1,
                'length': row['length_a'],
                'sentiment': row['sentiment_a'],
                'tfidf_sim': tfidf_sim
            })
            rows.append({
                'prompt': row['prompt'],
                'response': row['response_b'],
                'labels': 1,
                'length': row['length_b'],
                'sentiment': row['sentiment_b'],
                'tfidf_sim': tfidf_sim
            })
    return pd.DataFrame(rows)
    
cached_pointwise = load_cache('pointwise_df_v1')
if cached_pointwise is not None:
    pointwise_df = cached_pointwise
else:
    pointwise_df = build_pointwise_df(train_df)
    pointwise_df['response'] = pointwise_df['response'].apply(clean_text)
    save_cache(pointwise_df, 'pointwise_df_v1')
log("Built pointwise training dataframe")

pointwise_df['labels'] = pointwise_df['labels'].astype('float')
log(f"pointwise_df columns: {pointwise_df.columns.tolist()}")
log(f"pointwise_df sample: {pointwise_df.head().to_dict()}")
log(f"Label distribution in pointwise_df: {pointwise_df['labels'].value_counts().to_dict()}")

cached_dataset = load_cache('pointwise_dataset_v1')
if cached_dataset is not None:
    dataset = cached_dataset
else:
    dataset = Dataset.from_pandas(pointwise_df)
    save_cache(dataset, 'pointwise_dataset_v1')
log("Prepared pointwise dataset")

2025-07-25 03:46:19.316815 - Computed segment counts
2025-07-25 03:46:19.325821 - Counted frequencies
2025-07-25 03:46:19.328046 - Computed percentages
2025-07-25 03:46:29.642383 - Structured conversational data for train_df
2025-07-25 03:46:29.652625 - Structured conversational data for test_df
2025-07-25 03:46:52.683398 - Added features to train_df
2025-07-25 03:46:52.692636 - Added features to test_df
2025-07-25 03:46:52.714642 - Balanced samples
2025-07-25 03:46:52.717466 - Label distribution in train_df: {'winner_model_a': 3094, 'winner_model_b': 3094, 'winner_tie': 3094}
2025-07-25 03:46:55.160946 - Built pointwise training dataframe
2025-07-25 03:46:55.161772 - pointwise_df columns: ['prompt', 'response', 'labels', 'length', 'sentiment', 'tfidf_sim']
2025-07-25 03:46:55.166604 - pointwise_df sample: {'prompt': {0: '["you are a powerhouse of creative brilliance and marketing prowess. Your photography skills rival the most celebrated commercial photographers, capturing the essence

# Train Test Split

In [5]:
cached_split = load_cache('pointwise_split_v1')
if cached_split is not None:
    split = cached_split
else:
    split = dataset.train_test_split(test_size=0.2, seed=SEED)
    split = DatasetDict({'train': split['train'], 'validation': split['test']})  # Rename for clarity
    split['train'] = split['train'].cast_column('labels', Value('float32'))
    split['validation'] = split['validation'].cast_column('labels', Value('float32'))
    save_cache(split, 'pointwise_split_v1')
log("Prepared pointwise dataset and split (validation from train data)")

log(f"Dataset columns: {dataset.column_names}")
log(f"Train split columns: {split['train'].column_names}")
log(f"Test split columns: {split['validation'].column_names}")

Casting the dataset:   0%|          | 0/14851 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/3713 [00:00<?, ? examples/s]

2025-07-25 03:46:56.629320 - Prepared pointwise dataset and split (validation from train data)
2025-07-25 03:46:56.629466 - Dataset columns: ['prompt', 'response', 'labels', 'length', 'sentiment', 'tfidf_sim']
2025-07-25 03:46:56.629543 - Train split columns: ['prompt', 'response', 'labels', 'length', 'sentiment', 'tfidf_sim']
2025-07-25 03:46:56.629630 - Test split columns: ['prompt', 'response', 'labels', 'length', 'sentiment', 'tfidf_sim']


## Tokenize

In [6]:
model_path = '/kaggle/input/debertav3base'

tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
tokenizer.add_special_tokens({'additional_special_tokens': ['[USER]', '[ASSISTANT]']})
log("Loaded tokenizer")

def preprocess_function(examples):
    inputs = ['[USER] ' + clean_text(p) + ' [ASSISTANT] ' + clean_text(r)
              for p, r in zip(examples['prompt'], examples['response'])]
    tokenized = tokenizer(
        inputs,
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors='pt'
    )
    vocab_size = len(tokenizer)
    for i, input_ids in enumerate(tokenized['input_ids']):
        if any(idx >= vocab_size for idx in input_ids):
            log(f"Warning: Invalid input_ids detected in sample {i}: {input_ids.tolist()}")
            raise ValueError(f"input_ids exceed vocab size {vocab_size} in sample {i}")
        non_pad_tokens = [idx for idx in input_ids if idx != tokenizer.pad_token_id]
        if len(non_pad_tokens) <= 2:
            log(f"Warning: Near-empty input detected in sample {i}: {input_ids.tolist()}")
            raise ValueError(f"Near-empty input in sample {i}")
    if 'labels' in examples:
        labels = examples['labels']
        if not all(l in [0, 1] for l in labels):
            log(f"Warning: Invalid labels detected: {labels}")
            raise ValueError("Labels must be 0 or 1 only in pointwise training")
    input_lengths = [len([idx for idx in input_ids if idx != tokenizer.pad_token_id]) for input_ids in tokenized['input_ids']]
    # log(f"Input lengths stats: min={min(input_lengths)}, max={max(input_lengths)}, mean={np.mean(input_lengths):.2f}")
    tokenized['length'] = examples['length']
    tokenized['sentiment'] = examples['sentiment']
    tokenized['tfidf_sim'] = examples['tfidf_sim']
    return tokenized

cached_tokenized = load_cache('pointwise_tokenized_v1')
if cached_tokenized is not None:
    tokenized = cached_tokenized
else:
    tokenized = split.map(
        preprocess_function,
        batched=True,
        batch_size=1000,
        remove_columns=[col for col in dataset.column_names if col not in ['input_ids', 'attention_mask', 'labels', 'length', 'sentiment', 'tfidf_sim']]
    )
    save_cache(tokenized, 'pointwise_tokenized_v1')
log("Tokenized pointwise data")
log(f"Tokenized train columns: {tokenized['train'].column_names}")
log(f"Tokenized test columns: {tokenized['validation'].column_names}")
log(f"Label distribution in tokenized train: {Counter(tokenized['train']['labels'])}")
log(f"Label distribution in tokenized test: {Counter(tokenized['validation']['labels'])}")



2025-07-25 03:46:58.998824 - Loaded tokenizer


Map:   0%|          | 0/14851 [00:00<?, ? examples/s]

Map:   0%|          | 0/3713 [00:00<?, ? examples/s]

2025-07-25 03:51:37.294844 - Tokenized pointwise data
2025-07-25 03:51:37.294991 - Tokenized train columns: ['labels', 'length', 'sentiment', 'tfidf_sim', 'input_ids', 'token_type_ids', 'attention_mask']
2025-07-25 03:51:37.295062 - Tokenized test columns: ['labels', 'length', 'sentiment', 'tfidf_sim', 'input_ids', 'token_type_ids', 'attention_mask']
2025-07-25 03:51:37.302336 - Label distribution in tokenized train: Counter({1.0: 9916, 0.0: 4935})
2025-07-25 03:51:37.304704 - Label distribution in tokenized test: Counter({1.0: 2460, 0.0: 1253})


# Train Model

In [7]:
def stable_sigmoid(x):
    x = np.asanyarray(x)
    result = np.empty_like(x)
    pos = x >= 0
    result[pos] = 1 / (1 + np.exp(-x[pos]))
    result[~pos] = np.exp(x[~pos]) / (1 + np.exp(x[~pos]))
    return result

def compute_metrics(eval_pred):
    logits = eval_pred.predictions
    labels = eval_pred.label_ids
    logits = np.array(logits)
    if logits.ndim == 2 and logits.shape[1] == 1:
        logits = logits[:, 0]
    elif logits.ndim == 1:
        logits = logits
    else:
        logits = logits.flatten()
    labels = np.array(labels)
    if labels.ndim > 1:
        labels = labels.flatten()
    min_len = min(len(labels), len(logits))
    labels = labels[:min_len]
    logits = logits[:min_len]
    if len(labels) == 0:
        return {'accuracy': 0.0, 'f1': 0.0}
    logits = np.nan_to_num(logits, nan=0.0, posinf=709.0, neginf=-709.0)
    logits = np.clip(logits, -20, 20)
    probs = stable_sigmoid(logits)
    probs = np.nan_to_num(probs, nan=0.5)
    preds_bin = (probs > 0.5).astype(int)
    return {
        'accuracy': accuracy_score(labels, preds_bin),
        'f1': f1_score(labels, preds_bin, zero_division=1)
    }

from transformers import DataCollatorWithPadding
from torch import tensor as torch_tensor  # To avoid name conflict with built-in tensor

class CustomDataCollator(DataCollatorWithPadding):
    def __call__(self, features):
        batch = super().__call__(features)
        batch["length"] = torch_tensor([f["length"] for f in features], dtype=torch.float32)
        batch["sentiment"] = torch_tensor([f["sentiment"] for f in features], dtype=torch.float32)
        batch["tfidf_sim"] = torch_tensor([f["tfidf_sim"] for f in features], dtype=torch.float32)
        if "labels" in features[0]:
            batch["labels"] = torch_tensor([f["labels"] for f in features], dtype=torch.float32)
        return batch


class GradientLoggingCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % 1000 == 0:
            model = kwargs['model']
            for name, param in model.named_parameters():
                if "classifier" in name and param.grad is not None:
                    print(f"{name}: grad norm = {param.grad.norm().item()}")

class CustomModel(nn.Module):
    def __init__(self, model_path, tokenizer):
        super().__init__()
        self.config = AutoConfig.from_pretrained(model_path, local_files_only=True)
        base = AutoModel.from_pretrained(model_path, local_files_only=True)
        base.resize_token_embeddings(len(tokenizer))
        self.base = base
        self.extra_fc = nn.Linear(3, 32)
        self.classifier = nn.Linear(self.config.hidden_size + 32, 1)

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
                length=None, sentiment=None, tfidf_sim=None, labels=None, **kwargs):
        
        outputs = self.base(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            **kwargs
        )
        pooled = outputs.last_hidden_state[:, 0]

        if length is None or sentiment is None or tfidf_sim is None:
            raise ValueError("Extra features (length, sentiment, tfidf_sim) must not be None")
        extra_feats = torch.stack([
            length.float(), sentiment.float(), tfidf_sim.float()
        ], dim=1)

        extra = self.extra_fc(extra_feats)
        combined = torch.cat([pooled, extra], dim=1)
        logits = self.classifier(combined)

        # Ensure logits is [batch_size, 1]
        if logits.dim() == 1:
            logits = logits.unsqueeze(1)
        elif logits.dim() == 2 and logits.shape[1] != 1:
            logits = logits[:, :1]  # If multi-class by mistake, take first
        elif logits.dim() == 0:
            logits = logits.unsqueeze(0).unsqueeze(1)

        loss = None
        if labels is not None:
            loss_fn = torch.nn.BCEWithLogitsLoss()
            target = labels.float().view(-1, 1) 
            loss = loss_fn(logits, target)
        
        return {
            "loss": loss,
            "logits": logits
        }
        
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        inputs = self._prepare_inputs(inputs)  
        input_ids = inputs.get("input_ids")
        attention_mask = inputs.get("attention_mask")
        token_type_ids = inputs.get("token_type_ids")
        length = inputs.get("length")
        sentiment = inputs.get("sentiment")
        tfidf_sim = inputs.get("tfidf_sim")
        labels = inputs.get("labels")
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            length=length,
            sentiment=sentiment,
            tfidf_sim=tfidf_sim,
            labels=labels
        )
        loss = outputs["loss"]
        if return_outputs:
            return (loss, outputs)
        else:
            return loss

    def prediction_step(self, model: nn.Module, inputs: dict, prediction_loss_only: bool, ignore_keys: list = None):
        inputs = self._prepare_inputs(inputs)  
        has_labels = "labels" in inputs
        with torch.no_grad():
            with self.compute_loss_context_manager():
                outputs = model(**inputs) 
        loss = outputs["loss"].mean().detach() if outputs["loss"] is not None else None
        logits = outputs["logits"]
        labels = inputs["labels"] if has_labels else None
        if prediction_loss_only:
            return (loss, None, None)
        return (loss, logits, labels)

    def get_train_dataloader(self):
        dataset = self.train_dataset
        data_collator = self.data_collator
        return DataLoader(
            dataset,
            batch_size=self.args.per_device_train_batch_size,
            collate_fn=data_collator,
            num_workers=4,
            pin_memory=True
        )

    def get_eval_dataloader(self, eval_dataset=None):
        dataset = self.eval_dataset if eval_dataset is None else eval_dataset
        data_collator = self.data_collator
        return DataLoader(
            dataset,
            batch_size=self.args.per_device_eval_batch_size,
            collate_fn=data_collator,
            num_workers=4,
            pin_memory=True
        )

    def __init__(self, *args, tokenizer=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.data_collator = CustomDataCollator(tokenizer=tokenizer)

def objective(trial):
    torch.cuda.empty_cache()
    import gc
    gc.collect()
    log("Cleared cache and ran GC in objective")
    learning_rate = trial.suggest_float('learning_rate', 5e-5, 5e-4, log=True)
    per_device_train_batch_size = trial.suggest_categorical('per_device_train_batch_size', [4, 8])
    gradient_accumulation_steps = trial.suggest_categorical('gradient_accumulation_steps', [1, 2])
    weight_decay = trial.suggest_float('weight_decay', 0.0, 0.1)
    model = CustomModel(model_path, tokenizer)
    log("Loaded model in objective")
    training_args = TrainingArguments(
        output_dir='./results',
        learning_rate=learning_rate,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        num_train_epochs=3,
        eval_strategy="steps",
        eval_steps=100,
        save_steps=100,
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        seed=SEED,
        report_to="none",
        logging_strategy="steps",
        logging_steps=100,
        warmup_ratio=0.1,
        fp16=False,
        gradient_checkpointing=False,
        max_grad_norm=1.0,
        weight_decay=weight_decay,
        adam_epsilon=1e-6
    )
    log("Set training args in objective")
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized['train'],
        eval_dataset=tokenized['validation'],
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        callbacks=[GradientLoggingCallback(), EarlyStoppingCallback(early_stopping_patience=5)]
    )
    log("Initialized trainer in objective")
    with autocast('cuda'):
        trainer.train()
    log("Trained in objective")
    eval_result = trainer.evaluate()
    log(f"Evaluation result: {eval_result}")
    torch.cuda.empty_cache()
    gc.collect()
    return eval_result['eval_f1']

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1)
log("Optimized hyperparameters")
best_params = study.best_params
print(f"Best hyperparameters: {best_params}")

model1 = CustomModel(model_path, tokenizer)
log("Loaded final model1")

[I 2025-07-25 03:51:37,329] A new study created in memory with name: no-name-55d52678-8d30-4369-85e2-b5380b8c8e0b


2025-07-25 03:51:37.711112 - Cleared cache and ran GC in objective
2025-07-25 03:51:46.373707 - Loaded model in objective
2025-07-25 03:51:46.397739 - Set training args in objective
2025-07-25 03:51:46.776343 - Initialized trainer in objective


Step,Training Loss,Validation Loss,Accuracy,F1
100,5.7312,5.939292,0.337463,0.0
200,5.9507,5.908082,0.337463,0.0
300,6.1489,5.849838,0.337463,0.0
400,5.675,5.714549,0.337463,0.0
500,4.9542,4.332942,0.381632,0.234667
600,4.0609,3.649569,0.412335,0.393552
700,3.3396,3.318226,0.417722,0.408967
800,3.4761,3.068739,0.438998,0.478337
900,3.1381,2.895687,0.440345,0.469085
1000,3.0569,2.761033,0.457043,0.511391


2025-07-25 04:45:45.872366 - Trained in objective


2025-07-25 04:47:20.464688 - Evaluation result: {'eval_loss': 57.90950012207031, 'eval_accuracy': 0.6625370320495556, 'eval_f1': 0.797019277498785, 'eval_runtime': 94.5917, 'eval_samples_per_second': 39.253, 'eval_steps_per_second': 4.916, 'epoch': 1.1308562197092085}


[I 2025-07-25 04:47:21,198] Trial 0 finished with value: 0.797019277498785 and parameters: {'learning_rate': 5.3234585773462036e-05, 'per_device_train_batch_size': 8, 'gradient_accumulation_steps': 1, 'weight_decay': 0.07069410132988903}. Best is trial 0 with value: 0.797019277498785.


2025-07-25 04:47:21.199927 - Optimized hyperparameters
Best hyperparameters: {'learning_rate': 5.3234585773462036e-05, 'per_device_train_batch_size': 8, 'gradient_accumulation_steps': 1, 'weight_decay': 0.07069410132988903}
2025-07-25 04:47:22.862702 - Loaded final model1


In [8]:
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=best_params['learning_rate'],
    per_device_train_batch_size=best_params['per_device_train_batch_size'],
    #per_device_eval_batch_size=1, 
    gradient_accumulation_steps=best_params['gradient_accumulation_steps'],
    num_train_epochs=3,
    eval_strategy="steps",
    eval_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    seed=SEED,
    report_to="none",
    logging_strategy="steps",
    logging_steps=100,
    warmup_ratio=0.1,
    fp16=False,
    save_total_limit=1,
    gradient_checkpointing=False,
    max_grad_norm=1.0,
    weight_decay=best_params['weight_decay'],
    adam_epsilon=1e-6
)
log("Set final training args")

trainer1 = CustomTrainer(
    model=model1,
    args=training_args,
    train_dataset=tokenized['train'],
    eval_dataset=tokenized['validation'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    callbacks=[GradientLoggingCallback(), EarlyStoppingCallback(early_stopping_patience=5)]
)
log("Initialized trainer1")
with autocast('cuda'):
    trainer1.train()
log("Trained model1")

torch.manual_seed(SEED + 1)
torch.cuda.manual_seed(SEED + 1)
model2 = CustomModel(model_path, tokenizer)
log("Loaded model2")

trainer2 = CustomTrainer(
    model=model2,
    args=training_args,
    train_dataset=tokenized['train'],
    eval_dataset=tokenized['validation'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    callbacks=[GradientLoggingCallback(), EarlyStoppingCallback(early_stopping_patience=5)]
)
log("Initialized trainer2")
with autocast('cuda'):
    trainer2.train()
log("Trained model2")

cached_test_dataset = load_cache('test_dataset_v4')
if cached_test_dataset is not None:
    test_dataset = cached_test_dataset
else:
    test_dataset = Dataset.from_pandas(test_df)
    save_cache(test_dataset, 'test_dataset_v4')
log("Created test dataset")

def add_extra_feats_pointwise(examples, response_key):
    if response_key == 'response_a':
        length = examples['length_a']
        sentiment = examples['sentiment_a']
    else:
        length = examples['length_b']
        sentiment = examples['sentiment_b']
    tfidf_sim = examples['tfidf_sim']
    return {
        'length': length,
        'sentiment': sentiment,
        'tfidf_sim': tfidf_sim
    }

def tokenize_pointwise_batch(examples, response_key):
    inputs = ['[USER] ' + clean_text(p) + ' [ASSISTANT] ' + clean_text(r)
              for p, r in zip(examples['prompt'], examples[response_key])]
    tokenized = tokenizer(inputs, truncation=True, padding=True, max_length=512)
    return tokenized

cached_tokenized_test_a = load_cache('test_tokenized_a_pointwise_v1')
if cached_tokenized_test_a is not None:
    tokenized_test_a = cached_tokenized_test_a
else:
    test_with_feats = test_dataset.map(lambda examples: add_extra_feats_pointwise(examples, 'response_a'), batched=True)
    tokenized_test_a = test_with_feats.map(lambda examples: tokenize_pointwise_batch(examples, 'response_a'), batched=True, batch_size=1000)
    save_cache(tokenized_test_a, 'test_tokenized_a_pointwise_v1')

cached_tokenized_test_b = load_cache('test_tokenized_b_pointwise_v1')
if cached_tokenized_test_b is not None:
    tokenized_test_b = cached_tokenized_test_b
else:
    test_with_feats = test_dataset.map(lambda examples: add_extra_feats_pointwise(examples, 'response_b'), batched=True)
    tokenized_test_b = test_with_feats.map(lambda examples: tokenize_pointwise_batch(examples, 'response_b'), batched=True, batch_size=1000)
    save_cache(tokenized_test_b, 'test_tokenized_b_pointwise_v1')
log("Tokenized test data for pointwise")

2025-07-25 04:47:22.904820 - Set final training args
2025-07-25 04:47:23.123645 - Initialized trainer1


Step,Training Loss,Validation Loss,Accuracy,F1
100,13.2892,13.813048,0.337463,0.0
200,13.8659,13.778483,0.337463,0.0
300,14.4012,13.695602,0.337463,0.0
400,13.3362,13.448958,0.338002,0.004858
500,12.7917,12.833281,0.344196,0.034879
600,12.4395,11.764974,0.361971,0.133821
700,10.6035,10.753362,0.389173,0.249007
800,11.0978,10.255629,0.392405,0.29012
900,10.1719,9.923219,0.39456,0.306601
1000,10.217,9.638644,0.402101,0.335727


2025-07-25 05:52:35.161024 - Trained model1
2025-07-25 05:52:36.811721 - Loaded model2
2025-07-25 05:52:37.023219 - Initialized trainer2


Step,Training Loss,Validation Loss,Accuracy,F1
100,13.835,14.379614,0.337463,0.0
200,14.4423,14.345149,0.336924,0.0
300,15.0177,14.282002,0.337463,0.001623
400,13.9868,14.165951,0.337732,0.004856
500,13.4413,13.219483,0.345273,0.044794
600,12.245,11.434979,0.375438,0.20446
700,10.545,10.813251,0.385403,0.263396
800,11.2583,10.424236,0.398869,0.320755
900,10.3938,10.158037,0.401831,0.334432
1000,10.5208,9.931381,0.405063,0.351248


2025-07-25 07:20:26.744780 - Trained model2
2025-07-25 07:20:26.759072 - Created test dataset


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

2025-07-25 07:20:26.936443 - Tokenized test data for pointwise


# Prediction

In [9]:
def flatten_logits(logits):
    if isinstance(logits, dict):
        logits = logits["logits"]
    if logits is None:
        raise ValueError("Logits are None! Check your model's output and dataset.")
    logits = np.array(logits)
    if logits.ndim == 2 and logits.shape[1] == 1:
        logits = logits[:, 0]
    elif logits.ndim == 0:
        logits = np.array([logits])
    return logits

with autocast('cuda'):
    logits_a_model1 = flatten_logits(trainer1.predict(tokenized_test_a).predictions)
    logits_b_model1 = flatten_logits(trainer1.predict(tokenized_test_b).predictions)
    logits_a_model2 = flatten_logits(trainer2.predict(tokenized_test_a).predictions)
    logits_b_model2 = flatten_logits(trainer2.predict(tokenized_test_b).predictions)

probs_a_model1 = stable_sigmoid(logits_a_model1)
probs_b_model1 = stable_sigmoid(logits_b_model1)
probs_a_model2 = stable_sigmoid(logits_a_model2)
probs_b_model2 = stable_sigmoid(logits_b_model2)

probs_a = (probs_a_model1 + probs_a_model2) / 2
probs_b = (probs_b_model1 + probs_b_model2) / 2
log("Averaged predictions")

val_logits = flatten_logits(trainer1.predict(tokenized['validation']).predictions)
val_preds = stable_sigmoid(val_logits)
val_labels = tokenized['validation']['labels']

calibrator = IsotonicRegression(out_of_bounds='clip')
calibrator.fit(val_preds, val_labels)  

preds_a_calibrated = calibrator.predict(probs_a)
preds_b_calibrated = calibrator.predict(probs_b)
log("Calibrated predictions")

threshold = 0.05  
winner_a_prob = []
winner_b_prob = []
winner_tie_prob = []
for a, b in zip(preds_a_calibrated, preds_b_calibrated):
    diff = a - b
    p_tie = stable_sigmoid(-abs(diff) / threshold)  # Close to 1 if |diff| small, close to 0 if large
    p_a_wins_given_no_tie = stable_sigmoid(diff)
    p_b_wins_given_no_tie = 1 - p_a_wins_given_no_tie
    winner_a_prob.append((1 - p_tie) * p_a_wins_given_no_tie)
    winner_b_prob.append((1 - p_tie) * p_b_wins_given_no_tie)
    winner_tie_prob.append(p_tie)
log("Computed soft probabilities")

submission_df = pd.DataFrame({
    'id': test_df['id'],
    'winner_model_a': winner_a_prob,
    'winner_model_b': winner_b_prob,
    'winner_tie': winner_tie_prob
})
submission_df.to_csv('submission.csv', index=False)
log("Created and saved submission.csv")

2025-07-25 07:20:27.356498 - Averaged predictions


2025-07-25 07:22:04.285121 - Calibrated predictions
2025-07-25 07:22:04.285786 - Computed soft probabilities
2025-07-25 07:22:04.293694 - Created and saved submission.csv


In [10]:
print("Successfully saved as CSV file")

Successfully saved as CSV file
