import pandas as pd
import numpy as np
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from sklearn.model_selection import train_test_split
from typing import Dict, List, Optional
import warnings
warnings.filterwarnings('ignore')

def set_seed(seed: int = 42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

class PreferenceDataset(torch.utils.data.Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer, max_length: int = 512):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx: int):
        row = self.df.iloc[idx]
        
        # Concatenate prompt with each response
        text_a = f"{row['prompt']} [SEP] {row['response_a']}"
        text_b = f"{row['prompt']} [SEP] {row['response_b']}"
        
        # Tokenize both texts
        encoded = self.tokenizer(
            text_a,
            text_b,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors=None
        )
        
        # Add label
        encoded['label'] = 0 if row['winner'] == 'model_a' else 1
        
        return encoded

class PreferenceTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop('labels')
        outputs = model(**inputs)
        
        loss = torch.nn.functional.cross_entropy(outputs.logits, labels)
        
        return (loss, outputs.logits) if return_outputs else loss

def main():
    # Set random seed
    set_seed(42)
    
    # Load datasets
    print("Loading datasets...")
    train = pd.read_parquet("/kaggle/input/wsdm-cup-multilingual-chatbot-arena/train.parquet")
    test = pd.read_parquet("/kaggle/input/wsdm-cup-multilingual-chatbot-arena/test.parquet")
    print(f"Train shape: {train.shape}, Test shape: {test.shape}")
    
    # Initialize tokenizer and model
    print("Initializing model and tokenizer...")
    model_path = "/kaggle/input/google-bert/transformers/default/1/bert-base-multilingual-cased"
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_path,
        num_labels=2,
        ignore_mismatched_sizes=True
    )
    
    # Split data into train and validation sets
    train_df, val_df = train_test_split(train, test_size=0.1, random_state=42, shuffle=True)
    
    # Create datasets
    train_dataset = PreferenceDataset(train_df, tokenizer)
    val_dataset = PreferenceDataset(val_df, tokenizer)
    
    # Initialize data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        warmup_ratio=0.1,
        weight_decay=0.01,
        logging_dir='./logs',
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        max_grad_norm=1.0,
        report_to=["none"],
        learning_rate=2e-5,
        fp16=True,
        gradient_accumulation_steps=2,
        save_total_limit=2,
        logging_steps=100,
    )
    
    # Initialize trainer
    trainer = PreferenceTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=data_collator,
    )
    
    # Train model
    print("Training model...")
    trainer.train()
    
    # Generate predictions
    print("Generating predictions...")
    test_dataset = PreferenceDataset(test, tokenizer)
    predictions = []
    model.eval()
    device = next(model.parameters()).device
    
    with torch.no_grad():
        for i in range(len(test_dataset)):
            # Get both encodings
            text_a = f"{test.iloc[i]['prompt']} [SEP] {test.iloc[i]['response_a']}"
            text_b = f"{test.iloc[i]['prompt']} [SEP] {test.iloc[i]['response_b']}"
            
            # Encode both texts
            inputs_a = tokenizer(
                text_a,
                truncation=True,
                max_length=512,
                padding='max_length',
                return_tensors='pt'
            ).to(device)
            
            inputs_b = tokenizer(
                text_b,
                truncation=True,
                max_length=512,
                padding='max_length',
                return_tensors='pt'
            ).to(device)
            
            # Get logits for both
            outputs_a = model(**inputs_a)
            outputs_b = model(**inputs_b)
            
            # Compare logits
            pred = 'model_a' if outputs_a.logits[0, 1] > outputs_b.logits[0, 1] else 'model_b'
            predictions.append(pred)
    
    # Create submission
    submission = pd.DataFrame({
        'id': test['id'],
        'winner': predictions
    })
    submission.to_csv("submission.csv", index=False)
    print("Submission file created successfully!")

if __name__ == "__main__":
    main()

import os
import gc
from dataclasses import dataclass
from typing import List, Optional, Tuple
import warnings
from tqdm.auto import tqdm
import pandas as pd
import polars as pl
import numpy as np
from sklearn.model_selection import train_test_split

import torch
import transformers
from datasets import Dataset
from peft import (
    LoraConfig,
    get_peft_model,
    TaskType
)
from transformers import (
    GemmaTokenizerFast,
    GemmaForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    BitsAndBytesConfig
)

# Set environment variables for better memory management
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
warnings.simplefilter('ignore')

@dataclass
class PATHS:
    train_path: str = '/kaggle/input/wsdm-cup-multilingual-chatbot-arena/train.parquet'
    test_path: str = '/kaggle/input/wsdm-cup-multilingual-chatbot-arena/test.parquet'
    model_path: str = '/kaggle/input/googlegemma-7b/transformers/default/1/gemma-2b'
    output_dir: str = './gemma-finetuned'

@dataclass
class CFG:
    max_length: int = 256
    train_batch_size: int = 1
    eval_batch_size: int = 1
    num_epochs: int = 3
    learning_rate: float = 1e-4
    seed: int = 42
    gradient_accumulation_steps: int = 16
    warmup_ratio: float = 0.1
    valid_size: float = 0.1
    
    # LoRA specific configs
    lora_r: int = 8
    lora_alpha: int = 32
    lora_dropout: float = 0.1

def cleanup():
    """Aggressive cleanup of GPU memory"""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

def set_seed(seed: int = 42) -> None:
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        for k, v in inputs.items():
            if isinstance(v, torch.Tensor):
                inputs[k] = v.to(model.device)
        
        outputs = model(**inputs)
        loss = outputs.loss
        
        return (loss, outputs) if return_outputs else loss

def prepare_training_data(df: pd.DataFrame, tokenizer: GemmaTokenizerFast, max_length: int) -> Dataset:
    def prepare_features(examples):
        texts = [
            f"<prompt>: {prompt}\n\n<response_a>: {resp_a}\n\n<response_b>: {resp_b}"
            for prompt, resp_a, resp_b in zip(
                examples['prompt'],
                examples['response_a'],
                examples['response_b']
            )
        ]
        
        tokenized = tokenizer(
            texts,
            max_length=max_length,
            padding='max_length',
            truncation=True,
        )
        
        labels = [0 if winner == 'model_a' else 1 for winner in examples['winner']]
        
        return {
            'input_ids': tokenized['input_ids'],
            'attention_mask': tokenized['attention_mask'],
            'labels': labels
        }
    
    dataset = Dataset.from_pandas(df)
    return dataset.map(
        prepare_features,
        batched=True,
        remove_columns=dataset.column_names,
        desc="Processing dataset"
    )

def create_peft_config():
    return LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=CFG.lora_r,
        lora_alpha=CFG.lora_alpha,
        lora_dropout=CFG.lora_dropout,
        bias="none",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        modules_to_save=["score"]
    )

def train():
    set_seed(CFG.seed)
    cleanup()
    
    if torch.cuda.is_available():
        torch.cuda.set_device(0)
    
    print("Loading data...")
    train_df = pl.read_parquet(PATHS.train_path).to_pandas()
    
    train_df, valid_df = train_test_split(
        train_df, 
        test_size=CFG.valid_size, 
        random_state=CFG.seed,
        stratify=train_df['winner']
    )
    
    print("Loading tokenizer and model...")
    tokenizer = GemmaTokenizerFast.from_pretrained(
        PATHS.model_path,
        padding_side="right",
        truncation_side="right",
    )
    tokenizer.pad_token = tokenizer.eos_token
    
    # Configure 4-bit quantization
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
    )
    
    # Load model with quantization
    print("Loading model with 4-bit quantization...")
    model = GemmaForSequenceClassification.from_pretrained(
        PATHS.model_path,
        num_labels=2,
        quantization_config=bnb_config,
        device_map="auto",
        use_cache=False
    )
    model.config.pad_token_id = model.config.eos_token_id
    
    print("Preparing model for LoRA...")
    peft_config = create_peft_config()
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()
    
    print("Preparing datasets...")
    train_dataset = prepare_training_data(train_df, tokenizer, CFG.max_length)
    valid_dataset = prepare_training_data(valid_df, tokenizer, CFG.max_length)
    cleanup()
    
    training_args = TrainingArguments(
        output_dir=PATHS.output_dir,
        num_train_epochs=CFG.num_epochs,
        per_device_train_batch_size=CFG.train_batch_size,
        per_device_eval_batch_size=CFG.eval_batch_size,
        gradient_accumulation_steps=CFG.gradient_accumulation_steps,
        warmup_ratio=CFG.warmup_ratio,
        learning_rate=CFG.learning_rate,
        fp16=True,
        logging_steps=100,
        evaluation_strategy="steps",
        eval_steps=500,
        save_strategy="steps",
        save_steps=500,
        save_total_limit=1,
        load_best_model_at_end=True,
        report_to='none',
        remove_unused_columns=False,
        dataloader_pin_memory=False,
        gradient_checkpointing=True,
    )
    
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(
            tokenizer=tokenizer,
            padding=True,
            max_length=CFG.max_length
        ),
    )
    
    print("Starting training...")
    trainer.train()
    
    model.save_pretrained(PATHS.output_dir)
    tokenizer.save_pretrained(PATHS.output_dir)
    print(f"Model saved to {PATHS.output_dir}")
    
    return model, tokenizer

def predict(model, tokenizer, test_df: pd.DataFrame):
    test_dataset = prepare_training_data(test_df, tokenizer, CFG.max_length)
    
    trainer = CustomTrainer(
        model=model,
        args=TrainingArguments(
            output_dir="./",
            per_device_eval_batch_size=CFG.eval_batch_size,
            report_to='none'
        ),
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(
            tokenizer=tokenizer,
            padding=True,
            max_length=CFG.max_length
        ),
    )
    
    predictions = trainer.predict(test_dataset).predictions
    return np.argmax(predictions, axis=1)

def main():
    print(f'PyTorch version: {torch.__version__}')
    print(f'Transformers version: {transformers.__version__}')
    
    try:
        model, tokenizer = train()
        
        test_df = pl.read_parquet(PATHS.test_path).to_pandas()
        predictions = predict(model, tokenizer, test_df)
        
        test_df['winner'] = ['model_a' if pred == 0 else 'model_b' for pred in predictions]
        test_df[['id', 'winner']].to_csv('submission.csv', index=False)
        print("Predictions saved to submission.csv")
        
    finally:
        cleanup()

if __name__ == "__main__":
    main()

!pip install -q -U bitsandbytes
!pip install -q -U accelerate
!pip install -q -U peft

In [10]:
import os
import gc
from dataclasses import dataclass
from typing import List, Optional, Tuple
import warnings
from tqdm.auto import tqdm
import pandas as pd
import polars as pl
import numpy as np
from sklearn.model_selection import train_test_split

import torch
import transformers
from datasets import Dataset
from peft import (
    LoraConfig,
    get_peft_model,
    TaskType
)
from transformers import (
    GemmaTokenizerFast,
    GemmaForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

# Suppress warnings and configure environment
warnings.simplefilter('ignore')
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

@dataclass
class PATHS:
    train_path: str = '/kaggle/input/wsdm-cup-multilingual-chatbot-arena/train.parquet'
    test_path: str = '/kaggle/input/wsdm-cup-multilingual-chatbot-arena/test.parquet'
    model_path: str = '/kaggle/input/googlegemma-7b/transformers/default/1/gemma-2b'
    output_dir: str = './gemma-finetuned'

@dataclass
class CFG:
    max_length: int = 256
    train_batch_size: int = 1
    eval_batch_size: int = 1
    num_epochs: int = 3
    learning_rate: float = 1e-4
    seed: int = 42
    gradient_accumulation_steps: int = 16
    warmup_ratio: float = 0.1
    valid_size: float = 0.1
    
    # LoRA specific configs
    lora_r: int = 8
    lora_alpha: int = 32
    lora_dropout: float = 0.1

def cleanup():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

def set_seed(seed: int = 42):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        device = next(model.parameters()).device
        inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v 
                 for k, v in inputs.items()}
        
        outputs = model(**inputs)
        loss = outputs.loss
        
        return (loss, outputs) if return_outputs else loss

def prepare_training_data(df: pd.DataFrame, tokenizer: GemmaTokenizerFast, max_length: int) -> Dataset:
    def prepare_features(examples):
        texts = [
            f"<prompt>: {prompt}\n\n<response_a>: {resp_a}\n\n<response_b>: {resp_b}"
            for prompt, resp_a, resp_b in zip(
                examples['prompt'],
                examples['response_a'],
                examples['response_b']
            )
        ]
        
        tokenized = tokenizer(
            texts,
            max_length=max_length,
            padding='max_length',
            truncation=True,
        )
        
        labels = [0 if winner == 'model_a' else 1 for winner in examples['winner']]
        
        return {
            'input_ids': tokenized['input_ids'],
            'attention_mask': tokenized['attention_mask'],
            'labels': labels
        }
    
    dataset = Dataset.from_pandas(df)
    return dataset.map(
        prepare_features,
        batched=True,
        remove_columns=dataset.column_names,
        desc="Processing dataset",
        num_proc=1
    )

def create_peft_config():
    return LoraConfig(
        task_type=TaskType.SEQ_CLS,
        r=CFG.lora_r,
        lora_alpha=CFG.lora_alpha,
        lora_dropout=CFG.lora_dropout,
        bias="none",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        modules_to_save=["score"],
        inference_mode=False
    )

def train():
    set_seed(CFG.seed)
    cleanup()
    
    print("Loading data...")
    train_df = pl.read_parquet(PATHS.train_path).to_pandas()
    
    train_df, valid_df = train_test_split(
        train_df, 
        test_size=CFG.valid_size, 
        random_state=CFG.seed,
        stratify=train_df['winner']
    )
    
    print("Loading tokenizer and model...")
    tokenizer = GemmaTokenizerFast.from_pretrained(
        PATHS.model_path,
        padding_side="right",
        truncation_side="right",
    )
    tokenizer.pad_token = tokenizer.eos_token
    
    # Load model with bf16 instead of fp16
    model = GemmaForSequenceClassification.from_pretrained(
        PATHS.model_path,
        num_labels=2,
        torch_dtype=torch.bfloat16,  # Changed to bfloat16
        device_map="auto",
        use_cache=False
    )
    model.config.pad_token_id = model.config.eos_token_id
    
    print("Preparing model for LoRA...")
    peft_config = create_peft_config()
    model = get_peft_model(model, peft_config)
    
    model.print_trainable_parameters()
    
    print("Preparing datasets...")
    train_dataset = prepare_training_data(train_df, tokenizer, CFG.max_length)
    valid_dataset = prepare_training_data(valid_df, tokenizer, CFG.max_length)
    cleanup()
    
    training_args = TrainingArguments(
        output_dir=PATHS.output_dir,
        num_train_epochs=CFG.num_epochs,
        per_device_train_batch_size=CFG.train_batch_size,
        per_device_eval_batch_size=CFG.eval_batch_size,
        gradient_accumulation_steps=CFG.gradient_accumulation_steps,
        warmup_ratio=CFG.warmup_ratio,
        learning_rate=CFG.learning_rate,
        bf16=True,  # Changed to bf16
        fp16=False,  # Disabled fp16
        logging_steps=100,
        evaluation_strategy="steps",
        eval_steps=500,
        save_strategy="steps",
        save_steps=500,
        save_total_limit=1,
        load_best_model_at_end=True,
        report_to='none',
        remove_unused_columns=False,
        dataloader_pin_memory=False,
        gradient_checkpointing=True,
        optim="adamw_torch",
        adam_beta1=0.9,
        adam_beta2=0.999,
        adam_epsilon=1e-8,
        max_grad_norm=1.0,
        weight_decay=0.01,
        lr_scheduler_type="cosine",
        ddp_find_unused_parameters=False,
    )
    
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(
            tokenizer=tokenizer,
            padding=True,
            max_length=CFG.max_length
        ),
    )
    
    print("Starting training...")
    trainer.train()
    
    model.save_pretrained(PATHS.output_dir)
    tokenizer.save_pretrained(PATHS.output_dir)
    print(f"Model saved to {PATHS.output_dir}")
    
    return model, tokenizer

def predict(model, tokenizer, test_df: pd.DataFrame):
    test_dataset = prepare_training_data(test_df, tokenizer, CFG.max_length)
    
    trainer = CustomTrainer(
        model=model,
        args=TrainingArguments(
            output_dir="./",
            per_device_eval_batch_size=CFG.eval_batch_size,
            report_to='none'
        ),
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(
            tokenizer=tokenizer,
            padding=True,
            max_length=CFG.max_length
        ),
    )
    
    predictions = trainer.predict(test_dataset).predictions
    return np.argmax(predictions, axis=1)

def main():
    print(f'PyTorch version: {torch.__version__}')
    print(f'Transformers version: {transformers.__version__}')
    
    try:
        model, tokenizer = train()
        
        test_df = pl.read_parquet(PATHS.test_path).to_pandas()
        predictions = predict(model, tokenizer, test_df)
        
        test_df['winner'] = ['model_a' if pred == 0 else 'model_b' for pred in predictions]
        test_df[['id', 'winner']].to_csv('submission.csv', index=False)
        print("Predictions saved to submission.csv")
        
    finally:
        cleanup()

if __name__ == "__main__":
    main()

PyTorch version: 2.5.1+cu121
Transformers version: 4.47.0
Loading data...
Loading tokenizer and model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of GemmaForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/googlegemma-7b/transformers/default/1/gemma-2b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preparing model for LoRA...
trainable params: 1,847,296 || all params: 2,508,023,808 || trainable%: 0.0737
Preparing datasets...


Processing dataset:   0%|          | 0/43595 [00:00<?, ? examples/s]

Processing dataset:   0%|          | 0/4844 [00:00<?, ? examples/s]

Starting training...


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! (when checking argument for argument target in method wrapper_CUDA_nll_loss_forward)