In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
ss = pd.read_csv("/kaggle/input/wsdm-cup-multilingual-chatbot-arena/sample_submission.csv")
ss.head()

In [None]:
from datasets import Dataset

# Load the datasets from parquet files
train_df = pd.read_parquet('/kaggle/input/wsdm-cup-multilingual-chatbot-arena/train.parquet')
test_df = pd.read_parquet('/kaggle/input/wsdm-cup-multilingual-chatbot-arena/test.parquet')

# Prepare training dataset for classification
train_samples = []
for _, row in train_df.iterrows():
    train_samples.append({
        'prompt': row['prompt'],
        'response': row['response_a'],
        'winner': 0 if row['winner'] == 'model_a' else 1
    })
    train_samples.append({
        'prompt': row['prompt'],
        'response': row['response_b'],
        'winner': 1 if row['winner'] == 'model_b' else 0
    })

# Convert to a Hugging Face Dataset
train_dataset = Dataset.from_dict({
    'prompt': [sample['prompt'] for sample in train_samples],
    'response': [sample['response'] for sample in train_samples],
    'winner': [sample['winner'] for sample in train_samples],
})

In [None]:
train_df.head()

In [None]:
train_df["language"].unique()

In [None]:
test_df.head()

In [None]:
# Use a subset of the dataset for quick testing
#train_dataset = train_dataset.select(range(2000))  # Select first 1000 examples

In [None]:
train_dataset

In [None]:
from transformers import AutoModelForSequenceClassification
import torch.nn.utils.weight_norm
import torch.nn as nn

class WeightNormModel(AutoModelForSequenceClassification):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # Applying weight normalization on the classifier layer
        self.classifier = torch.nn.utils.weight_norm(self.classifier)

# Load your model with weight normalization
#model = WeightNormModel.from_pretrained(
    #"/kaggle/input/bert-model/bert_base_uncased_model",
    #num_labels=2,
#)

model = AutoModelForSequenceClassification.from_pretrained(
    "/kaggle/input/model-ml/Model_ml_bert",  # You can choose other model variants as well
    num_labels=2,
    id2label={0: "model_a", 1: "model_b"},  # Labels corresponding to models
    label2id={"model_a": 0, "model_b": 1},
)

# Freeze all parameters in the base model
for param in model.base_model.parameters():
    param.requires_grad = False

# Unfreeze the last 3 layers of the transformer
num_layers = len(model.base_model.encoder.layer)
for layer in model.base_model.encoder.layer[num_layers-3:]:
    for param in layer.parameters():
        param.requires_grad = True

In [None]:
total_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of parameters that require gradients: {total_parameters}")

In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from transformers import AutoTokenizer
from datasets import DatasetDict
import torch
import random

# Set random seed for reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

# Ensure deterministic behavior
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/token-ml/tokenzer_ml_bert")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['prompt'], examples['response'], return_tensors='pt', padding=True, truncation=True)

# Tokenize the dataset
tokenized_train = train_dataset.map(
    lambda examples: {
        **tokenize_function(examples),
        "labels": examples["winner"],  # Replace 'label' with the actual column name in your dataset
    },
    batched=True,
)

# Split the tokenized_train dataset into training and evaluation datasets
split_datasets = tokenized_train.train_test_split(test_size=0.2, seed=42)  # Adjust test_size as needed
tokenized_train = split_datasets["train"]
tokenized_eval = split_datasets["test"]

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}


# Prepare training arguments
training_args = TrainingArguments(
    output_dir="./model_output",
    learning_rate=1e-4,
    per_device_train_batch_size=150,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",  # Disable logging to WandB
)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,  # Provide a validation dataset
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,  # Add compute_metrics
)

# Train the model
trainer.train()

In [None]:
# Prepare test dataset for predictions
test_samples = []
for _, row in test_df.iterrows():
    test_samples.append({
        'id': row['id'],
        'prompt': row['prompt'],
        'response': row['response_a'],
    })
    test_samples.append({
        'id': row['id'],
        'prompt': row['prompt'],
        'response': row['response_b'],
    })

# Convert to a Hugging Face Dataset
test_dataset = Dataset.from_dict({
    'id': [sample['id'] for sample in test_samples],
    'prompt': [sample['prompt'] for sample in test_samples],
    'response': [sample['response'] for sample in test_samples],
})

In [None]:
# Tokenize the test dataset
tokenized_test = test_dataset.map(tokenize_function, batched=True)

In [None]:
# Step 3: Make predictions
predictions = trainer.predict(tokenized_test)
predicted_labels = np.argmax(predictions.predictions, axis=1)

In [None]:
# Prepare submission DataFrame
submission_data = []
for i in range(len(tokenized_test)):
    # The ID from the test dataset
    sample_id = tokenized_test['id'][i]  
    # Determining the winner based on the predicted label
    winner = 'model_a' if predicted_labels[i] == 0 else 'model_b'
    
    # Append to submission data
    submission_data.append({
        'id': sample_id,
        'winner': winner,
    })

# Create DataFrame
submission_df = pd.DataFrame(submission_data)

# Group by 'id' and take the first predicted winner for each unique id
submission_df = submission_df.groupby('id', as_index=False).first()

# Save as CSV
submission_df.to_csv('submission.csv', index=False)

In [None]:
submission_df