In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Install Dependencies

In [2]:
!pip install peft \
    -U --no-index --find-links /kaggle/input/lmsys-wheel-files --quiet

# Library Imports

In [3]:
from dataclasses import dataclass
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer

import polars as pl
import time
import torch

# Configurations

In [4]:
@dataclass
class Config:
    qwen_dir='/kaggle/input/qwen2.5/transformers/1.5b-instruct/1'
    max_length=2048
    device=torch.device('cuda')

cfg = Config()

# Load Data

In [5]:
def load_data(file_path):
    return pl.read_parquet(file_path).to_pandas()

# Preprocess Data

In [6]:
def preprocess_data(df):
    df.prompt = df.prompt.fillna('')
    df.response_a = df.response_a.fillna('')
    df.response_b = df.response_b.fillna('')
    return df

# Initialize Model and Tokenizer

In [7]:
tokenizer = AutoTokenizer.from_pretrained(cfg.qwen_dir)
model = AutoModelForCausalLM.from_pretrained(cfg.qwen_dir)
model.eval()

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
      )
    )
    (norm): Qw

# Tokenise Data

In [8]:
instruction = """In the text provided for you below, PROMPT is the question presented; MODEL_A is the response from the first model; MODEL_B is the response from the second model. Please select the best answer from the two responses provided. If the first answer is better, return "model_a"; if the second answer is better, return "model_b"."""

def tokenize_data(df):
    tokenised_data = []
    ids = []

    for idx in range(len(df)):

        rec = df.iloc[idx,:]

        prompt = 'PROMPT: ' + rec['prompt']
        model_a = 'MODEL_A: ' + rec['response_a']
        model_b = 'MODEL_B: ' + rec['response_b']
        text = prompt + model_a + model_b

        messages = [
            {"role": "system", "content": instruction},
            {"role": "user", "content": text}
        ]

        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
    
        tokenised_datum = tokenizer(
            [text],
            return_tensors="pt",
            padding=True,
            truncation=True
        ).to(cfg.device)

        tokenised_data.append(tokenised_datum)
        ids.append(rec['id'])

    return tokenised_data, ids

# Method to Make Prediction

In [9]:
def inference(tokenised_datum):
    model.to(cfg.device)
    input_ids = tokenised_datum.input_ids

    generated_ids = model.generate(
        input_ids,
        max_new_tokens=5
    )

    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(input_ids, generated_ids)
    ]
    
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    return response

# Train Model

In [10]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

train = load_data('/kaggle/input/wsdm-cup-multilingual-chatbot-arena/train.parquet')
train = train.head(25000)
train = preprocess_data(train)
tokenised_data_train, ids = tokenize_data(train)
predictions_train = []

for tokenised_datum_train in tokenised_data_train:
    try:
        torch.cuda.empty_cache()  # Clears GPU memory
        response = inference(tokenised_datum_train)
        predictions_train.append(response)
    except Exception as e:
        print(f"An error occurred: {e}")
        predictions_train.append("model_a")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


An error occurred: CUDA out of memory. Tried to allocate 9.18 GiB. GPU 0 has a total capacity of 15.89 GiB of which 7.37 GiB is free. Process 2695 has 8.52 GiB memory in use. Of the allocated memory 8.15 GiB is allocated by PyTorch, and 86.47 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
An error occurred: CUDA out of memory. Tried to allocate 11.55 GiB. GPU 0 has a total capacity of 15.89 GiB of which 7.09 GiB is free. Process 2695 has 8.79 GiB memory in use. Of the allocated memory 8.39 GiB is allocated by PyTorch, and 112.31 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://p

# Make Prediction

In [11]:
test = load_data('/kaggle/input/wsdm-cup-multilingual-chatbot-arena/test.parquet')
test = preprocess_data(test)
tokenised_data, ids = tokenize_data(test)
predictions = []

for tokenised_datum in tokenised_data:
    try:
        response = inference(tokenised_datum)
        predictions.append(response)
    except Exception as e:
        print(f"An error occurred: {e}")
        predictions.append("model_a")

In [12]:
submission = pd.DataFrame({
    'id': ids,
    'winner': predictions
})
submission.to_csv("submission.csv",index=False)
submission

Unnamed: 0,id,winner
0,327228,model_a
1,1139415,model_a
2,1235630,model_a
