In [22]:
from transformers import AutoTokenizer

from dotenv import load_dotenv
load_dotenv()

from transformers import AutoTokenizer


class SequenceProcessor:
    def __init__(
        self,
        model_name: str,
        max_length: int = 1600,
        truncation_side: str = "left",
        padding_side: str = "left",
        add_eos_token: bool = False,
    ):
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            # use_fast=True,
            # trust_remote_code=True,
            # from_slow=True,
            # add_prefix_space=False,
            padding_side=padding_side,
            truncation_side=truncation_side,
        )
        self.tokenizer.add_eos_token = add_eos_token
        self.model_name = model_name

        self.max_length = max_length

        # Template parts (only tokenize once during initialization)
        self.templates = {
            "start": self.tokenizer.encode("# Prompt\n", add_special_tokens=False),
            "response_a": self.tokenizer.encode("\n\n# Response A\n", add_special_tokens=False),
            "response_b": self.tokenizer.encode("\n\n# Response B\n", add_special_tokens=False),
            "question": self.tokenizer.encode("\n\n# Which response is better?", add_special_tokens=False),
            "ellipsis": self.tokenizer.encode(" [...] ", add_special_tokens=False),
        }

        # Calculate fixed template length
        self.template_length = sum(len(tokens) for tokens in self.templates.values()) - len(self.templates["ellipsis"])

    def truncate_if_needed(self, tokens, max_tokens):
        """Truncate tokens if they exceed max_tokens by keeping start and end portions."""
        if len(tokens) <= max_tokens:
            return tokens

        keep_tokens = (max_tokens - len(self.templates["ellipsis"])) // 2
        return tokens[:keep_tokens] + self.templates["ellipsis"] + tokens[-keep_tokens:]

    def tokenize(self, row, tta=False):
        if tta:
            prompt, response_a, response_b = row["prompt"], row["response_b"], row["response_a"]
        else:
            prompt, response_a, response_b = row["prompt"], row["response_a"], row["response_b"]

        # Available tokens after accounting for template and special tokens
        buffer_tokens = 3
        available_tokens = self.max_length - self.template_length - buffer_tokens  # -1 for BOS token

        # Tokenize all inputs at once
        enc = self.tokenizer([prompt, response_a, response_b], add_special_tokens=False)["input_ids"]
        prompt_tokens, response_a_tokens, response_b_tokens = enc[0], enc[1], enc[2]

        total_length = len(prompt_tokens) + len(response_a_tokens) + len(response_b_tokens)

        # If total length is within limit, return without truncation
        if total_length <= available_tokens:
            final_sequence = (
                [self.tokenizer.bos_token_id]
                + self.templates["start"]
                + prompt_tokens
                + self.templates["response_a"]
                + response_a_tokens
                + self.templates["response_b"]
                + response_b_tokens
                + self.templates["question"]
            )
            if "qwen" in self.model_name.lower():
                final_sequence.pop(0)
            if self.tokenizer.add_eos_token:
                final_sequence.append(self.tokenizer.eos_token_id)

            return {"input_ids": final_sequence, "attention_mask": [1] * len(final_sequence), "length": len(final_sequence)}

        # Allocate tokens based on 20-40-40 split with dynamic adjustment
        prompt_max = int(available_tokens * 0.2)  # Reserve 20% for prompt
        response_max = int(available_tokens * 0.4)  # 40% each for responses

        # If prompt needs less than its allocation, distribute the excess
        prompt_needed = min(len(prompt_tokens), prompt_max)
        excess_tokens = prompt_max - prompt_needed

        # Add half of excess to each response's budget
        response_a_max = response_max + excess_tokens // 2
        response_b_max = response_max + excess_tokens - (excess_tokens // 2)  # Account for odd number

        # Calculate actual token allocations
        prompt_max_tokens = prompt_needed
        response_a_max_tokens = min(len(response_a_tokens), response_a_max)
        response_b_max_tokens = min(len(response_b_tokens), response_b_max)

        # Truncate each section if needed
        prompt_tokens = self.truncate_if_needed(prompt_tokens, prompt_max_tokens)
        response_a_tokens = self.truncate_if_needed(response_a_tokens, response_a_max_tokens)
        response_b_tokens = self.truncate_if_needed(response_b_tokens, response_b_max_tokens)

        # Assemble final input
        final_sequence = (
            [self.tokenizer.bos_token_id]
            + self.templates["start"]
            + prompt_tokens
            + self.templates["response_a"]
            + response_a_tokens
            + self.templates["response_b"]
            + response_b_tokens
            + self.templates["question"]
        )
        if "qwen" in self.model_name.lower():
            final_sequence.pop(0)
        if self.tokenizer.add_eos_token:
            final_sequence.append(self.tokenizer.eos_token_id)
        return {"input_ids": final_sequence, "attention_mask": [1] * len(final_sequence), "length": len(final_sequence)}

In [23]:
processor = SequenceProcessor(model_name="Gladiator/wsdm_cup_phi4_stage1_lora_700_merged",max_length=4096, add_eos_token=True)

In [24]:
import pandas as pd
df = pd.read_parquet("../data/pseudo_labeled_final_data/train_stage2_stage1_combined_v1.parquet")
df = df[df['split'] == 'valid']
df

Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language,split,source
0,00010ed04b536f56ebe43eef1100c13906abea12bf9855...,You will be given a piece of news. Analyze it ...,Let's break down the news and analyze it accor...,"```json\n{\n ""contains_orgs"": true,\n ""orgs""...",model_a,gemma-2-27b-it,gemini-1.5-flash-002,Russian,valid,current_comp
1,00172aab8af10cc0648041c94a41eeab7d9caaea7717a3...,kısaca dopamin detoksu,"Dopamin detoksu, beyninizin dopamin seviyeleri...","Dopamin detoksunun amacı, aşırı uyarılmaya ned...",model_b,grok-2-2024-08-13,gemini-1.5-pro-002,Finnish,valid,current_comp
2,00314ee979ffc9e4e4dd3716d02c401ba117d00640a3e1...,"In Webgpu/WGSL, what are override variables?",In **WebGPU** using **WGSL** (WebGPU Shading L...,"As of my last update in 2023, WebGPU and WGSL ...",model_a,chatgpt-4o-latest-20240903,gpt-4-0125-preview,English,valid,current_comp
3,003aa0a76eb58c06adce6e9db59ad1da73929a431f3f23...,"nêu lý do chọn đây là đề tài nghiên cứu ""Đánh...",Dưới đây là một số lý do có thể được sử dụng đ...,"Chọn đề tài ""Đánh giá tác động của Social Medi...",model_a,llama-3.1-405b-instruct-bf16,reka-core-20240904,Vietnamese,valid,current_comp
4,00448e4160ceb9578584293b6aee5d680c8e6dbbcd1d13...,package ex3_Excercise;\n//버블정렬\npublic class T...,```java\npackage ex3_Excercise;\n\npublic clas...,```java\npackage ex3_Excercise;\n\npublic clas...,model_a,gemma-2-9b-it,command-r-08-2024,English,valid,current_comp
...,...,...,...,...,...,...,...,...,...,...
4834,ffc2a1a209a1053658645fc60dd29249392c7e0040ae5d...,Are there decent AI therapy options yet? If so...,"As an AI, I cannot provide medical advice, and...",The fascinating intersection of AI and mental ...,model_b,gemma-2-9b-it,llama-3.1-nemotron-70b-instruct,English,valid,current_comp
4835,ffd929efef61e5b353180a6e790d35896363ecf02973ee...,Imagine you have to get two goats and a cabbag...,To solve the problem of getting two goats and ...,"To solve this classic problem, we'll break it ...",model_a,o1-mini,llama-3.1-405b-instruct-bf16,English,valid,current_comp
4836,ffe2645c0cdb2cbbb755ad0766cfc14663726619968c4e...,"翻译一下这句话：Builder Journey Experience, 优化开发者在网站及各...",Builder Journey Experience optimizes developer...,Here is a translation of the sentence:\n\n**Bu...,model_b,glm-4-plus,llama-3.1-nemotron-70b-instruct,Chinese,valid,current_comp
4837,ffe2a8b3cf149dc4ffd040623f4c8e4e9e87b98bd41b14...,Какое программное обеспечение развернуто на Уп...,На управляющих узлах OpenShift развернуто неск...,На Управляющих узлах (Master Nodes) OpenShift ...,model_a,gemini-1.5-flash-8b-001,yi-lightning,unknown,valid,current_comp


In [25]:
df['tokenized'] = df.apply(lambda x: processor.tokenize(x), axis=1)

In [1]:
import pandas as pd

In [3]:
df = pd.read_parquet("../data/pseudo_labeled_final_data/train_stage2_labeled_v1.parquet")
df

Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language,split,source,gemma9b,gemma27b,phi4,soft_labels
0,00007cff95d7f7974642a785aca248b0f26e60d3312fac...,vieš po Slovensky?,"Áno, hovorím po slovensky. Ako vám môžem pomôcť?","Áno, veď som tu! Môžem ti pomôcť s otázkami al...",model_a,o1-preview,reka-core-20240904,Slovak,train,current_comp,"[0.802734375, -0.159423828125]","[-0.728515625, -1.80859375]","[0.86328125, -0.49609375]","[0.985, 0.015]"
1,0000c48f341d489591e3bbeca1821926,¿Cómo hago una petición Ajax en django con jQu...,Para hacer una petición Ajax en Django con jQu...,¡Hola! ¡Claro que sí! Aquí te puedo ayudar con...,,Qwen/Qwen2-1.5B-Instruct,llama-2-13b-chat,Spanish,train,lmsys1m_vllm_gen,"[-0.40869140625, 0.94140625]","[-1.5546875, -0.9609375]","[-0.35595703125, 0.783203125]","[0.2412032986690225, 0.7587967013309777]"
2,0000d826867d47dabd7d4b110beea2e9,You are now an expert educator. You have acces...,Cognitive bias is a tendency of the mind to fo...,Cognitive Bias is a phenomenon that occurs whe...,model_a,fastchat-t5-3b,RWKV-4-Raven-14B,English,train,lmsys_33k,"[0.3466033935546875, 0.2843017578125]","[-1.45703125, -0.95703125]","[0.173828125, 0.28857421875]","[0.985, 0.015]"
3,0000f23b2f8049ac9d92c28d4817381e,Please act my sick old grandma who would read ...,"Oh dear, it sounds like you're looking for a c...","Oh, my dear sweet child, come closer and snugg...",,MiniMaxAI/MiniMax-Text-01,deepseek-ai/DeepSeek-V3,English,train,nicholas_broad_hf,"[0.16259765625, 0.3935546875]","[-1.421875, -1.01953125]","[0.25439453125, 0.1134033203125]","[0.45704432372858816, 0.5429556762714118]"
4,00010ed04b536f56ebe43eef1100c13906abea12bf9855...,You will be given a piece of news. Analyze it ...,Let's break down the news and analyze it accor...,"```json\n{\n ""contains_orgs"": true,\n ""orgs""...",model_a,gemma-2-27b-it,gemini-1.5-flash-002,Russian,valid,current_comp,"[1.048828125, -0.31005859375]","[-0.534423828125, -2.0859375]","[1.0, -0.515625]","[0.985, 0.015]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180693,fffe4d219bf14e50a7f29f64ee8a2fa0,puoi organizzare meglio questo documento latex...,"Certo, ecco il codice aggiornato con una maggi...",\documentclass{article}\n\n\usepackage[utf8]{i...,,vicuna-13b,microsoft/Phi-3-mini-4k-instruct,Italian,train,lmsys1m_vllm_gen,"[-0.564453125, 1.25]","[-1.6484375, -0.80078125]","[-0.40625, 0.83984375]","[0.18557792051464522, 0.8144220794853547]"
180694,fffe559651f241bdb473eb396ccdf042,zxcvzxcv,It looks like you've typed a random string of ...,It seems like you've entered a string of rando...,,MiniMaxAI/MiniMax-Text-01,meta-llama/Llama-3.1-405B-Instruct,Danish,train,nicholas_broad_hf,"[0.3017578125, 0.27392578125]","[-0.96875, -1.30859375]","[0.283203125, 0.07080078125]","[0.5340414940762446, 0.4659585059237554]"
180695,ffff059aea247f1dc7a09cfea55e00309b5b9a2e8cd9fc...,1000Tb 每秒传输10gb要多久才能完成,**1. 单位转换:**\n\n* 1000 Tb = 1000 * 1024 Gb = 1...,要计算传输 1000TB 数据以 10Gb/s 的速率需要多长时间，我们可以按以下步骤进行。...,model_a,gemini-1.5-flash-exp-0827,chatgpt-4o-latest-20240903,unknown,train,current_comp,"[0.701171875, 0.154296875]","[-1.6015625, -0.919921875]","[0.1044921875, 0.266357421875]","[0.985, 0.015]"
180696,ffff3da3bc7f4493b93b370bc0bab73e,I have submitted a paper on the topic of trust...,"Dear Reviewers,\n\nThank you for your valuable...",In response to the question about the meaning ...,,vicuna-33b,Nexusflow/Starling-LM-7B-beta,unknown,train,lmsys1m_vllm_gen,"[0.34814453125, 0.259521484375]","[-1.166015625, -1.3828125]","[0.4423828125, -0.06363677978515625]","[0.5544351263355298, 0.4455648736644703]"
