In [1]:
!pip install transformers peft accelerate bitsandbytes torch \
    -U --no-index --find-links /kaggle/input/lmsys-wheel-files

Looking in links: /kaggle/input/lmsys-wheel-files
Processing /kaggle/input/lmsys-wheel-files/peft-0.11.1-py3-none-any.whl
Processing /kaggle/input/lmsys-wheel-files/bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl
Processing /kaggle/input/lmsys-wheel-files/torch-2.3.1-cp310-cp310-manylinux1_x86_64.whl
Processing /kaggle/input/lmsys-wheel-files/nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (from torch)
Processing /kaggle/input/lmsys-wheel-files/nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (from torch)
Processing /kaggle/input/lmsys-wheel-files/nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (from torch)
Processing /kaggle/input/lmsys-wheel-files/nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (from torch)
Processing /kaggle/input/lmsys-wheel-files/nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (from torch)
Processing /kaggle/input/lmsys-wheel-files/nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86

In [2]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "False"

In [3]:
import torch
print(torch.__version__)

2.3.1+cu121


In [4]:
from collections import defaultdict
from typing import Dict, List, Optional, Union

import numpy as np
from transformers import AutoTokenizer


def parse_text(text: str) -> list:
    return eval(text, {"null": ""})


templates_dict = {
    "chat_template_with_token_num": """<bos><start_of_turn>user
{prompt}\n
<response_a> ({a_word_num} words): {response_a}\n
<response_b> ({b_word_num} words): {response_b}
<end_of_turn>
<start_of_turn>model
""",
    "chat_template": """<bos><start_of_turn>user
{prompt}\n
<response_a>: {response_a}\n
<response_b>: {response_b}
<end_of_turn>
<start_of_turn>model
""",
    "template": """{prompt}\n
<response_a>: {response_a}\n
<response_b>: {response_b}
<eos>
""",
    "template_with_token_num": """{prompt}\n
<response_a> ({a_word_num} words): {response_a}\n
<response_b> ({b_word_num} words): {response_b}
""",
    "template_with_token_num_eos": """{prompt}\n
<response_a> ({a_word_num} words): {response_a}\n
<response_b> ({b_word_num} words): {response_b}
<eos>
""",
    "template_with_eos": """{prompt}\n
<response_a>: {response_a}\n
<response_b>: {response_b}
<eos>
""",
}


class TextProcessorV2:
    def __init__(
        self,
        truncation_method: str,
        length_assign_method: str,
        tokenizer: AutoTokenizer,
        max_length: int,
        chat_template: Optional[str] = None,
        get_labels: Optional[bool] = True,
    ):
        """
        Initializes the TextProcessor object.

        Args:
            truncation_method (str): The method used for truncating text.
            length_assign_method (str): The method used for assigning length to text.
            tokenizer (AutoTokenizer): The tokenizer object used for tokenization.
            max_length (int): The maximum length of the processed text.
            chat_template (Optional[str], optional): The chat template to be used. Defaults to None.
            get_labels (Optional[bool], optional): Whether to retrieve labels. Defaults to True. [For Inference, set to False.]
        """
        self.chat_template = templates_dict["chat_template_with_token_num"]
        if chat_template is not None:
            self.chat_template = templates_dict[chat_template]

        self.truncation_method = truncation_method
        self.length_assign_method = length_assign_method
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.get_labels = get_labels

    def preprocess_batch(
        self, batch_data: Dict[str, List[str]]
    ) -> Dict[str, List[str]]:
        batch_prompt = [" ".join(parse_text(t)).strip() for t in batch_data["prompt"]]
        batch_response_a = [
            " ".join(parse_text(t)).strip() for t in batch_data["response_a"]
        ]
        batch_response_b = [
            " ".join(parse_text(t)).strip() for t in batch_data["response_b"]
        ]
        return batch_prompt, batch_response_a, batch_response_b

    def compute_token_num(self, text: str) -> int:
        return len(
            self.tokenizer(text, add_special_tokens=False, truncation=False)[
                "input_ids"
            ]
        )

    def format_texts(
        self,
        batch_prompt: List[str],
        batch_response_a: List[str],
        batch_response_b: List[str],
        response_a_token_num: List[int],
        response_b_token_num: List[int],
    ) -> List[str]:
        texts = []
        for prompt, response_a, response_b, a_num, b_num in zip(
            batch_prompt,
            batch_response_a,
            batch_response_b,
            response_a_token_num,
            response_b_token_num,
        ):
            if "a_word_num" in self.chat_template:
                text = self.chat_template.format(
                    prompt=prompt,
                    response_a=response_a,
                    response_b=response_b,
                    a_word_num=a_num,
                    b_word_num=b_num,
                )
            else:
                text = self.chat_template.format(
                    prompt=prompt,
                    response_a=response_a,
                    response_b=response_b,
                )
            texts.append(text)
        return texts

    def get_part_capacity(
        self,
        prompt_token_num: int,
        response_a_token_num: int,
        response_b_token_num: int,
        cur_max_token_capacity: int,
    ) -> tuple:
        if self.length_assign_method == "method_1":
            response_token_capacity = max(cur_max_token_capacity - prompt_token_num, 0)
            prompt_capacity = min(prompt_token_num, cur_max_token_capacity)
            response_a_capacity = int(
                response_token_capacity
                * response_a_token_num
                / (response_a_token_num + response_b_token_num)
            )
            response_b_capacity = response_token_capacity - response_a_capacity
        elif self.length_assign_method == "method_2":
            total_tokens = (
                prompt_token_num + response_a_token_num + response_b_token_num
            )
            prompt_capacity = int(
                cur_max_token_capacity * prompt_token_num / total_tokens
            )
            response_a_capacity = int(
                cur_max_token_capacity * response_a_token_num / total_tokens
            )
            response_b_capacity = (
                cur_max_token_capacity - prompt_capacity - response_a_capacity
            )
        elif self.length_assign_method == "method_3":
            response_token_capacity = max(cur_max_token_capacity - prompt_token_num, 0)
            prompt_capacity = min(prompt_token_num, cur_max_token_capacity)
            response_a_capacity = response_token_capacity // 2
            response_b_capacity = response_token_capacity - response_a_capacity
        else:
            raise ValueError("Method not supported")
        return prompt_capacity, response_a_capacity, response_b_capacity

    def __call__(self, batch_data):
        """
        Preprocesses the text data in the batch and computes the token numbers for each part of the text.
        Then, it calculates the maximum token capacity for the data and assigns token capacities to different parts of the text based on the specified length assignment method.
        Finally, it truncates the text if necessary, encodes it into input_ids and attention_mask, and returns the final input.

        Args:
            batch_data (dict): A dictionary containing the batch data with keys "prompt", "response_a", and "response_b".

        Returns:
            dict: A dictionary containing the final input with keys "input_ids" and "attention_mask".

        self.truncation_method 可以为 [left, right]，表示prompt从哪部分截断
        self.length_assign_method 可以为 [method_1, method_2, method_3]，表示分配长度的方法
            - 方法一：prompt全部保留，response_a和response_b按长度分配
            - 方法二：prompt，response_a, response_b都按长度分配
            - 方法三：prompt全部保留，response_a和response_b平分
            - 方法四：原先的方法，直接截断response_b
            ...
        """
        batch_prompt, batch_response_a, batch_response_b = self.preprocess_batch(
            batch_data
        )
        final_input = defaultdict(list)
        if self.get_labels:
            final_input["labels"] = self.extract_labels(batch_data)
        prompt_token_num = np.array([self.compute_token_num(p) for p in batch_prompt])
        response_a_token_num = np.array(
            [self.compute_token_num(r) for r in batch_response_a]
        )
        response_b_token_num = np.array(
            [self.compute_token_num(r) for r in batch_response_b]
        )
        p_len, a_len, b_len = [], [], []
        for i in range(len(batch_prompt)):
            p_len.append(prompt_token_num[i])
            a_len.append(response_a_token_num[i])
            b_len.append(response_b_token_num[i])

        final_input["original_prompt_length"] = p_len
        final_input["original_response_a_length"] = a_len
        final_input["original_response_b_length"] = b_len
        if self.length_assign_method == "method_4":
            texts = self.format_texts(
                batch_prompt,
                batch_response_a,
                batch_response_b,
                response_a_token_num,
                response_b_token_num,
            )
            tokenized = self.tokenizer(
                texts,
                max_length=self.max_length,
                truncation=False,
                add_special_tokens=False,
            )
            token_length = [len(t) for t in tokenized["input_ids"]]

            tokenized_truncation = self.tokenizer(
                texts,
                max_length=self.max_length,
                truncation=True,
                add_special_tokens=False,
            )
            for key in tokenized_truncation:
                final_input[key] = tokenized_truncation[key]
            final_input["token_length"] = token_length
            return final_input

        concat_batch_text = self.format_texts(
            batch_prompt,
            batch_response_a,
            batch_response_b,
            response_a_token_num,
            response_b_token_num,
        )
        concat_batch_text_token_num = np.array(
            [self.compute_token_num(text) for text in concat_batch_text]
        )

        other_part_token_num = (
            concat_batch_text_token_num
            - prompt_token_num
            - response_a_token_num
            - response_b_token_num
        )
        max_token_capacity = self.max_length - other_part_token_num - 10
        token_length = []
        for i, token_num in enumerate(concat_batch_text_token_num):

            if token_num > self.max_length:
                prompt_capacity, response_a_capacity, response_b_capacity = (
                    self.get_part_capacity(
                        prompt_token_num[i],
                        response_a_token_num[i],
                        response_b_token_num[i],
                        max_token_capacity[i],
                    )
                )
                if self.truncation_method in ["left", "right"]:
                    self.tokenizer.truncation_side = self.truncation_method
                    prompt = self.tokenizer(
                        batch_prompt[i],
                        max_length=max(prompt_capacity, 0),
                        truncation=True,
                        add_special_tokens=False,
                    )
                    response_a = self.tokenizer(
                        batch_response_a[i],
                        max_length=max(response_a_capacity, 0),
                        truncation=True,
                        add_special_tokens=False,
                    )
                    response_b = self.tokenizer(
                        batch_response_b[i],
                        max_length=max(response_b_capacity, 0),
                        truncation=True,
                        add_special_tokens=False,
                    )

                    prompt_text = self.tokenizer.decode(prompt["input_ids"]).strip()
                    response_a_text = self.tokenizer.decode(
                        response_a["input_ids"]
                    ).strip()
                    response_b_text = self.tokenizer.decode(
                        response_b["input_ids"]
                    ).strip()

                    text = self.chat_template.format(
                        prompt=prompt_text,
                        response_a=response_a_text,
                        response_b=response_b_text,
                        a_word_num=response_a_token_num[i],
                        b_word_num=response_b_token_num[i],
                    )
                else:
                    raise ValueError("Truncation method not supported")
                inputs = self.tokenizer(
                    text,
                    max_length=self.max_length,
                    truncation=False,
                    add_special_tokens=False,
                )
                assert len(inputs["input_ids"]) <= self.max_length
                token_length.append(len(inputs["input_ids"]))
            else:
                inputs = self.tokenizer(
                    concat_batch_text[i],
                    max_length=self.max_length,
                    truncation=False,
                    add_special_tokens=False,
                )
                assert len(inputs["input_ids"]) <= self.max_length
                token_length.append(len(inputs["input_ids"]))
            for key in inputs:
                final_input[key].append(inputs[key])
        final_input["token_length"] = token_length
        self.tokenizer.truncation_side = "right"

        return final_input

    def extract_labels(self, batch_data: Dict[str, List[str]]) -> List[int]:
        labels = [
            0 if a_win else 1 if b_win else 2
            for a_win, b_win in zip(
                batch_data["winner_model_a"], batch_data["winner_model_b"]
            )
        ]
        return labels


In [5]:
import copy
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass

import numpy as np
import pandas as pd
import torch
from datasets import Dataset
from peft import PeftModel
from sklearn.metrics import accuracy_score, log_loss
from tqdm import trange
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
)
from transformers.data.data_collator import pad_without_fast_tokenizer_warning


torch.enable_grad(False)


@dataclass
class Config:
    # Fix
    model_name_or_path = "/kaggle/input/gemma-2-9b-it/gemma2"
    device_1 = torch.device("cuda:0")
    device_2 = torch.device("cuda:1")
    # Maybe change:
    test_data_path = "/kaggle/input/lmsys-chatbot-arena/test.csv"
    test_local = False
    # Need Change:
    lora_dir = "/kaggle/input/gemma2it-loraforlmsys/gemma_lora_result/gemma_template_2e-4lr_right_truncation_method_2/checkpoint-1435"
    model_max_length = 2048
    default_chat_template = None
    tta = True
    truncation_method = "right"
    length_assign_method = "method_2"
    
    


def calculate_metrics(predictions_df, true_labels_df):
    """
    Calculate log loss and accuracy between predictions and true labels.

    Parameters:
    predictions_df (pd.DataFrame): DataFrame containing predicted probabilities.
    true_labels_df (pd.DataFrame): DataFrame containing true labels.

    Returns:
    tuple: (average log loss, accuracy)
    """
    # Ensure the DataFrames are aligned on the index
    predictions_df = predictions_df.set_index("id").sort_index()
    true_labels_df = true_labels_df.set_index("id").sort_index()

    # Extract true labels as one-hot encoded vectors
    true_labels = true_labels_df[
        ["winner_model_a", "winner_model_b", "winner_tie"]
    ].values

    # Extract predicted probabilities
    predicted_probabilities = predictions_df[
        ["winner_model_a", "winner_model_b", "winner_tie"]
    ].values

    # Calculate log loss
    avg_log_loss = log_loss(true_labels, predicted_probabilities)

    # Extract true labels as class indices for accuracy calculation
    true_label_indices = np.argmax(true_labels, axis=1)
    predicted_label_indices = np.argmax(predicted_probabilities, axis=1)

    # Calculate accuracy
    accuracy = accuracy_score(true_label_indices, predicted_label_indices)

    return avg_log_loss, accuracy
cfg = Config()

2024-07-26 11:44:07.775802: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-26 11:44:07.775909: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-26 11:44:07.915621: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [6]:


tokenizer = AutoTokenizer.from_pretrained(
    cfg.model_name_or_path, padding_side="right", use_fast=True,
)


In [7]:

preprocess = TextProcessorV2(
    tokenizer=tokenizer,
    max_length=cfg.model_max_length,
    chat_template=cfg.default_chat_template,
    truncation_method=cfg.truncation_method,
    length_assign_method=cfg.length_assign_method,
    get_labels=cfg.test_local
)

raw_dataset = Dataset.from_csv(cfg.test_data_path)
test_dataset = raw_dataset.map(preprocess, batched=True)
tokenized_data = pd.DataFrame(test_dataset.to_dict())


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [8]:
preprocess.chat_template

'<bos><start_of_turn>user\n{prompt}\n\n<response_a> ({a_word_num} words): {response_a}\n\n<response_b> ({b_word_num} words): {response_b}\n<end_of_turn>\n<start_of_turn>model\n'

In [9]:

tokenized_data["length"] = tokenized_data["input_ids"].apply(len)
bnb_config =  BitsAndBytesConfig(load_in_8bit=True)
model_1 = AutoModelForSequenceClassification.from_pretrained(
    cfg.model_name_or_path,
    num_labels=3,
    device_map=cfg.device_1,
    low_cpu_mem_usage=True,
#     torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
)
print(model_1.score.weight)
model_1 = PeftModel.from_pretrained(model_1, cfg.lora_dir).eval()
print(model_1.score.weight)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/gemma-2-9b-it/gemma2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parameter containing:
tensor([[ 0.0030, -0.0030, -0.0049,  ..., -0.0316, -0.0163, -0.0044],
        [-0.0037,  0.0082,  0.0450,  ..., -0.0064,  0.0357,  0.0060],
        [-0.0135,  0.0076,  0.0042,  ..., -0.0075,  0.0133,  0.0182]],
       device='cuda:0', dtype=torch.float16, requires_grad=True)
Parameter containing:
tensor([[-0.0471, -0.0010,  0.0039,  ..., -0.0140,  0.0141, -0.0112],
        [ 0.0183,  0.0070,  0.0034,  ..., -0.0038,  0.0591, -0.0366],
        [ 0.0070, -0.0040, -0.0132,  ...,  0.0063,  0.0181,  0.0217]],
       device='cuda:0', dtype=torch.float16, requires_grad=True)


In [10]:
model_2 = AutoModelForSequenceClassification.from_pretrained(
    cfg.model_name_or_path,
    num_labels=3,
    device_map=cfg.device_2,
    low_cpu_mem_usage=True,
#     torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,
)
print(model_2.score.weight)
model_2 = PeftModel.from_pretrained(model_2, cfg.lora_dir).eval()
print(model_2.score.weight)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/gemma-2-9b-it/gemma2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parameter containing:
tensor([[ 0.0138,  0.0223, -0.0354,  ..., -0.0116, -0.0144,  0.0314],
        [ 0.0241,  0.0027, -0.0124,  ...,  0.0123, -0.0099, -0.0157],
        [ 0.0510, -0.0368,  0.0091,  ..., -0.0230, -0.0054, -0.0255]],
       device='cuda:1', dtype=torch.float16, requires_grad=True)
Parameter containing:
tensor([[-0.0471, -0.0010,  0.0039,  ..., -0.0140,  0.0141, -0.0112],
        [ 0.0183,  0.0070,  0.0034,  ..., -0.0038,  0.0591, -0.0366],
        [ 0.0070, -0.0040, -0.0132,  ...,  0.0063,  0.0181,  0.0217]],
       device='cuda:1', dtype=torch.float16, requires_grad=True)


In [11]:


def predict(data, model, tokenizer, batch_size=1):
    a_win, b_win, tie = [], [], []
    predict_df = copy.deepcopy(data)
    if cfg.test_local:
        predict_df = predict_df.astype({'winner_model_a':float, 'winner_model_b': float, 'winner_tie': float})
    for i in trange(0, len(predict_df), batch_size):
        end_idx = min(i + batch_size, len(predict_df))
        tmp = predict_df.iloc[i:end_idx]
        input_ids = tmp["input_ids"].to_list()
        attention_mask = tmp["attention_mask"].to_list()
        inputs = pad_without_fast_tokenizer_warning(
                tokenizer,
                {"input_ids": input_ids, "attention_mask": attention_mask},
                padding="longest",
                pad_to_multiple_of=None,
                return_tensors="pt",
            ).to(model.device)
        outputs = model(**inputs)
        proba = outputs.logits.softmax(-1).cpu()
        a_win.extend(proba[:, 0].tolist())
        b_win.extend(proba[:, 1].tolist())
        tie.extend(proba[:, 2].tolist())
    # Ensure the correct dtype
    predict_df.loc[:, "winner_model_a"] = a_win
    predict_df.loc[:, "winner_model_b"] = b_win
    predict_df.loc[:, "winner_tie"] = tie
    return predict_df[["id", "winner_model_a", "winner_model_b","winner_tie"]]

In [12]:
sort_data = tokenized_data.sort_values("length", ascending=False)
sub_1 = sort_data.iloc[0::2].copy()
sub_2 = sort_data.iloc[1::2].copy()

with ThreadPoolExecutor(max_workers=2) as executor:
    results = executor.map(predict, (sub_1, sub_2), (model_1, model_2), (tokenizer, tokenizer), (4, 4))
    
result_df = pd.concat(list(results), axis=0)
result_df = result_df.sort_values("id")


  0%|          | 0/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:02<00:00,  2.35s/it]
100%|██████████| 1/1 [00:05<00:00,  5.07s/it]


In [13]:
if cfg.test_local:
    print(calculate_metrics(result_df, tokenized_data))

# TTA Test


In [14]:
aug_test_dataset = raw_dataset.rename_columns(
    {
        "response_a": "response_b",
        "response_b": "response_a",
    }
)
if cfg.test_local:
    aug_test_dataset = aug_test_dataset.rename_columns(
        {
            "winner_model_a": "winner_model_b",
            "winner_model_b": "winner_model_a"
        }
    )
            
aug_test_dataset = aug_test_dataset.map(preprocess, batched=True)

aug_tokenized_data = pd.DataFrame(aug_test_dataset.to_dict())

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [15]:
final_result = copy.deepcopy(result_df).sort_values("id")

In [16]:

if cfg.tta:
    sort_aug_data = aug_tokenized_data.sort_values("token_length", ascending=False)
    sub_aug_1 = sort_aug_data.iloc[0::2].copy()
    sub_aug_2 = sort_aug_data.iloc[1::2].copy()

    with ThreadPoolExecutor(max_workers=2) as executor:
        results = executor.map(predict, (sub_aug_1, sub_aug_2), (model_1, model_2), (tokenizer, tokenizer), (4, 4))

    aug_result_df = pd.concat(list(results), axis=0)
    aug_result_df = aug_result_df.sort_values("id")

    if cfg.test_local:
        print(calculate_metrics(aug_result_df, aug_tokenized_data))
    final_result[["winner_model_a", "winner_model_b", "winner_tie"]] = (final_result[["winner_model_a", "winner_model_b", "winner_tie"]].values + aug_result_df[["winner_model_b", "winner_model_a", "winner_tie"]].values ) / 2


  0%|          | 0/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:00<00:00,  1.28it/s]
100%|██████████| 1/1 [00:03<00:00,  3.75s/it]


In [17]:

if cfg.test_local:
    print("ab_score: ", calculate_metrics(result_df, tokenized_data))
    print("ba_score: ", calculate_metrics(aug_result_df, aug_tokenized_data))
    print("tta_score: ", calculate_metrics(final_result, tokenized_data))

In [18]:
final_result.sort_values(by='id').reset_index(drop=True).to_csv('submission.csv', index=False)