In [None]:
import copy
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass

import numpy as np
import pandas as pd
import torch
from datasets import Dataset
from peft import PeftModel
from sklearn.metrics import accuracy_score, log_loss
from tqdm import trange
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
)
from transformers.data.data_collator import pad_without_fast_tokenizer_warning

from text_process import TextProcessorV2

torch.enable_grad(False)


@dataclass
class Config:
    model_name_or_path = "/data/share/pyz/model_weight/gemma-2-9b-it"
    lora_dir = "output/gemma_template_2e-4lr_right_truncation_method_2/checkpoint-1435"
    model_max_length = 2048
    device_1 = torch.device("cuda:0")
    device_2 = torch.device("cuda:1")
    test_data_path = "data/split/test.csv"
    default_chat_template = None
    tta = True
    test_local = False


def calculate_metrics(predictions_df, true_labels_df):
    """
    Calculate log loss and accuracy between predictions and true labels.

    Parameters:
    predictions_df (pd.DataFrame): DataFrame containing predicted probabilities.
    true_labels_df (pd.DataFrame): DataFrame containing true labels.

    Returns:
    tuple: (average log loss, accuracy)
    """
    # Ensure the DataFrames are aligned on the index
    predictions_df = predictions_df.set_index("id").sort_index()
    true_labels_df = true_labels_df.set_index("id").sort_index()

    # Extract true labels as one-hot encoded vectors
    true_labels = true_labels_df[
        ["winner_model_a", "winner_model_b", "winner_tie"]
    ].values

    # Extract predicted probabilities
    predicted_probabilities = predictions_df[
        ["winner_model_a", "winner_model_b", "winner_tie"]
    ].values

    # Calculate log loss
    avg_log_loss = log_loss(true_labels, predicted_probabilities)

    # Extract true labels as class indices for accuracy calculation
    true_label_indices = np.argmax(true_labels, axis=1)
    predicted_label_indices = np.argmax(predicted_probabilities, axis=1)

    # Calculate accuracy
    accuracy = accuracy_score(true_label_indices, predicted_label_indices)

    return avg_log_loss, accuracy

In [None]:

cfg = Config()
tokenizer = AutoTokenizer.from_pretrained(
    cfg.model_name_or_path, padding_side="right", use_fast=True,
)


In [None]:

preprocess = TextProcessorV2(
    tokenizer=tokenizer,
    max_length=cfg.model_max_length,
    chat_template=cfg.default_chat_template,
    truncation_method="right",
    length_assign_method="method_2"
)

raw_dataset = Dataset.from_csv(cfg.test_data_path)
test_dataset = raw_dataset.map(preprocess, batched=True)
tokenized_data = pd.DataFrame(test_dataset.to_dict())


In [None]:
preprocess.chat_template

In [None]:

tokenized_data["length"] = tokenized_data["input_ids"].apply(len)
bnb_config =  BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
model_1 = AutoModelForSequenceClassification.from_pretrained(
    cfg.model_name_or_path,
    num_labels=3,
    device_map=cfg.device_1,
    torch_dtype=torch.bfloat16,
)
print(model_1.score.weight)
model_1 = PeftModel.from_pretrained(model_1, cfg.lora_dir).eval()
print(model_1.score.weight)


In [None]:
model_2 = AutoModelForSequenceClassification.from_pretrained(
    cfg.model_name_or_path,
    num_labels=3,
    device_map=cfg.device_2,
    torch_dtype=torch.bfloat16,
)
print(model_2.score.weight)
model_2 = PeftModel.from_pretrained(model_2, cfg.lora_dir).eval()
print(model_2.score.weight)

In [None]:


def predict(data, model, tokenizer, batch_size=1):
    a_win, b_win, tie = [], [], []
    predict_df = copy.deepcopy(data)

    for i in trange(0, len(predict_df), batch_size):
        end_idx = min(i + batch_size, len(predict_df))
        tmp = predict_df.iloc[i:end_idx]
        input_ids = tmp["input_ids"].to_list()
        attention_mask = tmp["attention_mask"].to_list()
        inputs = pad_without_fast_tokenizer_warning(
                tokenizer,
                {"input_ids": input_ids, "attention_mask": attention_mask},
                padding="longest",
                pad_to_multiple_of=None,
                return_tensors="pt",
            ).to(model.device)
        outputs = model(**inputs)
        proba = outputs.logits.softmax(-1).cpu()
        a_win.extend(proba[:, 0].tolist())
        b_win.extend(proba[:, 1].tolist())
        tie.extend(proba[:, 2].tolist())
    predict_df.loc[:, "winner_model_a"] = a_win
    predict_df.loc[:, "winner_model_b"] = b_win
    predict_df.loc[:, "winner_tie"] = tie
    return predict_df[["id", "winner_model_a", "winner_model_b","winner_tie"]]

In [None]:
sort_data = tokenized_data.sort_values("length", ascending=False)
sub_1 = sort_data.iloc[0::2].copy()
sub_2 = sort_data.iloc[1::2].copy()

with ThreadPoolExecutor(max_workers=2) as executor:
    results = executor.map(predict, (sub_1, sub_2), (model_1, model_2), (tokenizer, tokenizer), (4, 4))

result_df = pd.concat(list(results), axis=0)

In [None]:
if cfg.test_local:
    print(calculate_metrics(result_df, tokenized_data))

# TTA Test


In [None]:
aug_test_dataset = raw_dataset.rename_columns(
    {
        "response_a": "response_b",
        "response_b": "response_a",
        "winner_model_a": "winner_model_b",
        "winner_model_b": "winner_model_a"
    }
)
aug_test_dataset = aug_test_dataset.map(preprocess, batched=True)

aug_tokenized_data = pd.DataFrame(aug_test_dataset.to_dict())

In [None]:
aug_tokenized_data.head(1)

In [None]:
final_result = copy.deepcopy(result_df)

if cfg.tta:
    sort_aug_data = aug_tokenized_data.sort_values("token_length", ascending=False)
    sub_aug_1 = sort_aug_data.iloc[0::2].copy()
    sub_aug_2 = sort_aug_data.iloc[1::2].copy()

    with ThreadPoolExecutor(max_workers=2) as executor:
        results = executor.map(predict, (sub_aug_1, sub_aug_2), (model_1, model_2), (tokenizer, tokenizer), (4, 4))

    aug_result_df = pd.concat(list(results), axis=0)
    if cfg.test_local:
        print(calculate_metrics(aug_result_df, aug_tokenized_data))
    aug_flip_result_df = copy.deepcopy(result_df)
    aug_flip_result_df[["winner_model_a", "winner_model_b", "winner_tie"]] = aug_result_df[["winner_model_b", "winner_model_a", "winner_tie"]] 

    final_result[["winner_model_a", "winner_model_b", "winner_tie"]] = (final_result[["winner_model_a", "winner_model_b", "winner_tie"]] + aug_flip_result_df[["winner_model_a", "winner_model_b", "winner_tie"]]) / 2

In [None]:
final_result.to_csv('submission.csv', index=False)

In [None]:
if cfg.test_local:
    print(calculate_metrics(final_result, tokenized_data))