In [None]:
import pandas as pd
import numpy as np
import torch
import json
import ctypes
import gc
import os
import random
from torch.utils.data import Dataset
from sklearn.metrics import log_loss
from tqdm import tqdm


def seed_everything(seed=None):
    """
    固定seed
    :param seed: int, 随机种子
    """
    max_seed_value = np.iinfo(np.uint32).max
    min_seed_value = np.iinfo(np.uint32).min

    if (seed is None) or not (min_seed_value <= seed <= max_seed_value):
        seed = random.randint(np.iinfo(np.uint32).min, np.iinfo(np.uint32).max)

    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    return seed


seed_everything(42)

In [None]:
if (not torch.cuda.is_available()):
    print("Sorry - GPU Required!")

MAX_LENGTH = 2300
INFERENCE_MAX_LENGTH = [2300]
TRAINING_MODEL_PATH = ["../data/checkpoint-1800_8685"]

In [None]:
class InstructionDataSet(Dataset):
    def __init__(self, data, tokenizer, max_source_length, max_target_length):
        super(InstructionDataSet, self).__init__()
        self.data = data
        self.tokenizer = tokenizer
        self.max_source_length = max_source_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        now_data = self.data.loc[index]
        idx = now_data["id"]
        templete_part1 = "<start_of_turn>user\nHere are two question-answering dialogues. Compare two model performance on answering question, determine which is better.\n\n"
        templete_part1_input_ids = self.tokenizer(text=templete_part1, add_special_tokens=True, padding=False)["input_ids"]

        templete_part2 = "\n###options\nA. Model A\nB. Model B\nC. Tie\n<end_of_turn>\n"
        templete_part2_input_ids = self.tokenizer(text=templete_part2, add_special_tokens=True, padding=False)["input_ids"][1:]

        templete_part3 = "<start_of_turn>model\n"
        templete_part3_input_ids = self.tokenizer(text=templete_part3, add_special_tokens=True, padding=False)["input_ids"][1:]

        prompt_response = now_data["prompt_response"]
        prompt_response_ids = self.tokenizer(
            text=prompt_response, 
            add_special_tokens=True, 
            truncation=True,
            max_length=self.max_source_length, 
            padding=False
        )["input_ids"][1:]

        input_ids = templete_part1_input_ids + prompt_response_ids + templete_part2_input_ids + templete_part3_input_ids
        input_text = self.tokenizer.decode(input_ids, skip_special_tokens=False)
        
        return {
            "input_ids": input_text,
            "id": idx
        }

In [None]:
def process(input_str):
    return json.loads(input_str)


def get_label(row):
    label = [idx for idx, option in enumerate(["winner_model_a", "winner_model_b", "winner_tie"]) if row[option] == 1]
    if label[-1] == 0:
        return "A"
    elif label[-1] == 1:
        return "B"
    else:
        return "C"
    return label[-1]


def load_json(data):
    data.loc[:, "prompt"] = data["prompt"].apply(process)
    data.loc[:, "response_a"] = data["response_a"].apply(process)
    data.loc[:, "response_b"] = data["response_b"].apply(process)
    return data


def prompt_1(data):
    """
    #Model A
    Prompt1: xxx
    Response: xxx

    Prompt2: xxx
    Response: xxx

    #Model B
    Prompt1: xxx
    Response: xxx

    Prompt2: xxx
    Response: xxx
    """
    data["prompt_response_A"] = "Prompt: " + data["prompt"] + "\n" + "Response: " + data["response_a"]
    data["prompt_response_B"] = "Prompt: " + data["prompt"] + "\n" + "Response: " + data["response_b"]
    data = data.groupby("id").agg({"prompt_response_A": "\n\n".join, "prompt_response_B": "\n\n".join, "label": lambda x: list(x)[0]}).reset_index()
    data["prompt_response"] = "#Model A\n" + data["prompt_response_A"] + "\n\n#Model B\n" + data["prompt_response_B"]
    return data


def prompt_2(data, max_length, if_train):
    """
    超过max length新开一行，label不变
    #Prompt1
    xxxx
    #Response
    ##Model A
    xxxx
    ##Model B
    xxxx

    #Prompt2
    #Response
    ##Model A
    xxxx
    ##Model B
    xxxx
    """

    data["prompt_response"] = "#Prompt\n" + data["prompt"] + "\n\n" + "#Response\n" + "##Model A\n" + data["response_a"] + "\n\n" + "##Model B\n" + data["response_b"]

    prompt_response = []
    ids = []
    labels = []
    text_length = 0
    for idx, row in tqdm(data.iterrows(), total=len(data)):
        text = row["prompt_response"]
        if if_train:
            label = row["label"]
        id = row["id"]
        if id not in ids:
            prompt_response.append(text)
            text_length = len(text.split(" "))
            ids.append(id)
            if if_train:
                labels.append(label)
        else:
            text_length += len(text.split(" "))
            if text_length <= max_length:
                text = prompt_response[-1] + "\n\n" + text
                prompt_response[-1] = text
            else:
                prompt_response.append(text)
                text_length = len(text.split(" "))
                ids.append(id)
                if if_train:
                    labels.append(label)
    if if_train:
        data = pd.DataFrame({"id": ids, "prompt_response": prompt_response, "label": labels})
    else:
        data = pd.DataFrame({"id": ids, "prompt_response": prompt_response})
    return data


def load_split_data(data_path, prompt_type, max_length, if_train, split):
    """
    prompt_type: [1, 2, 3]
    if_train: True or False
    """
    if "csv" in data_path:
        data = pd.read_csv(data_path)
        data = load_json(data)
    elif "json" in data_path:
        data = pd.read_json(data_path)

    data = data.explode(["prompt", "response_a", "response_b"]).reset_index(drop=True)

    if if_train:
        data["label"] = data.apply(lambda x: get_label(x), axis=1)

    data = data.fillna("None")
    data["response_a"] = data["response_a"].apply(lambda x: "None" if len(x) == 0 else x)
    data["response_b"] = data["response_b"].apply(lambda x: "None" if len(x) == 0 else x)

    if prompt_type == 1:
        data = prompt_1(data)
    elif prompt_type == 2:
        data = prompt_2(data, max_length * 0.75, if_train)
    if split:
        idx = data.id.unique()
        valid_idx = [idx[i] for i in range(len(idx)) if i % 20 == 0]
        valid = data.loc[data.id.isin(valid_idx),].reset_index(drop=True)
        train = data.loc[~data.id.isin(valid_idx),].reset_index(drop=True)
        return train, valid
    
    return data, None

In [None]:
data_path = "../data/train.csv"
prompt_type = 2
test = load_split_data(data_path, prompt_type, MAX_LENGTH, False)

In [None]:
def collate_fn(batch):
    batch = {k: [item[k] for item in batch] for k in ("input_ids", "id")}
    batch_input = tokenizer(
        batch["input_ids"],
        padding="longest",
        truncation=True,
        return_tensors="pt",
        add_special_tokens=True,
        max_length=MAX_LENGTH + 50
    )
    return batch_input, batch["id"]

In [None]:
def inference(model, test_dataloader, gpu_id):
    test_predictions = []
    for batch in test_dataloader:
        batch_input, idx = batch
        for k in batch_input.keys():
            batch_input[k] = batch_input[k].to(devices[gpu_id])
        with torch.no_grad():
            response = model.generate(**batch_input, max_new_tokens=1, return_dict_in_generate=True, output_scores=True)
            score = response.scores[0]
            A_prob, B_prob, C_prob = score[:, A_TOKEN_IDS], score[:, B_TOKEN_IDS], score[:, C_TOKEN_IDS]
            logits = torch.Tensor([[A_prob, B_prob, C_prob]])
            logits = torch.softmax(logits, dim=-1).cpu().numpy()
            node_result = [[idx[i], logits[i]] for i in range(batch_size)]
        test_predictions.append(node_result)
    return test_predictions

In [None]:
libc = ctypes.CDLL("libc.so.6")
_ = gc.collect()
libc.malloc_trim(0)

device0 = torch.device("cuda:0")
device1 = torch.device("cuda:1")

In [None]:
base_model = "google/gemma-2-9b-it"

In [None]:
%%time
N_SAMPLES = test.shape[0]
half = N_SAMPLES // 2
sub_df_1 = test.iloc[0:half].copy().reset_index(drop=True)
sub_df_2 = test.iloc[half:].copy().reset_index(drop=True)
batch_size = 1

for idx, model_path in enumerate(TRAINING_MODEL_PATH):
    print("#"*25)
    print("=> Inferring",model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    config = AutoConfig.from_pretrained(base_model, trust_remote_code=True, token="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")
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,  
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True
    )
    base_model_0 = AutoModelForCausalLM.from_pretrained(
        base_model,
        config=config,
        quantization_config=bnb_config,
        torch_dtype=torch.float16,
        device_map=device0,
        trust_remote_code=True,
        token="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"
    )
    new_model = model_path
    model0 = PeftModel.from_pretrained(base_model_0, new_model).to(device0)
    model0.eval()
    
    base_model_1 = AutoModelForCausalLM.from_pretrained(
        base_model,
        config=config,
        quantization_config=bnb_config,
        torch_dtype=torch.float16,
        device_map=device1,
        trust_remote_code=True,
        token="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"
    )
    new_model = model_path
    model1 = PeftModel.from_pretrained(base_model_1, new_model).to(device1)
    model1.eval()
    
    models = [model0, model1]
    devices = [device0, device1]
    
    A_TOKEN_IDS = tokenizer("A",add_special_tokens=True, truncation=True, max_length=1024)["input_ids"][1:]
    B_TOKEN_IDS = tokenizer("B",add_special_tokens=True, truncation=True, max_length=1024)["input_ids"][1:]
    C_TOKEN_IDS = tokenizer("C",add_special_tokens=True, truncation=True, max_length=1024)["input_ids"][1:]
    

    lock = threading.Lock()
    
    pred_0 = []
    pred_1 = []
    def inference_thread(gpu_id, lock, sub_test):
        with lock:
            print(f"Thread {gpu_id} started on GPU {gpu_id}")
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
        tokenized_dataset = InstructionDataSet(sub_test, tokenizer, MAX_LENGTH, 1)
        test_dataloader = torch.utils.data.DataLoader(tokenized_dataset, batch_size=batch_size ,collate_fn=collate_fn)

        sub_pred = inference(model=models[gpu_id], test_dataloader=test_dataloader, gpu_id=gpu_id)
        assert len(sub_pred) == sub_test.shape[0]
        with lock:
            print(f"Thread {gpu_id} finished on GPU {gpu_id}")
        if gpu_id == 0:
            pred_0.append(sub_pred)
        if gpu_id == 1:
            pred_1.append(sub_pred)
    
    thread1 = threading.Thread(target=inference_thread, args=(0, lock, sub_df_1))
    thread2 = threading.Thread(target=inference_thread, args=(1, lock, sub_df_2))

    start_time = time.time()
    thread1.start()
    thread2.start()

    thread1.join()
    thread2.join()
    print(f"Inference Time {time.time() - start_time}")
    print("Both threads have finished.")
    
    del model0, model1, models, tokenizer
    torch.cuda.empty_cache()
    _ = gc.collect()
    libc.malloc_trim(0)

In [None]:
def get_pre(sub_pred):
    processed_data = []
    for item in sub_pred:
        item = item[0]
        id = item[0].item() 
        array_values = item[1].tolist()
        processed_data.append([id] + array_values)
    return processed_data

In [None]:
prediction = [j for i in pred_0 for j in i] + [j for i in pred_1 for j in i]
prediction = get_pre(prediction)
new_columns = ["id", "winner_model_a", "winner_model_b", "winner_tie"]
new_columns_df = pd.DataFrame(prediction, columns=new_columns)

In [None]:
new_columns_df = new_columns_df.groupby("id").mean().reset_index()

In [None]:
new_columns_df

In [None]:
test = pd.read_csv("../data/train.csv")
assert new_columns_df.shape[0] == test.shape[0]
test[new_columns] = new_columns_df

In [None]:
new_columns = ["winner_model_a", "winner_model_b", "winner_tie"]
test[["id"] + new_columns].to_csv("predictions.csv",index=False)

In [None]:
pd.read_csv("predictions.csv")