In [1]:
import lmppl
import pandas as pd, numpy as np
from transformers import AutoTokenizer

checkpoint = "meta-llama/Meta-Llama-3-8B-Instruct"
scorer = lmppl.LM(checkpoint)
# scorer = lmppl.LM("Qwen/Qwen2-7B-Instruct")
# scorer = lmppl.LM("google/gemma-2-9b-it")



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [2]:
class FormattingFunction:
    def __init__(self, model_checkpoint):
        self.instruction_all = {
            "meta-llama/Meta-Llama-3-8B-Instruct": """
        <|begin_of_text|>
        <|start_header_id|>system<|end_header_id|>
        당신의 역할은 한국어로 답변하는 **한국어 AI 어시트턴트**입니다. 주어진 질문에 대해 한국어로 답변해주세요.<|eot_id|>
        <|start_header_id|>user<|end_header_id|>
        아래 질문을 한국어로 정확하게 답변해주세요. **질문**: {}<|eot_id|>
        <|start_header_id|>assistant<|end_header_id|>\n\n{}<|eot_id|>
        <|end_of_text|>""",
            "google/gemma-2-9b-it": """
        <bos><start_of_turn>model
        당신의 역할은 한국어로 답변하는 **한국어 AI 어시트턴트**입니다. 주어진 질문에 대해 한국어로 답변해주세요.<end_of_turn>
        <start_of_turn>user
        아래 질문을 한국어로 정확하게 답변해주세요. **질문**: {}<end_of_turn>
        <start_of_turn>model
        {}<end_of_turn>
        <eos>""",
            "Qwen/Qwen2-7B-Instruct": """
        <|im_start|>system
        당신의 역할은 한국어로 답변하는 **한국어 AI 어시트턴트**입니다. 주어진 질문에 대해 한국어로 답변해주세요.\n<|im_end|>
        <|im_start|>user
        아래 질문을 한국어로 정확하게 답변해주세요. **질문**: {}<|im_end|>
        <|im_start|>system
        {}<|im_end|>
        <|endoftext|>""",
        }

        self.instruction = self.instruction_all[model_checkpoint]

    def __call__(self, examples):

        final_texts = []
        for i in range(len(examples["input"])):
            final_text = self.instruction.format(
                examples["input"][i], examples["output"][i]
            )
            final_texts.append(final_text)

        return final_texts


class DataBase:
    def __init__(self):
        train_url = "/node_storage2/data_llm_kr/data_it_train_240724.csv"
        eval_url = "/node_storage2/data_llm_kr/data_it_eval_240724.csv"

        eval_translate_url = "/home/hyohyeongjang/bigdata/ModelTrainer/llm-kr/data/oig-smallchip2-dedu-slice_reviewed_week1-7_instruction_valid.csv"

        self.train_df = pd.read_csv(train_url)
        self.eval_df = pd.read_csv(eval_url)
        self.eval_translate = pd.read_csv(eval_translate_url)


d = DataBase()
f = FormattingFunction(checkpoint)

  self.train_df = pd.read_csv(train_url)


In [3]:
def count_tokens(sentences):
    target = sentences.apply(lambda x: f"{x['input']} {x['output']}", axis=1).tolist()

    checkpoint = "meta-llama/Meta-Llama-3-8B-Instruct"
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    out_llama = tokenizer(target)
    out_llama = pd.Series([len(i) for i in out_llama["input_ids"]], name="llama_token")

    checkpoint = "Qwen/Qwen2-7B-Instruct"
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    out_qwen = tokenizer(target)
    out_qwen = pd.Series([len(i) for i in out_qwen["input_ids"]], name="qwen_token")

    checkpoint = "google/gemma-2-9b-it"
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    out_gemma = tokenizer(target)
    out_gemma = pd.Series([len(i) for i in out_gemma["input_ids"]], name="gemma_token")

    x = pd.concat([sentences, out_llama, out_qwen, out_gemma], axis=1)
    x.groupby("task")[["llama_token", "qwen_token", "gemma_token"]].agg(
        ["mean", "max", "min"]
    ).round(1)

    return x


def get_perplexity(sentences, checkpoint_type, sample=True, sample_num=100):

    # sentences = d.train_df
    if sample:
        samples = (
            sentences[["input", "output", "task"]]
            .groupby("task")
            .sample(sample_num, random_state=21)
            .reset_index(drop=True)
        )
    else:
        samples = sentences

    # samples = sentences
    targets = f(samples)

    ppl = np.array(scorer.get_perplexity(targets, batch=8)).round(2)
    x = pd.concat([samples, pd.Series(ppl, name=checkpoint_type)], axis=1)
    x.groupby("task")[checkpoint_type].mean()

    return x

In [4]:
if __name__ == "__main__":

    # count_tokens(d.train_df).to_csv("/home/hyohyeongjang/bigdata/ModelTrainer/llm-kr/data/token_stat.csv")

    tasks = d.train_df.task.drop_duplicates().tolist()

    partial_task = len(tasks) // 5

    ppl_lst = []
    for i in range(5):
        if i == 4:
            this_task = tasks[i * partial_task :]
        else:
            this_task = tasks[i * partial_task : (i + 1) * partial_task]

        sentences = d.train_df[d.train_df.task.map(lambda x: x in this_task)]
        # sentences = d.train_df.loc[
        #     d.train_df.task.map(
        #         lambda x: x
        #         in [
        #             "doosan_253717",
        #             "gen_argument",
        #             "gen_news_classification",
        #             "gen_sentiment_classification",
        #             "gen_sts_classification",
        #             "med_translation",
        #         ]
        #     )
        # ].reset_index(drop=True)

        ppl_out = get_perplexity(sentences, checkpoint_type="llama_ppl")
        ppl_lst.append(ppl_out)

  0%|                                 | 0/63 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
100%|████████████████████████| 63/63 [07:30<00:00,  7.15s/it]


KeyError: 'Column not found: 0'

In [None]:
import torch

torch.cuda.empty_cache()