In [59]:
import pandas as pd
import datasets
from utils import remove_hangul
from transformers import AutoTokenizer

In [None]:
df = pd.read_csv("../data/inference_data.csv", encoding = "utf-8")
df_text = pd.read_csv("../data/data_sample_20251111_01.csv", encoding = "cp949")
system_prompt = df_text.system[0]

ds = datasets.Dataset.from_pandas(df)
columns_to_remove = [f for f in list(ds.features) if f not in ["subject_id", "text"]]

## Explicit format
train_ds = ds.map(
    lambda sample: {
        "text": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": sample["text"]}
            ]
    }
)

train_ds = train_ds.map(lambda sample: remove_hangul(sample, column = "text"))
train_ds = train_ds.map(remove_columns = columns_to_remove, batched = False)

Map: 100%|██████████| 10000/10000 [00:00<00:00, 10456.67 examples/s]
Map: 100%|██████████| 10000/10000 [00:01<00:00, 6865.58 examples/s]
Map: 100%|██████████| 10000/10000 [00:00<00:00, 22319.99 examples/s]


In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-3.1-8B-Instruct",
    use_fast = True,
    trust_remote_code = True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

LLAMA_3_CHAT_TEMPLATE = (
    "{{ bos_token }}"
    "{% for message in messages %}"
        "{% if message['role'] == 'system' %}"
            "{{ '<|start_header_id|>system<|end_header_id|>\n\n' + message['content'] + eos_token }}"
        "{% elif message['role'] == 'user' %}"
            "{{ '<|start_header_id|>user<|end_header_id|>\n\n' + message['content'] +  eos_token }}"
        "{% elif message['role'] == 'assistant' %}"
            "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n'}}"
            "{% generation %}"
            "{{ message['content'] +  eos_token }}"
            "{% endgeneration %}"
        "{% endif %}"
    "{% endfor %}"
    "{%- if add_generation_prompt %}"
    "{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"
    "{%- endif %}"
)

tokenizer.chat_template = LLAMA_3_CHAT_TEMPLATE

In [75]:
def template_dataset(example):
    return {"token_size": len(tokenizer.apply_chat_template(example["text"], tokenize = True))}

In [76]:
train_ds = train_ds.map(template_dataset)
print(f"max token_size: {max(train_ds['token_size'])}")
print(f"min token_size: {min(train_ds['token_size'])}")

Map: 100%|██████████| 10000/10000 [00:36<00:00, 277.29 examples/s]


max token_size: 17176
min token_size: 286
