# Exploring chat templates

In [1]:
from transformers import AutoTokenizer
from trl.trainer import DataCollatorForCompletionOnlyLM
from datasets import load_dataset, Dataset

dataset_name:str="GAIR/lima"

tokenizer_llama = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
tokenizer_opt = AutoTokenizer.from_pretrained("facebook/opt-125m")
ds = load_dataset(dataset_name, 'plain_text')

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
# See template using
tokenizer_llama.default_chat_template


No chat template is defined for this tokenizer - using the default template for the LlamaTokenizerFast class. If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.



"{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\\n\\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don\\'t know the answer to a question, please don\\'t share false information.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must

In [3]:
x = ds['train'][0]['conversations']
x_formatted = [
    {'role': 'user', 'content': x[0]},
    {'role': 'assistant', 'content': x[1]}
]
tokenizer_llama.apply_chat_template(x_formatted, tokenize=False)   # This method is also what SFTTrainer calls... So I need to ensure that this is on the right format!

# It does NOT seem that apply_chat_template supports the usual instruction format as they claim here: https://huggingface.co/docs/trl/sft_trainer#dataset-format-support

'<s>[INST] Can brain cells move? By movement I mean long distance migration (preferably within the brain only). [/INST] The question is relatively broad and one should take into account that the brain not only consists of neurons, but also glial cells (supportive cells) and pre-mitotic neuronal stem cells. Furthermore, as critical fellow-scientists have indicated, developmental stage is very important, as the developing embryonic brain is very different from the adult brain.\nHowever, after sifting through various publications, the answer to the question is actually remarkably simple: Yes, brain cells migrate.\nIn  the adult brain glial cells migrate in the brain (Klämbt, 2009). Glial cells are involved in a myriad of functions, but a notable example of migrating glial cells are the oligodendrocytes that migrate relative long distances to find their target axons onto which they wrap themselves to form the insulating myelin sheath (Tsai and Miller, 2002).\nNeuronal stem cells migrate ov

In [4]:
tokenizer_opt.apply_chat_template(x_formatted, tokenize=False)


No chat template is defined for this tokenizer - using the default template for the GPT2TokenizerFast class. If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.



'Can brain cells move? By movement I mean long distance migration (preferably within the brain only).</s>The question is relatively broad and one should take into account that the brain not only consists of neurons, but also glial cells (supportive cells) and pre-mitotic neuronal stem cells. Furthermore, as critical fellow-scientists have indicated, developmental stage is very important, as the developing embryonic brain is very different from the adult brain.\nHowever, after sifting through various publications, the answer to the question is actually remarkably simple: Yes, brain cells migrate.\nIn  the adult brain glial cells migrate in the brain (Klämbt, 2009). Glial cells are involved in a myriad of functions, but a notable example of migrating glial cells are the oligodendrocytes that migrate relative long distances to find their target axons onto which they wrap themselves to form the insulating myelin sheath (Tsai and Miller, 2002).\nNeuronal stem cells migrate over long distanc

# Data processing

In [45]:
from functools import partial

def filter_example(x):
    return x["source"] != "multi_turn"

def process_example(x, tokenizer):
    x = x['conversations']
    assert len(x) == 2, "The multi-turn format is not supported"
    x = [
            {'role': 'user', 'content': x[0]},
            {'role': 'assistant', 'content': x[1]}
        ]
    return {'text': tokenizer.apply_chat_template(x, tokenize=False, add_generation_prompt=True)}  # NOTE: Calling `apply_chat_template` on the instance again is no bueno

map_kwargs = {'remove_columns': ["conversations", "source"]}

opt_processer = partial(process_example, tokenizer=tokenizer_opt)
llama_processer = partial(process_example, tokenizer=tokenizer_llama)

train_ds_opt = ds['train'].filter(filter_example).map(opt_processer, **map_kwargs)
train_ds_llama = ds['train'].filter(filter_example).map(llama_processer, **map_kwargs)

Map: 100%|██████████| 1000/1000 [00:00<00:00, 3498.87 examples/s]


In [78]:
tokenizer = tokenizer_llama

# Just picking the dataset, and instruction and response templates
if tokenizer == tokenizer_llama:
    instruction_template = '[INST]'
    response_template = '[/INST]'
    x = train_ds_llama[0]
elif tokenizer == tokenizer_opt:
    instruction_template = tokenizer_opt.bos_token
    response_template = tokenizer_opt.eos_token
    x = train_ds_opt[0]
print(f"{x}\n")

x_tokenized = tokenizer(x['text'], return_tensors='pt')

collator = DataCollatorForCompletionOnlyLM(
    tokenizer=tokenizer, mlm=False, instruction_template=instruction_template, response_template=response_template
)
collated_x = collator.torch_call([x_tokenized['input_ids'][0]])
input_ids = collated_x['input_ids']
labels = collated_x['labels']

print(f"{tokenizer.decode(input_ids[labels != collator.ignore_index])}")

{'text': '<s>[INST] Can brain cells move? By movement I mean long distance migration (preferably within the brain only). [/INST] The question is relatively broad and one should take into account that the brain not only consists of neurons, but also glial cells (supportive cells) and pre-mitotic neuronal stem cells. Furthermore, as critical fellow-scientists have indicated, developmental stage is very important, as the developing embryonic brain is very different from the adult brain.\nHowever, after sifting through various publications, the answer to the question is actually remarkably simple: Yes, brain cells migrate.\nIn  the adult brain glial cells migrate in the brain (Klämbt, 2009). Glial cells are involved in a myriad of functions, but a notable example of migrating glial cells are the oligodendrocytes that migrate relative long distances to find their target axons onto which they wrap themselves to form the insulating myelin sheath (Tsai and Miller, 2002).\nNeuronal stem cells m