In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, get_scheduler
from trl import SFTTrainer, SFTConfig
import torch

from dotenv import load_dotenv
load_dotenv()

import json
import pickle
import random

from utils_simulate import run_games

import os
from huggingface_hub.hf_api import HfFolder
HfFolder.save_token(os.environ['HUGGINGFACE_TOKEN'])

import datasets

import pandas as pd

# from transformers.utils import logging
# logging.set_verbosity_info()

import logging
from transformers.utils import logging as transformers_logging

class CustomFilter(logging.Filter):
    def filter(self, record):
        return 'right-padding was detected' not in record.getMessage()

logger = logging.getLogger('transformers')

for handler in logger.handlers:
    handler.addFilter(CustomFilter())

In [2]:
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
# model_id = "astronomer/Llama-3-8B-Special-Tokens-Adjusted"

# tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side='left')
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side='right')
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
model.gradient_checkpointing_enable()
id_eot = tokenizer.convert_tokens_to_ids(["<|eot_id|>"])[0]
pad_token_id = tokenizer.convert_tokens_to_ids(["<|end_of_text|>"])[0]

tokenizer.pad_token_id = pad_token_id
model.config.pad_token_id = pad_token_id

# tokenizer.pad_token_id = id_eot
# model.config.pad_token_id = id_eot

# tokenizer.add_special_tokens({"pad_token": "<|padding_token|>"})
# model.config.pad_token_id = tokenizer.pad_token_id

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
1

1

In [4]:
tokenizer.pad_token, tokenizer.pad_token_id

('<|end_of_text|>', 128001)

In [5]:
id_eot, pad_token_id

(128009, 128001)

In [6]:
d_basic = pickle.load(open("data_collection/!final_dataset_11194_basic.pkl", "rb"))
d_hints = pickle.load(open("data_collection/!final_dataset_6016_with_hints_after_n_step.pkl", "rb"))
d_snap = pickle.load(open("data_collection/!final_dataset_17476_from_snapshots_fixed.pkl", "rb"))

In [7]:
def split_dataset(dataset, split_ratio=0.05):
    random.shuffle(dataset)
    split_index = int(len(dataset) * split_ratio)
    eval_set = dataset[:split_index]
    train_set = dataset[split_index:]
    return train_set, eval_set

d_basic, d_basic_eval = split_dataset(d_basic)
d_hints, d_hints_eval = split_dataset(d_hints)
d_snap, d_snap_eval = split_dataset(d_snap)

d_eval = d_basic_eval + d_hints_eval + d_snap_eval

In [8]:
# # # TODO TEMP!
# d_basic = d_basic[:1000]
# d_hints = d_hints[:30]
# d_snap = d_snap[:30]
# d_eval = d_eval[:30]

In [9]:
# def prepare_data(data):
#     prompts = [item['prompt'] for item in data]
#     responses = [item['response'] for item in data]
#     return datasets.Dataset.from_dict({"prompt": prompts, "response": responses})


from utils import SYSTEM_PROMPT_ASKER, SYSTEM_PROMPT_ANSWERER
# def prepare_inputs(chats: list[dict]):
#     tokenizer.padding_side = "left"
#     r_inputs = tokenizer.apply_chat_template(
#         [
#             [
#                 {"role": "system", "content": SYSTEM_PROMPT_ASKER if "only YES or NO" not in o['prompt'] else SYSTEM_PROMPT_ANSWERER},
#                 {"role": "user", "content": o['prompt']},
#             ] for o in chats
#         ],
#         tokenize=True,
#         padding="longest",
#         return_dict=True,
#     )
#     tokenizer.padding_side = "right"
#     r_labels = tokenizer.apply_chat_template(
#         [
#             [
#                 {"role": "assistant", "content": o['response']}
#             ] for o in chats
#         ],
#         tokenize=True,
#         padding="longest",
#         # return_dict=True,
#     )
#     return datasets.Dataset.from_dict(
#         {
#             "input_ids": r_inputs['input_ids'],
#             "attention_mask": r_inputs['attention_mask'],
#             "labels": r_labels,
#         }
#     ).shuffle(seed=42)


def prepare_inputs(chats: list[dict]):
    r_inputs = tokenizer.apply_chat_template(
        [
            [
                {"role": "system", "content": SYSTEM_PROMPT_ASKER if "only YES or NO" not in o['prompt'] else SYSTEM_PROMPT_ANSWERER},
                {"role": "user", "content": o['prompt']},                
                {"role": "assistant", "content": o['response']}
            ] for o in chats
        ],
        tokenize=False,
        padding=False,
        return_dict=False,
    )
    return datasets.Dataset.from_dict(
        {"texts": [r[:-len("<|start_header_id|>assistant<|end_header_id|>\n\n")] for r in r_inputs]}
    ).shuffle(seed=42)


# def tokenize_function(examples):
    # return tokenizer(examples['prompt'], text_target=examples['response'], truncation=True, padding="max_length", max_length=2500)

In [10]:
ds_basic = prepare_inputs(d_basic)
print("ds_basic", len(ds_basic))

ds_hints = prepare_inputs(d_hints)
print("d_hints", len(d_hints))

ds_snap = prepare_inputs(d_snap)
print("ds_snap", len(ds_snap))

ds_eval = prepare_inputs(d_eval)
print("ds_eval", len(ds_eval))

ds_basic 7961
d_hints 4287
ds_snap 16603
ds_eval 1517


In [11]:
for name, param in model.named_parameters():
    if "model.layers." not in name and not "head" in name and not "embed_tokens" in name:
        param.requires_grad = False
        continue
    if "mlp" in name:
        param.requires_grad = False
        continue
    if "norm" in name:
        param.requires_grad = False
        continue
    if "head" in name or "embed_tokens" in name:
        continue
    layer_num = int(name.split(".")[2])
    if layer_num <= 21:
        param.requires_grad = False
        continue

# for name, param in model.named_parameters():
#     if "model.layers." not in name and not "head" in name:
#         param.requires_grad = False
#         continue
#     if "mlp" in name:
#         param.requires_grad = False
#         continue
#     if "norm" in name:
#         param.requires_grad = False
#         continue
#     if "head" in name:
#         continue
#     layer_num = int(name.split(".")[2])
#     if layer_num <= 20:
#         param.requires_grad = False
#         continue

In [12]:
for name, param in model.named_parameters():
    print(name, param.requires_grad)

model.embed_tokens.weight True
model.layers.0.self_attn.q_proj.weight False
model.layers.0.self_attn.k_proj.weight False
model.layers.0.self_attn.v_proj.weight False
model.layers.0.self_attn.o_proj.weight False
model.layers.0.mlp.gate_proj.weight False
model.layers.0.mlp.up_proj.weight False
model.layers.0.mlp.down_proj.weight False
model.layers.0.input_layernorm.weight False
model.layers.0.post_attention_layernorm.weight False
model.layers.1.self_attn.q_proj.weight False
model.layers.1.self_attn.k_proj.weight False
model.layers.1.self_attn.v_proj.weight False
model.layers.1.self_attn.o_proj.weight False
model.layers.1.mlp.gate_proj.weight False
model.layers.1.mlp.up_proj.weight False
model.layers.1.mlp.down_proj.weight False
model.layers.1.input_layernorm.weight False
model.layers.1.post_attention_layernorm.weight False
model.layers.2.self_attn.q_proj.weight False
model.layers.2.self_attn.k_proj.weight False
model.layers.2.self_attn.v_proj.weight False
model.layers.2.self_attn.o_proj.

In [13]:
def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

num_trainable_params = count_trainable_parameters(model)
print(f"Number of trainable parameters: {num_trainable_params}")

Number of trainable parameters: 1470103552


In [14]:
import logging
logging.basicConfig(filename='training_log.txt', level=logging.INFO)

In [15]:
# eval_set = json.load(open("data/eval_set.json", "r"))[:10] # TODO!!!!
eval_set = json.load(open("data/eval_set.json", "r"))
train_set = json.load(open("data/train_set.json", "r"))
def custom_evaluation(model, epoch, short=True):
    if short:
        keywords = [e['keyword'] for e in eval_set if e['difficulty'] == "Easy"]
    else:
        keywords = [e['keyword'] for e in eval_set]

    games = run_games(
        keywords,
        tokenizer,
        model,
        id_eot,
        batch_size=15,
    )
    if not short:
        pickle.dump(games, open(f"full_eval_{epoch}.pkl", "wb"))
    return sum([g.win for g in games]) / len(games)

def custom_evaluation_on_train(model, n_examples=100):
    import random
    random.seed(42)
    random.shuffle(train_set)
    
    keywords = [e['keyword'] for e in train_set[:n_examples]]
    games = run_games(
        keywords,
        tokenizer,
        model,
        id_eot,
        batch_size=15,
    )
    return sum([g.win for g in games]) / len(games)

In [16]:
from trl.trainer import DataCollatorForCompletionOnlyLM

data_collator = DataCollatorForCompletionOnlyLM(response_template="<|start_header_id|>assistant<|end_header_id|>", tokenizer=tokenizer)

In [17]:
model = model.train()

In [18]:
save_epochs = set([0])
cumulative_epoch = 0
for ds, num_epochs in (
    # (ds_basic, 1),
    # (ds_hints, 1),
    (ds_snap, 1),
):
    for epoch in range(num_epochs):
        training_args = SFTConfig(
            output_dir="./results",
            num_train_epochs=1,
            per_device_train_batch_size=25,
            per_device_eval_batch_size=25,
            save_strategy="no",
            learning_rate=1e-05,
            weight_decay=0.2,
            bf16=True,
            lr_scheduler_type="linear",
            warmup_ratio=0.1,
            eval_strategy="steps",
            eval_steps=150,
            logging_steps=150,
            neftune_noise_alpha=None,
            max_seq_length=5000,
            num_of_sequences=5000,
            dataset_text_field="texts",
        )

        trainer = SFTTrainer(
            model=model,
            args=training_args,
            train_dataset=ds,
            eval_dataset=ds_eval,
            tokenizer=tokenizer,
            data_collator=data_collator,
        )

        train_result = trainer.train()
        train_loss = train_result.training_loss
        logging.info(f"Epoch {cumulative_epoch} - Train Loss: {train_loss}")
        print(f"Epoch {cumulative_epoch} - Train Loss: {train_loss}")
        
        eval_result = trainer.evaluate()
        eval_loss = eval_result['eval_loss']
        
        logging.info(f"Epoch {cumulative_epoch} - Eval Loss: {eval_loss}")
        print(f"Epoch {cumulative_epoch} - Eval Loss: {eval_loss}")
        
        eval_metrics = custom_evaluation(model, cumulative_epoch)
        logging.info(f"Epoch {cumulative_epoch} - Custom Eval: {eval_metrics}")
        print(f"Epoch {cumulative_epoch} - Custom Eval: {eval_metrics}")
        
        if cumulative_epoch in save_epochs:
            # TODO!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
            # output_dir = f"./results/llama_{cumulative_epoch}"
            # os.makedirs(output_dir, exist_ok=True)
            # model.save_pretrained(output_dir)
            # tokenizer.save_pretrained(output_dir)
            eval_metrics = custom_evaluation(model, cumulative_epoch, short=False)
            logging.info(f"Epoch {cumulative_epoch} - Custom Eval (full): {eval_metrics}")
            print(f"Epoch {cumulative_epoch} - Custom Eval (full): {eval_metrics}")
            eval_metrics_on_train = custom_evaluation_on_train(model)
            logging.info(f"Epoch {cumulative_epoch} - Custom Eval (full train): {eval_metrics_on_train}")
            print(f"Epoch {cumulative_epoch} - Custom Eval (full train): {eval_metrics_on_train}")
        
        print(f"Finished epoch {cumulative_epoch + 1}")
        cumulative_epoch += 1  # Increment the overall epoch counter



Map:   0%|          | 0/16603 [00:00<?, ? examples/s]

Map:   0%|          | 0/1517 [00:00<?, ? examples/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
150,0.8384,0.733219
300,0.5993,0.704635
450,0.5631,0.698442
600,0.5664,0.697272


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Epoch 0 - Train Loss: 0.63388595007416


Epoch 0 - Eval Loss: 0.6964104175567627
Processing batch of games 2024-07-25 09:29:41.162127
Processing batch of games 2024-07-25 09:30:44.106538
Processing batch of games 2024-07-25 09:32:03.887699
Epoch 0 - Custom Eval: 0.08108108108108109
Processing batch of games 2024-07-25 09:32:41.498718
Processing batch of games 2024-07-25 09:33:44.521859
Processing batch of games 2024-07-25 09:35:00.919580
Processing batch of games 2024-07-25 09:36:12.841255
Processing batch of games 2024-07-25 09:37:22.326015
Processing batch of games 2024-07-25 09:38:36.518110
Processing batch of games 2024-07-25 09:39:52.614469
Processing batch of games 2024-07-25 09:41:04.384166
Processing batch of games 2024-07-25 09:42:23.205899
Processing batch of games 2024-07-25 09:43:39.035542
Epoch 0 - Custom Eval (full): 0.050359712230215826
Processing batch of games 2024-07-25 09:44:13.208081
Processing batch of games 2024-07-25 09:45:23.952848
Processing batch of games 2024-07-25 09:46:34.631723
Processing batch o

In [18]:
cumulative_epoch = 100
eval_metrics = custom_evaluation(model, cumulative_epoch)
logging.info(f"Epoch {cumulative_epoch} - Custom Eval: {eval_metrics}")
print(f"Epoch {cumulative_epoch} - Custom Eval: {eval_metrics}")
eval_metrics = custom_evaluation(model, cumulative_epoch, short=False)
logging.info(f"Epoch {cumulative_epoch} - Custom Eval (full): {eval_metrics}")
print(f"Epoch {cumulative_epoch} - Custom Eval (full): {eval_metrics}")
eval_metrics_on_train = custom_evaluation_on_train(model)
logging.info(f"Epoch {cumulative_epoch} - Custom Eval (full train): {eval_metrics_on_train}")
print(f"Epoch {cumulative_epoch} - Custom Eval (full train): {eval_metrics_on_train}")

Processing batch of games 2024-07-25 10:09:24.269525
Processing batch of games 2024-07-25 10:10:55.196423
Processing batch of games 2024-07-25 10:12:17.819128
Epoch 100 - Custom Eval: 0.02702702702702703
Processing batch of games 2024-07-25 10:12:58.992802
Processing batch of games 2024-07-25 10:14:32.838505
Processing batch of games 2024-07-25 10:16:13.000444
Processing batch of games 2024-07-25 10:17:37.062857
Processing batch of games 2024-07-25 10:18:52.478248
Processing batch of games 2024-07-25 10:20:16.587125
Processing batch of games 2024-07-25 10:21:53.740797
Processing batch of games 2024-07-25 10:23:17.495005
Processing batch of games 2024-07-25 10:24:45.651247
Processing batch of games 2024-07-25 10:26:05.676559
Epoch 100 - Custom Eval (full): 0.007194244604316547
Processing batch of games 2024-07-25 10:26:36.769083
Processing batch of games 2024-07-25 10:28:00.031064
Processing batch of games 2024-07-25 10:29:49.797583
Processing batch of games 2024-07-25 10:31:15.785376
P

In [20]:
# eval_metrics = custom_evaluation(model, cumulative_epoch)

In [None]:
# eval_metrics

In [None]:
# output_dir = f"./results/llama_{cumulative_epoch}"
# os.makedirs(output_dir, exist_ok=True)
# model.save_pretrained(output_dir)
# tokenizer.save_pretrained(output_dir)

In [None]:
# eval_metrics = custom_evaluation(model, 111, short=False)

In [18]:
save_epochs = set([0])
cumulative_epoch = 0
for ds, num_epochs in (
    # (ds_basic, 1),
    (ds_hints, 1),
    # (ds_snap, 1),
):
    for epoch in range(num_epochs):
        training_args = SFTConfig(
            output_dir="./results",
            num_train_epochs=1,
            per_device_train_batch_size=25,
            per_device_eval_batch_size=25,
            save_strategy="no",
            learning_rate=1e-05,
            weight_decay=0.25,
            bf16=True,
            lr_scheduler_type="linear",
            warmup_ratio=0.1,
            eval_strategy="steps",
            eval_steps=150,
            logging_steps=150,
            neftune_noise_alpha=None,
            max_seq_length=5000,
            num_of_sequences=5000,
            dataset_text_field="texts",
        )

        trainer = SFTTrainer(
            model=model,
            args=training_args,
            train_dataset=ds,
            eval_dataset=ds_eval,
            tokenizer=tokenizer,
            # data_collator=data_collator,
        )

        train_result = trainer.train()
        train_loss = train_result.training_loss
        logging.info(f"Epoch {cumulative_epoch} - Train Loss: {train_loss}")
        print(f"Epoch {cumulative_epoch} - Train Loss: {train_loss}")
        
        eval_result = trainer.evaluate()
        eval_loss = eval_result['eval_loss']
        
        logging.info(f"Epoch {cumulative_epoch} - Eval Loss: {eval_loss}")
        print(f"Epoch {cumulative_epoch} - Eval Loss: {eval_loss}")
        
        eval_metrics = custom_evaluation(model, cumulative_epoch)
        logging.info(f"Epoch {cumulative_epoch} - Custom Eval: {eval_metrics}")
        print(f"Epoch {cumulative_epoch} - Custom Eval: {eval_metrics}")
        
        if cumulative_epoch in save_epochs:
            # TODO !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
            # output_dir = f"./results/llama_no_mask_{cumulative_epoch}"
            # os.makedirs(output_dir, exist_ok=True)
            # model.save_pretrained(output_dir)
            # tokenizer.save_pretrained(output_dir)
            eval_metrics = custom_evaluation(model, cumulative_epoch, short=False)
            logging.info(f"Epoch {cumulative_epoch} - Custom Eval (full): {eval_metrics}")
            print(f"Epoch {cumulative_epoch} - Custom Eval (full): {eval_metrics}")
            eval_metrics_on_train = custom_evaluation_on_train(model)
            logging.info(f"Epoch {cumulative_epoch} - Custom Eval (full train): {eval_metrics_on_train}")
            print(f"Epoch {cumulative_epoch} - Custom Eval (full train): {eval_metrics_on_train}")
       
        print(f"Finished epoch {cumulative_epoch + 1}")
        cumulative_epoch += 1  # Increment the overall epoch counter



Map:   0%|          | 0/4287 [00:00<?, ? examples/s]

Map:   0%|          | 0/1517 [00:00<?, ? examples/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
150,0.7303,0.161857


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Epoch 0 - Train Loss: 0.6553056808405144


Epoch 0 - Eval Loss: 0.16271020472049713
Processing batch of games 2024-07-25 14:14:43.680049
Processing batch of games 2024-07-25 14:15:57.433648
Processing batch of games 2024-07-25 14:17:28.763690
Epoch 0 - Custom Eval: 0.08108108108108109
Processing batch of games 2024-07-25 14:18:15.573899
Processing batch of games 2024-07-25 14:19:51.400755
Processing batch of games 2024-07-25 14:21:17.810188
Processing batch of games 2024-07-25 14:22:39.612578
Processing batch of games 2024-07-25 14:23:45.895106
Processing batch of games 2024-07-25 14:25:01.293184
Processing batch of games 2024-07-25 14:26:19.515741
Processing batch of games 2024-07-25 14:27:54.013202
Processing batch of games 2024-07-25 14:29:35.996222
Processing batch of games 2024-07-25 14:31:00.559145
Epoch 0 - Custom Eval (full): 0.050359712230215826
Processing batch of games 2024-07-25 14:31:33.966403
Processing batch of games 2024-07-25 14:32:59.482198
Processing batch of games 2024-07-25 14:34:32.018090
Processing batch 

In [19]:
# eval_metrics = custom_evaluation(model, cumulative_epoch)
# logging.info(f"Epoch {cumulative_epoch} - Custom Eval: {eval_metrics}")
# print(f"Epoch {cumulative_epoch} - Custom Eval: {eval_metrics}")
# eval_metrics = custom_evaluation(model, cumulative_epoch, short=False)
# logging.info(f"Epoch {cumulative_epoch} - Custom Eval (full): {eval_metrics}")
# print(f"Epoch {cumulative_epoch} - Custom Eval (full): {eval_metrics}")
# eval_metrics_on_train = custom_evaluation_on_train(model)
# logging.info(f"Epoch {cumulative_epoch} - Custom Eval (full train): {eval_metrics_on_train}")
# print(f"Epoch {cumulative_epoch} - Custom Eval (full train): {eval_metrics_on_train}")

In [None]:
output_dir = f"./results/llama_no_mask_{cumulative_epoch}"
os.makedirs(output_dir, exist_ok=True)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
eval_metrics = custom_evaluation(model, cumulative_epoch, short=False)
logging.info(f"Epoch {cumulative_epoch} - Custom Eval (full): {eval_metrics}")
print(f"Epoch {cumulative_epoch} - Custom Eval (full): {eval_metrics}")

Configuration saved in ./results/llama_no_mask_2/config.json
Configuration saved in ./results/llama_no_mask_2/generation_config.json
The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 4 checkpoint shards. You can find where each parameters has been saved in the index located at ./results/llama_no_mask_2/model.safetensors.index.json.
tokenizer config file saved in ./results/llama_no_mask_2/tokenizer_config.json
Special tokens file saved in ./results/llama_no_mask_2/special_tokens_map.json


Processing batch of games 2024-07-24 20:30:57.513520
