In [9]:
#this notebook follows along with https://wandb.ai/capecape/alpaca_ft/reports/How-to-Fine-Tune-an-LLM-Part-2-Instruction-Tuning-Llama-2--Vmlldzo1NjY0MjE1#:~:text=Transformer%2Dbased%20models%20like%20Llama,the%20last%208%20of%20them
import wandb
from datasets import load_from_disk 

run = wandb.init(project="alpaca_ft")
artifact = run.use_artifact('capecape/alpaca_ft/packed_alpaca_hf:v0', type='dataset')
artifact_dir = artifact.download()
ds_packed = load_from_disk(artifact_dir)

train_ds_packed = ds_packed["train"]
eval_ds_packed  = ds_packed["eval"]



[34m[1mwandb[0m: Downloading large artifact packed_alpaca_hf:v0, 178.69MB. 7 files... 
[34m[1mwandb[0m:   7 of 7 files downloaded.  
Done. 0:0:28.8


In [11]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

batch_size = 8  # I have an A100 GPU with 40GB of RAM 😎

train_dataloader = DataLoader(
    train_ds_packed,
    batch_size=batch_size,
    collate_fn=default_data_collator, # we don't need any special collator 😎
)

eval_dataloader = DataLoader(
    eval_ds_packed,
    batch_size=batch_size,
    collate_fn=default_data_collator,
    shuffle=False,
)

In [17]:
b = next(iter(train_dataloader))
b.keys(), b["input_ids"][0][0:25], b["labels"][0][:25]

(dict_keys(['input_ids', 'labels']),
 tensor([    1, 13866,   338,   385, 15278,   393, 16612,   263,  3414, 29889,
         14350,   263,  2933,   393,  7128,  2486,  1614,  2167,   278,  2009,
         29889,    13,    13,  2277, 29937]),
 tensor([13866,   338,   385, 15278,   393, 16612,   263,  3414, 29889, 14350,
           263,  2933,   393,  7128,  2486,  1614,  2167,   278,  2009, 29889,
            13,    13,  2277, 29937,  2799]))

In [24]:
from types import SimpleNamespace

gradient_accumulation_steps = 32 // batch_size
max_sequence_len = 128 # tutorial actually forgets say this? i assume this is a safe choice

config = SimpleNamespace(
    model_id='NousResearch/Llama-2-7b-hf',
    dataset_name="alpaca-gpt4",
    precision="bf16",  # faster and better than fp16, requires new GPUs
    n_freeze=24,  # How many layers we don't train, LLama 7B has 32.
    lr=2e-4,
    n_eval_samples=10, # How many samples to generate on validation
    max_seq_len=max_sequence_len, # Length of the sequences to pack
    epochs=3,  # we do 3 pasess over the dataset.
    gradient_accumulation_steps=gradient_accumulation_steps,  # evey how many iterations we update the gradients, simulates larger batch sizes
    batch_size=batch_size,  # what my GPU can handle, depends on how many layers are we training  
    log_model=False,  # upload the model to W&B?
    mom=0.9, # optim param
    gradient_checkpointing = True,  # saves even more memory
    freeze_embed = True,  # why train this? let's keep them frozen ❄️
)


config.total_train_steps = config.epochs * len(train_dataloader) // config.gradient_accumulation_steps


In [25]:
from transformers import AutoModelForCausalLM # tutorial forgets this too...
import torch # and this lol

model = AutoModelForCausalLM.from_pretrained(
    config.model_id,
    device_map=0,
    trust_remote_code=True,
    low_cpu_mem_usage=True,
    torch_dtype=torch.bfloat16,
    use_cache=False,
)

Downloading config.json: 100%|██████████| 583/583 [00:00<00:00, 3.92MB/s]
Downloading (…)fetensors.index.json: 100%|██████████| 26.8k/26.8k [00:00<00:00, 21.5MB/s]
Downloading (…)of-00002.safetensors: 100%|██████████| 9.98G/9.98G [24:05<00:00, 6.90MB/s]
Downloading (…)of-00002.safetensors: 100%|██████████| 3.50G/3.50G [08:27<00:00, 6.90MB/s]
Downloading shards: 100%|██████████| 2/2 [32:33<00:00, 976.53s/it] 
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.27s/it]
Downloading generation_config.json: 100%|██████████| 179/179 [00:00<00:00, 1.45MB/s]


In [None]:
n_freeze = 24. # you can play with this parameter

# freeze layers (disable gradients)
for param in model.parameters(): param.requires_grad = False
for param in model.lm_head.parameters(): param.requires_grad = True
for param in model.model.layers[n_freeze:].parameters(): param.requires_grad = True
# freeze embeddings
if config.freeze_embed:
    model.model.embed_tokens.weight.requires_grad_(False);
# use model checkpointing
if config.gradient_checkpointing:
    model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})  # <- pytorch changed this

In [None]:
from transformers import get_cosine_schedule_with_warmup

optim = torch.optim.Adam(model.parameters(), lr=config.lr, betas=(0.9,0.99), eps=1e-5)
scheduler = get_cosine_schedule_with_warmup(
    optim,
    num_training_steps=config.total_train_steps,
    num_warmup_steps=config.total_train_steps // 10,
)

def loss_fn(x, y):
    "A Flat CrossEntropy" 
    return torch.nn.functional.cross_entropy(x.view(-1, x.shape[-1]), y.view(-1))

In [None]:
#tutorial forgets this as well
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('NousResearch/Llama-2-7b-hf')


In [None]:
from transformers import GenerationConfig
gen_config = GenerationConfig.from_pretrained(config.model_id)

def generate(prompt, max_new_tokens=100, gen_config=gen_config):
    with torch.inference_mode():
        tokenized_prompt = tokenizer(prompt, return_tensors='pt')['input_ids'].cuda()
        output = model.generate(tokenized_prompt, 
                            max_new_tokens=max_new_tokens, 
                            generation_config=gen_config)
    return tokenizer.decode(output[0][len(tokenized_prompt[0]):], skip_special_tokens=True)


In [None]:
# tutorial forgets to say how test_config was made as well?
test_config = SimpleNamespace(
    model_id='NousResearch/Llama-2-7b-hf',
    dataset_name="alpaca-gpt4",
    precision="bf16",  # faster and better than fp16, requires new GPUs
    n_freeze=24,  # How many layers we don't train, LLama 7B has 32.
    lr=2e-4,
    n_eval_samples=10, # How many samples to generate on validation
    max_seq_len=max_sequence_len, # Length of the sequences to pack
    epochs=3,  # we do 3 pasess over the dataset.
    gradient_accumulation_steps=gradient_accumulation_steps,  # evey how many iterations we update the gradients, simulates larger batch sizes
    batch_size=batch_size,  # what my GPU can handle, depends on how many layers are we training  
    log_model=False,  # upload the model to W&B?
    mom=0.9, # optim param
    gradient_checkpointing = True,  # saves even more memory
    freeze_embed = True,  # why train this? let's keep them frozen ❄️
    gen_config = config
)

In [None]:
def prompt_table(prompts, log=True):
    table = wandb.Table(columns=["prompt", "generation", "concat", "max_new_tokens", "temperature", "top_p"])
    for prompt in progress_bar(prompts):
        out = generate(prompt, test_config.max_new_tokens, test_config.gen_config)
        table.add_data(prompt, out, prompt+out, test_config.max_new_tokens, test_config.gen_config.temperature, test_config.gen_config.top_p)
    if log:
        wandb.log({"predictions":table})
    return table

In [None]:
from tqdm import tqdm
@torch.no_grad()
def validate():
    model.eval();
    eval_acc = Accuracy()
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch = to_gpu(batch)
        with torch.amp.autocast("cuda", dtype=torch.bfloat16):
            out = model(**batch)
            loss = loss_fn(out.logits, batch["labels"])  # you could use out.loss and not shift the dataset
        eval_acc.update(out.logits, batch["labels"])
    # we log results at the end
    wandb.log({"eval_loss": loss.item(),
               "eval_accuracy": eval_acc.compute()})
    prompt_table(eval_dataset[:config.n_eval_samples], log=True)
    model.train();
