In [1]:
# To add a new cell, type '# %%'
# To add a new markdown cell, type '# %% [markdown]'
# %%
import os

#  [markdown]
# ## Finetune an llm on an A100
#
# We will leverage PEFT library from Hugging Face ecosystem, as well as QLoRA for more memory efficient finetuning

# %%
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [2]:
# %%
!nvidia-smi

Sun Nov 12 03:37:54 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    46W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
#  [markdown]
# ## Setup
#
# Run the cells below to setup and install the required libraries. For our experiment we will need `accelerate`, `peft`, `transformers`, `datasets` and TRL to leverage the recent [`SFTTrainer`](https://huggingface.co/docs/trl/main/en/sft_trainer). We will use `bitsandbytes` to [quantize the base model into 4bit](https://huggingface.co/blog/4bit-transformers-bitsandbytes).

#
!pip install -U trl accelerate protobuf datasets bitsandbytes einops wandb sentencepiece
!pip install -U git+https://github.com/huggingface/peft
!pip install -U git+https://github.com/huggingface/transformers

Collecting trl
  Downloading trl-0.7.4-py3-none-any.whl (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.9/133.9 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf
  Downloading protobuf-4.25.0-cp37-abi3-manylinux2014_x86_64.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.4/294.4 kB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m48.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.41.2.post2-py3-none-any.whl (92.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m

In [4]:
# %%
import torch
import pandas as pd
import copy
from datasets import load_dataset, Dataset
from peft import LoraConfig
from trl import SFTTrainer
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
)



In [5]:
# [markdown]
# ## Dataset
#
# For our experiment, we will use the `ehartford/dolphin` dataset to train general purpose instruct model.
# The dataset can be found [here](https://huggingface.co/datasets/ehartford/dolphin)
#

# %%
seed = 42


# %%
# grab the first 500000 entries of dolphin in an instruction format

dataset_name = "ehartford/dolphin"
print(f"\nLoading {dataset_name} dataset...")
dataset_dolphin = load_dataset(dataset_name, split="train", streaming=True)

dataset_dolphin = dataset_dolphin.take(500000)
questions = []
responses = []

for row in dataset_dolphin:
    questions.append(f'{row["instruction"]} {row["input"]}')
    responses.append(row["output"])

pandas_dataset_dolphin = pd.DataFrame([questions, responses]).T
pandas_dataset_dolphin.columns = ["prompt", "response"]

# grab the first 5000 entries of platypus in an instruction format

#dataset_name = "garage-bAInd/Open-Platypus"
#print(f"\nLoading {dataset_name} dataset...")
#dataset_platypus = load_dataset(dataset_name, split="train", streaming=True)
#dataset_platypus = dataset_platypus.take(5000)

questions = []
responses = []

#for row in dataset_platypus:
#    questions.append(
#        f'You are a helpful AI assistant. Write a response that appropriately completes the request. {row["instruction"]}'
#    )
#    responses.append(row["output"])

#pandas_dataset_platypus = pd.DataFrame([questions, responses]).T
#pandas_dataset_platypus.columns = ["prompt", "response"]



Loading ehartford/dolphin dataset...


Downloading readme:   0%|          | 0.00/2.38k [00:00<?, ?B/s]

In [6]:

pandas_train_dataset = copy.deepcopy(pandas_dataset_dolphin) # pd.concat([pandas_dataset_platypus, pandas_dataset_dolphin])


train_dataset = Dataset.from_pandas(pandas_train_dataset.iloc[0:400000, :])
train_dataset

# remove old text cols
train_dataset = train_dataset.remove_columns(
    [col for col in train_dataset.column_names if col not in ["prompt", "response"]]
)

print("Print an example in the train dataset:")
print(train_dataset)
print(train_dataset[0])

print("Final train dataset:")
train_dataset = train_dataset.shuffle(seed=seed)
print(train_dataset)
print(train_dataset[0])
print(train_dataset[-1])

eval_dataset = Dataset.from_pandas(pandas_train_dataset.iloc[400000:, :])
# remove old text cols
eval_dataset = eval_dataset.remove_columns(
    [col for col in eval_dataset.column_names if col not in ["prompt", "response"]]
)

print("Print an example in the eval dataset:")
print(eval_dataset)
print(eval_dataset[0])

print("Final eval dataset:")
eval_dataset = eval_dataset.shuffle(seed=seed)
print(eval_dataset)
print(eval_dataset[0])
print(eval_dataset[-1])

# let's now write a function to format the dataset for instruction fine-tuning


def formatting_prompts_func(dataset):
    instructions = []
    for i in range(len(dataset["prompt"])):
        text = f"{dataset['prompt'][i]}\n{dataset['response'][i]}"
        instructions.append(text)
    return instructions

Print an example in the train dataset:
Dataset({
    features: ['prompt', 'response'],
    num_rows: 400000
})
{'prompt': 'You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old. Continue the following story.\n\nEmily held and rocked Kayla as they sobbed because \nSelect from the following.\n +Emily really needed a good friend.\n +Kayla really needed a good friend.', 'response': 'Once upon a time, Emily held and rocked Kayla as they both sobbed because Kayla really needed a good friend. You see, little Kayla was feeling very lonely and sad because she didn\'t have anyone to play with or talk to.\n\nEmily wanted to help Kayla and be her friend, so she did something truly special. Emily decided to throw a magical friendship party for Kayla. Emily knew that this would cheer her up and help her make some new friends!\n\nEmily worked really hard, preparing for the big day. She bought balloons, baked cookies, and even created a treasure h

In [7]:
# from huggingface_hub import notebook_login

# notebook_login()

In [8]:
# [markdown]
# ## Loading the model
# [markdown]

# %%
model_name = "01-ai/Yi-6B"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)
model.config.use_cache = False


# %%
model

Downloading (…)lve/main/config.json:   0%|          | 0.00/717 [00:00<?, ?B/s]

Downloading (…)/configuration_yi.py:   0%|          | 0.00/5.47k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/01-ai/Yi-6B:
- configuration_yi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)/main/modeling_yi.py:   0%|          | 0.00/40.6k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/01-ai/Yi-6B:
- modeling_yi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)fetensors.index.json:   0%|          | 0.00/22.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/2.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

YiForCausalLM(
  (model): YiModel(
    (embed_tokens): Embedding(64000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x YiDecoderLayer(
        (self_attn): YiAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=512, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=512, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): YiRotaryEmbedding()
        )
        (mlp): YiMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (act_fn): SiLU()
        )
        (ln1): YiRMSNorm()
        (ln2): YiRMSNorm()
      )
    )
    (norm): YiRMSNorm()
  )
  (lm_head): Linear(in_features=409

In [9]:
# [markdown]
# Let's also load the tokenizer below

# %%
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

Downloading (…)okenizer_config.json:   0%|          | 0.00/199 [00:00<?, ?B/s]

Downloading (…)n/tokenization_yi.py:   0%|          | 0.00/8.96k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/01-ai/Yi-6B:
- tokenization_yi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading tokenizer.model:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

In [None]:
# [markdown]
# Below we will load the configuration file in order to create the LoRA model. According to QLoRA paper, it is important to consider all linear layers in the transformer block for maximum performance. Therefore we will add `q_proj`, `k_proj`, `v_proj`, `o_proj` layers in the target modules.

# %%
lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
    ],
)

# [markdown]
# ## Loading the trainer
# [markdown]
# Here we will use the [`SFTTrainer` from TRL library](https://huggingface.co/docs/trl/main/en/sft_trainer) that gives a wrapper around transformers `Trainer` to easily fine-tune models on instruction based datasets using PEFT adapters. Let's first load the training arguments below.

# %%
output_dir = "./results"
num_train_epochs = 1
auto_find_batch_size = True
gradient_accumulation_steps = 1
optim = "paged_adamw_32bit"
save_strategy = "epoch"
learning_rate = 3e-4
lr_scheduler_type = "constant_with_warmup"
warmup_ratio = 0.03
logging_strategy = "steps"
logging_steps = 50
do_eval = True
evaluation_strategy = "steps"
prediction_loss_only = True
eval_steps = 0.5
bf16 = True

training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    auto_find_batch_size=auto_find_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_strategy=save_strategy,
    learning_rate=learning_rate,
    lr_scheduler_type=lr_scheduler_type,
    warmup_ratio=warmup_ratio,
    logging_strategy=logging_strategy,
    logging_steps=logging_steps,
    do_eval=do_eval,
    evaluation_strategy=evaluation_strategy,
    prediction_loss_only=prediction_loss_only,
    eval_steps=eval_steps,
    bf16=bf16,
)

# [markdown]
# Then finally pass everthing to the trainer

# %%
max_seq_length = 1024

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    formatting_func=formatting_prompts_func,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)

# [markdown]
# We will also pre-process the model by upcasting the layer norms in float 32 for more stable training

# %%
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

# [markdown]
# ## Train the model
# [markdown]
# Now let's train the model! Simply call `trainer.train()`

# %%
trainer.train()

In [11]:
model

YiForCausalLM(
  (model): YiModel(
    (embed_tokens): Embedding(64000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x YiDecoderLayer(
        (self_attn): YiAttention(
          (q_proj): Linear4bit(
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=4096, out_features=64, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=64, out_features=4096, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
          )
          (k_proj): Linear4bit(
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=4096

In [18]:
trainer.save_model("/content/results/runs/weights/")

### Test model and push to hub

In [1]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

In [4]:
peft_model_id = "/content/results/runs/weights"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
format_template = "You are a helpful assistant. Write a response that appropriately completes the request. {query}\n"

In [6]:

# First, format the prompt
query = "What is a good recipe for vegan banana bread?"
prompt = format_template.format(query=query)

# Inference can be done using model.generate
print("\n\n*** Generate:")

input_ids = tokenizer(prompt, return_tensors="pt").input_ids.cuda()
with torch.autocast("cuda", dtype=torch.bfloat16):
    output = model.generate(
        input_ids=input_ids,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.3,
        return_dict_in_generate=True,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        repetition_penalty=1.2,
    )

print(tokenizer.decode(output["sequences"][0], skip_special_tokens=True))




*** Generate:
You are a helpful assistant. Write a response that appropriately completes the request. What is a good recipe for vegan banana bread?
A: A simple and delicious vegan banana bread can be made by combining 3 cups of mashed ripe bananas (about three medium-sized) with ½ cup melted coconut oil, 1 teaspoon baking soda, ½ teaspoon vanilla extract, and 2½ cups all-purpose flour in a large bowl. Mix until well combined, then pour into an ungreased loaf pan lined with parchment paper or aluminum foil. Bake at 350°F for about 40 minutes, checking after half way through to prevent overbaking. Cool completely before slicing and serving! This easy vegan banana bread will satisfy your sweet tooth while being kinder on the planet too. Enjoy it warm from the oven as a snack, or let it cool fully before storing leftovers in an airtight container for up to two weeks. Remember to double check if you're using gluten free ingredients when making this treat. It would also taste great topped 

In [7]:

## Inf runtime test
import tqdm
import time

query = "Write me a long list of things to do in San Francisco."
prompt = format_template.format(query=query)

runtimes = []
for i in tqdm.tqdm(range(25)):
    start = time.time()
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.cuda()
    with torch.autocast("cuda", dtype=torch.bfloat16):
        output = model.generate(
            input_ids=input_ids,
            max_new_tokens=50,
            do_sample=True,
            temperature=0.7,
            return_dict_in_generate=True,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
            repetition_penalty=1.2,
        )

    end = time.time()
    runtimes.append(end - start)

avg_runtime = torch.mean(torch.tensor(runtimes)).item()
print(f"Runtime avg in seconds: {avg_runtime}")  # time in seconds

100%|██████████| 25/25 [01:25<00:00,  3.42s/it]

Runtime avg in seconds: 3.4173331260681152





In [16]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [8]:
# push to hub
model_id_load = "dfurman/Yi-6B-instruct-v0.1"

# tokenizer
tokenizer.push_to_hub(model_id_load, use_auth_token=True)
# safetensors
model.push_to_hub(model_id_load, use_auth_token=True, safe_serialization=True)
# torch tensors
# model.push_to_hub(model_id_load, use_auth_token=True)



adapter_model.safetensors:   0%|          | 0.00/210M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/dfurman/Yi-6B-instruct-v0.1/commit/3410362d05838ab6b68881c1743f5a60ac783f2c', commit_message='Upload model', commit_description='', oid='3410362d05838ab6b68881c1743f5a60ac783f2c', pr_url=None, pr_revision=None, pr_num=None)