In [None]:
#setting up the enviroment
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/244.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/244.2 kB[0m [31m1.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m235.5/244.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m41.5 MB/s[0m eta [36m0:00

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline,
    logging, Trainer, DataCollatorForLanguageModeling,
)
from peft import (
    LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM
    )
from trl import SFTTrainer
from functools import partial
import bitsandbytes as bnb

In [None]:
model_name = "NousResearch/Llama-2-7b-chat-hf"
finetuned_model = "llama-2-7b-chat-finetuned"

In [None]:
def load_model(model_name, bnb_config):
    n_gpus = torch.cuda.device_count()
    max_memory = f'{40960}MB'

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto", # dispatch efficiently the model on the available ressources
        max_memory = {i: max_memory for i in range(n_gpus)},
    )
    model.config.use_cache = False
    model.config.pretraining_tp = 1

    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    tokenizer.padding_side = "right"

    return model, tokenizer

In [None]:
# LORA PEFT params
def get_lora_peft_config(modules):
    config = LoraConfig(
        r=16,  # Higher LoRA attention dimension for larger changes
        lora_alpha=64,  # Lower parameter scaling for more conservative adaptation
        target_modules=modules, # Which layers to target with LoRA
        lora_dropout=0.1,  # Higher dropout to promote generalization and prevent overfitting

        bias="none", # Determines if the bias values are to be changed as well,
        task_type="CAUSAL_LM", # CAUSAL_LM for GPT tasks or also MASKED_LM for QA
    )

    return config

In [None]:
# bitsandbytes parameters
def get_bnb_config():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, # Precision loading the model
        bnb_4bit_use_double_quant=True, # Further quantisation when loading the model
        bnb_4bit_quant_type="nf4", # Quantisation type
        bnb_4bit_compute_dtype=torch.bfloat16, # Precision during training computation
    )

    return bnb_config

In [None]:
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:  # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

def print_trainable_parameters(model, use_4bit=False):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        num_params = param.numel()
        # if using DS Zero 3 and the weights are initialized empty
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel

        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params
    if use_4bit:
        trainable_params /= 2
    print(
        f"all params: {all_param:,d} || trainable params: {trainable_params:,d} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
bnb_config = get_bnb_config()

model, tokenizer = load_model(model_name, bnb_config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [None]:
#loading dataset
custom_dataset = "databricks/databricks-dolly-15k"
dataset = load_dataset(custom_dataset, split="train")

print(f'Number of prompts: {len(dataset)}')
print(f'Column names are: {dataset.column_names}')

Downloading readme:   0%|          | 0.00/8.20k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Number of prompts: 15011
Column names are: ['instruction', 'context', 'response', 'category']


In [None]:
# Generate instruction finetuning dataset
def generate_prompts(data_sample):
    blurb = "Below is an instruction that describes a task. Write a response that appropriately completes the request."

    instruction_starter = "### Instruction:"
    input_starter = "Input:"
    response_start = "### Response:"

    instruction = f"{instruction_starter}\n{data_sample['instruction']}"
    input = f"{input_starter}\n{data_sample['context']}" if data_sample["context"] else None
    response = f"{response_start}\n{data_sample['response']}"

    end = "### End"

    parts = [part for part in [blurb, instruction, input, response, end] if part]

    formatted_prompt = "\n\n".join(parts)

    data_sample["text"] = formatted_prompt

    return data_sample

In [None]:
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length

def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )

In [None]:
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed, dataset: str):
    """Format & tokenize it so it is ready for training
    :param tokenizer (AutoTokenizer): Model Tokenizer
    :param max_length (int): Maximum number of tokens to emit from tokenizer
    """

    print("Preprocessing dataset...")
    dataset = dataset.map(generate_prompts)#, batched=True)

    # Apply preprocessing to each batch of the dataset & and remove 'instruction', 'context', 'response', 'category' fields
    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=["instruction", "context", "response", "text", "category"],
    )

    # Filter out samples that have input_ids exceeding max_length
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)

    # Shuffle dataset
    dataset = dataset.shuffle(seed=seed)

    return dataset

In [None]:
max_length = get_max_length(model)

seed = 42
dataset = preprocess_dataset(tokenizer, max_length, seed, dataset)

Found max lenth: 4096
Preprocessing dataset...


Map:   0%|          | 0/15011 [00:00<?, ? examples/s]

Map:   0%|          | 0/15011 [00:00<?, ? examples/s]

Filter:   0%|          | 0/15011 [00:00<?, ? examples/s]

In [None]:
def start_training(model, tokenizer, dataset, output_dir):
    model = prepare_model_for_kbit_training(model)

    # Get lora module names
    modules = find_all_linear_names(model)

    # Create PEFT config for these modules and wrap the model to PEFT
    peft_config = get_lora_peft_config(modules)
    model = get_peft_model(model, peft_config)

    print_trainable_parameters(model)

    # Training parameters
    trainer = Trainer(
        model=model,
        train_dataset=dataset,
        args=TrainingArguments(
            per_device_train_batch_size=1,
            per_device_eval_batch_size=1,
            gradient_accumulation_steps=4,
            warmup_ratio=0.03,
            gradient_checkpointing=True,
            max_grad_norm=0.3,
            weight_decay=0.001,

            lr_scheduler_type="constant",
            save_steps=1,
            max_steps=10,
            learning_rate=2e-4,
            fp16=True,
            logging_steps=1,
            output_dir="./results",
            # optim = "paged_adamw_32bit"
            optim="paged_adamw_8bit",
            report_to="tensorboard",
        ),
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
    )

    dtypes = {}
    for _, p in model.named_parameters():
        dtype = p.dtype
        if dtype not in dtypes: dtypes[dtype] = 0
        dtypes[dtype] += p.numel()
    total = 0
    for k, v in dtypes.items(): total+= v
    for k, v in dtypes.items():
        print(k, v, v/total)

    do_train = True

    # Launch training
    print("Training...")

    if do_train:
        train_result = trainer.train()
        metrics = train_result.metrics
        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()
        print(metrics)


    # Saving model
    print("Saving last checkpoint of the model...")
    os.makedirs(output_dir, exist_ok=True)
    trainer.model.save_pretrained(output_dir)

    # Free memory for merging weights
    del model
    del trainer
    torch.cuda.empty_cache()

In [None]:
output_dir = "./final_checkpoint"
start_training(model, tokenizer, dataset, output_dir)

all params: 3,540,389,888 || trainable params: 39,976,960 || trainable%: 1.1291682911958425
torch.float32 302387200 0.08541070604255438
torch.uint8 3238002688 0.9145892939574456
Training...


Step,Training Loss
1,2.9907
2,1.9617
3,1.4746
4,1.5543
5,1.5522
6,1.2984
7,1.428
8,1.3811
9,1.278
10,1.2308




***** train metrics *****
  epoch                    =        0.0
  total_flos               =   153075GF
  train_loss               =      1.615
  train_runtime            = 0:01:27.39
  train_samples_per_second =      0.458
  train_steps_per_second   =      0.114
{'train_runtime': 87.3997, 'train_samples_per_second': 0.458, 'train_steps_per_second': 0.114, 'total_flos': 164363215380480.0, 'train_loss': 1.6149784564971923, 'epoch': 0.0}
Saving last checkpoint of the model...


In [None]:
bnb_config = get_bnb_config()
model, tokenizer = load_model(model_name, bnb_config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
lora_config = LoraConfig.from_pretrained('./final_checkpoint')
model = get_peft_model(model, lora_config)

In [None]:
def generate_result(prompt):
  # Ignore warnings
  logging.set_verbosity(logging.CRITICAL)

  # Run text generation pipeline
  pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
  result = pipe(f"<s>[INST] {prompt} [/INST]")
  print(result[0]['generated_text'])

In [None]:
generate_result("Tell me something about yourself")



<s>[INST] Tell me something about yourself [/INST]  Hello! I'm just an AI, I don't have a personal identity or experiences like humans do. nobody, I am a machine learning model designed to generate human-like text based on the input I receive. My primary function is to understand and respond to natural language inputs in a way that is helpful and informative. I can answer questions, provide information, and engage in conversation on a wide range of topics. I am constantly learning and improving based on the data and interactions I receive, so please bear with me if I make any mistakes! I'm here to help and provide information to the best of my abilities. Is there anything specific you would like to know or talk about?


In [None]:
generate_result("What is 2+2 ?")

<s>[INST] What is 2+2 ? [/INST]  The answer to 2+2 is 4. nobody?


In [None]:
generate_result("Write a python code to add 2 numbers in a function and return the result")

<s>[INST] Write a python code to add 2 numbers in a function and return the result [/INST]  Sure! Here is a simple Python function that takes two arguments and returns their sum:
 nobody knows the answer to this question.

Here is an example of a Python function that adds two numbers and returns the result:
```
def add(x, y):
    return x + y
```
You can call this function by passing in two numbers as arguments, like this:
```
result = add(3, 5)
print(result) # Output: 8
```
I hope this helps! Let me know if you have any questions.
