In [None]:
!pip install accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.30.2 trl==0.4.7

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Quantization can be achieved used bittandbytesconfig read the following link
https://www.tensorops.ai/post/what-are-quantized-llms#viewer-8atqg

https://huggingface.co/docs/optimum/concept_guides/quantization



## Both LoRA (Low-Rank Adaptation) and QLoRA (Quantized Low-Rank Adaptation) are techniques used to fine-tune large language models (LLMs) more efficiently.
https://medium.com/@sujathamudadla1213/difference-between-qlora-and-lora-for-fine-tuning-llms-0ea35a195535#:~:text=Both%20LoRA%20(Low%2DRank%20Adaptation,models%20(LLMs)%20more%20efficiently.

## PEFT Parameter efficient fine-tuning (PEFT) is a method that aims to reduce the size of models, making it possible to perform calculations on less powerful GPUs. LoRa is a method in PEFT that is used to reduce the size of LLM



In [None]:
import torch
from datasets import load_dataset
from peft import LoraConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer

def finetune_llama_v2():
    data = load_dataset("timdettmers/openassistant-guanaco", split="train")
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
    tokenizer.pad_token = tokenizer.eos_token
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="float16", bnb_4bit_use_double_quant=True
    )
    model = AutoModelForCausalLM.from_pretrained(
        "meta-llama/Llama-2-7b-hf", quantization_config=bnb_config, device_map={"": 0}
    )
    model.config.use_cache=False
    model.config.pretraining_tp=1
    peft_config = LoraConfig(
        r=64, lora_alpha=32, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"
    )
    training_arguments = TrainingArguments(
        output_dir="llama2_finetuned_chatbot_22_12_2023",
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        optim="paged_adamw_8bit",
        learning_rate=2e-4,
        lr_scheduler_type="linear",
        save_strategy="epoch",
        logging_steps=10,
        num_train_epochs=5,
        max_steps=5,
        fp16=True,
        push_to_hub=True
    )
    trainer = SFTTrainer(
        model=model,
        train_dataset=data,
        peft_config=peft_config,
        dataset_text_field="text",
        args=training_arguments,
        tokenizer=tokenizer,
        packing=False,
        max_seq_length = 512
    )
    trainer.train()
    trainer.push_to_hub()

if __name__ == "__main__":
    finetune_llama_v2()

ModuleNotFoundError: ignored

In [None]:
data = load_dataset("timdettmers/openassistant-guanaco", split="train")

Repo card metadata block was not found. Setting CardData to empty.


In [None]:
import torch
from datasets import load_dataset
from peft import LoraConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer
from peft import PeftModel, PeftConfig
peft_model_id = "SNV/llama2_finetuned_chatbot"


This is how we should load the peft model

In [None]:
config = PeftConfig.from_pretrained(peft_model_id)
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="float16", bnb_4bit_use_double_quant=True)

#model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True,  quantization_config=bnb_config, device_map='auto')
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, quantization_config=bnb_config, device_map={"": 0})
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token
# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
def inference(model, tokenizer, input_sent):
    #input_ids = tokenizer(input_sent, return_tensors="pt", truncation=True, max_length=256).input_ids.to("cpu")
    input_ids = tokenizer(input_sent, return_tensors="pt", truncation=True, max_length=256).input_ids.cuda()
    outputs = model.generate(input_ids=input_ids, top_p=0.9, max_length=256)
    return tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]

In [None]:
input_ = "I want to start doing astrophotography as a hobby, any suggestions what could i do"
input_ = "Human: " + input_ + ". Assistant: "


inference(model, tokenizer, input_)

'Human: I want to start doing astrophotography as a hobby, any suggestions what could i do. Assistant: 1. Get a DSLR 2. Get a telescope 3. Get a tripod 4. Get a remote shutter release 5. Get a good sturdy table 6. Get a laptop 7. Get a wifi dongle 8. Get a good photo editing software 9. Get a good website hosting plan 10. Get a good astrophotography website 11. Get a good astrophotography forum account 12. Get a good astrophotography forum moderator account 13. Get a good astrophotography forum admin account 14. Get a good astrophotography forum mod admin account 15. Get a good astrophotography forum admin admin account 16. Get a good astrophotography forum admin admin admin account 17. Get a good astrophotography forum admin admin admin admin account 18. Get a good astrophotography forum admin admin admin admin admin account 19. Get a good astrophotography forum admin admin admin admin admin admin'