# Using PEFT & bitsandbytes to finetune a LoRa checkpoint




In [None]:
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q peft transformers

In [None]:
!pip install torch --index-url https://download.pytorch.org/whl/cu118 --user

Looking in indexes: https://download.pytorch.org/whl/cu118


In [None]:
import torch
torch.zeros(1).cuda()

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
!nvidia-smi

Mon Nov 27 11:32:58 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 527.27       Driver Version: 527.27       CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Quadro GV100       WDDM  | 00000000:73:00.0  On |                  Off |
| 29%   40C    P2    33W / 250W |   2425MiB / 32768MiB |      2%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Setup the model

In [None]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [None]:
!pip install tokenizers>=0.13.3

In [None]:
import os
# os.environ["CUDA_VISIBLE_DEVICES"]="0"
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
#EleutherAI/pythia-70m
model = AutoModelForCausalLM.from_pretrained(
    "tiiuae/falcon-7b",
    torch_dtype=torch.float16,
    device_map='auto',
)

tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Freezing the original weights


In [None]:
for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

# class CastOutputToFloat(nn.Sequential):
#   def forward(self, x): return super().forward(x).to(torch.float32)
# model.lm_head = CastOutputToFloat(model.lm_head)

### Setting up the LoRa Adapters

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16, #attention heads
    lora_alpha=32, #alpha scaling
    # target_modules=["q_proj", "v_proj"], #if you know the
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM" # set this for CLM or Seq2Seq
)

model = get_peft_model(model, config)
print_trainable_parameters(model)


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
binary_path: C:\Users\ncheruk2\AppData\Local\anaconda3\Lib\site-packages\bitsandbytes\cuda_setup\libbitsandbytes_cuda116.dll
CUDA SETUP: Loading binary C:\Users\ncheruk2\AppData\Local\anaconda3\Lib\site-packages\bitsandbytes\cuda_setup\libbitsandbytes_cuda116.dll...
trainable params: 4718592 || all params: 6926439296 || trainable%: 0.06812435363037071


## Data

In [None]:
import transformers
from datasets import load_dataset
data = load_dataset("Abirate/english_quotes")


In [None]:
data

DatasetDict({
    train: Dataset({
        features: ['quote', 'author', 'tags', 'prediction', 'input_ids', 'attention_mask'],
        num_rows: 2508
    })
})

In [None]:
import pandas as pd
df = pd.read_csv("Readme2Vid_Dataset_instr.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,filename,summary,Readme_File,Instruction_prompt
0,0,COMP_SCI 110_ Intro to Computer Programming,Description: This introductory programming...,﻿\nDescription\nIntroduction to programming pr...,### Instruction:\n\nGive a shortened concise s...
1,1,CMU 10601,Description: 10-601: Introduction to Machi...,10-601: Introduction to Machine Learning\n| Ca...,### Instruction:\n\nGive a shortened concise s...
2,2,CMU 10605,Description: 10-605: Machine Learning with...,10-605: Machine Learning with Large Datasets\n...,### Instruction:\n\nGive a shortened concise s...
3,3,CMU 10701,Description: 10-701: Introduction to Machi...,10-701: Introduction to Machine Learning (PhD)...,### Instruction:\n\nGive a shortened concise s...
4,4,CMU 11411,Description: 11-411: Natural Language Proc...,11-411: Natural Language Processing\n| Categor...,### Instruction:\n\nGive a shortened concise s...


In [None]:
from datasets import Dataset

# Replace these with your actual data
tokenizer.pad_token = tokenizer.eos_token

inputids = []
attninp = []
for input_text in df["Instruction_prompt"].tolist():
  encoding = tokenizer(input_text, max_length=512, padding="max_length", truncation=True, return_tensors="pt")
  inputids.append(encoding["input_ids"].flatten())
  attninp.append(encoding["attention_mask"].flatten())

outputids = []
attnout = []
for output_text in df["summary"].tolist():
  encoding = tokenizer(output_text, max_length=512, padding="max_length", truncation=True, return_tensors="pt")
  outputids.append(encoding["input_ids"].flatten())
  attnout.append(encoding["attention_mask"].flatten())

custom_data = {
    "input_ids": inputids,
    "attention_mask": attninp,
    "labels": outputids,
    "attention_mask_output": attnout,
}

# Create a Hugging Face Dataset
huggingface_dataset = Dataset.from_dict(custom_data)

# Optionally, you can specify the format and columns
# huggingface_dataset = Dataset.from_dict(custom_data, format="pandas", columns=["input_column", "output_column"])

# Save the dataset to disk
# huggingface_dataset.save_to_disk("./finetune_dataset")

# # Load the dataset from disk (optional, just to verify)
# loaded_dataset = Dataset.load_from_disk("./finetune_dataset")

# # Print the loaded dataset
# print(loaded_dataset)


In [None]:
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoTokenizer
import pandas as pd
class MyCSVDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.data = self.load_data(file_path)

    def load_data(self, file_path):
        df = pd.read_csv(file_path)
        return df

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = str(self.data.iloc[idx]["Instruction_prompt"])
        output_text = str(self.data.iloc[idx]["Readme_File"])

        input_encoding = self.tokenizer(input_text, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt")
        output_encoding = self.tokenizer(output_text, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt")

        return {
            "input_ids": input_encoding["input_ids"].flatten(),
            "attention_mask": input_encoding["attention_mask"].flatten(),
            "labels": output_encoding["input_ids"].flatten(),
            # "attention_mask_output": output_encoding["attention_mask"].flatten(),
        }

# Example usage
# tokenizer = AutoTokenizer.from_pretrained("TheBloke/Llama-2-13B-GPTQ")
tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b")

dataset = MyCSVDataset(file_path="Readme2Vid_Dataset_instr.csv", tokenizer=tokenizer)
train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)


### Training

In [None]:
!pip install transformers



In [None]:
import transformers
trainer = transformers.Trainer(
    model=model,
    train_dataset=huggingface_dataset,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        max_steps=200,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=5,
        output_dir='outputs'
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mncheruk2[0m ([33mgeniai[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
The current implementation of Falcon calls `torch.scaled_dot_product_attention` directly, this will be deprecated in the future in favor of the `BetterTransformer` API. Please install the latest optimum library with `pip install -U optimum` and call `model.to_bettertransformer()` to benefit from `torch.scaled_dot_product_attention` and future performance optimizations.


Step,Training Loss
5,2.6153
10,2.617
15,2.6198
20,2.6059
25,2.5707
30,2.5252
35,2.4561
40,2.3625
45,2.2138
50,2.0185


TrainOutput(global_step=200, training_loss=1.2749407017230987, metrics={'train_runtime': 1276.0016, 'train_samples_per_second': 2.508, 'train_steps_per_second': 0.157, 'total_flos': 6.194620569550848e+16, 'train_loss': 1.2749407017230987, 'epoch': 53.33})

In [None]:
trainer.save_model("./falcon_1ft_20231127_134856-z4pob5q6")

In [None]:
!pip install bitsandbytes-windows

Collecting bitsandbytes-windows
  Downloading bitsandbytes_windows-0.37.5-py3-none-any.whl (2.0 MB)
     ---------------------------------------- 0.0/2.0 MB ? eta -:--:--
     ---------------------------------------- 0.0/2.0 MB ? eta -:--:--
      --------------------------------------- 0.0/2.0 MB 435.7 kB/s eta 0:00:05
     - -------------------------------------- 0.1/2.0 MB 544.7 kB/s eta 0:00:04
     ----- ---------------------------------- 0.3/2.0 MB 2.0 MB/s eta 0:00:01
     ------------------------- -------------- 1.3/2.0 MB 6.7 MB/s eta 0:00:01
     ---------------------------------------- 2.0/2.0 MB 9.7 MB/s eta 0:00:00
Installing collected packages: bitsandbytes-windows
Successfully installed bitsandbytes-windows-0.37.5


In [None]:
import torch
from tqdm import tqdm
from transformers import AdamW, get_scheduler
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model = AutoModelForCausalLM.from_pretrained(
    "tiiuae/falcon-7b",
#     load_in_8bit=True,
    device_map="auto",
    torch_dtype=torch.float16
).to(device)

optimizer = AdamW(model.parameters(), lr=3e-5)

tokenizer.add_special_tokens({'pad_token': '[PAD]'})
num_epochs = 1
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

progress_bar = tqdm(range(num_training_steps))

model.train()
losses=[]
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        losses.append(loss)
        print(loss)
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        del loss
        del outputs

## Share adapters on the 🤗 Hub

In [None]:
model.push_to_hub("samwit/bloom-7b1-lora-tagger",
                  use_auth_token=True,
                  commit_message="basic training",
                  private=True)

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.bin:   0%|          | 0.00/31.5M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/samwit/bloom-7b1-lora-tagger/commit/62cfae6c87a7d657b2bd3e6e2abac2d5a7d07caf', commit_message='basic training', commit_description='', oid='62cfae6c87a7d657b2bd3e6e2abac2d5a7d07caf', pr_url=None, pr_revision=None, pr_num=None)

## Load adapters from the Hub

In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

peft_model_id = "falcon_1ft_20231127_134856-z4pob5q6"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, torch_dtype=torch.float16, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## Inference

In [None]:
batch = tokenizer("“Training models with PEFT and LoRa is cool” ->: ", return_tensors='pt')

with torch.cuda.amp.autocast():
  output_tokens = model.generate(**batch, max_new_tokens=50)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))



 “Training models with PEFT and LoRa is cool” ->:  ['training', 'teaching']

A:

I think the best way to describe it is to say that it is a combination of the two. The idea is that you can train a model on a dataset, and then you can use the


In [None]:
input_ids = tokenizer(text, return_tensors='pt').to('cuda')
output = model.generate(**input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=1024)
print(tokenizer.decode(output[0]))

In [None]:
"""
### Instruction:

Give a shortened concise summary of the course based readme file text into the format given in ``` seperators:
```
Description: a brief description of the course
course content: syllabus of the course and a brief of topics that will be covered
grading: info regarding the grading, homeworks and grades, exams, midterms etc
prerequisites: prerequisites of the course if any
office hours: time and days at which students can meet the professor
location/time: location and time of the course
class structure: include if there are going to be seminars, lectures, homeworks, assignments etc
Teaching assistant info/additional info: any info related to the teaching assistant's office hours,  and other info related to teaching assistant
communication: Any communications, ways to reach the professor or any links or grps they can join
additional info: any additional info relevant to the course and course success criteria
```
if you do not find the content for any of these fields, strictly skip that field rather than writing no info against them. -> "
### Input:
10-601: Introduction to Machine Learning
| Category | Difficulty |
|:-:       | :-:        |
| HW       | 4     |
| Exams    | 5|
Intro to ML is one of the most popular CS electives at CMU, as ML has been a very hot topic in the last few years.
The class will cover a good variety of ML concepts, but does not go too much into detail since it is
just an introductory class.
Topics covered

Decision Trees
k-Nearest Neighbors
Perceptron
Linear Regression
Logistic Regression
Neural Networks
PAC Learning
MLE/MAP
Naive Bayes
Markov Models
Bayesian Networks
Reinforcement Learning

Class structure
The class is very well organized, and follows the approximate structure:

Lecture about [ML method]
Hand calculations to derive an important theorem about ML method
Algorithm

Applications


Written and coding homework about topic

Perform hand calculations on your own
Implement algorithm in code

Homeworks
Homeworks are straightforward implementations of algorithms that are covered in class. Lecture slides and the lectures themselves go into algorithms in a great depth, so the homeworks should not be too bad. In addition, since the algorithms are often well-known in the ML field, you can
find lots of support for them online as well. The handouts provided for the homework is detailed, so make sure to read that as well. Definitely make sure to write your own tests for your program if it is possible, since that can help you uncover bugs you can't find on Autolab.
As a trick, sometimes your program might take a while to run on your computer. If you run your program on the Andrew machines at CMU instead, you might find a considerable boost in your program speed.
There are often also short-answers for homeworks. They give you LaTeX
templates to fill in, but you don't need to know that much LaTeX to work
with it.
How to study for exams
One of the trickiest parts of the class is that there is a wide
breadth of material so it is hard to learn everything in depth.
For exams, it is important to know all your equations and algorithms.
Fortunately, you have a cheatsheet so you can put those on there!
In addition, you can also do other school's exams for extra practice on the topic.
External materials for the class
The good news about ML is that it is a very widely taught course, so there are lots of good materials about it outside of CMU. The most notable source for ML education is Andrew Ng's CS229 course at Stanford. His course notes are great and his class has lots of great content.
You can find course materials here (this looks like an archive of the old Github, which is now deleted.).
### Response:
---

Here is the response of the shortened version of the response template.
```
Description: Introduction to Machine Learning
Course Info: 10-601
Credits: 3
Prerequisites: 15-601 or 18-201 Intro to Computing
Teaching Style: Lectures and Recitation
Frequency: LUNES
Course website: link
Andrew referral required? No
Andrew office hours? Yes
Current TA list? Here
TAs: Prof. Andrew Ng and TAs TBD
Course CAT: Computer Algorithms and Their Applications, plus some of your own time
Class Timing: Semester
Class Meeting Times and Locations: TWR 11:45 AM-1:00 PM E-107
Course CAT: Computer Algorithms and Their Applications, plus some of your own time
Audio/Video Recording restriction? No
Regular or need based advising? Need based advising
lec 1: What is Machine Learning? Introduction. Overview of the course. Help develop the skills of computational thinking, software engineering, and communication. Intro to some of the following: linear algebra, algorithms, and discrete mathematics, machine learning, probabilistic thinking, and artificial intelligence. Use Matlab, R, or another programming language. Attend at least 2 out of 3 recitation sessions per week.
lec 2: Machine Learning Algorithms and Applications. Understand how to use Matlab, R, or another programming language to implement algorithms for machine learning. Understand how to analyze data to decide which algorithm to use. Understand how to evaluate the performance of a learned algorithm.
lec 3: Reading Assignments. Readings are optional. If you do not like reading, go to the course webpage and click on the titles to read the short summaries. Matlab, R, or another programming language. Problem sets. Syllabus, lectures, and/or lectures notes, homework sets, and/or homework assignments, lectures, homeworks. Lots of content. Anytime. If you do not want to spend too much time on the course, do not worry. The course is not that hard, and the course is not a hard course to do well on. If you do not want to spend too much time on the course, you can do the following: go to class, check out the materials, do not do the homework, and not talk to the professor or students or any problems. Other students might want to go to class, check out the materials, and do the homework. Other students might want to talk to you or the professor or students about the course.
Any additional info?: Any additional info relevant to the course and course success criteria
---
```

PS: If you are interested in the course, you might want to read this reply I wrote to a comment on the course's review page:
>
> "I think the course is a great introduction to ML. It is a very well organized course that goes over a lot of ML topics. The course is not a very hard course, so you can get a good GPA without too much work. The course is very well taught. Prof Ng teaches the course, and he does so with great lectures and a great lecture note system. He also has a very detailed and well written lecture notes. It is very easy to do well on the lecture and not have to spend a lot of time on it. The only downside is that the lectures are recorded and are available online, so you might not want to go to class and just watch the lectures online. [...] If you do not want to spend too much time on the course, you can do the following: go to class, check out the materials, do not do the homework, and not talk to the professor or other students or any problems."
>
> "Other students might want to go to class, check out the materials, do not do the homework, and not talk to the professor or any problems. Other students might want to go to class, check out the materials, and do not do the homework, and not talk to the professor or any problems. Other students might want to go to class, check out the materials, and not do the homework. Other students might want to talk to you or the professor or students about the course and the success criteria."
>
> "Any additional info relevant to the course and course success criteria":
>
> "Any additional info relevant to the course and the course success criteria":
>
> "Any additional info relevant to the lecture and lecture success criteria":
> "Any additional info relevant to the homeworks and homework success criteria":
> "Any additional info relevant to the course if you do not like to talk to the professor or any problems":
> "Any additional info relevant to the course if you do not like to go to class, check out the materials, do not do the homework, and not talk to the professor or any problems":
​
"""