In [2]:
import torch
import os
from datasets import load_dataset
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from transformers import (AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments)
from trl import SFTTrainer
from huggingface_hub import notebook_login

In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
import json
from datasets import DatasetDict, Dataset

# Load the JSON file
with open(r"C:\Users\BMSCE CSE\Downloads\HPEquestionsOurOwn.json", 'r') as json_file:
    data = json.load(json_file)

# Extract the "questions" data
questions_data = data["text"]

# Create a new list to store the reformatted data
formatted_data = []

# Iterate through each item in the original data and reformat it
for item in questions_data:
    # Check if both question and answer are strings before concatenating
    if isinstance(item["question"], str) and isinstance(item["answer"], str):
        formatted_item = {
            "text": "### Human: " + item["question"] + " " + "###Assistant: " +item["answer"]
        }
        formatted_data.append(formatted_item)
    else:
        print("Skipping item with non-string question or answer:", item)


formatted_dataset = Dataset.from_dict(data)

# Print the Dataset summary
dataset = formatted_dataset
print(dataset)
print(dataset['text'][0])

Dataset({
    features: ['text'],
    num_rows: 311
})
{'answer': 'Management switches on cluster systems are paired into switch stacks, with two switches per stack. The top switch is typically the master switch, while the bottom switch is typically the slave switch.', 'question': 'How are management switches typically paired in cluster systems, and what roles do the top and bottom switches in a stack usually play?'}


## Loading the model

In this section we will load the [Falcon 7B model](https://huggingface.co/tiiuae/falcon-7b), quantize it in 4bit and attach LoRA adapters on it. Let's get started!

In [5]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer

model_name = "ybelkada/falcon-7b-sharded-bf16"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",  # new
    trust_remote_code=True
)
model.config.use_cache = False



Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Let's also load the tokenizer below

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

Below we will load the configuration file in order to create the LoRA model. According to QLoRA paper, it is important to consider all linear layers in the transformer block for maximum performance. Therefore we will add `dense`, `dense_h_to_4_h` and `dense_4h_to_h` layers in the target modules in addition to the mixed query key value layer.

In [7]:
from peft import LoraConfig

lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "query_key_value",
        # "dense",
        # "dense_h_to_4h",
        # "dense_4h_to_h",
    ]
)

## Loading the trainer

Here we will use the [`SFTTrainer` from TRL library](https://huggingface.co/docs/trl/main/en/sft_trainer) that gives a wrapper around transformers `Trainer` to easily fine-tune models on instruction based datasets using PEFT adapters. Let's first load the training arguments below.

In [8]:
from transformers import TrainingArguments

output_dir = "./ShardedonOurData"
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
optim = "paged_adamw_32bit"
save_steps = 10
logging_steps = 10
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 200
warmup_ratio = 0.03
lr_scheduler_type = "constant"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
    gradient_checkpointing=True,
)
model.config.use_cache = False


Then finally pass everthing to the trainer

In [11]:
from trl import SFTTrainer

max_seq_length = 512
dataset = dataset.map(lambda x: {'text': str(x['text'])})

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
)

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

Map:   0%|          | 0/311 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


We will also pre-process the model by upcasting the layer norms in float 32 for more stable training

In [12]:
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

## Train the model

Now let's train the model! Simply call `trainer.train()`

In [13]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33maditi-cs21[0m ([33mbmsce[0m). Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/200 [00:00<?, ?it/s]

  attn_output = F.scaled_dot_product_attention(


{'loss': 2.4898, 'grad_norm': 0.2644895315170288, 'learning_rate': 0.0002, 'epoch': 0.51}




{'loss': 2.2835, 'grad_norm': 0.508341372013092, 'learning_rate': 0.0002, 'epoch': 1.03}




{'loss': 2.0747, 'grad_norm': 0.4628939926624298, 'learning_rate': 0.0002, 'epoch': 1.54}




{'loss': 1.9776, 'grad_norm': 0.31026020646095276, 'learning_rate': 0.0002, 'epoch': 2.05}




{'loss': 1.8715, 'grad_norm': 0.4888463616371155, 'learning_rate': 0.0002, 'epoch': 2.56}




{'loss': 1.8304, 'grad_norm': 0.5135347843170166, 'learning_rate': 0.0002, 'epoch': 3.08}




{'loss': 1.7761, 'grad_norm': 0.685931921005249, 'learning_rate': 0.0002, 'epoch': 3.59}




{'loss': 1.7307, 'grad_norm': 0.625741183757782, 'learning_rate': 0.0002, 'epoch': 4.1}




{'loss': 1.6551, 'grad_norm': 1.0800304412841797, 'learning_rate': 0.0002, 'epoch': 4.62}




{'loss': 1.5911, 'grad_norm': 0.8324030637741089, 'learning_rate': 0.0002, 'epoch': 5.13}




{'loss': 1.5249, 'grad_norm': 0.709463357925415, 'learning_rate': 0.0002, 'epoch': 5.64}




{'loss': 1.5035, 'grad_norm': 0.9293894171714783, 'learning_rate': 0.0002, 'epoch': 6.15}




{'loss': 1.3779, 'grad_norm': 0.6214763522148132, 'learning_rate': 0.0002, 'epoch': 6.67}




{'loss': 1.4101, 'grad_norm': 1.135507583618164, 'learning_rate': 0.0002, 'epoch': 7.18}




{'loss': 1.2732, 'grad_norm': 0.7719298601150513, 'learning_rate': 0.0002, 'epoch': 7.69}




{'loss': 1.2514, 'grad_norm': 1.4488328695297241, 'learning_rate': 0.0002, 'epoch': 8.21}




{'loss': 1.1717, 'grad_norm': 1.1668577194213867, 'learning_rate': 0.0002, 'epoch': 8.72}




{'loss': 1.1652, 'grad_norm': 1.5100066661834717, 'learning_rate': 0.0002, 'epoch': 9.23}




{'loss': 1.0979, 'grad_norm': 1.6807385683059692, 'learning_rate': 0.0002, 'epoch': 9.74}




{'loss': 1.0179, 'grad_norm': 1.0873857736587524, 'learning_rate': 0.0002, 'epoch': 10.26}




{'train_runtime': 568.4241, 'train_samples_per_second': 5.63, 'train_steps_per_second': 0.352, 'train_loss': 1.6037031602859497, 'epoch': 10.26}


TrainOutput(global_step=200, training_loss=1.6037031602859497, metrics={'train_runtime': 568.4241, 'train_samples_per_second': 5.63, 'train_steps_per_second': 0.352, 'total_flos': 1.3570649769371136e+16, 'train_loss': 1.6037031602859497, 'epoch': 10.256410256410255})

In [14]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [15]:
trainer.push_to_hub()



Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/75.5M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.98k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Aditi25/ShardedonOurData/commit/9ffebecc9267f10b58e4fca7d8d4fc2b7db405f4', commit_message='End of training', commit_description='', oid='9ffebecc9267f10b58e4fca7d8d4fc2b7db405f4', pr_url=None, pr_revision=None, pr_num=None)

In [16]:
# Loading PEFT model
from peft import PeftConfig, PeftModel


PEFT_MODEL = "Aditi25/ShardedonOurData"

config = PeftConfig.from_pretrained(PEFT_MODEL)
peft_base_model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    return_dict=True,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

peft_model = PeftModel.from_pretrained(peft_base_model, PEFT_MODEL)

peft_tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
peft_tokenizer.pad_token = peft_tokenizer.eos_token

adapter_config.json:   0%|          | 0.00/678 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/75.5M [00:00<?, ?B/s]

In [17]:
from transformers import GenerationConfig

# Function to generate responses from both original model and PEFT model and compare their answers.
def generate_answer(query):
  system_prompt = """Answer the following question truthfully.
  If you don't know the answer, respond 'Sorry, I don't know the answer to this question.'.
  If the question is too complex, respond 'Kindly, consult the documentation for further queries.'."""

  user_prompt = f"""###HUMAN: {query}
  ###ASSISTANT: """

  final_prompt = system_prompt + "\n" + user_prompt

  device = "cuda:0"
  dashline = "-".join("" for i in range(50))


  peft_encoding = peft_tokenizer(final_prompt, return_tensors="pt").to(device)
  peft_outputs = peft_model.generate(input_ids=peft_encoding.input_ids, generation_config=GenerationConfig(max_new_tokens=256, pad_token_id = peft_tokenizer.eos_token_id, \
                                                                                                                     eos_token_id = peft_tokenizer.eos_token_id, attention_mask = peft_encoding.attention_mask, \
                                                                                                                     temperature=0.4, top_p=0.6, repetition_penalty=1.3, num_return_sequences=1,))
  peft_text_output = peft_tokenizer.decode(peft_outputs[0], skip_special_tokens=True)

  start_token = "###ASSISTANT:"
  end_token = "###"

  start_idx = peft_text_output.find(start_token)
  end_idx = peft_text_output.find(end_token, start_idx + len(start_token))

  if start_idx != -1 and end_idx != -1:
      print(peft_text_output[start_idx + len(start_token):end_idx].strip())
  else:
      print("No answer found.")


In [18]:
query = "What is the recommended action after entering a ping command?"
generate_answer(query)



'Please enter y and press Enter to continue.'


In [19]:
query = "Can you give me command used to monitor the switch configuration process? "
generate_answer(query)

'Please enter the command used to monitor the switch configuration process.'


In [20]:
query = "How many times should the cm node add command be run to configure the compute nodes into the cluster?"
generate_answer(query)

2 times.


In [None]:
query = "Can you name the command used to generate a cluster definition file?"
generate_answer(query)

In [None]:
query = "How many SU leaders are there in a group for gluster purposes?"
generate_answer(query)

In [None]:
query = "Are compute nodes associated with a physical SU leader?"
generate_answer(query)

In [None]:
query = "How many compute nodes can each SU leader node manage?"
generate_answer(query)

In [None]:
query = "How are compute node images provisioned to the compute nodes?"
generate_answer(query)

In [None]:
query = "What are compute nodes associated with in the SU leader pool?"
generate_answer(query)

In [None]:
query = "How many SU leader nodes can the cluster manager support in a cluster?"
generate_answer(query)

In [None]:
query = "In what format are cm node provision commands used to assign new images to nodes?"
generate_answer(query)

During training, the model should converge nicely as follows:

![image](https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/loss-falcon-7b.png)

The `SFTTrainer` also takes care of properly saving only the adapters during training instead of saving the entire model.