In [1]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline,
)
import transformers
import torch
from torch.utils.data import DataLoader, Dataset

# Load Quantized Model

In [19]:
# Define the model ID for the sharded FALCON model by vilsonrodrigues
model_id = "vilsonrodrigues/falcon-7b-instruct-sharded"

# Configure BitsAndBytesConfig for 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Initialize the tokenizer using the model ID and set the pad token to be the same as the end of sentence token
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

# Initialize the pre-trained model using AutoModelForCausalLM
pretrained_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map={"": 0},
    trust_remote_code=True,
)



Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

In [3]:
from peft import prepare_model_for_kbit_training

pretrained_model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(pretrained_model)

You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.
You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.


In [4]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [5]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 4718592 || all params: 3613463424 || trainable%: 0.13058363808693696


In [6]:
from datasets import load_dataset

"""
We'll be using a dataset of Mayo Clinic symptoms and diseases.
It contains information about 1,058 rows (symptoms) and related diseases.
The file weughts 626 kB
"""

data = load_dataset("celikmus/mayo_clinic_symptoms_and_diseases_v1", split="train")

In [7]:
data

Dataset({
    features: ['text', 'label'],
    num_rows: 1058
})

In [8]:
tokenizer.pad_token = tokenizer.eos_token

train_dataset = data.map(
    lambda x: {
        "input_text": f"symptoms: {x['text']}; most likely explanation: {x['label']}"
    }
)

# Tokenize the datasets
train_encodings = tokenizer(
    train_dataset["input_text"],
    truncation=True,
    padding=True,
    max_length=256,
    return_tensors="pt",
)

In [9]:
class TextDataset(Dataset):
    def __init__(self, encodings):
        """
        Initialize a custom dataset for text inputs and encodings.

        Parameters:
            encodings (dict): A dictionary containing the encoded inputs.
        """
        self.encodings = encodings

    def __getitem__(self, idx):
        """
        Get an item from the dataset by index.

        Parameters:
            idx (int): The index of the item to retrieve.

        Returns:
            dict: A dictionary containing the encoded input and labels.
        """
        # Create an item dictionary with tensors for each encoding key
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

        # Copy input_ids to labels for tasks like language modeling
        item["labels"] = item["input_ids"].clone()

        return item

    def __len__(self):
        """
        Get the length of the dataset.

        Returns:
            int: The number of items in the dataset.
        """
        return len(self.encodings["input_ids"])

In [10]:
# Convert the encodings to PyTorch datasets
train_dataset = TextDataset(train_encodings)

In [11]:
request_text = """
In crowded places, I feel cold in the tips of my fingers, I sweat with dizziness.
What is happening?
"""
# hint: agoraphobia

In [12]:
encoding = tokenizer(request_text, return_tensors="pt").to("cuda:0")
pretrained_model_output = pretrained_model.generate(
    input_ids=encoding.input_ids,
    attention_mask=encoding.attention_mask,
    max_new_tokens=100,
    do_sample=True,
    temperature=0.25,
    eos_token_id=tokenizer.eos_token_id,
)

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
  attn_output = F.scaled_dot_product_attention(


In [13]:
import pprint

pprint.pprint(tokenizer.decode(pretrained_model_output[0], skip_special_tokens=True))

('\n'
 'In crowded places, I feel cold in the tips of my fingers, I sweat with '
 'dizziness.\n'
 'What is happening?\n'
 'I am not alone.\n'
 'I am not alone.\n'
 'I am not alone.\n'
 'I am not alone.\n'
 'I am not alone.\n'
 'I am not alone.\n'
 'I am not alone.\n'
 'I am not alone.\n'
 'I am not alone.\n'
 'I am not alone.\n'
 'I am not alone.\n'
 'I am not alone.\n'
 'I am not alone.\n'
 'I am not alone.\n'
 'I am not alone.\n'
 'I am not alone.\n'
 'I am not alone')


In [14]:
request_text = "I started to feel swelling  and itching around the mouth and throat after a salad with peanuts, cherry tomatoes and cheese. What may be the reason?"

In [15]:
encoding = tokenizer(request_text, return_tensors="pt").to("cuda:0")
pretrained_model_output = pretrained_model.generate(
    input_ids=encoding.input_ids,
    attention_mask=encoding.attention_mask,
    max_new_tokens=100,
    do_sample=True,
    temperature=0.25,
    eos_token_id=tokenizer.eos_token_id,
)

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


# Training the model

In [16]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=train_dataset,
    # eval_dataset=val_dataset,
    args=transformers.TrainingArguments(
        num_train_epochs=10,
        per_device_train_batch_size=8,
        gradient_accumulation_steps=4,
        warmup_ratio=0.05,
        max_steps=40,
        learning_rate=2.5e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit",
        lr_scheduler_type="cosine",
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

max_steps is given, it will override any value given in num_train_epochs


In [17]:
trainer.train()

  0%|          | 0/40 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'loss': 2.1058, 'grad_norm': 0.17878128588199615, 'learning_rate': 0.000125, 'epoch': 0.03}
{'loss': 2.0841, 'grad_norm': 0.16140981018543243, 'learning_rate': 0.00025, 'epoch': 0.06}
{'loss': 2.2015, 'grad_norm': 0.1694914549589157, 'learning_rate': 0.0002495730616258337, 'epoch': 0.09}
{'loss': 2.1433, 'grad_norm': 0.2552121579647064, 'learning_rate': 0.0002482951629253403, 'epoch': 0.12}
{'loss': 2.1502, 'grad_norm': 0.30767083168029785, 'learning_rate': 0.0002461750332424163, 'epoch': 0.15}
{'loss': 2.0273, 'grad_norm': 0.28413626551628113, 'learning_rate': 0.00024322715521257933, 'epoch': 0.18}
{'loss': 1.9846, 'grad_norm': 0.3133431673049927, 'learning_rate': 0.00023947166583188217, 'epoch': 0.21}
{'loss': 2.0909, 'grad_norm': 0.37438762187957764, 'learning_rate': 0.00023493421890081112, 'epoch': 0.24}
{'loss': 2.049, 'grad_norm': 0.44594240188598633, 'learning_rate': 0.00022964580978281608, 'epoch': 0.27}
{'loss': 2.0225, 'grad_norm': 0.5678523778915405, 'learning_rate': 0.0002

TrainOutput(global_step=40, training_loss=1.8569317221641541, metrics={'train_runtime': 380.6954, 'train_samples_per_second': 3.362, 'train_steps_per_second': 0.105, 'total_flos': 1.297590694772736e+16, 'train_loss': 1.8569317221641541, 'epoch': 1.2030075187969924})

# Inference

In [20]:
# Save model:
trained_model = (
    trainer.model.module if hasattr(trainer.model, "module") else trainer.model
)  # Take care of distributed/parallel training
trained_model.save_pretrained("outputs")

In [21]:
# Now we can inference our model:
lora_config = LoraConfig.from_pretrained("outputs")
loaded_model = get_peft_model(
    prepare_model_for_kbit_training(pretrained_model), lora_config
)

You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.


In [22]:
loaded_model.config.use_cache = True
loaded_model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): FalconForCausalLM(
      (transformer): FalconModel(
        (word_embeddings): Embedding(65024, 4544)
        (h): ModuleList(
          (0-31): 32 x FalconDecoderLayer(
            (self_attention): FalconAttention(
              (maybe_rotary): FalconRotaryEmbedding()
              (query_key_value): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4544, out_features=4672, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4544, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4672, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_

In [23]:
# Empty VRAM
del model
del trained_model
del trainer
import gc

gc.collect()

15317

In [24]:
model_id = (
    "vilsonrodrigues/falcon-7b-instruct-sharded"  # sharded model by vilsonrodrigues
)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

pretrained_model = AutoModelForCausalLM.from_pretrained(
    model_id, quantization_config=bnb_config, device_map={"": 0}, trust_remote_code=True
)

Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

In [25]:
def generate_answer(query):
    """
    Generate responses from both the original model and PEFT model and compare their answers.

    Parameters:
        query (str): The user query for which responses are generated.

    Returns:
        None
    """
    # System and user prompts
    system_prompt = """Answer the following question truthfully.
          If you don't know the answer or the question is too complex,
          respond 'Kindly, consult a doctor for further queries.'."""
    user_prompt = f"""<HUMAN>: {query}
      <ASSISTANT>: """
    final_prompt = system_prompt + "\n" + user_prompt

    # Device and dashline
    device = "cuda:0"
    dashline = "-" * 50

    # Encode prompt and generate response from the original model
    encoding = tokenizer(final_prompt, return_tensors="pt").to(device)
    output = pretrained_model.generate(
        input_ids=encoding.input_ids,
        attention_mask=encoding.attention_mask,
        max_new_tokens=100,
        do_sample=True,
        temperature=0.25,
        repetition_penalty=1.3,
        eos_token_id=tokenizer.eos_token_id,
    )
    text_output = tokenizer.decode(output[0], skip_special_tokens=True)

    # Print original model response
    pprint.pprint(dashline)
    pprint.pprint(f"ORIGINAL MODEL RESPONSE:\n{text_output}")
    pprint.pprint(dashline)

    # Encode prompt and generate response from the PEFT model
    peft_encoding = tokenizer(final_prompt, return_tensors="pt").to(device)
    peft_output = loaded_model.generate(
        input_ids=peft_encoding.input_ids,
        attention_mask=peft_encoding.attention_mask,
        max_new_tokens=100,
        do_sample=True,
        temperature=0.25,
        repetition_penalty=1.3,
        eos_token_id=tokenizer.eos_token_id,
    )
    peft_text_output = tokenizer.decode(peft_output[0], skip_special_tokens=True)

    # Print PEFT model response
    pprint.pprint(f"PEFT MODEL RESPONSE:\n{peft_text_output}")
    pprint.pprint(dashline)

In [26]:
query = """In crowded places, I feel cold in the tips of my fingers, I sweat with dizziness. What may be happening?"""
generate_answer(query)

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


'--------------------------------------------------'
('ORIGINAL MODEL RESPONSE:\n'
 'Answer the following question truthfully.\n'
 "          If you don't know the answer or the question is too complex,\n"
 "          respond 'Kindly, consult a doctor for further queries.'.\n"
 '<HUMAN>: In crowded places, I feel cold in the tips of my fingers, I sweat '
 'with dizziness. What may be happening?\n'
 '      <ASSISTANT>: <HUMAN> may be experiencing symptoms of hypothermia. It '
 'is advised to seek medical attention immediately.')
'--------------------------------------------------'
('PEFT MODEL RESPONSE:\n'
 'Answer the following question truthfully.\n'
 "          If you don't know the answer or the question is too complex,\n"
 "          respond 'Kindly, consult a doctor for further queries.'.\n"
 '<HUMAN>: In crowded places, I feel cold in the tips of my fingers, I sweat '
 'with dizziness. What may be happening?\n'
 '      <ASSISTANT>: <HUMAN> may be experiencing symptoms of hypother

In [27]:
query = """I started to feel swelling  and itching around the mouth and throat after a salad with peanuts, cherry tomatoes and cheese. What may be the reason?"""
generate_answer(query)

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


'--------------------------------------------------'
('ORIGINAL MODEL RESPONSE:\n'
 'Answer the following question truthfully.\n'
 "          If you don't know the answer or the question is too complex,\n"
 "          respond 'Kindly, consult a doctor for further queries.'.\n"
 '<HUMAN>: I started to feel swelling  and itching around the mouth and throat '
 'after a salad with peanuts, cherry tomatoes and cheese. What may be the '
 'reason?\n'
 "      <ASSISTANT>:  It's possible that the swelling and itching are due to "
 'an allergic reaction to peanuts. Have you had any previous allergic '
 'reactions to peanuts or similar foods?')
'--------------------------------------------------'
('PEFT MODEL RESPONSE:\n'
 'Answer the following question truthfully.\n'
 "          If you don't know the answer or the question is too complex,\n"
 "          respond 'Kindly, consult a doctor for further queries.'.\n"
 '<HUMAN>: I started to feel swelling  and itching around the mouth and throat '
 'a