# Fine Tuning

## Supervised

### Classification

In [3]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np




In [4]:
# load dataset
dataset = load_dataset('shawhin/imdb-truncated')
dataset

Downloading readme:   0%|          | 0.00/592 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/836k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/853k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

In [5]:
model_checkpoint = 'distilbert-base-uncased'
# model_checkpoint = 'roberta-base' # you can alternatively use roberta-base but this model is bigger thus training will take longer

# define label maps
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative":0, "Positive":1}

# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [13]:
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [14]:
# create tokenize function
def tokenize_function(examples):
    # extract text
    text = examples["text"]

    #tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

In [19]:
dataset["train"][20]

{'label': 0,
 'text': "As with most of the reviewers, I saw this on Starz! OnDemand. After watching the preview with my girlfriend, she decided not to watch it from how bad the preview watched. I, on the other hand, thought it looked weird enough to warrant a watching. I mean, the design of Dr. Meso alone warranted at least a brief sweep over this title. After watching it, I can say that while there are some interesting aspects to it (namely the browsing over the notebooks and trying to figure out the incomprehensible story), it's best to pass over this one.<br /><br />*Major Spoilers Ahead* After making their first video for their as-yet-unfinished CD, the lead singer, Cassidy, kills herself in an attempt to get her boyfriend Neil to notice her. 3 months later, the band is trying to decide if they're going to finish the album or not. To try and see what Cassidy would have wanted, they go to see an old psychic friend of hers, Dr. Meso, and try to contact her through him. In his card re

In [15]:
# tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [20]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [21]:
# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [22]:
# define an evaluation function to pass into trainer later
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

In [23]:
# define list of examples
text_list = ["It was good.", "Not a fan, don't recommed.", "Better than the first one.", "This is not worth watching even once.", "This one is a pass."]

print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

Untrained model predictions:
----------------------------
It was good. - Negative
Not a fan, don't recommed. - Negative
Better than the first one. - Negative
This is not worth watching even once. - Negative
This one is a pass. - Negative


In [24]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules = ['q_lin'])

In [25]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, r=4, target_modules={'q_lin'}, lora_alpha=32, lora_dropout=0.01, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False))

In [26]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


In [27]:
# hyperparameters
lr = 1e-3
batch_size = 4
num_epochs = 10

In [28]:

# define training arguments
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)



In [29]:
# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics,
)

# train model
trainer.train()

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'accuracy': 0.869}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.4813419282436371, 'eval_accuracy': {'accuracy': 0.869}, 'eval_runtime': 9.3308, 'eval_samples_per_second': 107.172, 'eval_steps_per_second': 26.793, 'epoch': 1.0}
{'loss': 0.4519, 'grad_norm': 7.42149543762207, 'learning_rate': 0.0008, 'epoch': 2.0}


  0%|          | 0/250 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'accuracy': 0.86}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.5163654685020447, 'eval_accuracy': {'accuracy': 0.86}, 'eval_runtime': 9.2411, 'eval_samples_per_second': 108.212, 'eval_steps_per_second': 27.053, 'epoch': 2.0}


  0%|          | 0/250 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'accuracy': 0.866}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.7012303471565247, 'eval_accuracy': {'accuracy': 0.866}, 'eval_runtime': 9.2503, 'eval_samples_per_second': 108.105, 'eval_steps_per_second': 27.026, 'epoch': 3.0}
{'loss': 0.2217, 'grad_norm': 1.067376732826233, 'learning_rate': 0.0006, 'epoch': 4.0}


  0%|          | 0/250 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'accuracy': 0.887}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.6214185953140259, 'eval_accuracy': {'accuracy': 0.887}, 'eval_runtime': 9.2484, 'eval_samples_per_second': 108.127, 'eval_steps_per_second': 27.032, 'epoch': 4.0}


  0%|          | 0/250 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'accuracy': 0.89}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.7877678871154785, 'eval_accuracy': {'accuracy': 0.89}, 'eval_runtime': 9.2755, 'eval_samples_per_second': 107.811, 'eval_steps_per_second': 26.953, 'epoch': 5.0}
{'loss': 0.0887, 'grad_norm': 5.88487928325776e-05, 'learning_rate': 0.0004, 'epoch': 6.0}


  0%|          | 0/250 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'accuracy': 0.897}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.7541839480400085, 'eval_accuracy': {'accuracy': 0.897}, 'eval_runtime': 8.8697, 'eval_samples_per_second': 112.743, 'eval_steps_per_second': 28.186, 'epoch': 6.0}


  0%|          | 0/250 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'accuracy': 0.885}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.8369969725608826, 'eval_accuracy': {'accuracy': 0.885}, 'eval_runtime': 8.8303, 'eval_samples_per_second': 113.247, 'eval_steps_per_second': 28.312, 'epoch': 7.0}
{'loss': 0.0351, 'grad_norm': 3.622229996835813e-05, 'learning_rate': 0.0002, 'epoch': 8.0}


  0%|          | 0/250 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'accuracy': 0.885}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.9117058515548706, 'eval_accuracy': {'accuracy': 0.885}, 'eval_runtime': 8.8263, 'eval_samples_per_second': 113.298, 'eval_steps_per_second': 28.324, 'epoch': 8.0}


  0%|          | 0/250 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'accuracy': 0.891}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.9155431389808655, 'eval_accuracy': {'accuracy': 0.891}, 'eval_runtime': 8.9612, 'eval_samples_per_second': 111.592, 'eval_steps_per_second': 27.898, 'epoch': 9.0}
{'loss': 0.0163, 'grad_norm': 0.002734925365075469, 'learning_rate': 0.0, 'epoch': 10.0}


  0%|          | 0/250 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'accuracy': 0.889}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.9408891797065735, 'eval_accuracy': {'accuracy': 0.889}, 'eval_runtime': 8.8213, 'eval_samples_per_second': 113.362, 'eval_steps_per_second': 28.34, 'epoch': 10.0}
{'train_runtime': 302.5714, 'train_samples_per_second': 33.05, 'train_steps_per_second': 8.263, 'train_loss': 0.16274697799682616, 'epoch': 10.0}


TrainOutput(global_step=2500, training_loss=0.16274697799682616, metrics={'train_runtime': 302.5714, 'train_samples_per_second': 33.05, 'train_steps_per_second': 8.263, 'total_flos': 1112883852759936.0, 'train_loss': 0.16274697799682616, 'epoch': 10.0})

In [32]:
model.to('cpu') # moving to mps for Mac (can alternatively do 'cpu')

print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("cpu") # moving to mps for Mac (can alternatively do 'cpu')

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])

Trained model predictions:
--------------------------
It was good. - Positive
Not a fan, don't recommed. - Negative
Better than the first one. - Positive
This is not worth watching even once. - Negative
This one is a pass. - Positive


In [36]:
model(inputs)

SequenceClassifierOutput(loss=None, logits=tensor([[-0.0972,  0.2844]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

# option 1: notebook login
from huggingface_hub import notebook_login
notebook_login() # ensure token gives write access

# # option 2: key login
# from huggingface_hub import login
# write_key = 'hf_' # paste token here
# login(write_key)
hf_name = 'shawhin' # your hf username or org name
model_id = hf_name + "/" + model_checkpoint + "-lora-text-classification" # you can name the model whatever you want
model.push_to_hub(model_id) # save model
trainer.push_to_hub(model_id) # save trainer
Optional: load peft model
# how to load peft model from hub for inference
config = PeftConfig.from_pretrained(model_id)
inference_model = AutoModelForSequenceClassification.from_pretrained(
    config.base_model_name_or_path, num_labels=2, id2label=id2label, label2id=label2id
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(inference_model, model_id)

# Devs

In [1]:
import json
import os
from pprint import pprint
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset
from huggingface_hub import notebook_login
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training
)
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig
)

In [2]:
# Import the necessary library for loading datasets
from datasets import load_dataset

# Specify the name of the dataset
dataset_name = "timdettmers/openassistant-guanaco"

# Load the dataset from the specified name and select the "train" split
dataset = load_dataset(dataset_name, split="train")
dataset[0].keys()

Downloading readme:   0%|          | 0.00/395 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.
Downloading data: 100%|██████████| 20.9M/20.9M [00:02<00:00, 7.78MB/s]
Downloading data: 100%|██████████| 1.11M/1.11M [00:00<00:00, 3.09MB/s]


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

dict_keys(['text'])

In [4]:
dataset[1]

{'text': '### Human: ¿CUales son las etapas del desarrollo y en qué consisten según Piaget?### Assistant: Jean Piaget fue un psicólogo suizo que propuso una teoría sobre el desarrollo cognitivo humano que consta de cuatro etapas:\n\nEtapa sensoriomotora (0-2 años): Durante esta etapa, el niño aprende a través de sus sentidos y movimientos. Descubre que sus acciones pueden tener un impacto en el entorno y comienza a formarse una idea básica de objetividad y continuidad.\n\nEtapa preoperatoria (2-7 años): En esta etapa, el niño comienza a desarrollar un pensamiento simbólico y a comprender que las cosas pueden representar a otras cosas. También comienzan a desarrollar un pensamiento lógico y a comprender conceptos como la causa y el efecto.\n\nEtapa de operaciones concretas (7-12 años): Durante esta etapa, el niño desarrolla un pensamiento lógico y comprende las relaciones causales. Empiezan a comprender que las cosas pueden tener múltiples perspectivas y que los conceptos pueden ser más

In [5]:
model_name = "ybelkada/falcon-7b-sharded-bf16" # sharded falcon-7b model

# Configuring the BitsAndBytes quantization
bnb_config_base = BitsAndBytesConfig(
    load_in_4bit=True,            # load model in 4-bit precision
    bnb_4bit_quant_type="nf4",    # pre-trained model should be quantized in 4-bit NF format
    bnb_4bit_use_double_quant=True, # Using double quantization as mentioned in QLoRA paper
    bnb_4bit_compute_dtype=torch.bfloat16, # During computation, pre-trained model should be loaded in BF16 format
)

# Loading the Falcon model with quantization configuration
model_base = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config_base, # Use bitsandbytes config
    device_map="auto",  # Specifying device_map="auto" so that HF Accelerate will determine which GPU to put each layer of the model on
    trust_remote_code=True, # Set trust_remote_code=True to use falcon-7b model with custom code
)

config.json:   0%|          | 0.00/581 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/17.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.92G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.99G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.91G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.91G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.99G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.91G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.91G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/921M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [4]:
# Disabling cache usage in the model configuration
model_base.config.use_cache = False

# Load the tokenizer for the Falcon 7B model with remote code trust
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Set the padding token to be the same as the end-of-sequence token
tokenizer.pad_token = tokenizer.eos_token

In [5]:
# Import the necessary module for LoRA configuration
from peft import LoraConfig

# controlling out of memory error
model_base = prepare_model_for_kbit_training(model_base)

# we will import the configuration file to construct the LoRA model. It's crucial to incorporate all linear layers within the transformer block for optimal results.
# For this reason, we will include the dense, dense_h_to_4_h, and dense_4h_to_h layers as target modules alongside the mixed query key value layer. This comprehensive approach aims to achieve the highest level of performance.
# Define the parameters for LoRA configuration
lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

# Create the LoRA configuration object
#  For LLaMa model, you have to remove target_modules parameter from LoraConfig.
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "query_key_value",
        "dense",
        "dense_h_to_4h",
        "dense_4h_to_h",
    ]
)

peft_model = get_peft_model(model_base, peft_config)

In [6]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [25]:
from transformers import TrainingArguments
import mlflow
# pointing mlflow to my local machine. We have to serve first with mlflow ui
mlflow.set_tracking_uri("http://127.0.0.1:5000/")
mlflow.set_experiment("falcon-7b")

# Define the directory to save training results
output_dir = "./train/custom_llm"

# Set the batch size per device during training
per_device_train_batch_size = 2  # reduce batch size by 2x if out-of-memory error

# Number of steps to accumulate gradients before updating the model
gradient_accumulation_steps = 2  # increase gradient accumulation steps by 2x if batch size is reduced

# Choose the optimizer type (e.g., "paged_adamw_32bit")
optim = "paged_adamw_32bit" # activates the paging for better memory management

# Interval to save model checkpoints (every 10 steps)
save_strategy="steps" # checkpoint save strategy to adopt during training
save_steps = 10 # number of updates steps before two checkpoint saves

# Interval to log training metrics (every 10 steps)
logging_steps = 2  # number of update steps between two logs if logging_strategy="steps"

# Learning rate for optimization
learning_rate = 2e-4  # learning rate for AdamW optimizer

# Maximum gradient norm for gradient clipping
max_grad_norm = 0.3 # maximum gradient norm (for gradient clipping)

# Maximum number of training steps
max_steps = 50

# Warmup ratio for learning rate scheduling
warmup_ratio = 0.03  # number of steps used for a linear warmup from 0 to learning_rate

# Type of learning rate scheduler (e.g., "constant")
lr_scheduler_type = "cosine"  # learning rate scheduler

# Create a TrainingArguments object to configure the training process
training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    bf16=False,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
    push_to_hub=True,
    tf32=False,
)

# Import the SFTTrainer from the TRL library
from trl import SFTTrainer

# Set the maximum sequence length
max_seq_length = 512

# Create a trainer instance using SFTTrainer
trainer = SFTTrainer(
    model=model_base,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)

# Iterate through the named modules of the trainer's model
for name, module in trainer.model.named_modules():
    # print(f"TRAIN: {name}")
    # Check if the name contains "norm"
    if "norm" in name:
        # Convert the module to use torch.float32 data type
        module = module.to(torch.float32)

result = trainer.train()

Map:   0%|          | 0/9846 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/50 [00:00<?, ?it/s]



In [16]:
# Loading PEFT model
PEFT_MODEL = "Magody/custom_llm"
# PEFT_MODEL = <Username>/YOUR_MODEL_URL_REPO. 

bnb_config_peft = BitsAndBytesConfig(
    load_in_4bit=True,            # load model in 4-bit precision
    bnb_4bit_quant_type="nf4",    # pre-trained model should be quantized in 4-bit NF format
    bnb_4bit_use_double_quant=True, # Using double quantization as mentioned in QLoRA paper
    bnb_4bit_compute_dtype=torch.bfloat16, # During computation, pre-trained model should be loaded in BF16 format
    load_in_8bit_fp32_cpu_offload=True,
)

config = PeftConfig.from_pretrained(PEFT_MODEL)
peft_base_model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    return_dict=True,
    quantization_config=bnb_config_peft,
    device_map="cuda:0",
    trust_remote_code=True
)

peft_model = PeftModel.from_pretrained(peft_base_model, PEFT_MODEL)

peft_tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
peft_tokenizer.pad_token = peft_tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/522M [00:00<?, ?B/s]

In [17]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, GenerationConfig

# Function to generate responses from both original model and PEFT model and compare their answers.
def generate_answer(query):
    system_prompt = """Answer the following question truthfully.
    If you don't know the answer, respond 'Sorry, I don't know the answer to this question.'.
    If the question is too complex, respond 'Kindly, consult a psychiatrist for further queries.'."""

    

    device = "cuda:0"
    dashline = "-".join("" for i in range(50))

    encoding = tokenizer(final_prompt, return_tensors="pt").to(device)
    outputs = model_base.generate(input_ids=encoding.input_ids, generation_config=GenerationConfig(max_new_tokens=256, pad_token_id = tokenizer.eos_token_id, \
                                                                                                                        eos_token_id = tokenizer.eos_token_id, attention_mask = encoding.attention_mask, \
                                                                                                                        temperature=0.4, top_p=0.6, repetition_penalty=1.3, num_return_sequences=1,))
    text_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print(dashline)
    print(f'ORIGINAL MODEL RESPONSE:\n{text_output}')
    print(dashline)

    peft_encoding = peft_tokenizer(final_prompt, return_tensors="pt").to(device)
    peft_outputs = peft_model.generate(input_ids=peft_encoding.input_ids, generation_config=GenerationConfig(max_new_tokens=256, pad_token_id = peft_tokenizer.eos_token_id, \
                                                                                                                        eos_token_id = peft_tokenizer.eos_token_id, attention_mask = peft_encoding.attention_mask, \
                                                                                                                        temperature=0.4, top_p=0.6, repetition_penalty=1.3, num_return_sequences=1,))
    peft_text_output = peft_tokenizer.decode(peft_outputs[0], skip_special_tokens=True)

    print(f'PEFT MODEL RESPONSE:\n{peft_text_output}')
    print(dashline)

In [21]:
query = 'What to do if you have mental illnes?'

device = 'cuda:0'

# create your own prompt  
system_prompt = f"""Answer the following question truthfully.
If you don't know the answer, respond 'Sorry, I don't know the answer to this question.'.
If the question is too complex, respond 'Kindly, consult a psychiatrist for further queries.'.""".strip()

user_prompt = f"""Human: {query}
    Assistant: """

final_prompt = system_prompt + "\n" + user_prompt
peft_encoding = peft_tokenizer(final_prompt, return_tensors="pt").to(device)
# print(peft_encoding)

peft_outputs = peft_model.generate(input_ids=peft_encoding.input_ids, generation_config=GenerationConfig(max_new_tokens=256, pad_token_id = peft_tokenizer.eos_token_id, \
                                                                                                                        eos_token_id = peft_tokenizer.eos_token_id, attention_mask = peft_encoding.attention_mask, \
                                                                                                                        temperature=0.4, top_p=0.6, repetition_penalty=1.3, num_return_sequences=1,))
peft_text_output = peft_tokenizer.decode(peft_outputs[0], skip_special_tokens=True)

#  with torch.inference_mode():??
print(peft_text_output)

Answer the following question truthfully.
If you don't know the answer, respond 'Sorry, I don't know the answer to this question.'.
If the question is too complex, respond 'Kindly, consult a psychiatrist for further queries.'.
Human: What to do if you have mental illnes?
    Assistant: '


In [24]:
peft_tokenizer.decode(peft_outputs[0], skip_special_tokens=False)

"Answer the following question truthfully.\nIf you don't know the answer, respond 'Sorry, I don't know the answer to this question.'.\nIf the question is too complex, respond 'Kindly, consult a psychiatrist for further queries.'.\nHuman: What to do if you have mental illnes?\n    Assistant: '>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TITLE<<>>TIT

# Experiments

In [16]:
import transformers
from datasets import load_dataset, Dataset
import pandas as pd


def gen_prompt(text_input):
    return f"""
    <human>: {text_input["input"]}
    <assistant>: {text_input["output"]}
    """.strip()


def gen_and_tok_prompt(text_input):
    full_input = gen_prompt(text_input)
    tok_full_prompt = tokenizer(full_input, padding = True , truncation =True)
    return tok_full_prompt

df_faq = pd.read_csv("./data/input/chat.csv", delimiter='|')
df_faq["text"] = df_faq.apply(gen_prompt, axis=1)
data = Dataset.from_pandas(df_faq[['input', 'output', 'text']])
data = data.map(gen_and_tok_prompt)
print(data)
df_faq.head()

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'output', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2
})


Unnamed: 0,input,output,text
0,¿Quién es Vedal?,Es el creador de Neuro sama,<human>: ¿Quién es Vedal?\n <assistant>: Es...
1,¿Qué hace Vedal?,Hace contenido de IA y vtubers,<human>: ¿Qué hace Vedal?\n <assistant>: Ha...


In [22]:
output_dir_base = "./train/falcon-7b-sharded-bf16-finetuned-test"

In [38]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

config = PeftConfig.from_pretrained(training_model_paths)
trained_model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    return_dict=True,
    quantization_config=bnb_config, # Use bitsandbytes config
    device_map="auto",  # Specifying device_map="auto" so that HF Accelerate will determine which GPU to put each layer of the model on
    trust_remote_code=True, # Set trust_remote_code=True to use falcon-7b model with custom code
)

trained_model = PeftModel.from_pretrained(trained_model, training_model_paths)

trained_model_tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path, trust_remote_code=True)
trained_model_tokenizer.pad_token = trained_model_tokenizer.eos_token

gen_config = trained_model.generation_config
gen_config.max_new_tokens = 200
gen_config.temperature = 0.1
gen_config.top_p = 0.2
gen_config.num_return_sequences = 1
gen_config.pad_token_id = trained_model_tokenizer.eos_token_id
gen_config.eos_token_id = trained_model_tokenizer.eos_token_id

ValueError: Can't find 'adapter_config.json' at './train/falcon-7b-sharded-bf16-finetuned-test/models'

In [8]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [13]:
print_trainable_parameters(peft_model)

trainable params: 2359296 || all params: 3611104128 || trainable%: 0.06533447711203746


In [None]:
device = 'cuda:0'

# create your own prompt  
prompt = f"""
    <human>: qué hace es Vedal? y quién es neuro-sama?
    <assistant>: 
    """.strip()

# encode the prompt 
encoding = trained_model_tokenizer(prompt, return_tensors= "pt").to(device)
# print(encoding)

# do the inference 
with torch.inference_mode():
    outputs = trained_model.generate(
        input_ids = encoding.input_ids,
        attention_mask = encoding.attention_mask,
        generation_config = gen_config
    )

# print(outputs)

outputs_decode = trained_model_tokenizer.decode(outputs[0], skip_special_tokens = True )
print(outputs_decode)

RuntimeError: value cannot be converted to type at::Half without overflow

# TT

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch

model = "tiiuae/falcon-7b"

tokenizer = AutoTokenizer.from_pretrained(model)
pipeline:transformers.Pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
)
sequences = pipeline(
    "Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.\nDaniel: Hello, Girafatron!\nGirafatron:",
    max_length=200,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")





A new version of the following files was downloaded from https://huggingface.co/tiiuae/falcon-7b:
- configuration_falcon.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.




ValueError: Could not load model tiiuae/falcon-7b with any of the following classes: (<class 'transformers.models.auto.modeling_tf_auto.TFAutoModelForCausalLM'>,). See the original errors:

while loading with TFAutoModelForCausalLM, an error is thrown:
Traceback (most recent call last):
  File "c:\Users\usuario\anaconda3\envs\env_ai\Lib\site-packages\transformers\pipelines\base.py", line 279, in infer_framework_load_model
    model = model_class.from_pretrained(model, **kwargs)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\usuario\anaconda3\envs\env_ai\Lib\site-packages\transformers\models\auto\auto_factory.py", line 566, in from_pretrained
    raise ValueError(
ValueError: Unrecognized configuration class <class 'transformers_modules.tiiuae.falcon-7b.898df1396f35e447d5fe44e0a3ccaaaa69f30d36.configuration_falcon.FalconConfig'> for this kind of AutoModel: TFAutoModelForCausalLM.
Model type should be one of BertConfig, CamembertConfig, CTRLConfig, GPT2Config, GPT2Config, GPTJConfig, OpenAIGPTConfig, OPTConfig, RemBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoFormerConfig, TransfoXLConfig, XGLMConfig, XLMConfig, XLMRobertaConfig, XLNetConfig.


