In [1]:
import os
# os.environ['CUDA_VISIBLE_DEVICES'] = '0'
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [2]:
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForCausalLM,
    TextStreamer,
    GenerationConfig,
    logging,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq,
    DataCollatorForLanguageModeling,
    StoppingCriteria,
    StoppingCriteriaList,
    pipeline,
)
from transformers.trainer_callback import (
    EarlyStoppingCallback,
    TrainerCallback,
    TrainerControl,
    TrainerState,
)
from transformers.trainer_utils import (
    IntervalStrategy,
)
import torch
from trl import SFTTrainer

import matplotlib.pyplot as plt
from datetime import datetime
import pandas as pd
from dotenv import load_dotenv
from pathlib import Path
import wandb
from torch.utils.data import DataLoader, Dataset, IterableDataset, random_split
from pymilvus import Collection, db, connections
import torch
import datasets
from peft import get_peft_model, LoraConfig, TaskType

In [3]:
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdevasheeshmishra[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
load_dotenv("datasets/synthetic-mt/.env")

True

In [5]:
logging.set_verbosity_info()

## Load Model and Tokenizer


In [6]:
# _BASE_MODEL_PATH = Path('../models/zephyr-7b-beta/')
_BASE_MODEL_PATH = Path('../models/Mistral-7B-Instruct-v0.2/')
# _BASE_MODEL_PATH = Path('../models/phi-1_5/')
# _BASE_MODEL_PATH = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
_LORA_OUTPUT_PATH = Path('output/loras')

In [7]:
tokenizer = AutoTokenizer.from_pretrained(
    _BASE_MODEL_PATH,
    padding_side="right",
    add_eos_token=True,
    add_bos_token=True,
    use_fast=False
)
# model_config = AutoConfig.from_pretrained(_BASE_MODEL_PATH)

loading file tokenizer.model
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading file tokenizer.json


In [8]:
model = AutoModelForCausalLM.from_pretrained(
    _BASE_MODEL_PATH,
    # config=base_model_config,
    device_map='auto',
    # torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True
)

loading configuration file ../models/Mistral-7B-Instruct-v0.2/config.json
Model config MistralConfig {
  "_name_or_path": "../models/Mistral-7B-Instruct-v0.2",
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-05,
  "rope_theta": 1000000.0,
  "sliding_window": null,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.38.0.dev0",
  "use_cache": true,
  "vocab_size": 32000
}

loading weights file ../models/Mistral-7B-Instruct-v0.2/pytorch_model.bin.index.json
Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2
}



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing MistralForCausalLM.

All the weights of MistralForCausalLM were initialized from the model checkpoint at ../models/Mistral-7B-Instruct-v0.2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use MistralForCausalLM for predictions without further training.
loading configuration file ../models/Mistral-7B-Instruct-v0.2/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2
}



In [9]:
if torch.cuda.device_count() > 1:  # If more than 1 GPU
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    setattr(model, 'is_model_parallel', True)

Let's use 2 GPUs!


In [10]:
model.config.use_cache = False
for param in model.parameters():
    # Turning off gradient calculation for base model as we want to train lora, not base model
    param.requires_grad = False

In [11]:
{tokenizer.bos_token: tokenizer.bos_token_id}, \
{tokenizer.eos_token: tokenizer.eos_token_id}, \
{tokenizer.pad_token: tokenizer.pad_token_id}, \
{tokenizer.unk_token: tokenizer.unk_token_id},

({'<s>': 1}, {'</s>': 2}, {None: None}, {'<unk>': 0})

In [12]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [13]:
model.config.do_sample = False
model.generation_config.do_sample = False
# model.config.temperature=0.0
# model.generation_config.temperature=0.0

model.generation_config

GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2
}

In [14]:
vocabulary = tokenizer.get_vocab().keys()
new_tokens = ["<calculator>", "</calculator>", "<stop>"]
for token in new_tokens:
    if token not in vocabulary:
        tokenizer.add_tokens(token)
        print(f"Token: {token}, ID: {tokenizer.encode(token)}")

model.resize_token_embeddings(len(tokenizer))
model.config.vocab_size = len(tokenizer)

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32003. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


Token: <calculator>, ID: [1, 32000, 2]
Token: </calculator>, ID: [1, 32001, 2]
Token: <stop>, ID: [1, 32002, 2]


## HYPER PARAMETERS


In [15]:
# SYSTEM_PROMPT = "You are a Personal Assistant named JARVIS. You are installed in Devasheesh Mishra's House and running on his servers. You are integrated in his smart home system. If he is generally talking about something, just talk to him but if he is asking to control the state of a device, you first need to ask the system to give you the functions needed to control the device, then you can do the appropriate function call to control devices. You can also ask user for more information if needed. You can also use tools like 1. calculator (for doing basic mathematical calculations i.e. addition, substraction, multiplication, division) 2. calendar (to find todays date) 3. clock (to find todays day and current time). You can also use Google to search for Information online. Give short responses as much as possible. Be attentive to user commands and inquiries, ensuring a seamless and efficient smart home experience. You are designed to make his life easier and better."
SYSTEM_PROMPT = ''

In [16]:
BATCH_SIZE_PER_DEVICE = 1
GRADIENT_ACCUMULATION_STEPS = 1
EPOCHS = 10
WARMUP_STEPS = 0
LEARNING_RATE = 2e-4
MAX_LENGTH = 256

## Setup Dataset


Base Zephyr Model Prompt Template:

```text
<|system|>
You are a friendly chatbot who always responds in the style of a pirate.</s>
<|user|>
How many helicopters can a human eat in one sitting?</s>
<|assistant|>
Ah, me hearty matey! But yer question be a puzzler! A human cannot eat a helicopter in one sitting, as helicopters are not edible. They be made of metal, plastic, and other materials, not food!
```


In [17]:
class MilvusDataset(Dataset):
    def __init__(self):
        connections.connect(
            alias="default",
            host=os.getenv("MILVUS_HOST"),
            port=19530,
            user=os.getenv("MILVUS_USER"),
            password=os.getenv("MILVUS_PASSWORD"),
        )
        db.using_database("JARVIS")
        self.collection = Collection("calculator")
        self.data_itrator = self.collection.query(
            # batch_size=1,
            output_fields=["conversation"],
            expr="conversation_id > 0",
        )

    def __len__(self):
        return self.collection.num_entities

    def __getitem__(self, idx):
        # conversation: dict = self.data_itrator.next()[0]
        conversation: dict = self.data_itrator[idx]
        conversation = self.apply_prompt_template(conversation)
        # return conversation
        return {
                'input_ids': conversation['input_ids'].squeeze(),
                'attention_mask': conversation['attention_mask'].squeeze(),
            }
        # return {
        #         'input_ids': torch.tensor(conversation['input_ids'], device=base_model.device),
        #         'attention_mask': torch.tensor(conversation['attention_mask'], device=base_model.device)
        #     }

    def apply_prompt_template(self, conversation: dict):
        # add </s> at end of lines which do not end with </s>
        # conversation["conversation"]["data"] = ''.join([x + '</s>\n' if not x.strip().endswith("</s>") else x + '\n' for x in conversation["conversation"]["data"].split("\n")]).strip()

        # apply zephyr-7b-beta chat template
        prompt = f"<|system|>\n{SYSTEM_PROMPT}\n{conversation["conversation"]["data"].replace("USER: ", "<|user|>\n").replace("ASSISTANT: ", "<|assistant|>\n")}" \
            .replace('</s>', '<stop>').replace('  ', ' ')

        # return prompt
        # return tokenizer(prompt, return_tensors="pt", padding=True).to(base_model.device)
        return tokenizer(prompt, return_tensors="pt", padding="max_length", truncation=True, max_length=MAX_LENGTH)

## Train Model


### [Gradient Accumulation](https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-accumulation)

    The idea behind gradient accumulation is to instead of calculating the gradients for the whole batch at once to do it in smaller steps. The way we do that is to calculate the gradients iteratively in smaller batches by doing a forward and backward pass through the model and accumulating the gradients in the process. When enough gradients are accumulated we run the model’s optimization step. This way we can easily increase the overall batch size to numbers that would never fit into the GPU’s memory. In turn, however, the added forward and backward passes can slow down the training a bit.

### [Gradient Checkpointing](https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing)

    Even when we set the batch size to 1 and use gradient accumulation we can still run out of memory when working with large models. In order to compute the gradients during the backward pass all activations from the forward pass are normally saved. This can create a big memory overhead. Alternatively, one could forget all activations during the forward pass and recompute them on demand during the backward pass. This would however add a significant computational overhead and slow down training.

    Gradient checkpointing strikes a compromise between the two approaches and saves strategically selected activations throughout the computational graph so only a fraction of the activations need to be re-computed for the gradients. See this great article explaining the ideas behind gradient checkpointing.


In [18]:
# base_model.gradient_checkpointing_enable()
# base_model.enable_input_require_grads()

In [27]:
lora_config = LoraConfig(
    # peft_type: str | PeftType = None,
    # auto_mapping: dict | None = None,
    base_model_name_or_path=_BASE_MODEL_PATH.name,
    # revision: str = None,
    # task_type = TaskType.CAUSAL_LM,
    task_type="CAUSAL_LM",
    # inference_mode: bool = False,
    r=32,  # ! 8, 16, 32, 64
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_alpha=32,  # ! 8, 16, 32
    lora_dropout=0.02,
    # fan_in_fan_out: bool = False,
    bias="none",
    # modules_to_save: List[str] | None = None,
    # init_lora_weights: bool = True,
    # layers_to_transform: List[int] | int | None = None,
    # layers_pattern: str | None = None
)
model = get_peft_model(model, lora_config, adapter_name='jarvis-calculator-v0_1')
model.print_trainable_parameters()

trainable params: 27,262,976 || all params: 7,269,019,648 || trainable%: 0.37505712352147985


In [None]:
# for parm in model.named_parameters():
#     print(parm[0] + '\n' if parm[1].requires_grad else '', end='')

In [28]:
dataset = MilvusDataset()

In [None]:
# total_size = len(dataset)
# train_size = int(0.95 * total_size)
# test_size = total_size - train_size
# train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
# print(len(train_dataset), len(test_dataset))

# train_dataset, test_dataset = list(train_dataset), list(test_dataset)

# train_dataset, test_dataset = [{"text": text} for text in train_dataset], [{"text": text} for text in test_dataset]

# train_dataset, test_dataset = datasets.Dataset.from_list(train_dataset), datasets.Dataset.from_list(test_dataset)

In [None]:
# print(tokenizer.decode(train_dataset[4]['input_ids']))

In [None]:
# # If you ran this function, you will need to rerun the cell above to get the train_dataset and test_dataset
# def plot_data_lengths(dataset):
#     lengths = [len(x['input_ids']) for x in dataset]
#     print(len(lengths))

#     # Plotting the histogram
#     plt.figure(figsize=(10, 6))
#     plt.hist(lengths, bins=40, alpha=0.7, color='blue')
#     plt.xlabel('Length of input_ids')
#     plt.ylabel('Frequency')
#     plt.title('Distribution of Lengths of input_ids')
#     plt.show()

# plot_data_lengths(dataset)

In [29]:
run_name = "calculator-lora"
training_args = TrainingArguments(
    #! Training
    # gradient_checkpointing=True,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    # gradient_checkpointing_kwargs={"use_reentrant": True},
    per_device_train_batch_size=BATCH_SIZE_PER_DEVICE,
    learning_rate=LEARNING_RATE,
    num_train_epochs=EPOCHS,
    warmup_steps=WARMUP_STEPS,

    #! Datatypes
    fp16=True if model.parameters().__next__().dtype == torch.float16 else False,
    fp16_full_eval=True if model.parameters().__next__().dtype == torch.float16 else False,
    bf16=True if model.parameters().__next__().dtype == torch.bfloat16 else False,
    bf16_full_eval=True if model.parameters().__next__().dtype == torch.bfloat16 else False,

    #! Evaluation
    # evaluation_strategy=IntervalStrategy.NO,
    # eval_steps=10,
    report_to="wandb",
    run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}",
    metric_for_best_model="train_loss",
    greater_is_better=False,


    #! Logging
    logging_strategy=IntervalStrategy.STEPS,
    logging_steps=len(dataset) // (BATCH_SIZE_PER_DEVICE * GRADIENT_ACCUMULATION_STEPS * EPOCHS),

    #! Checkpointing
    save_strategy=IntervalStrategy.STEPS,
    save_steps=10,
    output_dir=_LORA_OUTPUT_PATH,
    overwrite_output_dir=True,

    # load_best_model_at_end=True,
)

PyTorch: setting up devices


In [None]:
# class CustomTrainer(Trainer):
#     # def __init__(self, *args, **kwargs):
#     #     super().__init__(*args, **kwargs)

#     def compute_loss(self, model, inputs, return_outputs=False):
#         return super().compute_loss(model, inputs, return_outputs)

In [None]:
# os.environ["WANDB_DISABLED"] = "false"
# wandb.init(project="jarvis-calculator-lora", name=run_name, config=training_args)
# trainer = CustomTrainer(
#     model=model,
#     args=training_args,
#     # data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
#     tokenizer=tokenizer,
#     train_dataset=dataset,
#     # callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
# )
# trainer.train()

In [None]:
# sample_test = dataset[0]
# sample_test = tokenizer.decode(sample_test['input_ids'], skip_special_tokens=False)

In [30]:
class CalculatorErrorCallback(TrainerCallback):
    def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        model = state.model
        tokenizer = state.tokenizer
        if not model.training:
            raise Exception("Model is not in training mode")
        
        # return super().on_step_end(args, state, control, **kwargs)

In [31]:
# os.environ["WANDB_DISABLED"] = "false"
wandb.init(project="jarvis-calculator-lora", name=run_name, config=training_args)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=dataset,
    callbacks=[
        # EarlyStoppingCallback(early_stopping_patience=3),
        # CalculatorErrorCallback(),
    ],
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
)
trainer.train()

You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set to `True` to avoid any unexpected behavior such as device placement mismatching.
***** Running training *****
  Num examples = 54
  Num Epochs = 10
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 540
  Number of trainable parameters = 27,262,976
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss
5,1.9517
10,1.3182
15,0.8834
20,0.7123
25,0.829
30,0.8668
35,0.6349
40,0.7845
45,0.7026
50,0.72


Saving model checkpoint to output/loras/tmp-checkpoint-10
tokenizer config file saved in output/loras/tmp-checkpoint-10/tokenizer_config.json
Special tokens file saved in output/loras/tmp-checkpoint-10/special_tokens_map.json
added tokens file saved in output/loras/tmp-checkpoint-10/added_tokens.json
Saving model checkpoint to output/loras/tmp-checkpoint-20
tokenizer config file saved in output/loras/tmp-checkpoint-20/tokenizer_config.json
Special tokens file saved in output/loras/tmp-checkpoint-20/special_tokens_map.json
added tokens file saved in output/loras/tmp-checkpoint-20/added_tokens.json
Saving model checkpoint to output/loras/tmp-checkpoint-30
tokenizer config file saved in output/loras/tmp-checkpoint-30/tokenizer_config.json
Special tokens file saved in output/loras/tmp-checkpoint-30/special_tokens_map.json
added tokens file saved in output/loras/tmp-checkpoint-30/added_tokens.json
Saving model checkpoint to output/loras/tmp-checkpoint-40
tokenizer config file saved in outpu

TrainOutput(global_step=540, training_loss=0.26137542092689764, metrics={'train_runtime': 722.2141, 'train_samples_per_second': 0.748, 'train_steps_per_second': 0.748, 'total_flos': 5920489104998400.0, 'train_loss': 0.26137542092689764, 'epoch': 10.0})

In [None]:
# wandb.init(project="jarvis-calculator-lora", name=run_name, config=training_args)
# trainer = SFTTrainer(
#     model=model,
#     train_dataset=dataset,
#     dataset_text_field='text',
#     peft_config=lora_config,
#     args=training_args,
#     tokenizer=tokenizer,
#     packing=False,
#     max_seq_length=MAX_LENGTH,
#     # callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
# )
# trainer.train()

In [None]:
model = model.merge_and_unload()
model.config.use_cache = True

In [None]:
tokenizer.add_bos_token = False
tokenizer.add_eos_token = False
tokenizer.padding_side = "right"

In [None]:
print(tokenizer.decode(tokenizer(dataset['text'][0])['input_ids']))

In [1]:
prompt = """<|system|>
<|user|>
Hey, I am trying to calculate the total cost of 10 items that cost 5 rupees each. Can you help me?
<|assistant|>
Of course! Please provide me with the cost of each item and the number of items.  <stop>
<|user|>
There are 10 items in total.
<|assistant|>
Great! And the cost of each item is five rupees?   <stop>
<|user|>
No, the cost of each item is 130 rupees.
<|assistant|>
I see. So, the total cost of all the items would be  <calculator> 10*130    <stop>
<|user|>
what is the total height of the chair including its legs ?
<|assistant|>
To find the total height of the chair including its legs, I'll need to know the height of the chair's seat and the length of each leg.   <stop>
<|user|>
the chair height is two and a half feet without legs and each leg is half a foot tall.
<|assistant|>
"""
prompt = tokenizer(prompt, return_tensors="pt").to(model.device)

NameError: name 'tokenizer' is not defined

In [None]:
class CustomStoppingCriteria(StoppingCriteria):
    def __init__(self, stops: list = []):
        StoppingCriteria.__init__(self)
        self.stops = stops

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
        for token in self.stops:
            if input_ids[0][-1].cpu().numpy() == token:
                return True
        return False

In [None]:
output = model.generate(
    **prompt,
    max_new_tokens=128,
    do_sample=False,
    # temperature=0.6,
    pad_token_id=tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    stopping_criteria=StoppingCriteriaList([
        CustomStoppingCriteria(stops=[tokenizer.encode("<stop>", add_special_tokens=False)[-1]])
    ]),
)
print(tokenizer.decode(output[0], skip_special_tokens=False))

In [None]:
tokenizer.decode(torch.LongTensor([2287]))

In [None]:
output