# Fine Tuning Huggingface

In [6]:
from dataclasses import dataclass, field
from typing import Optional

import torch

from transformers import AutoTokenizer, HfArgumentParser, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from datasets import load_dataset
from peft import LoraConfig
from trl import SFTTrainer
from datasets import Dataset
from dotenv import load_dotenv
load_dotenv()
import traceback
import os

In [7]:
import sys
import site
from pathlib import Path

!echo "Installation in progress, please wait..."
!{sys.executable} -m pip cache purge > /dev/null
!{sys.executable} -m pip install --pre --upgrade "bigdl-llm[xpu]==2.5.0b20240318" -f https://developer.intel.com/ipex-whl-stable-xpu
!{sys.executable} -m pip install "peft==0.10.0"  #> /dev/null
!{sys.executable} -m pip install "accelerate==0.27.2" --no-warn-script-location #> /dev/null
!{sys.executable} -m pip install "transformers==4.40.0" --no-warn-script-location #> /dev/null 
!{sys.executable} -m pip install "datasets==2.19.0" --no-warn-script-location #> /dev/null 2>&1 
!{sys.executable} -m pip install "bitsandbytes==0.43.1" "scipy==1.13.0" #> /dev/null  2>&1
!echo "Installation completed."

def get_python_version():
    return "python" + ".".join(map(str, sys.version_info[:2]))

def set_local_bin_path():
    local_bin = str(Path.home() / ".local" / "bin") 
    local_site_packages = str(
        Path.home() / ".local" / "lib" / get_python_version() / "site-packages"
    )
    sys.path.append(local_bin)
    sys.path.insert(0, site.getusersitepackages())
    sys.path.insert(0, sys.path.pop(sys.path.index(local_site_packages)))

set_local_bin_path()

Installation in progress, please wait...
Defaulting to user installation because normal site-packages is not writeable
Looking in links: https://developer.intel.com/ipex-whl-stable-xpu
Collecting bigdl-llm[xpu]==2.5.0b20240318
  Downloading bigdl_llm-2.5.0b20240318-py3-none-manylinux2010_x86_64.whl (13.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.2/13.2 MB[0m [31m55.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
[31mERROR: Could not find a version that satisfies the requirement bigdl-core-xe-esimd-21==2.5.0b20240318; platform_system == "Linux" and extra == "xpu" (from bigdl-llm[xpu]) (from versions: 2.5.0b1, 2.5.0b2, 2.5.0b20240401, 2.5.0b20240402, 2.5.0b20240404, 2.5.0b20240405, 2.5.0b20240406, 2.5.0b20240407, 2.5.0b20240408, 2.5.0b20240409, 2.5.0b20240410, 2.5.0b20240411, 2.5.0b20240412, 2.5.0b20240413, 2.5.0b20240414, 2.5.0b20240415, 2.5.0b20240416, 2.5.0b20240417, 2.5.0

In [8]:
import logging
import os
import warnings

warnings.filterwarnings(
    "ignore", category=UserWarning, module="intel_extension_for_pytorch"
)
warnings.filterwarnings(
    "ignore", category=UserWarning, module="torchvision.io.image", lineno=13
)
warnings.filterwarnings(
    "ignore",
    message="The installed version of bitsandbytes was compiled without GPU support.*",
    category=UserWarning,
    module='bitsandbytes.cextension'
)
warnings.filterwarnings("ignore")
warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    message="This implementation of AdamW is deprecated",
)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["NUMEXPR_MAX_THREADS"] = "28"
os.environ["ENABLE_SDP_FUSION"] = "true"
os.environ["SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS"]="1"

logging.getLogger("transformers").setLevel(logging.ERROR)
logging.getLogger("bigdl").setLevel(logging.ERROR)


import torch
import intel_extension_for_pytorch as ipex
from datasets import load_dataset
from datasets import Dataset
from bigdl.llm.transformers import AutoModelForCausalLM
from bigdl.llm.transformers.qlora import (
    get_peft_model,
    prepare_model_for_kbit_training as prepare_model,
)
from peft import LoraConfig
from bigdl.llm.transformers.qlora import PeftModel
import transformers
from transformers import (
    BitsAndBytesConfig,
    DataCollatorForSeq2Seq,
    LlamaTokenizer,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)

transformers.logging.set_verbosity_error()

In [9]:
# Function to check available disk space in the Hugging Face cache directory
import os
import shutil

def check_disk_space(path="~/.cache/huggingface/"):
    abs_path = os.path.expanduser(path)
    total, used, free = shutil.disk_usage(abs_path)
    print(f"Total: {total // (2**30)} GiB")
    print(f"Used: {used // (2**30)} GiB")
    print(f"Free: {free // (2**30)} GiB")

# Example usage
check_disk_space()

Total: 3574 GiB
Used: 168 GiB
Free: 3406 GiB


In [10]:
BASE_MODEL = "google/gemma-7b"
MODEL_PATH = "./final_model"
ADAPTER_PATH = "./lora_adapters"
DEVICE = torch.device("xpu" if torch.xpu.is_available() else "cpu")
LORA_CONFIG = LoraConfig(
    r=16,  # rank
    lora_alpha=32,  # scaling factor
    target_modules=["q_proj", "k_proj", "v_proj"], 
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

print("=" * 80)
print(f"Using Device: {DEVICE}")
print(f"Final model will be saved to: {MODEL_PATH}")
print(f"LoRA adapters will be saved to: {ADAPTER_PATH}")
print(f"Finetuning Model: {BASE_MODEL}")
print("=" * 80)

Using Device: cpu
Final model will be saved to: ./final_model
LoRA adapters will be saved to: ./lora_adapters
Finetuning Model: google/gemma-7b


In [11]:
def setup_model_and_tokenizer(base_model_id: str):
    """Downloads / Loads the pre-trained model and tokenizer based on the given base model ID for training, 
    with fallbacks for permission errors to use default cache."""

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=False,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    
    print("Downloading model and tokenizer..." + base_model_id)
    try:
        model = AutoModelForCausalLM.from_pretrained(
            base_model_id, 
            # quantization_config=bnb_config,
            # torch_dtype=torch.float32,
            token=os.environ.get("HF_TOKEN", None),
        )
        tokenizer_class = LlamaTokenizer if "llama" in base_model_id.lower() else AutoTokenizer
        tokenizer = tokenizer_class.from_pretrained(base_model_id,token=os.environ.get("HF_TOKEN", None))
        print("Downloaded model and tokenizer successfully.")
    except Exception as e:
        print(traceback.format_exc())
        print(f"Error downloading model and tokenizer: {e}")

    tokenizer.pad_token_id = 0
    tokenizer.padding_side = "left"
    return model, tokenizer

In [12]:
from datasets import Dataset

# Your data as a list of dictionaries
data = {
    "input": [
        "How does a computer execute a program?",
        "What is the process for finding the maximum subarray sum using Kadane's Algorithm?",
        "How does the Quick Sort algorithm process an array to sort its elements?",
        "How does a binary search algorithm find an element in a sorted array?",
        "What is the process of inserting a node into a binary search tree?",
        "How does a hash table handle collisions using chaining?",
        "What is the process of performing a depth-first search on a graph?",
        "How does the merge sort algorithm sort an array?"
    ],
    "output": [
        """graph LR;
            A[Start] --> B[Load Program]
            B --> C[Execute Instructions]
            C --> D[End]""",
        """flowchart LR;
            A[Start] --> B[Init max_sum, current_sum = 0]
            B --> C[For each element]
            C --> D[Add to current_sum]
            D --> E[current_sum < 0?]
            E -- Yes --> F[Reset current_sum]
            E -- No --> G[current_sum > max_sum?]
            G -- Yes --> H[Update max_sum]
            G -- No --> I[Next element]
            F --> J[End array?]
            H --> J
            I --> J
            J -- No --> C
            J -- Yes --> K[Return max_sum]
            K --> L[End]""",
        """sequenceDiagram
            participant S as Start
            participant P as Select Pivot
            participant PA as Partition Array
            participant SL as Sort Left Subarray
            participant SR as Sort Right Subarray
            participant E as End

            S->>P: Begin Sorting
            P->>PA: Choose pivot element
            PA->>SL: Partition left of pivot
            SL->>SL: Recurse left
            PA->>SR: Partition right of pivot
            SR->>SR: Recurse right
            SL->>E: Left sorted
            SR->>E: Right sorted""",
        """flowchart LR;
            A[Start] --> B[Initialize pointers]
            B --> C[Check middle element]
            C --> D{Is it the target?}
            D -- Yes --> E[Found]
            D -- No --> F{Is target less than middle?}
            F -- Yes --> G[Move right pointer]
            F -- No --> H[Move left pointer]
            G --> C
            H --> C
            E --> I[End]""",
        """flowchart LR;
            A[Start] --> B[Find position]
            B --> C{Found spot?}
            C -- Yes --> D[Insert node]
            C -- No --> E{Is node smaller?}
            E -- Yes --> F[Go left]
            E -- No --> G[Go right]
            F --> B
            G --> B
            D --> H[End]""",
        """flowchart LR;
            A[Start] --> B[Compute hash]
            B --> C{Collision?}
            C -- Yes --> D[Link new entry]
            C -- No --> E[Insert normally]
            D --> F[End]
            E --> F""",
        """flowchart LR;
            A[Start] --> B[Mark as visited]
            B --> C[Visit first unvisited neighbor]
            C --> D{All neighbors visited?}
            D -- No --> B
            D -- Yes --> E[Backtrack]
            E --> C
            C --> F[End]""",
        """flowchart LR;
            A[Start] --> B[Split array into halves]
            B --> C[Sort each half]
            C --> D[Merge halves]
            D --> E{Is array fully sorted?}
            E -- Yes --> F[End]
            E -- No --> B"""
    ]
}

# Convert list of dicts to a Hugging Face dataset
dataset = Dataset.from_dict(data)

# Function to format each entry for training
def formatting_func(example):
    text = f"### USER: You are an experienced Javascript engineer in charge of converting educational prompts into valid MermaidJS code. Make sure to only return valid MermaidJS code and nothing else. Prompt: {example['input']}\n### ASSISTANT: {example['output']}"
    return {"text": text}

# Apply formatting function
formatted_dataset = dataset.map(formatting_func)

# Display an example to verify formatting
print(formatted_dataset[0]['text'])


Map: 100%|██████████| 8/8 [00:00<00:00, 1649.27 examples/s]

### USER: You are an experienced Javascript engineer in charge of converting educational prompts into valid MermaidJS code. Make sure to only return valid MermaidJS code and nothing else. Prompt: How does a computer execute a program?
### ASSISTANT: graph LR;
            A[Start] --> B[Load Program]
            B --> C[Execute Instructions]
            C --> D[End]





In [13]:
class FineTuner:
    """A class to handle the fine-tuning of LLM models."""

    def __init__(self, base_model_id: str, model_path: str, device: torch.device):
        """
        Initialize the FineTuner with base model, model path, and device.

        Parameters:
            base_model_id (str): Id of pre-trained model to use for fine-tuning.
            model_path (str): Path to save the fine-tuned model.
            device (torch.device): Device to run the model on.
        """
        self.base_model_id = base_model_id
        self.model_path = model_path
        self.device = device
        self.model, self.tokenizer = setup_model_and_tokenizer(base_model_id)

    def tokenize_data(self, example, add_eos_token=True, train_on_inputs=False, cutoff_len=512):
        """
        Tokenizes a MermaidJS data example.

        Parameters:
            example (dict): A data example containing 'text'.
            add_eos_token (bool): Whether to add an EOS token at the end of each tokenized sequence.
            cutoff_len (int): The maximum length for each tokenized sequence.

        Returns:
            dict: A dictionary containing tokenized 'input_ids', 'attention_mask', and 'labels'.
        """
        try:
            text = example["text"]
            tokenized = self.tokenizer(
                text,
                truncation=True,
                max_length=cutoff_len,
                padding=False,
                return_tensors=None,
            )
            if (
                tokenized["input_ids"][-1] != self.tokenizer.eos_token_id
                and add_eos_token
                and len(tokenized["input_ids"]) < cutoff_len
            ):
                tokenized["input_ids"].append(self.tokenizer.eos_token_id)
                tokenized["attention_mask"].append(1)
            tokenized["labels"] = tokenized["input_ids"].copy()
            return tokenized
        except Exception as e:
            logging.error(f"Error in tokenization: {e}")
            raise e

    def prepare_data(self, dataset, val_set_size=2):
        """Prepare training and validation datasets."""
        try:
            train_val_split = dataset.train_test_split(test_size=val_set_size, shuffle=True, seed=42)
            train_data = train_val_split["train"].shuffle().map(self.tokenize_data)
            val_data = train_val_split["test"].shuffle().map(self.tokenize_data)
            return train_data, val_data
        except Exception as e:
            logging.error(f"Error in preparing data: {e}")
            raise e

    def train_model(self, train_data, val_data, training_args):
        """
        Fine-tune the model with the given training and validation data.

        Parameters:
            train_data (Dataset): Training data.
            val_data (Optional[Dataset]): Validation data.
            training_args (TrainingArguments): Training configuration.
        """
        try:
            self.model = self.model.to(self.device)
            self.model.gradient_checkpointing_enable()
            self.model = prepare_model(self.model)
            self.model = get_peft_model(self.model, LORA_CONFIG)
            trainer = Trainer(
                model=self.model,
                train_dataset=train_data,
                eval_dataset=val_data,
                args=training_args,
                data_collator=DataCollatorForSeq2Seq(
                    self.tokenizer,
                    pad_to_multiple_of=8,
                    return_tensors="pt",
                    padding=True,
                ),
            )
            self.model.config.use_cache = False
            results = trainer.train()
            self.model.save_pretrained(self.model_path)
            print("saved")
        except Exception as e:
            logging.error(f"Error in model training: {e}")

    def finetune(self, dataset, training_args):
        """
        Execute the fine-tuning pipeline.

        Parameters:
            dataset (Dataset): Dataset for fine-tuning.
            training_args (TrainingArguments): Training configuration.
        """
        try:
            train_data, val_data = self.prepare_data(formatted_dataset)
            self.train_model(train_data, val_data, training_args)
        except KeyboardInterrupt:
            print("Interrupt received, saving model...")
            self.model.save_pretrained(f"{self.model_path}_interrupted")
            print(f"Model saved to {self.model_path}_interrupted")
        except Exception as e:
            logging.error(f"Error in finetuning: {e}")

In [9]:
def lets_finetune(
    device=DEVICE,
    model=BASE_MODEL,
    per_device_batch_size=4,
    warmup_steps=20,
    learning_rate=2e-5,
    max_steps=200,
    gradient_accum_steps=4,
):
    try:
        # Training parameters
        save_steps = 20
        eval_steps = 20
        max_grad_norm = 0.3
        save_total_limit = 3
        logging_steps = 20

        print("\n" + "\033[1;34m" + "=" * 60 + "\033[0m")
        print("\033[1;34mTraining Parameters:\033[0m")
        param_format = "\033[1;34m{:<25} {}\033[0m"
        print(param_format.format("Foundation model:", BASE_MODEL))
        print(param_format.format("Model save path:", MODEL_PATH))
        print(param_format.format("Device used:", DEVICE))
        if DEVICE.type.startswith("xpu"):
            print(param_format.format("Intel GPU:", torch.xpu.get_device_name()))
        print(param_format.format("Batch size per device:", per_device_batch_size))
        print(param_format.format("Gradient accum. steps:", gradient_accum_steps))
        print(param_format.format("Warmup steps:", warmup_steps))
        print(param_format.format("Save steps:", save_steps))
        print(param_format.format("Evaluation steps:", eval_steps))
        print(param_format.format("Max steps:", max_steps))
        print(param_format.format("Learning rate:", learning_rate))
        print(param_format.format("Max gradient norm:", max_grad_norm))
        print(param_format.format("Save total limit:", save_total_limit))
        print(param_format.format("Logging steps:", logging_steps))
        print("\033[1;34m" + "=" * 60 + "\033[0m\n")

        # Initialize the finetuner with the model and device information
        finetuner = FineTuner(
            base_model_id=model, model_path=MODEL_PATH, device=device
        )

        training_args = TrainingArguments(
            per_device_train_batch_size=per_device_batch_size,
            gradient_accumulation_steps=gradient_accum_steps,
            warmup_steps=warmup_steps,
            save_steps=save_steps,
            save_strategy="steps",
            eval_steps=eval_steps,
            evaluation_strategy="steps",
            max_steps=max_steps,
            learning_rate=learning_rate,
            #max_grad_norm=max_grad_norm,
            bf16=True,
            use_ipex=True,
            #lr_scheduler_type="cosine",
            load_best_model_at_end=True,
            ddp_find_unused_parameters=False,
            group_by_length=True,
            save_total_limit=save_total_limit,
            logging_steps=logging_steps,
            optim="adamw_hf",
            output_dir="./lora_adapters",
            logging_dir="./logs",
        )
        # Start fine-tuning
        finetuner.finetune(dataset, training_args)
    except Exception as e:
        logging.error(f"Error occurred: {e}")

In [10]:
lets_finetune()


[1;34mTraining Parameters:[0m
[1;34mFoundation model:         google/gemma-7b[0m
[1;34mModel save path:          ./final_model[0m
[1;34mDevice used:              cpu[0m
[1;34mBatch size per device:    4[0m
[1;34mGradient accum. steps:    4[0m
[1;34mWarmup steps:             20[0m
[1;34mSave steps:               20[0m
[1;34mEvaluation steps:         20[0m
[1;34mMax steps:                200[0m
[1;34mLearning rate:            2e-05[0m
[1;34mMax gradient norm:        0.3[0m
[1;34mSave total limit:         3[0m
[1;34mLogging steps:            20[0m

Downloading model and tokenizer...google/gemma-7b


Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.03it/s]


Downloaded model and tokenizer successfully.


Map: 100%|██████████| 6/6 [00:00<00:00, 696.11 examples/s]
Map: 100%|██████████| 2/2 [00:00<00:00, 425.99 examples/s]


{'loss': 2.6506, 'grad_norm': 4.90625, 'learning_rate': 2e-05, 'epoch': 20.0}
{'eval_loss': 3.0582573413848877, 'eval_runtime': 6.9468, 'eval_samples_per_second': 0.288, 'eval_steps_per_second': 0.144, 'epoch': 20.0}
{'loss': 2.3537, 'grad_norm': 5.125, 'learning_rate': 1.7777777777777777e-05, 'epoch': 40.0}
{'eval_loss': 3.0582573413848877, 'eval_runtime': 5.644, 'eval_samples_per_second': 0.354, 'eval_steps_per_second': 0.177, 'epoch': 40.0}
{'loss': 2.3554, 'grad_norm': 5.28125, 'learning_rate': 1.555555555555556e-05, 'epoch': 60.0}
{'eval_loss': 3.0582573413848877, 'eval_runtime': 11.0751, 'eval_samples_per_second': 0.181, 'eval_steps_per_second': 0.09, 'epoch': 60.0}
{'loss': 2.3524, 'grad_norm': 5.25, 'learning_rate': 1.3333333333333333e-05, 'epoch': 80.0}
{'eval_loss': 3.0582573413848877, 'eval_runtime': 5.4312, 'eval_samples_per_second': 0.368, 'eval_steps_per_second': 0.184, 'epoch': 80.0}
{'loss': 2.3555, 'grad_norm': 5.21875, 'learning_rate': 1.1111111111111113e-05, 'epoch':

In [14]:
os.environ["WANDB_DISABLED"] = "true"
INFERENCE_DEVICE = torch.device("cpu")  # change this to `xpu` to use Intel GPU for inference  

def generate_prompt_mermaid(input_text, output=""):
    """
    Generates a prompt for fine-tuning the LLM model for text-to-Mermaid.js tasks.

    Parameters:
        input_text (str): The input text or question to be converted to Mermaid.js code.
        output (str, optional): The expected Mermaid.js code as the output.

    Returns:
        str: A formatted string serving as the prompt for the fine-tuning task.
    """
    return f"""You are an experienced Javascript engineer in charge of converting educational prompts into valid MermaidJS code. Make sure to only return valid MermaidJS code and nothing else. 

Prompt: {input_text}

MermaidJS code:
{output}"""


def setup_model_and_tokenizer(base_model_path: str):
    """Loads the fine-tuned model and tokenizer."""
    try:
        model = AutoModelForCausalLM.from_pretrained(base_model_path)
        tokenizer = AutoTokenizer.from_pretrained(base_model_path)
        tokenizer.pad_token_id = 0
        tokenizer.padding_side = "left"
        return model.to(INFERENCE_DEVICE), tokenizer
    except Exception as e:
        logging.error(f"Exception occurred during model loading: {e}")
        raise

class TextToMermaidGenerator:
    """Handles Mermaid.js code generation for a given text prompt."""

    def __init__(
        self, base_model_id=BASE_MODEL, use_adapter=False, lora_checkpoint=None, loaded_base_model=None
    ):
        """
        Initialize the TextToMermaidGenerator class.
        Parameters:
            base_model_path (str): Path to the fine-tuned model.
            loaded_base_model (Optional[BaseModel]): Pre-loaded base model and tokenizer.
        """
        try:
            if loaded_base_model:
                self.model = loaded_base_model.model
                self.tokenizer = loaded_base_model.tokenizer
            else:
                self.model, self.tokenizer = setup_model_and_tokenizer(base_model_id)
            if use_adapter:
                self.model = PeftModel.from_pretrained(self.model, lora_checkpoint)
        except Exception as e:
            logging.error(f"Exception occurred during model initialization: {e}")
            raise

        self.model.to(INFERENCE_DEVICE)
        self.max_length = 512

    def generate(self, prompt, **kwargs):
        """Generates Mermaid.js code based on the given prompt.
        Parameters:
            prompt (str): The input prompt.
        Returns:
            str: The generated Mermaid.js code.
        """
        try:
            encoded_prompt = self.tokenizer(
                prompt,
                truncation=True,
                max_length=self.max_length,
                padding=False,
                return_tensors="pt",
            ).input_ids.to(INFERENCE_DEVICE)
            with torch.no_grad():
                with torch.cpu.amp.autocast():
                    outputs = self.model.generate(
                        input_ids=encoded_prompt,
                        do_sample=True,
                        max_length=self.max_length,
                        temperature=0.3,
                        repetition_penalty=1.2,
                    )
            generated = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            return generated
        except Exception as e:
            logging.error(f"Exception occurred during Mermaid.js code generation: {e}")
            raise

In [23]:
# lets load base model for a baseline comparison
base_model = TextToMermaidGenerator(
    use_adapter=False,
    lora_checkpoint="",
)  # setting use_adapter=False to use the base model
finetuned_model = TextToMermaidGenerator(
    use_adapter=True,
    lora_checkpoint=ADAPTER_PATH+"/checkpoint-200",
)  # setting use_adapter=True to use the fine-tuned model

Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  2.16it/s]
Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.16it/s]


In [24]:
import json
from IPython.display import display, HTML

# let's use some fake sample data
samples = """
[
{
"prompt": "How does a computer execute a program?",
"expected_output": "flowchart LR;\\n    A[Start] --> B[Load Program]\\n    B --> C[Execute Instructions]\\n    C --> D[End]"
},
{
"prompt": "What is the process for finding the maximum subarray sum using Kadane's Algorithm?",
"expected_output": "flowchart LR;\\n    A[Start] --> B[Init max_sum, current_sum = 0]\\n    B --> C[For each element]\\n    C --> D[Add to current_sum]\\n    D --> E[current_sum < 0?]\\n    E -- Yes --> F[Reset current_sum]\\n    E -- No --> G[current_sum > max_sum?]\\n    G -- Yes --> H[Update max_sum]\\n    G -- No --> I[Next element]\\n    F --> J[End array?]\\n    H --> J\\n    I --> J\\n    J -- No --> C\\n    J -- Yes --> K[Return max_sum]\\n    K --> L[End]"
},
{
"prompt": "How does the Quick Sort algorithm process an array to sort its elements?",
"expected_output": "sequenceDiagram\\n    participant S as Start\\n    participant P as Select Pivot\\n    participant PA as Partition Array\\n    participant SL as Sort Left Subarray\\n    participant SR as Sort Right Subarray\\n    participant E as End\\n\\n    S->>P: Begin Sorting\\n    P->>PA: Choose pivot element\\n    PA->>SL: Partition left of pivot\\n    SL->>SL: Recurse left\\n    PA->>SR: Partition right of pivot\\n    SR->>SR: Recurse right\\n    SL->>E: Left sorted\\n    SR->>E: Right sorted"
}
]
"""

def _extract_sections(output):
    input_section = output.split("Prompt:")[1].split("MermaidJS code:")[0]
    response_section = output.split("MermaidJS code:")[1]
    return input_section, response_section

def run_inference(sample_data, model, finetuned=False):
    if INFERENCE_DEVICE.type.startswith("xpu"):
        torch.xpu.empty_cache()
        
    print(f"Running inference on {INFERENCE_DEVICE}...")

    color = "#4CAF52" if finetuned else "#2196F4"
    model_type = "finetuned" if finetuned else "base"

    display(HTML(f"<div style='color:{color};'>Processing prompts on {INFERENCE_DEVICE} please wait...</div>"))

    for index, row in enumerate(sample_data):
        try:
            prompt = generate_prompt_mermaid(row["prompt"])
            output = model.generate(prompt)
            input_section, response_section = _extract_sections(output)

            tabbed_output = f"""
            <details>
            <summary style='color: {color};'><b>{model_type} model - Sample {index+1}</b> (Click to expand)</summary>
            <div style='padding-left: 20px;'>
            <p><b>Expected prompt 📝:</b><br>{input_section}</p>
            <p><b>Generated Mermaid.js code 💡:</b><br><pre>{response_section}</pre></p>
            <p><b>Expected Mermaid.js code 📈:</b><br><pre>{row["expected_output"]}</pre></p>
            </div>
            </details>
            <hr style='border-top: 1px solid #bbb;'>""" # Subtle separator

            display(HTML(tabbed_output))
        except Exception as e:
            logging.error(f"Exception occurred during sample processing: {e}")

# checkpoints are saved to `./lora_adapters`.
# Update the USING_CHECKPOINT to the one you want to use.
USING_CHECKPOINT=200

# if the kernel is interrupted the latest adapter (LORA_CHECKPOINT) is `./final_model_interrupted/`
# or else, the final model LORA_CHECKPOINT is `./final_model`
LORA_CHECKPOINT = f"./lora_adapters/checkpoint-{USING_CHECKPOINT}"

if os.path.exists(LORA_CHECKPOINT):
    sample_data = json.loads(samples)

    run_inference(sample_data, model=base_model)

    if not finetuned_model:
        finetuned_model = TextToMermaidGenerator(
            base_model_id=LORA_CHECKPOINT,
            loaded_base_model=base_model
        )

    run_inference(sample_data, model=finetuned_model, finetuned=True)

# To conserve memory we can delete the model
#del finetuned_model
#del base_model

Running inference on cpu...


Running inference on cpu...


In [15]:
USING_CHECKPOINT=200
LORA_CHECKPOINT = f"./lora_adapters/checkpoint-{USING_CHECKPOINT}"
finetuned_model, tokenizer = setup_model_and_tokenizer(BASE_MODEL)
finetuned_model = PeftModel.from_pretrained(finetuned_model, LORA_CHECKPOINT)

Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.10it/s]


In [16]:
model = finetuned_model.merge_and_unload()
model.push_to_hub("Maelstrome/mermaid-gemmma-7b")

model-00001-of-00008.safetensors:   0%|          | 0.00/4.76G [00:00<?, ?B/s]

[A[A


[A[A[A
[A



[A[A[A[A

model-00001-of-00008.safetensors:   0%|          | 8.19k/4.76G [00:01<265:01:16, 4.99kB/s]
[A


[A[A[A

model-00001-of-00008.safetensors:   0%|          | 967k/4.76G [00:01<1:42:50, 771kB/s]    
[A


[A[A[A

model-00001-of-00008.safetensors:   0%|          | 1.56M/4.76G [00:01<1:02:39, 1.26MB/s]
[A


[A[A[A

model-00001-of-00008.safetensors:   0%|          | 3.78M/4.76G [00:01<20:32, 3.86MB/s]  
[A


[A[A[A

[A[A


[A[A[A
model-00001-of-00008.safetensors:   0%|          | 4.96M/4.76G [00:02<18:45, 4.22MB/s]


[A[A[A

[A[A
model-00001-of-00008.safetensors:   0%|          | 6.11M/4.76G [00:02<14:53, 5.32MB/s]


model-00001-of-00008.safetensors:   0%|          | 7.12M/4.76G [00:02<12:59, 6.09MB/s]
[A

[A[A


model-00001-of-00008.safetensors:   0%|          | 8.11M/4.76G [00:02<12:11, 6.49MB/s]
[A


[A[A[A

model-00001-of-00008.safetensors:

CommitInfo(commit_url='https://huggingface.co/Maelstrome/mermaid-gemmma-7b/commit/eed59a89c191354ff380e35dc96cefa9a7e07dec', commit_message='Upload GemmaForCausalLM', commit_description='', oid='eed59a89c191354ff380e35dc96cefa9a7e07dec', pr_url=None, pr_revision=None, pr_num=None)

In [17]:

tokenizer.push_to_hub("Maelstrome/mermaid-gemmma-7b")

CommitInfo(commit_url='https://huggingface.co/Maelstrome/mermaid-gemmma-7b/commit/d6e28c25fda600d02a67180db0b1fbdb1ce1079e', commit_message='Upload tokenizer', commit_description='', oid='d6e28c25fda600d02a67180db0b1fbdb1ce1079e', pr_url=None, pr_revision=None, pr_num=None)