In [8]:
from datasets import load_dataset
from colorama import Fore
from accelerate import Accelerator

import mlflow
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig, prepare_model_for_kbit_training
import bitsandbytes
import torch
import dvc.api
import os
from dotenv import load_dotenv
load_dotenv() # loads all tokens from .env file that must be created

True

In [9]:
path = 'data/synthetic_database.json'
repo = '.'
version = 'dv2' # Always remember the tags!

data_url = dvc.api.get_url(
	path=path,
	repo=repo,
	rev=version
	)

dataset = load_dataset("json", data_files=data_url, split="train")
print(Fore.YELLOW + str(dataset[2]) + Fore.RESET) 

# Grab the API key
hf_token = os.getenv("HF_TOKEN")

Downloading data:   0%|          | 0.00/398k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

{'business_description': 'A neighborhood bakery crafting sourdough breads and seasonal pastries using local grains.', 'domain_names': ['hearthgrainbakery.com', 'localsourdough.co', 'seasonalcrumbs.shop']}


In [13]:
# --------------------
# Chat Template Function
# --------------------
def format_chat_template(batch, tokenizer):

    system_prompt =  """
                    You are a domain name generator. 
                    Your sole purpose is to generate creative, brandable, and available-sounding domain names based on user input.
                    """

    samples = []
    questions = batch["business_description"]
    answers = batch["domain_names"]

    for i in range(len(questions)):
        row_json = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": questions[i]},
            {"role": "assistant", "content": answers[i]}
        ]

        tokenizer.chat_template = (
            "{% set loop_messages = messages %}"
            "{% for message in loop_messages %}"
            "{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' %}"
            "{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}"
            "{{ content }}"
            "{% endfor %}"
            "{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
        )
        text = tokenizer.apply_chat_template(row_json, tokenize=False)
        samples.append(text)

    return {
        "instruction": questions,
        "response": answers,
        "text": samples
    }

# --------------------
# Model + Tokenizer
# --------------------
base_model = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(
    base_model, 
    trust_remote_code=True,
    token=hf_token,
)

train_dataset = dataset.map(lambda x: format_chat_template(x, tokenizer), num_proc=8, batched=True, batch_size=10)
print(Fore.LIGHTMAGENTA_EX + str(train_dataset[0]) + Fore.RESET) 

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    device_map="cuda:0",
    quantization_config=quant_config,
    token=hf_token,
    cache_dir="./workspace",
)

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    r=256, 
    lora_alpha=512,
    lora_dropout=0.05,
    target_modules="all-linear",
    task_type="CAUSAL_LM",
)

training_args = SFTConfig(
    output_dir="meta-llama/Llama-3.2-1B-SFT", 
    num_train_epochs=30,
    save_strategy="steps",
    save_steps=1500,   
)

trainer = SFTTrainer(
    model,
    train_dataset=train_dataset,
    args=training_args,
    peft_config=peft_config,
)


# --------------------
# MLflow Tracking
# --------------------
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI")) # logs locally in ./mlruns
mlflow.set_experiment("llama-domain-generator")

with mlflow.start_run(run_name="llama-sft-run") as run:
    # Log parameters
    mlflow.log_param("base_model", base_model)
    mlflow.log_param("epochs", training_args.num_train_epochs)
    mlflow.log_params({
        "r": peft_config.r,
        "lora_alpha": peft_config.lora_alpha,
        "lora_dropout": peft_config.lora_dropout,
    })

    # More parameters to track data version too

    mlflow.log_param('data_url', data_url)
    mlflow.log_param('data_version', version)

    # Train
    trainer.train()

    # Save checkpoints
    trainer.save_model("complete_checkpoint")
    trainer.model.save_pretrained("trained_model")

    # Log metrics if available
    metrics = trainer.state.log_history
    for record in metrics:
        if "loss" in record:
            mlflow.log_metric("loss", record["loss"], step=record["step"])
        if "eval_loss" in record:
            mlflow.log_metric("eval_loss", record["eval_loss"], step=record["step"])

    # Log tokenizer and artifacts
    #tokenizer.save_pretrained("final_model")
    mlflow.log_artifacts("complete_checkpoint", artifact_path="checkpoints")


    
    accelerator = Accelerator()
    unwrapped_model = accelerator.unwrap_model(trainer.model)

    # if model is PEFT-wrapped, get its base model
    try:
        base_model = unwrapped_model.base_model.model
    except AttributeError:
        base_model = unwrapped_model

    mlflow.pytorch.log_model(base_model, "mlflow_model")
    

print("✅ Training complete. Logs saved in ./mlruns, final model in ./trained_model")


{'business_description': 'An eco-friendly cleaning service specializing in biodegradable products for urban households.', 'domain_names': ['greencleaners.com', 'ecourbanclean.io', 'biofreshservices.net'], 'instruction': 'An eco-friendly cleaning service specializing in biodegradable products for urban households.', 'response': ['greencleaners.com', 'ecourbanclean.io', 'biofreshservices.net'], 'text': "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a domain name generator. \n                    Your sole purpose is to generate creative, brandable, and available-sounding domain names based on user input.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nAn eco-friendly cleaning service specializing in biodegradable products for urban households.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n['greencleaners.com', 'ecourbanclean.io', 'biofreshservices.net']<|eot_id|>"}


  return fn(*args, **kwargs)


Step,Training Loss
10,3.5591
20,2.1995
30,1.9801
40,1.919
50,1.8224
60,1.8099
70,1.7556
80,1.6755
90,1.6959
100,1.6499


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


🏃 View run llama-sft-run at: http://ec2-35-180-181-209.eu-west-3.compute.amazonaws.com:5000/#/experiments/843519438225732159/runs/52637809ac2247ad81c87dfeec3b58ae
🧪 View experiment at: http://ec2-35-180-181-209.eu-west-3.compute.amazonaws.com:5000/#/experiments/843519438225732159
✅ Training complete. Logs saved in ./mlruns, final model in ./trained_model


In [7]:
# Once the training is done and you are satisfied with the result, use dvc to save adapters and add git tag for easy retrieval
