In [1]:
from datasets import load_dataset
from colorama import Fore

import mlflow
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig, prepare_model_for_kbit_training
import bitsandbytes
import torch
import dvc.api
import os
from dotenv import load_dotenv
load_dotenv() # loads all tokens from .env file that must be created

True

In [2]:
path = 'data/synthetic_database.json'
repo = '.'
version = 'dv1' # Always remember the tags!

data_url = dvc.api.get_url(
	path=path,
	repo=repo,
	rev=version
	)

dataset = load_dataset("json", data_files=data_url, split="train")
print(Fore.YELLOW + str(dataset[2]) + Fore.RESET) 

# Grab the API key
hf_token = os.getenv("HF_TOKEN")

{'business_description': 'A neighborhood bakery crafting sourdough breads and seasonal pastries using local grains.', 'domain_names': ['hearthgrainbakery.com', 'localsourdough.co', 'seasonalcrumbs.shop']}


In [3]:
# --------------------
# Chat Template Function
# --------------------
def format_chat_template(batch, tokenizer):

    system_prompt =  """
                    You are a domain name generator. 
                    Your sole purpose is to generate creative, brandable, and available-sounding domain names based on user input.
                    
                    Rules:
                    - Always respond in strict JSON format.
                    - The JSON should be an object with a single key "domains" containing a list of domain name strings.
                    - Do not include explanations, commentary, or text outside of the JSON.
                    - Each domain should be concise, memorable, and in lowercase.
                    - Generate only domain names, nothing else.
                    - If the user demands inappropriate content, instead of domain names return: Access refused.
                    """

    samples = []
    questions = batch["business_description"]
    answers = batch["domain_names"]

    for i in range(len(questions)):
        row_json = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": questions[i]},
            {"role": "assistant", "content": answers[i]}
        ]

        tokenizer.chat_template = (
            "{% set loop_messages = messages %}"
            "{% for message in loop_messages %}"
            "{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' %}"
            "{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}"
            "{{ content }}"
            "{% endfor %}"
            "{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
        )
        text = tokenizer.apply_chat_template(row_json, tokenize=False)
        samples.append(text)

    return {
        "instruction": questions,
        "response": answers,
        "text": samples
    }

# --------------------
# Model + Tokenizer
# --------------------
base_model = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(
    base_model, 
    trust_remote_code=True,
    token=hf_token,
)

train_dataset = dataset.map(lambda x: format_chat_template(x, tokenizer), num_proc=8, batched=True, batch_size=10)
print(Fore.LIGHTMAGENTA_EX + str(train_dataset[0]) + Fore.RESET) 

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    device_map="cuda:0",
    quantization_config=quant_config,
    token=hf_token,
    cache_dir="./workspace",
)

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    r=32, 
    lora_alpha=64,
    lora_dropout=0.05,
    target_modules="all-linear",
    task_type="CAUSAL_LM",
)

training_args = SFTConfig(
    output_dir="meta-llama/Llama-3.2-1B-SFT", 
    num_train_epochs=5
)

trainer = SFTTrainer(
    model,
    train_dataset=train_dataset,
    args=training_args,
    peft_config=peft_config,
)

# --------------------
# MLflow Tracking
# --------------------
mlflow.set_tracking_uri("file:./mlruns")  # logs locally in ./mlruns
mlflow.set_experiment("llama-domain-generator")

with mlflow.start_run(run_name="llama-sft-run") as run:
    # Log parameters
    mlflow.log_param("base_model", base_model)
    mlflow.log_param("epochs", training_args.num_train_epochs)
    mlflow.log_params({
        "r": peft_config.r,
        "lora_alpha": peft_config.lora_alpha,
        "lora_dropout": peft_config.lora_dropout,
    })

    # More parameters to track data version too

    mlflow.log_param('data_url', data_url)
    mlflow.log_param('data_version', version)

    # Train
    trainer.train()

    # Save checkpoints
    trainer.save_model("complete_checkpoint")
    trainer.model.save_pretrained("trained_model")

    # Log metrics if available
    metrics = trainer.state.log_history
    for record in metrics:
        if "loss" in record:
            mlflow.log_metric("loss", record["loss"], step=record["step"])
        if "eval_loss" in record:
            mlflow.log_metric("eval_loss", record["eval_loss"], step=record["step"])

    # Log tokenizer and artifacts
    #tokenizer.save_pretrained("final_model")
    mlflow.log_artifacts("complete_checkpoint", artifact_path="checkpoints")
     
    # Log model to MLflow (NOT the final_model, since DVC will track it)
    mlflow.pytorch.log_model(trainer.model, "mlflow_model")

print("✅ Training complete. Logs saved in ./mlruns, final model in ./trained_model")


Map (num_proc=8):   0%|          | 0/792 [00:00<?, ? examples/s]

{'business_description': 'An eco-friendly cleaning service specializing in biodegradable products for urban households.', 'domain_names': ['greencleaners.com', 'ecourbanclean.io', 'biofreshservices.net'], 'instruction': 'An eco-friendly cleaning service specializing in biodegradable products for urban households.', 'response': ['greencleaners.com', 'ecourbanclean.io', 'biofreshservices.net'], 'text': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a domain name generator. \n                    Your sole purpose is to generate creative, brandable, and available-sounding domain names based on user input.\n\n                    Rules:\n                    - Always respond in strict JSON format.\n                    - The JSON should be an object with a single key "domains" containing a list of domain name strings.\n                    - Do not include explanations, commentary, or text outside of the JSON.\n                    - Each domain should be concise, memora

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

Adding EOS to train dataset:   0%|          | 0/792 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/792 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/792 [00:00<?, ? examples/s]

2025/09/15 08:47:45 INFO mlflow.tracking.fluent: Experiment with name 'llama-domain-generator' does not exist. Creating a new experiment.
  return fn(*args, **kwargs)


Step,Training Loss
10,3.8318
20,2.8067
30,1.814
40,1.212
50,1.1077
60,1.0332
70,0.9988
80,0.969
90,0.9388
100,0.9143




PicklingError: Cannot pickle a prepared model with automatic mixed precision, please unwrap the model with `Accelerator.unwrap_model(model)` before pickling it.