In [None]:
! pip install comet-ml>=3.43.2 trl transformers peft bitsandbytes accelerate datasets

In [None]:
import os
from pathlib import Path
import torch
import warnings
from typing import Any, List, Literal, Optional  # noqa: E402
from datasets import concatenate_datasets, load_dataset  # noqa: E402
from huggingface_hub import HfApi, login  # noqa: E402
from trl import SFTTrainer, SFTConfig, DPOTrainer, DPOConfig  # noqa: E402
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from llm_engineering.settings import settings  # noqa: E402

comet_ml is installed but the Comet API Key is not configured. Please set the `COMET_API_KEY` environment variable to enable Comet logging. Check out the documentation for other ways of configuring it: https://www.comet.com/docs/v2/guides/experiment-management/configure-sdk/#set-the-api-key


In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Use only GPU 0
os.environ["UNSLOTH_SKIP_TOKEN_FIX"] = "1"  # Skip problematic token fixing
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"  # Better memory management
os.environ["COMET_API_KEY"] = settings.COMET_API_KEY  # Set your Comet API key here
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Disable warnings
warnings.filterwarnings("ignore")

# Verify GPU is available
if not torch.cuda.is_available():
    raise RuntimeError("CUDA not available!")

print(f"Using GPU: {torch.cuda.get_device_name(0)}")
print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

Using GPU: Tesla T4
GPU Memory: 15.8 GB


In [None]:
hf_api = HfApi(token=settings.HF_TOKEN)  # noqa: F821
login(token=settings.HF_TOKEN)  # noqa: F821

In [5]:
class Config:
    def __init__(self):
        self.num_train_epochs = 1
        self.per_device_train_batch_size = 2
        self.learning_rate = 3e-4
        self.dataset_huggingface_workspace = "K-1303"
        self.model_output_huggingface_workspace = "K-1303"
        self.is_dummy = False
        self.finetuning_type = "sft"  # "sft" or "dpo"
        self.output_data_dir = "/content/output"
        self.model_dir = "/content/model"
        self.n_gpus = "1"

In [6]:
args = Config()

In [7]:
print(f"Num training epochs: '{args.num_train_epochs}'")  # noqa
print(f"Per device train batch size: '{args.per_device_train_batch_size}'")  # noqa
print(f"Learning rate: {args.learning_rate}")  # noqa
print(f"Datasets will be loaded from Hugging Face workspace: '{args.dataset_huggingface_workspace}'")  # noqa
print(f"Models will be saved to Hugging Face workspace: '{args.model_output_huggingface_workspace}'")  # noqa
print(f"Training in dummy mode? '{args.is_dummy}'")  # noqa
print(f"Finetuning type: '{args.finetuning_type}'")  # noqa

print(f"Output data dir: '{args.output_data_dir}'")  # noqa
print(f"Model dir: '{args.model_dir}'")  # noqa
print(f"Number of GPUs: '{args.n_gpus}'")  # noqa

Num training epochs: '1'
Per device train batch size: '2'
Learning rate: 0.0003
Datasets will be loaded from Hugging Face workspace: 'K-1303'
Models will be saved to Hugging Face workspace: 'K-1303'
Training in dummy mode? 'False'
Finetuning type: 'sft'
Output data dir: '/content/output'
Model dir: '/content/model'
Number of GPUs: '1'


In [8]:
# Load model function

def load_model(
    model_name: str,
    max_seq_length: int,
    load_in_4bit: bool,
    lora_rank: int,
    lora_alpha: int,
    lora_dropout: float,
    target_modules: List[str],
    chat_template: str,
) -> tuple:
    # Configure 4-bit quantization
    if load_in_4bit:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_use_double_quant=True,
        )
    else:
        bnb_config = None

    print(bnb_config)

    # Load model
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
        torch_dtype=torch.bfloat16 if not load_in_4bit else None,
    )

    # Prepare model for k-bit training if using quantization
    if load_in_4bit:
        model = prepare_model_for_kbit_training(model)

    # Configure LoRA
    peft_config = LoraConfig(
        r=lora_rank,
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        target_modules=target_modules,
        bias="none",
        task_type="CAUSAL_LM",
    )

    # Apply PEFT
    model = get_peft_model(model, peft_config)

    # Print trainable parameters
    model.print_trainable_parameters()

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        trust_remote_code=True,
    )

    # Set padding token if not present
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Apply chat template
    if chat_template == "chatml":
        tokenizer.chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"

    return model, tokenizer

In [17]:
def inference(
    model: any,
    tokenizer: any,
    prompt: str = "Write a paragraph to introduce supervised fine-tuning.",
    max_new_tokens: int = 256,
) -> None:
    import torch
    from transformers import TextStreamer

    # Ensure the model is in evaluation mode
    model.eval()

    try:
        # Check if model has mixed dtypes
        dtypes = set(p.dtype for p in model.parameters())
        if len(dtypes) > 1 or torch.float32 in dtypes:
            print(f"Warning: Model has mixed dtypes {dtypes}. Converting to bfloat16...")
            if torch.cuda.is_bf16_supported():
                model = model.to(dtype=torch.bfloat16)
            else:
                model = model.to(dtype=torch.float16)
    except Exception as e:
        print(f"Could not check/fix dtypes: {e}")

    # Format the prompt (assuming alpaca_template is defined elsewhere)
    message = alpaca_template.format(prompt, "")
    inputs = tokenizer([message], return_tensors="pt")

    # Move inputs to the same device as model
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Create text streamer for real-time token output
    text_streamer = TextStreamer(
        tokenizer,
        skip_prompt=True,
        skip_special_tokens=True
    )

    # Generate text
    with torch.no_grad():
        _ = model.generate(
            **inputs,
            streamer=text_streamer,
            max_new_tokens=max_new_tokens,
            use_cache=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

In [10]:
# Save model function
def save_model(
    model: Any,
    tokenizer: Any,
    output_dir: str,
    push_to_hub: bool = False,
    repo_id: Optional[str] = None
):
    """
    Save the full merged model (LoRA weights merged with base model)
    """
    print("Merging LoRA weights with base model...")

    # Merge LoRA adapter weights with the base model
    model = model.merge_and_unload()

    # print(f"Saving merged model to '{output_dir}'...")

    # # Save the merged model in 16-bit precision
    # model.save_pretrained(
    #     output_dir,
    #     safe_serialization=True,  # Save as safetensors format
    # )
    # tokenizer.save_pretrained(output_dir)

    # print(f"Model saved successfully to '{output_dir}'")

    if push_to_hub and repo_id:
        print(f"Pushing merged model to Hugging Face Hub: '{repo_id}'")
        model.push_to_hub(
            repo_id,
            safe_serialization=True,
        )
        tokenizer.push_to_hub(repo_id)
        print(f"Model pushed successfully to '{repo_id}'")

# Supervised Fine Tuning(LORA)

In [None]:
# Template for training data

alpaca_template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{}

### Response:
{}"""

In [None]:
base_model_name = "unsloth/Meta-Llama-3.1-8B"

In [None]:
output_dir_sft = Path(args.model_dir) / "output_sft"

In [None]:
sft_output_model_repo_id = f"{args.model_output_huggingface_workspace}/TwinLlama-3.1-8B-KD-16bit"

In [None]:
# Fine-tuning function
def finetune(
    finetuning_type: Literal["sft", "dpo"],
    model_name: str,
    output_dir: str,
    dataset_huggingface_workspace: str,
    max_seq_length: int = 2048,
    load_in_4bit: bool = False,
    lora_rank: int = 32,
    lora_alpha: int = 32,
    lora_dropout: float = 0.0,
    target_modules: List[str] = ["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"],
    chat_template: str = "chatml",
    learning_rate: float = 3e-4,
    num_train_epochs: int = 3,
    per_device_train_batch_size: int = 2,
    gradient_accumulation_steps: int = 8,
    beta: float = 0.5,  # Only for DPO
    is_dummy: bool = False,
) -> tuple:
    print("--Run QLORA--")
    print(load_in_4bit)

    model, tokenizer = load_model(
        model_name,
        max_seq_length,
        load_in_4bit,
        lora_rank,
        lora_alpha,
        lora_dropout,
        target_modules,
        chat_template
    )

    # Print trainable parameters
    model.print_trainable_parameters()

    EOS_TOKEN = tokenizer.eos_token
    print(f"Setting EOS_TOKEN to {EOS_TOKEN}")

    if is_dummy is True:
        num_train_epochs = 1
        print(f"Training in dummy mode. Setting num_train_epochs to '{num_train_epochs}'")
        print(f"Training in dummy mode. Reducing dataset size to '400'.")

    def format_samples_sft(examples):
        text = []
        for instruction, output in zip(examples["instruction"], examples["output"], strict=False):
            message = alpaca_template.format(instruction, output) + EOS_TOKEN
            text.append(message)
        return {"text": text}

    # Load datasets
    dataset1 = load_dataset(f"{dataset_huggingface_workspace}/llmtwin", split="train")
    dataset2 = load_dataset("mlabonne/FineTome-Alpaca-100k", split="train[:1000]")
    dataset = concatenate_datasets([dataset1, dataset2])

    if is_dummy:
        try:
            dataset = dataset.select(range(400))
        except Exception:
            print("Dummy mode active. Failed to trim the dataset to 400 samples.")

    print(f"Loaded dataset with {len(dataset)} samples.")

    dataset = dataset.map(format_samples_sft, batched=True, remove_columns=dataset.column_names)
    dataset = dataset.train_test_split(test_size=0.05)

    print("Training dataset example:")
    print(dataset["train"][0])

    # Check if bfloat16 is supported
    is_bfloat16_supported = torch.cuda.is_available() and torch.cuda.is_bf16_supported()

    # Create SFTTrainer
    trainer = SFTTrainer(
        model=model,
        args=SFTConfig(
            learning_rate=learning_rate,
            num_train_epochs=num_train_epochs,
            per_device_train_batch_size=per_device_train_batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps,
            fp16=not is_bfloat16_supported,
            bf16=is_bfloat16_supported,
            logging_steps=10,
            optim="adamw_8bit",
            weight_decay=0.01,
            lr_scheduler_type="linear",
            per_device_eval_batch_size=per_device_train_batch_size,
            warmup_steps=10,
            output_dir=output_dir,
            report_to="comet_ml",
            seed=0,
            save_strategy="epoch",
            dataloader_pin_memory=False,
            eval_strategy="epoch",
            save_total_limit=2,
            packing=True,
            gradient_checkpointing=True,
            group_by_length=True,
            dataloader_num_workers=2
        ),
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        processing_class=tokenizer
    )

    trainer.train()

    return model, tokenizer

In [None]:
model, tokenizer = finetune(
            finetuning_type="sft",
            model_name=base_model_name,
            output_dir=str(output_dir_sft),
            dataset_huggingface_workspace=args.dataset_huggingface_workspace,
            num_train_epochs=args.num_train_epochs,
            per_device_train_batch_size=args.per_device_train_batch_size,
            load_in_4bit=True,
            learning_rate=args.learning_rate,
        )

--Run QLORA--
True


config.json:   0%|          | 0.00/947 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

trainable params: 83,886,080 || all params: 8,114,147,328 || trainable%: 1.0338


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

trainable params: 83,886,080 || all params: 8,114,147,328 || trainable%: 1.0338
Setting EOS_TOKEN to <|end_of_text|>


README.md:   0%|          | 0.00/408 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/139k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/20.7k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/896 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/408 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/89.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/100000 [00:00<?, ? examples/s]

Loaded dataset with 1896 samples.


Map:   0%|          | 0/1896 [00:00<?, ? examples/s]



Training dataset example:
{'text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nDescribe the role of a Remotion motion designer.\n\n### Response:\nA Remotion motion designer is an expert responsible for generating TemplateModels. They must work with specific guidelines related to Brand, Description, and Style. Their key output is a single valid TemplateModel in minified JSON format.<|end_of_text|>'}


Adding EOS to train dataset:   0%|          | 0/1801 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1801 [00:00<?, ? examples/s]

Packing train dataset:   0%|          | 0/1801 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/95 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/95 [00:00<?, ? examples/s]

Packing eval dataset:   0%|          | 0/95 [00:00<?, ? examples/s]

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/kavya-dua/general/36e0a3a2303a4c32ab2ddf052ea36326

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/content' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,0.9197,0.907365,0.912878,516433.0,0.755347


comet_ml is installed but the Comet API Key is not configured. Please set the `COMET_API_KEY` environment variable to enable Comet logging. Check out the documentation for other ways of configuring it: https://www.comet.com/docs/v2/guides/experiment-management/configure-sdk/#set-the-api-key


In [None]:
inference(model, tokenizer)

Supervised fine-tuning is a method for training a model using a labeled dataset. The model is first initialized with pre-trained weights, which are then fine-tuned using the labeled data. This approach is often used when the dataset is too small for unsupervised learning or when the model needs to be trained for a specific task.


In [None]:
save_model(model, tokenizer, "model_sft", push_to_hub=True, repo_id=sft_output_model_repo_id)

Merging LoRA weights with base model...
Pushing merged model to Hugging Face Hub: 'K-1303/TwinLlama-3.1-8B-KD-16bit'


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...0002-of-00002.safetensors:   1%|          | 8.37MB / 1.05GB            

  ...0001-of-00002.safetensors:   0%|          | 30.0kB / 4.65GB            

README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...mpq77o_efn/tokenizer.json: 100%|##########| 17.2MB / 17.2MB            

Model pushed successfully to 'K-1303/TwinLlama-3.1-8B-KD-16bit'


# DPO

In [20]:
alpaca_template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{}

### Response:
"""

In [12]:
base_model_name = "K-1303/TwinLlama-3.1-8B-KD-16bit"

In [13]:
output_dir_sft = Path(args.model_dir) / "output_sft"

In [14]:
sft_output_model_repo_id = f"{args.model_output_huggingface_workspace}/TwinLlama-3.1-8B-KD-DPO"

In [15]:
# DPO function
def dpo(
    finetuning_type: Literal["sft", "dpo"],
    model_name: str,
    output_dir: str,
    dataset_huggingface_workspace: str,
    max_seq_length: int = 2048,
    load_in_4bit: bool = True,
    lora_rank: int = 32,
    lora_alpha: int = 32,
    lora_dropout: float = 0.0,
    target_modules: List[str] = ["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"],
    chat_template: str = "chatml",
    learning_rate: float = 3e-4,
    num_train_epochs: int = 3,
    per_device_train_batch_size: int = 2,
    gradient_accumulation_steps: int = 8,
    beta: float = 0.5,  # Only for DPO
    is_dummy: bool = False,
) -> tuple:
    print("--Run QLORA--")
    print(load_in_4bit)

    model, tokenizer = load_model(
        model_name,
        max_seq_length,
        load_in_4bit,
        lora_rank,
        lora_alpha,
        lora_dropout,
        target_modules,
        chat_template
    )

    # Print trainable parameters
    model.print_trainable_parameters()

    model.gradient_checkpointing_enable()

    EOS_TOKEN = tokenizer.eos_token
    print(f"Setting EOS_TOKEN to {EOS_TOKEN}")


    def format_samples_dpo(example):
      # Format prompt with the instruction only
      example["prompt"] = alpaca_template.format(example["prompt"])
      example["chosen"] = example['chosen'] + EOS_TOKEN
      example["rejected"] = example['rejected'] + EOS_TOKEN
      return {"prompt": example["prompt"], "chosen": example["chosen"], "rejected": example["rejected"]}

    dataset = load_dataset(f"{dataset_huggingface_workspace}/llmtwin-preference", split="train")
    dataset = dataset.select(range(min(2000, len(dataset))))

    if is_dummy:
        try:
            dataset = dataset.select(range(400))
        except Exception:
            print("Dummy mode active. Failed to trim the dataset to 400 samples.")  # noqa
    print(f"Loaded dataset with {len(dataset)} samples.")  # noqa

    # Load datasets
    dataset = dataset.map(format_samples_dpo)
    dataset = dataset.train_test_split(test_size=0.05)


    print("Training dataset example:")  # noqa
    print(dataset["train"][0])  # noqa

    # Check if bfloat16 is supported
    is_bfloat16_supported = torch.cuda.is_available() and torch.cuda.is_bf16_supported()

    # Create DPOrainer
    trainer = DPOTrainer(
      model=model,
      ref_model=None,
      processing_class=tokenizer,
      train_dataset=dataset["train"],
      eval_dataset=dataset["test"],
      args=DPOConfig(
        learning_rate=2e-6,
        lr_scheduler_type="linear",
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=16,
        num_train_epochs=1,
        fp16=not is_bfloat16_supported,
        bf16=is_bfloat16_supported,
        optim="adamw_8bit",
        weight_decay=0.01,
        warmup_steps=10,
        output_dir="output",
        eval_strategy="steps",
        eval_steps=0.2,
        logging_steps=1,
        report_to="comet_ml",
        seed=0,
        max_length=max_seq_length//2,
        max_prompt_length=max_seq_length//2,
        beta=0.5,
      ),
    )


    trainer.train()

    return model, tokenizer

In [16]:
model, tokenizer = dpo(
            finetuning_type="dpo",
            model_name=base_model_name,
            output_dir=str(output_dir_sft),
            dataset_huggingface_workspace=args.dataset_huggingface_workspace,
            num_train_epochs=args.num_train_epochs,
            per_device_train_batch_size=args.per_device_train_batch_size,
            load_in_4bit=True,
            learning_rate=args.learning_rate,
        )

--Run QLORA--
True
BitsAndBytesConfig {
  "_load_in_4bit": true,
  "_load_in_8bit": false,
  "bnb_4bit_compute_dtype": "bfloat16",
  "bnb_4bit_quant_storage": "uint8",
  "bnb_4bit_quant_type": "nf4",
  "bnb_4bit_use_double_quant": true,
  "llm_int8_enable_fp32_cpu_offload": false,
  "llm_int8_has_fp16_weight": false,
  "llm_int8_skip_modules": null,
  "llm_int8_threshold": 6.0,
  "load_in_4bit": true,
  "load_in_8bit": false,
  "quant_method": "bitsandbytes"
}



config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.05G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.65G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/240 [00:00<?, ?B/s]

trainable params: 83,886,080 || all params: 8,114,147,328 || trainable%: 1.0338


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/196 [00:00<?, ?B/s]

trainable params: 83,886,080 || all params: 8,114,147,328 || trainable%: 1.0338
Setting EOS_TOKEN to <|end_of_text|>


README.md:   0%|          | 0.00/345 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/205k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1012 [00:00<?, ? examples/s]

Loaded dataset with 1012 samples.


Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

Training dataset example:
{'prompt': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nWhat is the teaching philosophy of fast.ai?\n\n### Response:\n', 'rejected': 'The fast.ai team aims to make AI accessible to everyone, irrespective of their background, language preference, or data and application needs, by favoring a hands-on approach over an initial overload of theory.<|end_of_text|>', 'chosen': 'Their goal is to make AI accessible to everyone, regardless of your background, your preferred language, or your data and applications. Instead of being confronted with an overwhelming amount of theory at the start, they advocate a very hands on approach.<|end_of_text|>'}


Extracting prompt in train dataset:   0%|          | 0/961 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/961 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/961 [00:00<?, ? examples/s]

Extracting prompt in eval dataset:   0%|          | 0/51 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/51 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/51 [00:00<?, ? examples/s]

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/kavya-dua/general/a865648077d74e7aab312ed955853a5d

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/content' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
13,0.57,0.612968,0.124944,-0.057898,0.764706,0.182842,-229.693069,-123.624939,-1.40569,-1.373462
26,0.4568,0.484896,0.391041,-0.172555,0.901961,0.563596,-229.160904,-123.85424,-1.406316,-1.373633
39,0.4836,0.418154,0.587796,-0.230586,0.941176,0.818382,-228.76738,-123.970322,-1.407916,-1.375507
52,0.3824,0.398061,0.646032,-0.276096,0.921569,0.922128,-228.650925,-124.061333,-1.408959,-1.375838


comet_ml is installed but the Comet API Key is not configured. Please set the `COMET_API_KEY` environment variable to enable Comet logging. Check out the documentation for other ways of configuring it: https://www.comet.com/docs/v2/guides/experiment-management/configure-sdk/#set-the-api-key


In [25]:
inference(model, tokenizer, prompt="What are the core entities involved in the coffee vending machine system?")

The core entities involved in the coffee vending machine system are:

- Customer: The person who purchases coffee from the vending machine.
- Coffee: The type of coffee that can be purchased from the vending machine.
- Vending Machine: The physical machine that dispenses coffee.
- Payment: The method by which the customer pays for the coffee.
- Inventory: The stock of coffee that is available in the vending machine.
- Order: The request for coffee that is made by the customer.
- Transaction: The exchange of coffee for payment that occurs when the customer purchases coffee.
- Customer Profile: The information that is stored about the customer, such as their name, address, and purchase history.
- Order History: The record of all the orders that have been made by the customer.
- Payment History: The record of all the payments that have been made by the customer.
- Inventory History: The record of all the coffee that has been dispensed from the vending machine.

### Explanation:
In this sy

In [24]:
save_model(model, tokenizer, "model_sft", push_to_hub=True, repo_id=sft_output_model_repo_id)

Merging LoRA weights with base model...
Pushing merged model to Hugging Face Hub: 'K-1303/TwinLlama-3.1-8B-KD-DPO'




Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...0002-of-00002.safetensors:   3%|3         | 33.5MB / 1.05GB            

  ...0001-of-00002.safetensors:   0%|          | 30.0kB / 4.65GB            

README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...mplvrfedaf/tokenizer.json:  96%|#########5| 16.5MB / 17.2MB            

Model pushed successfully to 'K-1303/TwinLlama-3.1-8B-KD-DPO'
