In [None]:
! pip install unsloth trl comet-ml>=3.43.2  protobuf==3.20.3

In [None]:
import os
from pathlib import Path
import torch
import warnings

from typing import Any, List, Literal, Optional  # noqa: E402

from datasets import concatenate_datasets, load_dataset  # noqa: E402
from huggingface_hub import HfApi, login  # noqa: E402
from transformers import TextStreamer, TrainingArguments  # noqa: E402
from trl import SFTTrainer  # noqa: E402
from llm_engineering.settings import settings  # noqa: E402

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Use only GPU 0
os.environ["UNSLOTH_SKIP_TOKEN_FIX"] = "1"  # Skip problematic token fixing
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"  # Better memory management
os.environ["COMET_API_KEY"] = settings.COMET_API_KEY  # Set your Comet API key here

# Disable warnings
warnings.filterwarnings("ignore")

# Verify GPU is available
if not torch.cuda.is_available():
    raise RuntimeError("CUDA not available!")

print(f"Using GPU: {torch.cuda.get_device_name(0)}")
print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

Using GPU: Tesla T4
GPU Memory: 15.8 GB


In [22]:
from unsloth import FastLanguageModel, is_bfloat16_supported  # noqa: E402
from unsloth.chat_templates import get_chat_template  # noqa: E402

In [None]:
hf_api = HfApi(token=settings.HF_TOKEN)  # noqa: F821
login()

In [24]:
class Config:
    def __init__(self):
        self.num_train_epochs = 3
        self.per_device_train_batch_size = 2
        self.learning_rate = 3e-4
        self.dataset_huggingface_workspace = "K-1303"
        self.model_output_huggingface_workspace = "K-1303"
        self.is_dummy = False
        self.finetuning_type = "sft"  # "sft" or "dpo"
        self.output_data_dir = "/content/output"
        self.model_dir = "/content/model"
        self.n_gpus = "1"

In [25]:
args = Config()

In [26]:
print(f"Num training epochs: '{args.num_train_epochs}'")  # noqa
print(f"Per device train batch size: '{args.per_device_train_batch_size}'")  # noqa
print(f"Learning rate: {args.learning_rate}")  # noqa
print(f"Datasets will be loaded from Hugging Face workspace: '{args.dataset_huggingface_workspace}'")  # noqa
print(f"Models will be saved to Hugging Face workspace: '{args.model_output_huggingface_workspace}'")  # noqa
print(f"Training in dummy mode? '{args.is_dummy}'")  # noqa
print(f"Finetuning type: '{args.finetuning_type}'")  # noqa

print(f"Output data dir: '{args.output_data_dir}'")  # noqa
print(f"Model dir: '{args.model_dir}'")  # noqa
print(f"Number of GPUs: '{args.n_gpus}'")  # noqa

Num training epochs: '3'
Per device train batch size: '2'
Learning rate: 0.0003
Datasets will be loaded from Hugging Face workspace: 'K-1303'
Models will be saved to Hugging Face workspace: 'K-1303'
Training in dummy mode? 'False'
Finetuning type: 'sft'
Output data dir: '/content/output'
Model dir: '/content/model'
Number of GPUs: '1'


In [27]:
# Load model function

def load_model(
    model_name: str,
    max_seq_length: int,
    load_in_4bit: bool,
    lora_rank: int,
    lora_alpha: int,
    lora_dropout: float,
    target_modules: List[str],
    chat_template: str,
) -> tuple:
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=max_seq_length,
        load_in_4bit=load_in_4bit,
    )

    model = FastLanguageModel.get_peft_model(
        model,
        r=lora_rank,
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        target_modules=target_modules,
    )

    tokenizer = get_chat_template(
        tokenizer,
        chat_template=chat_template,
    )

    return model, tokenizer

In [28]:
# Inference function

def inference(
    model: Any,
    tokenizer: Any,
    prompt: str = "Write a paragraph to introduce supervised fine-tuning.",
    max_new_tokens: int = 256,
) -> None:
    model = FastLanguageModel.for_inference(model)
    message = alpaca_template.format(prompt, "")
    inputs = tokenizer([message], return_tensors="pt").to("cuda")

    text_streamer = TextStreamer(tokenizer)
    _ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=max_new_tokens, use_cache=True)

In [29]:
def save_model(model: Any, tokenizer: Any, output_dir: str, push_to_hub: bool = False, repo_id: Optional[str] = None):
    model.save_pretrained_merged(output_dir, tokenizer, save_method="merged_16bit")

    if push_to_hub and repo_id:
        print(f"Saving model to '{repo_id}'")  # noqa
        model.push_to_hub_merged(repo_id, tokenizer, save_method="merged_16bit")

# Supervised Fine Tuning(LORA)

In [30]:
# Template for training data

alpaca_template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{}

### Response:
{}"""

In [31]:
base_model_name = "meta-llama/Llama-3.1-8B"

In [32]:
output_dir_sft = Path(args.model_dir) / "output_sft"

In [33]:
sft_output_model_repo_id = f"{args.model_output_huggingface_workspace}/TwinLlama-3.1-8B"

In [34]:
# Fine-tuning function

def finetune(
    finetuning_type: Literal["sft", "dpo"],
    model_name: str,
    output_dir: str,
    dataset_huggingface_workspace: str,
    max_seq_length: int = 2048,
    load_in_4bit: bool = False,
    lora_rank: int = 32,
    lora_alpha: int = 32,
    lora_dropout: float = 0.0,
    target_modules: List[str] = ["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"],  # noqa: B006
    chat_template: str = "chatml",
    learning_rate: float = 3e-4,
    num_train_epochs: int = 3,
    per_device_train_batch_size: int = 2,
    gradient_accumulation_steps: int = 8,
    beta: float = 0.5,  # Only for DPO
    is_dummy: bool = True,
) -> tuple:
    print("--Run QLORA--")
    print(load_in_4bit)
    model, tokenizer = load_model(
        model_name, max_seq_length, load_in_4bit, lora_rank, lora_alpha, lora_dropout, target_modules, chat_template
    )
    EOS_TOKEN = tokenizer.eos_token
    print(f"Setting EOS_TOKEN to {EOS_TOKEN}")  # noqa

    if is_dummy is True:
        num_train_epochs = 1
        print(f"Training in dummy mode. Setting num_train_epochs to '{num_train_epochs}'")  # noqa
        print(f"Training in dummy mode. Reducing dataset size to '400'.")  # noqa

    def format_samples_sft(examples):
        text = []
        for instruction, output in zip(examples["instruction"], examples["output"], strict=False):
            message = alpaca_template.format(instruction, output) + EOS_TOKEN
            text.append(message)

        return {"text": text}

    dataset1 = load_dataset(f"{dataset_huggingface_workspace}/llmtwin", split="train")
    dataset2 = load_dataset("mlabonne/FineTome-Alpaca-100k", split="train[:10000]")
    dataset = concatenate_datasets([dataset1, dataset2])
    if is_dummy:
        try:
            dataset = dataset.select(range(400))
        except Exception:
            print("Dummy mode active. Failed to trim the dataset to 400 samples.")  # noqa
    print(f"Loaded dataset with {len(dataset)} samples.")  # noqa

    dataset = dataset.map(format_samples_sft, batched=True, remove_columns=dataset.column_names)
    dataset = dataset.train_test_split(test_size=0.05)

    print("Training dataset example:")  # noqa
    print(dataset["train"][0])  # noqa

    trainer = SFTTrainer(
      model=model,
      args=TrainingArguments(
          learning_rate=learning_rate,
          num_train_epochs=num_train_epochs,
          per_device_train_batch_size=per_device_train_batch_size,
          gradient_accumulation_steps=gradient_accumulation_steps,
          fp16=not is_bfloat16_supported(),
          bf16=is_bfloat16_supported(),
          logging_steps=1,
          optim="adamw_8bit",
          weight_decay=0.01,
          lr_scheduler_type="linear",
          per_device_eval_batch_size=per_device_train_batch_size,
          warmup_steps=10,
          output_dir=output_dir,
          report_to="comet_ml",
          seed=0,
          save_strategy="epoch",
          dataloader_pin_memory=False,
          fp16_full_eval=True,
          dataloader_num_workers=0,
      ),
      train_dataset=dataset["train"],
      eval_dataset=dataset["test"],
      processing_class=tokenizer,  # Use processing_class instead of tokenizer
      peft_config=None,  # Let Unsloth handle PEFT configuration
      formatting_func=None,  # Use default formatting since dataset has 'text' field
      data_collator=None,  # Use default data collator
  )


    trainer.train()

    return model, tokenizer

In [35]:
model, tokenizer = finetune(
            finetuning_type="sft",
            model_name=base_model_name,
            output_dir=str(output_dir_sft),
            dataset_huggingface_workspace=args.dataset_huggingface_workspace,
            num_train_epochs=args.num_train_epochs,
            per_device_train_batch_size=args.per_device_train_batch_size,
            load_in_4bit=True,
            learning_rate=args.learning_rate,
        )

--Run QLORA--
True
==((====))==  Unsloth 2025.9.9: Fast Llama patching. Transformers: 4.56.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Unsloth 2025.9.9 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.
Unsloth: Will map <|im_end|> to EOS = <|end_of_text|>.


Setting EOS_TOKEN to <|im_end|>
Training in dummy mode. Setting num_train_epochs to '1'
Training in dummy mode. Reducing dataset size to '400'.


README.md:   0%|          | 0.00/403 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/22.8k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/6.65k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/126 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/14 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/408 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/89.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/100000 [00:00<?, ? examples/s]

Loaded dataset with 400 samples.


Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Training dataset example:
{'text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nProvide me an example about how to use the pythagorean theorem \n\n### Response:\n1. The instruction is asking for an example of how to use the Pythagorean theorem.\n2. The Pythagorean theorem is a fundamental principle in geometry, used to calculate the length of one side of a right triangle when the lengths of the other two sides are known.\n3. The theorem is expressed as c^2 = a^2 + b^2, where c is the hypotenuse and a and b are the other two sides.\n4. To provide an example, I need to choose specific lengths for sides a and b. I\'ll choose 4 units and 5 units for simplicity.\n5. Substituting these values into the theorem gives c^2 = 4^2 + 5^2.\n6. Calculating the squares and adding them together gives c^2 = 16 + 25 = 41.\n7. To find c, I need to take the square root of 41, which is approximately 6.4.\n8. Therefore, the le

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/380 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/20 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 380 | Num Epochs = 1 | Total steps = 24
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 8 x 1) = 16
 "-____-"     Trainable parameters = 83,886,080 of 8,114,147,328 (1.03% trained)
[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/kavya-dua/general/964fc0645077423cb46cf8940facd1be

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/content' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,1.1161
2,1.1283
3,1.3054
4,1.1477
5,1.0577
6,1.096
7,1.3677
8,0.9343
9,0.781
10,0.9262


comet_ml is installed but the Comet API Key is not configured. Please set the `COMET_API_KEY` environment variable to enable Comet logging. Check out the documentation for other ways of configuring it: https://www.comet.com/docs/v2/guides/experiment-management/configure-sdk/#set-the-api-key


In [36]:
inference(model, tokenizer)

<|begin_of_text|>Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Write a paragraph to introduce supervised fine-tuning.

### Response:
Supervised fine-tuning is a method for training deep learning models on new data by leveraging the knowledge gained from pre-trained models. It involves updating the weights of a pre-trained model using the new data, while preserving the general knowledge learned from the pre-training process. This approach allows the model to quickly adapt to new tasks, reducing the need for extensive training from scratch.<|im_end|>


In [43]:
save_model(model, tokenizer, "model_sft", push_to_hub=True, repo_id=sft_output_model_repo_id)

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Checking cache directory for required files...
Cache check failed: model-00001-of-00004.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files: 100%|██████████| 4/4 [00:00<00:00, 20789.61it/s]
Unsloth: Merging weights into 16bit: 100%|██████████| 4/4 [08:19<00:00, 124.98s/it]


Unsloth: Merge process complete.
Saving model to 'K-1303/TwinLlama-3.1-8B'


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...ama-3.1-8B/tokenizer.json:   2%|1         |  295kB / 17.2MB            

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Checking cache directory for required files...
Cache check failed: model-00001-of-00004.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files:  25%|██▌       | 1/4 [04:25<13:16, 265.48s/it]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files:  50%|█████     | 2/4 [06:16<05:48, 174.38s/it]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files:  75%|███████▌  | 3/4 [07:32<02:09, 129.48s/it]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files: 100%|██████████| 4/4 [07:40<00:00, 115.22s/it]
Unsloth: Merging weights into 16bit:   0%|          | 0/4 [00:00<?, ?it/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...0001-of-00004.safetensors:   1%|          | 41.9MB / 4.98GB            

Unsloth: Merging weights into 16bit:  25%|██▌       | 1/4 [03:22<10:07, 202.56s/it]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...0002-of-00004.safetensors:   0%|          |  611kB / 5.00GB            

Unsloth: Merging weights into 16bit:  50%|█████     | 2/4 [06:12<06:07, 183.53s/it]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...0003-of-00004.safetensors:   0%|          |  613kB / 4.92GB            

Unsloth: Merging weights into 16bit:  75%|███████▌  | 3/4 [08:55<02:53, 173.98s/it]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...0004-of-00004.safetensors:   4%|3         | 41.9MB / 1.17GB            

Unsloth: Merging weights into 16bit: 100%|██████████| 4/4 [09:35<00:00, 143.85s/it]


Unsloth: Merge process complete.
