In [None]:
from google.colab import drive
drive.mount('/content/drive')
# Model will be saved in Google Drive
import os
base_path = "/content/drive/MyDrive/SmolLM2-1.7B-UltraChat_200k"
os.makedirs(base_path, exist_ok=True)

logging_path = f"{base_path}/logs"
os.makedirs(logging_path, exist_ok=True)

checkpoint_path = f"{base_path}/checkpoints"
os.makedirs(checkpoint_path, exist_ok=True)

In [None]:
%pip install transformers==4.45.2
%pip install datasets
%pip install trl
%pip install peft
%pip install accelerate
%pip install bitsandbytes>0.37.0
%pip install --upgrade flash-attn

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM, setup_chat_format
import torch
model_name = "HuggingFaceTB/SmolLM2-1.7B"
dataset_name = "HuggingFaceH4/ultrachat_200k"

dataset = load_dataset(dataset_name, split="train_sft")
dataset = dataset.remove_columns([col for col in dataset.column_names if col != "messages"])

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
) # Putting the Q in QLoRA

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    use_cache=True,
    torch_dtype=torch.bfloat16,
    quantization_config=quantization_config,
    attn_implementation="flash_attention_2"
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model, tokenizer = setup_chat_format(model, tokenizer)

In [None]:
from peft import LoraConfig
config = SFTConfig(
    output_dir=checkpoint_path,
    report_to="tensorboard",
    logging_dir=logging_path,  # Store logs in Colab - view with tensorboard (see last cell)
    logging_steps=10,
    log_level="info",

    bf16=True,
    optim="adamw_bnb_8bit",

    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2, # This is the number of batches to accumulate before updating the model. Effectively increases the batch size to 32.

    max_seq_length=512,
    num_train_epochs=1, # One epoch should be enough to learn the format (Default 3)
    learning_rate=2e-5, # Default learning rate
    lr_scheduler_type="linear", # Linear scheduler for single epoch (default)
    warmup_ratio=0.1,

    save_strategy="steps",     # Save based on number of steps
    save_steps=500,           # Save checkpoint every 500 steps
    save_total_limit=3,        # Keep only the last 3 checkpoints
)

peft_config = LoraConfig(
    r=8, # This is the Low Rank! (Low Rank Adaptation)
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj"]  # Only adapt attention layers
)

trainer = SFTTrainer(
    model,
    args=config,
    train_dataset=dataset,
    processing_class=tokenizer,
    peft_config=peft_config,
)

In [None]:
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
torch.cuda.memory_summary(device=None, abbreviated=False)
torch.set_default_dtype(torch.float16) # Some layers might stay FP32 unless we specify

from transformers.trainer_utils import get_last_checkpoint
last_checkpoint = get_last_checkpoint(checkpoint_path)

if last_checkpoint is not None: # If training was interrupted, resume from the last checkpoint
       print(f"Resuming training from checkpoint: {last_checkpoint}")
else:
       print("No checkpoint found. Starting training from scratch.")

trainer.train(resume_from_checkpoint=last_checkpoint)
trainer.save_model(base_path + "/final_model_1.7B")
trainer.model.save_pretrained(base_path + "/peft_model_1.7B")

In [None]:
%load_ext tensorboard
%tensorboard --logdir "/content/drive/MyDrive/SmolLM2-1.7B-UltraChat_200k/logs/"

In [None]:
# Kill the runtime to stop consuming compute credits
from google.colab import runtime
runtime.unassign()