In [1]:
!pip install datasets
!pip install flash-attn --no-build-isolation
!pip install wandb
!pip install trl

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [None]:
!wandb login

In [None]:
from huggingface_hub import login
login()

In [None]:
import wandb

wandb.init(
    project="gaokerena",
    name="instruction_tuning",
)

In [3]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainerCallback,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer, SFTConfig

BASE_MODEL_ID = "CohereForAI/aya-expanse-8b"
PRETRAINED_MODEL_ID = "gaokerena/pretrained"
DATASET_REPO = "gaokerena/MF3QA"
DATASET_SPLIT = "train"
WORKING_REPO_ID = "gaokerena/instruction_tuned"

HYPER_PARAMS = {
    "run_name": "pretraining",
    "output_dir": "outputs",
    "num_train_epochs": 1,
    "per_device_train_batch_size": 2,
    "gradient_accumulation_steps": 16,
    "optim": "adamw_torch",
    "logging_steps": 4,
    "save_strategy": "steps",
    "save_total_limit": 1,
    "learning_rate": 5e-4,
    "bf16": True,
    "max_grad_norm": 0.3,
    "warmup_ratio": 0.03,
    "lr_scheduler_type": "cosine",
    "weight_decay": 0.5,
    "report_to": "wandb",
    "gradient_checkpointing": True,
    "gradient_checkpointing_kwargs": {"use_reentrant": False},
    "hub_model_id": WORKING_REPO_ID,
    "dataloader_persistent_workers": True,
    "dataloader_num_workers": 4,
    "max_seq_length": 1024,
    "packing": False,
}

In [None]:
from datasets import Dataset
dataset = load_dataset(DATASET_REPO, split=DATASET_SPLIT)
messages = {"messages": []}
for d in dataset:
  messages["messages"].append([
      {"content": d["Question"], "role": "user"},
      {"content": d["Answer"], "role": "assistant"},
  ])
dataset = Dataset.from_dict(messages)
dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    torch_dtype=torch.bfloat16,
    device_map="cuda",
    low_cpu_mem_usage=True,
    attn_implementation="flash_attention_2"
)
model = PeftModel.from_pretrained(model, PRETRAINED_MODEL_ID)
model = model.merge_and_unload()
tokenizer.pad_token = tokenizer.eos_token

In [None]:
lora_config = LoraConfig(
    lora_alpha=2,
    lora_dropout=0.4,
    r=2,
    bias="none",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

In [None]:
class PushToHubCallback(TrainerCallback):
    def on_save(self, args, state, control, **kwargs):
        kwargs["model"].push_to_hub(repo_id=WORKING_REPO_ID, commit_message=f"Checkpoint at step {state.global_step}")

In [None]:
args = SFTConfig(
    **HYPER_PARAMS
)

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset,
    peft_config=lora_config,
    callbacks=[PushToHubCallback],
)

In [None]:
trainer.train()