In [25]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType, PeftModel

In [26]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Load models and datasets

In [27]:
model_id = "HuggingFaceTB/SmolLM2-360M"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto")

model.gradient_checkpointing_enable()
model.enable_input_require_grads()

In [3]:
# Prepare the bom dataset
bom_dataset = load_dataset("burman-ai/The-Book-of-Mormon")
# The dataset only comes with a train split, so let's add a test set
bom_dataset = bom_dataset["train"].train_test_split(test_size=0.1)
bom_dataset_train = bom_dataset["train"]
bom_dataset_test = bom_dataset["test"]

tokenizer.pad_token = tokenizer.eos_token

def bom_tokenize_function(examples):
    return tokenizer(examples["scripture_text"], truncation=True, padding="max_length", max_length=2048)

bom_dataset_train = bom_dataset_train.map(bom_tokenize_function, batched=True)
bom_dataset_test = bom_dataset_test.map(bom_tokenize_function, batched=True)

bom_dataset_train = bom_dataset_train.remove_columns([col for col in bom_dataset_train.column_names if col not in ["input_ids", "attention_mask"]])
bom_dataset_test = bom_dataset_test.remove_columns([col for col in bom_dataset_test.column_names if col not in ["input_ids", "attention_mask"]])

# we add a labels field that is identical to input_ids for causal language modeling
# HF automatically shifts the labels over one position when computing the loss
bom_dataset_train = bom_dataset_train.map(
    lambda examples: {"labels": examples["input_ids"]},
    batched=True
)
bom_dataset_test = bom_dataset_test.map(
    lambda examples: {"labels": examples["input_ids"]},
    batched=True
)

Map:   0%|          | 0/4716 [00:00<?, ? examples/s]

Map:   0%|          | 0/524 [00:00<?, ? examples/s]

Map:   0%|          | 0/4716 [00:00<?, ? examples/s]

Map:   0%|          | 0/524 [00:00<?, ? examples/s]

In [28]:
# Prepare the arxiv dataset
arxiv_dataset = load_dataset("gfissore/arxiv-abstracts-2021")
# This dataset has 2 million rows; let's select a random subset of 5k rows to match the other dataset
arxiv_dataset = arxiv_dataset["train"].shuffle(seed=42).select(range(5000))

# The dataset only comes with a train split, so let's add a test set
arxiv_dataset = arxiv_dataset.train_test_split(test_size=0.1)
arxiv_dataset_train = arxiv_dataset["train"]
arxiv_dataset_test = arxiv_dataset["test"]

tokenizer.pad_token = tokenizer.eos_token

def arxiv_tokenize_function(examples):
    return tokenizer(examples["abstract"], truncation=True, padding="max_length", max_length=2048)

arxiv_dataset_train = arxiv_dataset_train.map(arxiv_tokenize_function, batched=True)
arxiv_dataset_test = arxiv_dataset_test.map(arxiv_tokenize_function, batched=True)

arxiv_dataset_train = arxiv_dataset_train.remove_columns([col for col in arxiv_dataset_train.column_names if col not in ["input_ids", "attention_mask"]])
arxiv_dataset_test = arxiv_dataset_test.remove_columns([col for col in arxiv_dataset_test.column_names if col not in ["input_ids", "attention_mask"]])

# we add a labels field that is identical to input_ids for causal language modeling
# HF automatically shifts the labels over one position when computing the loss
arxiv_dataset_train = arxiv_dataset_train.map(
    lambda examples: {"labels": examples["input_ids"]},
    batched=True
)
arxiv_dataset_test = arxiv_dataset_test.map(
    lambda examples: {"labels": examples["input_ids"]},
    batched=True
)

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

# Model

In [29]:
# Define lora config
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r = 64,
    lora_alpha = 128,
    lora_dropout = 0.05,
    bias = "none",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)

In [30]:
# Freeze the model and add LoRA adapters
model = get_peft_model(model, peft_config)

In [31]:
model.print_trainable_parameters()

trainable params: 34,734,080 || all params: 396,555,200 || trainable%: 8.7590


In [32]:
from transformers import TrainerCallback

class SampleGenerationCallback(TrainerCallback):
    def __init__(self, tokenizer, sample_prompt, max_new_tokens=50):
        self.tokenizer = tokenizer
        self.sample_prompt = sample_prompt
        self.max_new_tokens = max_new_tokens

    def generate_sample(self, model):
        model.eval()
        inputs = self.tokenizer(self.sample_prompt, return_tensors="pt").to(model.device)
        outputs = model.generate(
            **inputs,
            max_new_tokens=self.max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
        )
        text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        print("\nSample output:\n", text)
        model.train()

    def on_train_begin(self, args, state, control, **kwargs):
        print(f"\n=== Starting training ===")
        self.generate_sample(kwargs["model"])

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None and "loss" in logs:
            print(f"\nStep {state.global_step} | Loss: {logs['loss']:.4f}")
            self.generate_sample(kwargs["model"])


Training bom model

In [10]:
training_args = TrainingArguments(
    output_dir="./bom-lora",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate = 2e-4,
    num_train_epochs=1,
    save_steps=20,
    logging_steps=20,
    bf16=True,
    gradient_checkpointing=True,
)

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=bom_dataset_train,
    eval_dataset=bom_dataset_test,
    tokenizer=tokenizer,
    callbacks=[SampleGenerationCallback(tokenizer, sample_prompt="And it came to")]
)

  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.


In [12]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 0}.
[34m[1mwandb[0m: Currently logged in as: [33mjay-orten[0m ([33mjay-o[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin



=== Starting training ===


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.



Sample output:
 And it came to pass that when Jesus was on the mount of Olives, the first day of the feast, that he taught the people, and they were astonished at his doctrine.


The next day, when Jesus was walking by the sea of Galilee,


Step,Training Loss
20,2.1814
40,0.0668
60,0.0625
80,0.0603
100,0.0611
120,0.0572
140,0.0565
160,0.0605
180,0.058
200,0.0598



Step 20 | Loss: 2.1814

Sample output:
 And it came to pass that after the Lord had made the world, the Lord sent forth his spirit into the world, and the Lord made the world.

Step 40 | Loss: 0.0668

Sample output:
 And it came to pass that the sons of the women, the sons of the women, did not fear to go forth and to seek for the king, and to seek for the land; and they went forth and came in unto the land, and they did seek for

Step 60 | Loss: 0.0625

Sample output:
 And it came to pass that the Lord God commanded that the people should be numbered, and that they should be counted in all their tribes, and their fathers' houses, and their houses, and their lands, and their cities, and their towns, and their villages,

Step 80 | Loss: 0.0603

Sample output:
 And it came to pass that the people of Judah were filled with anger against their brethren the children of Ammon.

Step 100 | Loss: 0.0611

Sample output:
 And it came to pass that there were many of them who were of the priests 

TrainOutput(global_step=295, training_loss=0.2031243421263614, metrics={'train_runtime': 788.9198, 'train_samples_per_second': 5.978, 'train_steps_per_second': 0.374, 'total_flos': 2.024602244481024e+16, 'train_loss': 0.2031243421263614, 'epoch': 1.0})

In [13]:
model.save_pretrained("final_lora_adapter")

In [24]:
model.push_to_hub("royal42/final_bom_lora_adapter", private=True)

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...adapter_model.safetensors:   0%|          | 29.7kB /  139MB            

CommitInfo(commit_url='https://huggingface.co/royal42/final_bom_lora_adapter/commit/50eba2b215fa9d2c62b24e65f27db3bba3da1ea9', commit_message='Upload model', commit_description='', oid='50eba2b215fa9d2c62b24e65f27db3bba3da1ea9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/royal42/final_bom_lora_adapter', endpoint='https://huggingface.co', repo_type='model', repo_id='royal42/final_bom_lora_adapter'), pr_revision=None, pr_num=None)

Training code model

In [33]:
training_args = TrainingArguments(
    output_dir="./arxiv-lora",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate = 2e-4,
    num_train_epochs=1,
    save_steps=20,
    logging_steps=20,
    bf16=True,
    gradient_checkpointing=True,
)

In [34]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=arxiv_dataset_train,
    eval_dataset=arxiv_dataset_test,
    tokenizer=tokenizer,
    callbacks=[SampleGenerationCallback(tokenizer, sample_prompt="And it came to")]
)

  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.


In [35]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 0}.



=== Starting training ===

Sample output:
 And it came to pass that when Jesus was on the mount of Olives, the first day of the feast, that he taught the people, and they were astonished at his doctrine.


The next day, when Jesus was walking by the sea of Galilee,


Step,Training Loss
20,2.5792
40,0.2841
60,0.281
80,0.2702
100,0.2625
120,0.278
140,0.2693
160,0.2659
180,0.2581
200,0.2583



Step 20 | Loss: 2.5792

Sample output:
 And it came to pass that after the Lord had made the world, the Lord sent out his spirit and made all things. And it came to pass, that after the Lord had made all things, the Lord sent out his spirit and made all things. And it came

Step 40 | Loss: 0.2841

Sample output:
 And it came to pass, that after these things had been fulfilled, Jesus went down to the sea of Tiberias, and there was casting a net into the sea, and he drew up the net full of young fishes, and he let it down again into the sea

Step 60 | Loss: 0.2810

Sample output:
 And it came to pass, that when they had been in the camp for a long time, there came into the camp a man that was a prophet, who spoke as the Spirit of God had appointed him; and the Lord said to him, "Go and tell the

Step 80 | Loss: 0.2702

Sample output:
 And it came to pass that the great King of the East, who was called the Lord of Hosts, came to the city of Jerusalem with a great army, and he had a great

TrainOutput(global_step=282, training_loss=0.43116534453757266, metrics={'train_runtime': 755.0019, 'train_samples_per_second': 5.96, 'train_steps_per_second': 0.374, 'total_flos': 1.931872370688e+16, 'train_loss': 0.43116534453757266, 'epoch': 1.0})

In [37]:
model.save_pretrained("final_lora_adapter")

In [36]:
model.push_to_hub("royal42/final_arxiv_lora_adapter", private=True)

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...adapter_model.safetensors:   0%|          | 29.7kB /  139MB            

CommitInfo(commit_url='https://huggingface.co/royal42/final_arxiv_lora_adapter/commit/72ee4abc0bde098671cf95cbbbb6cdd53ad690f7', commit_message='Upload model', commit_description='', oid='72ee4abc0bde098671cf95cbbbb6cdd53ad690f7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/royal42/final_arxiv_lora_adapter', endpoint='https://huggingface.co', repo_type='model', repo_id='royal42/final_arxiv_lora_adapter'), pr_revision=None, pr_num=None)