<a href="https://colab.research.google.com/github/JulianSchwabCommits/colab-google-files/blob/main/birthdate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers accelerate peft trl datasets bitsandbytes -q


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.5/465.5 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import json
from datetime import datetime, timedelta
import random

def calculate_age(birthdate, current_date):
    return (
        current_date.year
        - birthdate.year
        - ((current_date.month, current_date.day) < (birthdate.month, birthdate.day))
    )

birthdate = datetime(2008, 5, 21)

samples = []
for _ in range(500):  # Change this number if you want more/less data
    start = datetime(2025, 1, 1)
    end = datetime(2050, 12, 31)
    delta = end - start
    random_days = random.randrange(delta.days)
    current_date = start + timedelta(days=random_days)

    age = calculate_age(birthdate, current_date)

    sample = {
        "system": f"Current Date: {current_date.date()}\nBirthdate: {birthdate.date()}",
        "instruction": "How old is Julian?",
        "response": f"Julian is {age} years old."
    }
    samples.append(sample)

# Save JSONL
with open("julian_age_dataset.jsonl", "w") as f:
    for s in samples:
        f.write(json.dumps(s) + "\n")

print("Generated julian_age_dataset.jsonl with", len(samples), "samples")


Generated julian_age_dataset.jsonl with 500 samples


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
dataset_path = "julian_age_dataset.jsonl"

dataset = load_dataset("json", data_files=dataset_path, split="train")

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,   # 4-bit quantization so Colab doesn't explode
    device_map="auto"
)

dataset


Generating train split: 0 examples [00:00, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


ImportError: Using `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`

In [None]:
from peft import LoraConfig
from trl import SFTConfig, SFTTrainer

lora = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

def format_sample(sample):
    return (
        f"<system>\n{sample['system']}\n</system>\n"
        f"<user>\n{sample['instruction']}\n</user>\n"
        f"<assistant>\n{sample['response']}\n</assistant>"
    )

training_args = SFTConfig(
    output_dir="./julian-age-lora",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    warmup_steps=20,
    max_steps=300,
    learning_rate=2e-4,
    logging_steps=10,
    fp16=True
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    formatting_func=format_sample,
    peft_config=lora,
    max_seq_length=256,
    packing=False,
    args=training_args
)

NameError: name 'model' is not defined

In [None]:
trainer.train()
trainer.save_model()

In [None]:
from transformers import pipeline

pipe = pipeline(
    "text-generation",
    model="./julian-age-lora",
    tokenizer=tokenizer
)

prompt = """<system>
Current Date: 2033-11-04
Birthdate: 2008-05-21
</system>
<user>
How old is Julian?
</user>
"""

result = pipe(prompt, max_new_tokens=50)[0]["generated_text"]
print(result)
