In [4]:
# Installing all the software tools we need:
# - transformers, datasets, accelerate: For working with LLMs
# - peft, bitsandbytes: For efficient memory training (QLoRA)
!pip install -q transformers datasets accelerate peft bitsandbytes trl


In [5]:
from datasets import load_dataset, DatasetDict

# --- Load a new, reliable instruction dataset (we'll use 'databricks-dolly-15k') ---
# This dataset is already formatted with 'instruction' and 'response' columns,
# making it perfect for fine-tuning! We are loading a smaller version for quick training.
print("Loading the Databricks Dolly 15k dataset...")

# Load the full dataset first
dolly_data = load_dataset("databricks/databricks-dolly-15k", split="train")

# Filter the data to keep only useful instructions (like writing tasks)
# and take only the first 1000 examples for speed.
dolly_data = dolly_data.filter(lambda x: x["category"] in ["creative_writing", "summarization", "open_qa"])
train_data_subset = dolly_data.select(range(1000))

# --- Format the Data for the LLM ---
def format_data_for_llm(example):
    # This creates the INSTRUCTION/RESPONSE format required by the model:
    # [INST] Write a short story about... [/INST] The story begins: 'Once upon a time...'
    return {
        "text": f"[INST] {example['instruction']} [/INST] {example['response']}"
    }

# Apply the formatting to the subset
formatted_data = train_data_subset.map(format_data_for_llm)

print("Dataset is prepared and ready!")
# Show the first example to see the format:
print("\nExample Story Format:")
print(formatted_data[0]['text'])

Loading the Databricks Dolly 15k dataset...


README.md: 0.00B [00:00, ?B/s]

databricks-dolly-15k.jsonl:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15011 [00:00<?, ? examples/s]

Filter:   0%|          | 0/15011 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset is prepared and ready!

Example Story Format:
[INST] Why can camels survive for long without water? [/INST] Camels use the fat in their humps to keep them filled with energy and hydration for long periods of time.


In [6]:
# Split the dataset into 90% for training and 10% for testing
# We use 'formatted_data' now, which is the variable we created in the step above!
data_split = formatted_data.train_test_split(test_size=0.1)

train_data = data_split["train"]
test_data = data_split["test"]

print(f"Total training examples: {len(train_data)}")
print(f"Total testing examples: {len(test_data)}")

Total training examples: 900
Total testing examples: 100


In [8]:
from huggingface_hub import login

# When you run this, a pop-up window will ask for your Hugging Face Token.
# Paste the token (the long string you just copied) into the box and press Enter.
login()

print("Successfully logged into Hugging Face!")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Successfully logged into Hugging Face!


In [9]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

# 1. Configuration for memory saving (QLoRA)
# This lets us train the giant model on the free GPU
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, # Load it in 4-bit to save space
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

MODEL_ID = "google/gemma-2b-it"

# 2. Load the Model (the brain)
print(f"Loading Model: {MODEL_ID}. This may take a few minutes...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto" # This automatically puts the model on the GPU
)
# 3. Load the Tokenizer (the translator)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
tokenizer.pad_token = tokenizer.eos_token
print("Model and Tokenizer loaded successfully.")

Loading Model: google/gemma-2b-it. This may take a few minutes...


config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

Model and Tokenizer loaded successfully.


In [24]:
from peft import LoraConfig
from trl import SFTTrainer
from transformers import TrainingArguments

# 1. LORA Configuration (Your settings are correct here)
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# 2. Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=1, # This must remain 1
    # === FIX IS HERE: Add Gradient Accumulation ===
    gradient_accumulation_steps=4,
    # ==============================================
    learning_rate=2e-4,
    logging_steps=10,
    save_strategy="epoch",
    fp16=True,
)

# Define a Formatting Function (This is correct)
def formatting_function(examples):
    # This function tells the SFTTrainer exactly how to find the text column.
    return examples['text']

# 3. Set up the SFT Trainer
# We are only keeping the arguments that are absolutely required.
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    peft_config=lora_config,
    formatting_func=formatting_function,
    # REMOVED: tokenizer=tokenizer (because of the previous error)
    # REMOVED: max_seq_length=512 (because of the new error)
)

print("Training setup is complete. Ready to bake!")



Applying formatting function to train dataset:   0%|          | 0/900 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/900 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/900 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/900 [00:00<?, ? examples/s]

Training setup is complete. Ready to bake!


In [25]:
print("Starting the fine-tuning process...")
# This is the step that actually uses the GPU for training!
trainer.train()
print("Training is complete! The model has learned new story skills.")

Starting the fine-tuning process...


Step,Training Loss
10,3.1899
20,2.4641
30,2.5333
40,2.4224
50,2.191
60,2.4806
70,2.188
80,2.2654
90,2.3496
100,2.2631


Training is complete! The model has learned new story skills.


In [30]:
from transformers import pipeline
import torch
import time

# Create a text generation pipeline
# We must include the tokenizer and set torch_dtype
story_generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
)

# --- USER INPUT AREA ---
# 1. Define your story request here.
#    The variable 'prompt_text' is what your professor will ask for.
prompt_text = "Write a story about a brave astronaut cat who visits the moon."

# 2. Format the request with the tags the model was trained on.
formatted_prompt = f"[INST] {prompt_text} [/INST]"
# -----------------------

# Set up generation parameters
generation_args = {
    "max_new_tokens": 300,
    "do_sample": True,
    "temperature": 0.8,
    "top_k": 50,
    # Stop the generation immediately after the model's response is complete
    "eos_token_id": tokenizer.encode('[/INST]')[0],
}

print(f"\nYour request: {prompt_text}")

# Generate the story
# Using torch.no_grad() speeds up generation by disabling training calculations
with torch.no_grad():
    result = story_generator(
        formatted_prompt,
        **generation_args
    )

# Clean the Output for Presentation
generated_text = result[0]['generated_text']

# Find the end of your input prompt and take only the model's response part
if "[/INST]" in generated_text:
    final_story = generated_text.split("[/INST]")[1].strip()
else:
    final_story = generated_text

print("\n--- GENERATED STORY ---")
print(final_story)
print("-----------------------")

Device set to use cuda:0



Your request: Write a story about a brave astronaut cat who visits the moon.

--- GENERATED STORY ---
A brave astronaut cat named Cleo was ready to explore the world. She was ready to give the best of her meow to all the world's humans. Her first stop was the moon. 

Cleo was scared to go to the moon, but she had to remember that she had plenty of meow-plified energy inside her to get her there. 

Finally, Cleo landed on the Moon and the first thing she did was explore all the cats' meow-velous planet! She made friends with the lunar felines and ate all the lunar cats' favorite things.

Cleo was able to meow-dify her energy all around the world so that all the humans would meow-nally love her. Cleo was a cat astronaut who traveled to the moon to meow-nally give the best of her meow-titude to the world.
-----------------------


In [28]:
# Save the small learned parts (the LoRA adapter weights)
print("Saving LoRA adapter weights...")
# This saves the model's new skills into a folder on the Colab server.
trainer.model.save_pretrained("./fine_tuned_story_model")
print("Adapter saved successfully to: ./fine_tuned_story_model")

Saving LoRA adapter weights...
Adapter saved successfully to: ./fine_tuned_story_model


In [29]:
# Create a zip file of your trained model adapters
!zip -r story_model_adapters.zip fine_tuned_story_model/
print("Model folder has been compressed into 'story_model_adapters.zip'.")


  adding: fine_tuned_story_model/ (stored 0%)
  adding: fine_tuned_story_model/adapter_config.json (deflated 58%)
  adding: fine_tuned_story_model/adapter_model.safetensors (deflated 8%)
  adding: fine_tuned_story_model/README.md (deflated 65%)
Model folder has been compressed into 'story_model_adapters.zip'.
