<a href="https://colab.research.google.com/github/IslamiTP/Intro-to-LLM-Homework-2/blob/main/run_ollama_in_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Run Ollama in Colab

### Code to initialize llama using Unsloth was retrieved from

https://colab.research.google.com/drive/1T5-zKWM_5OD21QHwXHiV9ixTRR7k3iB9?usp=sharing#scrollTo=2eSvM9zX_2d3

In [12]:
%%capture
!pip install unsloth

In [13]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # 4bit for 405b!
    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!

    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",

    "unsloth/Llama-3.3-70B-Instruct-bnb-4bit" # NEW! Llama 3.3 70B!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2025.3.18: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [14]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 1, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [15]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.2",
)

def formatting_train_prompts(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }

def tokenize_function(examples):
    return tokenizer(
        examples["text"],  # Use only text
        padding=True,
        truncation=True,
        max_length=2048,
        return_tensors="pt"
    )

from datasets import load_dataset

# Load the full dataset
full_dataset = load_dataset("mlabonne/FineTome-100k", split="train")

# Split the dataset into training and testing sets (e.g., 80% train, 20% test)
split_datasets = full_dataset.train_test_split(test_size=0.2, seed=42)

# Access the train and test splits
train_dataset = split_datasets['train']
test_dataset = split_datasets['test']

print(f"Train dataset: {train_dataset}")
print(f"Test dataset: {test_dataset}")

Train dataset: Dataset({
    features: ['conversations', 'source', 'score'],
    num_rows: 80000
})
Test dataset: Dataset({
    features: ['conversations', 'source', 'score'],
    num_rows: 20000
})


In [16]:
from unsloth.chat_templates import standardize_sharegpt
train_dataset = standardize_sharegpt(train_dataset)
train_dataset = train_dataset.map(formatting_train_prompts, batched = True,)
test_dataset = standardize_sharegpt(test_dataset)
test_dataset = test_dataset.map(formatting_train_prompts, batched=True)
tokenized_datasets = test_dataset.map(tokenize_function, batched=True)

# Model Performance on 5 different categories outlined in question #1 before training

In [17]:
from unsloth.chat_templates import get_chat_template
import torch

# Assuming tokenizer and model are already loaded
tokenizer = get_chat_template(
    tokenizer,
    chat_template="llama-3.2",
)

# Define a list of messages (each entry follows the chat template format)

message_list = [
    [# Text Summarization
     {"role": "user", "content": "Antibiotics are a type of medication used to treat bacterial infections. They work by either killing the bacteria or preventing them from reproducing, allowing the body’s immune system to fight off the infection. Antibiotics are usually taken orally in the form of pills, capsules, or liquid solutions, or sometimes administered intravenously. They are not effective against viral infections, and using them inappropriately can lead to antibiotic resistance.\n\n"},
     {"role": "user", "content": "Explain the above in one sentence.\n\n"}],

    [# Question Answering
     {"role": "user", "content": "Answer the question based on the context below. Keep the answer short and concise. Respond 'Unsure about answer' if not sure about the answer.\n\n"},
     {"role": "user", "content": "Context: Teplizumab traces its roots to a New Jersey drug company called Ortho Pharmaceutical. There, scientists generated an early version of the antibody, dubbed OKT3. Originally sourced from mice, the molecule was able to bind to the surface of T cells and limit their cell-killing potential. In 1986, it was approved to help prevent organ rejection after kidney transplants, making it the first therapeutic antibody allowed for human use.\n\n"},
     {"role": "user", "content": "Question: What was OKT3 originally sourced from?\n\n"}],

    [# Text Classification
     {"role": "user", "content": "Classify the text into neutral, negative or positive.\n\n"},
     {"role": "user", "content": "Text: I think the food was okay.\n\n"}],

    [# Role Playing
     {"role": "system", "content": "The following is a conversation with an AI research assistant. The assistant tone is technical and scientific.\n\n"},
     {"role": "user", "content": "Hello, who are you?\n\n"},
     {"role": "assistant", "content": "Greeting! I am an AI research assistant. How can I help you today?\n\n"},
     {"role": "user", "content": "Can you tell me about the creation of blackholes?\n\n"}],

    [# Reasoning
     {"role": "system", "content": "The odd numbers in this group add up to an even number: 15, 32, 5, 13, 82, 7, 1.\n\n"},
     {"role": "user", "content": "Solve by breaking the problem into steps. First, identify the odd numbers, add them, and indicate whether the result is odd or even.\n\n"}]
]

# Store results
generated_outputs = []

# Iterate through message list and generate responses
for messages in message_list:
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,  # Required for generation
        return_tensors="pt",
    ).to("cuda")

    # Run inference without gradient tracking
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs,
            max_new_tokens=2048,
            use_cache=True,
            temperature=1.5,
            min_p=0.1
        )

    # Decode and store the output
    output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    generated_outputs.append(output_text)

prompt_types = ["Text Summarization", "Question Answering", "Text Classification", "Role Playing", "Reasoning"]

# Print all generated responses
for response, prompt_type in zip(generated_outputs, prompt_types):
    print(f"Prompt Type: {prompt_type}\n\n")

    print(f"Response: {response}\n\n\n")

Prompt Type: Text Summarization


Response: system

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

user

Antibiotics are a type of medication used to treat bacterial infections. They work by either killing the bacteria or preventing them from reproducing, allowing the body’s immune system to fight off the infection. Antibiotics are usually taken orally in the form of pills, capsules, or liquid solutions, or sometimes administered intravenously. They are not effective against viral infections, and using them inappropriately can lead to antibiotic resistance.

user

Explain the above in one sentence.

assistant

Antibiotics are a type of medication that kill or prevent bacteria from reproducing, allowing the body's immune system to fight off infections, but they are ineffective against viral infections and can lead to antibiotic resistance if used improperly.



Prompt Type: Question Answering


Response: system

Cutting Knowledge Date: December 2023
Today Date: 26 July 

# Model performance on the representative dataset before training

In [18]:
messages_list = [[{"role": conv["role"], "content": conv["content"] + "\n\n"} for conv in message] for message in test_dataset["conversations"][:5]] # test the first 5 samples of the representative dataset that will be trained

# Store results
generated_outputs = []

# Iterate through message list and generate responses
for messages in messages_list:
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,  # Required for generation
        return_tensors="pt",
    ).to("cuda")

    # Run inference without gradient tracking
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs,
            max_new_tokens=2048,
            use_cache=True,
            temperature=1.5,
            min_p=0.1
        )

    # Decode and store the output
    output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    generated_outputs.append(output_text)

for response in generated_outputs:
    print(f"Response: {response}\n\n\n")

Response: system

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

user

Give three types of computer graphics.

assistant

1. Raster Graphics: These are also called bitmap graphics and are composed of pixels arranged in a grid. Each pixel can have a different color and shade. Raster graphics excel at representing photographic images and digital painting.

2. Vector Graphics: These graphics are constructed using mathematical formulas representing geometric shapes like lines, curves, and polygons. They are resolution-independent, meaning they can be scaled up or down in size without losing quality. Vector graphics are commonly used for logos, icons, typography and illustrations.

3. 3D Graphics: These graphics are used to create three-dimensional digital representations of objects. 3D graphics use techniques like modeling, rendering, and shading to simulate depth and surface properties. These graphics are used in animation, video games, architecture, engineering, and virt

# Fine-tuning using LORA and SFTTrainer which is a special trainer designed to fine-tune models.

In [19]:
import matplotlib.pyplot as plt
from transformers import TrainerCallback
import numpy as np
from unsloth import is_bfloat16_supported
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq

# Define a custom callback to capture the training and validation losses
class LossLoggerCallback(TrainerCallback):
    def __init__(self):
        self.train_losses = []
        self.val_losses = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            # Capture training loss
            if "loss" in logs:
                self.train_losses.append(logs["loss"])

            # Capture validation loss (eval_loss)
            if "eval_loss" in logs:
                self.val_losses.append(logs["eval_loss"])

# Initialize the callback
loss_logger = LossLoggerCallback()

# Create a trainer object with the custom callback
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    dataset_num_proc=2,
    packing=False,
    eval_dataset=tokenized_datasets.select(range(100)),
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=256,
        warmup_steps=5,
        num_train_epochs=1,
        learning_rate=9e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1, # log training loss every 1 step
        evaluation_strategy="steps",  # Ensure validation happens periodically
        eval_steps=1,  # Evaluate every 1 step
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",
    ),
    callbacks=[loss_logger]  # Attach the loss logger callback
)

# Train the model
trainer.train()

# Final plot after training is complete
plt.figure(figsize=(10, 5))
plt.plot(loss_logger.train_losses, label='Train Loss')
plt.plot(loss_logger.val_losses, label='Validation Loss')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.title('Final Training and Validation Loss')
plt.legend()
plt.show()

Unsloth: We found double BOS tokens - we shall remove one automatically.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 80,000 | Num Epochs = 1 | Total steps = 156
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 256
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 256 x 1) = 512
 "-____-"     Trainable parameters = 1,519,616/3,214,269,440 (0.05% trained)


Step,Training Loss,Validation Loss
1,1.3848,1.506838
2,1.4124,1.452232
3,1.3696,1.308076
4,1.1986,1.188115
5,1.1113,1.105791
6,1.0126,1.0628
7,0.9794,1.030821
8,0.9442,1.012819
9,0.937,1.000356
10,0.9081,0.988569


Step,Training Loss,Validation Loss
1,1.3848,1.506838
2,1.4124,1.452232
3,1.3696,1.308076
4,1.1986,1.188115
5,1.1113,1.105791
6,1.0126,1.0628
7,0.9794,1.030821
8,0.9442,1.012819
9,0.937,1.000356
10,0.9081,0.988569


KeyboardInterrupt: 

#Model performance on representative examples from the dataset after training

In [None]:
messages_list = [[{"role": conv["role"], "content": conv["content"] + "\n\n"} for conv in message] for message in test_dataset["conversations"][:5]] # test the first 5 samples of the representative dataset that will be trained

# Store results
generated_outputs = []

# Iterate through message list and generate responses
for messages in messages_list:
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,  # Required for generation
        return_tensors="pt",
    ).to("cuda")

    # Run inference without gradient tracking
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs,
            max_new_tokens=2048,
            use_cache=True,
            temperature=1.5,
            min_p=0.1
        )

    # Decode and store the output
    output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    generated_outputs.append(output_text)

for response in generated_outputs:
    print(f"Response: {response}\n\n\n")

# Model performance on the 5 different categories in question #1 after training

In [None]:
from unsloth.chat_templates import get_chat_template
import torch

# Assuming tokenizer and model are already loaded
tokenizer = get_chat_template(
    tokenizer,
    chat_template="llama-3.2",
)

# Define a list of messages (each entry follows the chat template format)

# Used ChatGPT to come up with new prompts for each task in this section

message_list = [
    [{"role": "system", "content": "You are a helpful AI assistant that specializes in summarizing text.\n\n"}, # Text Summarization
     {"role": "user", "content": "Summarize the following article in 2-3 sentences:\n\nIn recent years, the importance of renewable energy has grown significantly due to increasing concerns about climate change. Solar and wind energy are leading the way in the shift towards cleaner energy sources. Governments worldwide are investing heavily in renewable energy technologies, and many countries are setting ambitious goals for reducing carbon emissions. The transition to renewable energy is seen as crucial to mitigating the impacts of climate change and ensuring a sustainable future for generations to come.\n\n"}],

    [{"role": "system", "content": "You are an AI assistant that answers questions based on provided passages.\n\n"}, # Question Answering
     {"role": "user", "content": "Read the following passage and answer the question:\n\nThe Eiffel Tower is located in Paris, France. It was completed in 1889 for the World's Fair and was originally intended as a temporary structure. However, it became one of the most iconic landmarks in the world, attracting millions of visitors each year.\n\nQuestion: When was the Eiffel Tower completed?\n\n"}],

    [{"role": "system", "content": "You are an AI that classifies text into categories: Technology, Health, Sports, or Politics.\n\n"}, # Text Classification
     {"role": "user", "content": "Classify the following text:\n\nThe latest advancements in AI and machine learning are revolutionizing industries. Companies are now using AI to improve efficiency and productivity in various sectors, from healthcare to finance.\n\n"}],

    [{"role": "system", "content": "You are a customer support agent for an online course company. Respond helpfully to customers.\n\n"}, # Role Playing
     {"role": "user", "content": "Hi, I’m having trouble accessing the course I purchased. Can you help me?\n\n"}],

    [{"role": "system", "content": "Answer the following reasoning question:\n\n"}, #Reasoning
     {"role": "user", "content": "If all roses are flowers, and some flowers are red, can we conclude that some roses are red?\n\n"}]
]

# Store results
generated_outputs = []

# Iterate through message list and generate responses
for messages in message_list:
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,  # Required for generation
        return_tensors="pt",
    ).to("cuda")

    # Run inference without gradient tracking
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs,
            max_new_tokens=2048,
            use_cache=True,
            temperature=1.5,
            min_p=0.1
        )

    # Decode and store the output
    output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    generated_outputs.append(output_text)

prompt_types = ["Text Summarization", "Question Answering", "Text Classification", "Role Playing", "Reasoning"]

# Print all generated responses
for i, (response, messages, prompt_type) in enumerate(zip(generated_outputs, message_list, prompt_types)):
    print(f"Prompt Type: {prompt_type}\n\n")

    print(f"Response: {response}\n\n\n")