In [1]:
# installing required libraries
!pip install transformers datasets accelerate peft trl bitsandbytes typing

Collecting trl
  Downloading trl-0.24.0-py3-none-any.whl.metadata (11 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting typing
  Downloading typing-3.7.4.3.tar.gz (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading trl-0.24.0-py3-none-any.whl (423 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.1/423.1 kB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl (60.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: typing
  Building wheel for typing (setup.py) ... [?25l[?25hdone
  Created wheel for typing: filename=typing-3.7.4.3-py3-none-any.whl size=26304 sha256=377

In [1]:
import torch
import numpy as np
from datasets import load_dataset
from transformers import(
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer, SFTConfig
# don't do typos!

In [2]:
# defining model, defining tokenizer and quantization
base_model_id = "microsoft/Phi-3-mini-4k-instruct"
new_model_name = "phi3-mini-persona-merged"
merged_model_name = "phi3-mini-persona-merged"

# 4-bit quantization configuration(for memory efficiency)
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
) # important piece for finetuning on consumer GPUs

#model
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

#tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

# so phi 3 doesn't support {% generation %}
# this is a workaround here
tokenizer.chat_template = """{{ bos_token }}{% for message in messages %}{% if message['role'] == 'system' %}<|system|>
{{ message['content'] }}<|end|>
{% elif message['role'] == 'user' %}<|user|>
{{ message['content'] }}<|end|>
{% elif message['role'] == 'assistant' %}<|assistant|>
{% generation %}{{ message['content'] }}<|end|>
{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>
{% generation %}{% endif %}"""


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

In [3]:
#2000 for training and 100 for eval
raw_dataset= load_dataset("Salesforce/Webscale-RL",split="train[:2100]")

def get_field(ex, names):
  for n in names:
    if n in ex and ex[n]:
      return ex[n]
  return ""

# format Phi-3 chat template
def format_prompt(ex):
    domain = get_field(ex, ["domain", "domainstringclasses", "domain_class"])
    persona = get_field(ex, ["persona", "personastringlengths", "audience"])
    ctx = (get_field(ex, ["pretraining_text", "pretrain_text", "pretrain_textstringlengths"])).strip()
    q = get_field(ex, ["question", "questionstringlengths"])
    a = get_field(ex, ["answer", "answerstringlengths"])

    # This is the exact template from the Phi-3 model card
    system_prompt = f"You are an expert in {domain}. You are answering a question for a {persona}."
    user_prompt = f"Context:\n{ctx}\n\nQuestion:\n{q}"

    return {
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
            {"role": "assistant", "content": a}
        ]
    }


dataset = raw_dataset.map(format_prompt, remove_columns=raw_dataset.column_names)

# simple dataset split
train_dataset = dataset.select(range(2000)) # train on 2000 samples
eval_raw = raw_dataset.select(range(2000,2100)) # evaluate on 100




README.md: 0.00B [00:00, ?B/s]

data/part-0.parquet:   0%|          | 0.00/130M [00:00<?, ?B/s]

data/part-1.parquet:   0%|          | 0.00/135M [00:00<?, ?B/s]

data/part-10.parquet:   0%|          | 0.00/80.7M [00:00<?, ?B/s]

data/part-11.parquet:   0%|          | 0.00/8.67M [00:00<?, ?B/s]

data/part-2.parquet:   0%|          | 0.00/135M [00:00<?, ?B/s]

data/part-3.parquet:   0%|          | 0.00/132M [00:00<?, ?B/s]

data/part-4.parquet:   0%|          | 0.00/136M [00:00<?, ?B/s]

data/part-5.parquet:   0%|          | 0.00/107M [00:00<?, ?B/s]

data/part-6.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

data/part-7.parquet:   0%|          | 0.00/48.0M [00:00<?, ?B/s]

data/part-8.parquet:   0%|          | 0.00/95.4M [00:00<?, ?B/s]

data/part-9.parquet:   0%|          | 0.00/81.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1110662 [00:00<?, ? examples/s]

Map:   0%|          | 0/2100 [00:00<?, ? examples/s]

In [4]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules="all-linear"
)

sft_config = SFTConfig(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    optim="paged_adamw_8bit",
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=1e-3,
    fp16=True,
    max_grad_norm=0.3,
    max_steps=-1,  # Train on all 2000 examples (approx 250 steps)
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    save_strategy="epoch",      # Save at end of each epoch
    save_only_model=True,       # Save only model weights, not optimizer states
    max_length=1024,
    # packing=True,
    # dataset_text_field="text",
    assistant_only_loss=False,   # Only compute loss on assistant responses
    gradient_checkpointing=True,
    # OPTIONAL BUT RECOMMENDED:
    report_to="none",
    load_best_model_at_end=False,  # Not needed for 1 epoch
)

print(f"  LoRA rank: {peft_config.r}, alpha: {peft_config.lora_alpha}")
print(f"  Effective batch size: {sft_config.per_device_train_batch_size * sft_config.gradient_accumulation_steps}")
print(f"  Approximate steps: ~{2000 // (sft_config.per_device_train_batch_size * sft_config.gradient_accumulation_steps)}")

  LoRA rank: 16, alpha: 32
  Effective batch size: 8
  Approximate steps: ~250


In [5]:
# was missing this
trainer = SFTTrainer(
    model=model,
    # tokenizer=tokenizer,
    train_dataset=train_dataset,
    peft_config=peft_config,
    args=sft_config,

)
trainer.tokenizer = tokenizer
trainer.train()
trainer.model.save_pretrained(new_model_name)
tokenizer.save_pretrained(new_model_name)

Tokenizing train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (7335 > 4096). Running this sequence through the model will result in indexing errors


Truncating train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Trainer.tokenizer is now deprecated. You should use `Trainer.processing_class = processing_class` instead.
  return fn(*args, **kwargs)


Step,Training Loss
25,2.0479
50,1.9118
75,1.8563
100,1.8152
125,1.8216
150,1.7464
175,1.7876
200,1.7548
225,1.6831
250,1.7305


('phi3-mini-persona-merged/tokenizer_config.json',
 'phi3-mini-persona-merged/special_tokens_map.json',
 'phi3-mini-persona-merged/chat_template.jinja',
 'phi3-mini-persona-merged/tokenizer.model',
 'phi3-mini-persona-merged/added_tokens.json',
 'phi3-mini-persona-merged/tokenizer.json')

In [6]:
# -----------------------------------------------------------------
# Step 7: Merge LoRA Adapters into Base Model
# -----------------------------------------------------------------
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import gc

# Clear memory
torch.cuda.empty_cache()
gc.collect()

print(f"GPU memory available: {torch.cuda.mem_get_info()[0] / 1024**3:.2f} GB")

# Load base model in FP16
print("Loading base model in FP16...")
base_model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

# Load LoRA adapters
print("Loading LoRA adapters from ./results/checkpoint-250...")
model_with_adapters = PeftModel.from_pretrained(
    base_model,
    "./results/checkpoint-250"
)

# Merge
print("Merging adapters into base model...")
merged_model = model_with_adapters.merge_and_unload()

# Save
print("Saving merged model to ./phi3-mini-persona-merged...")
merged_model.save_pretrained("./phi3-mini-persona-merged")

tokenizer = AutoTokenizer.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    trust_remote_code=True
)
tokenizer.save_pretrained("./phi3-mini-persona-merged")

print("\n" + "="*60)
print("✅ MERGE COMPLETE!")
print("="*60)

# Cleanup
del base_model, model_with_adapters, merged_model
torch.cuda.empty_cache()
gc.collect()

GPU memory available: 10.75 GB
Loading base model in FP16...


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading LoRA adapters from ./results/checkpoint-250...
Merging adapters into base model...
Saving merged model to ./phi3-mini-persona-merged...

✅ MERGE COMPLETE!


544

In [7]:
# To avoid memory issue (OOM)
# cleanup
import torch
import gc
torch.cuda.empty_cache()
gc.collect()

# Check available memory
available_mem = torch.cuda.mem_get_info()[0] / 1024**3
print(f"--- [Step 8/9] GPU memory available: {available_mem:.2f} GB ---")

MERGED_MODEL_PATH = "./phi3-mini-persona-merged"

# Load merged model in FP16
print(f"--- [Step 8/9] Loading merged model from {MERGED_MODEL_PATH}... ---")
try:
    final_model = AutoModelForCausalLM.from_pretrained(
        MERGED_MODEL_PATH,
        device_map="cuda",
        torch_dtype=torch.float16
    )
    print("✅ Model loaded on GPU")
except Exception as e:
    print(f"⚠️  GPU loading failed, using auto device_map: {e}")
    final_model = AutoModelForCausalLM.from_pretrained(
        MERGED_MODEL_PATH,
        device_map="auto",  # Fallback to auto
        torch_dtype=torch.float16
    )

# Load tokenizer
final_tokenizer = AutoTokenizer.from_pretrained(MERGED_MODEL_PATH)
final_tokenizer.pad_token = final_tokenizer.eos_token
final_tokenizer.padding_side = "left"

print("--- [Step 8/9] Final model and tokenizer loaded successfully ---")

# Quick verification test
print("--- [Step 8/9] Running quick verification test... ---")
test_prompt = "<|system|>\nYou are an expert in healthcare.<|end|>\n<|user|>\nWhat is aspirin used for?<|end|>\n<|assistant|>\n"
test_inputs = final_tokenizer(test_prompt, return_tensors="pt").to(final_model.device)
with torch.no_grad():
    test_output = final_model.generate(
        **test_inputs,
        max_new_tokens=50,
        pad_token_id=final_tokenizer.pad_token_id
    )
test_text = final_tokenizer.decode(test_output[0], skip_special_tokens=True)
verification_response = test_text.split("<|assistant|>")[-1].strip()[:100]

print(f"✅ Verification response: {verification_response}...")
print("--- [Step 8/9] Model ready for evaluation ---")

--- [Step 8/9] Loading final merged model for evaluation ---
--- [Step 8/9] GPU memory available: 10.75 GB ---
--- [Step 8/9] Loading merged model from ./phi3-mini-persona-merged... ---


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Model loaded on GPU
--- [Step 8/9] Final model and tokenizer loaded successfully ---
--- [Step 8/9] Running quick verification test... ---
✅ Verification response: You are an expert in healthcare. What is aspirin used for? Aspirin is used to reduce fever and relie...
--- [Step 8/9] Model ready for evaluation ---


In [9]:

import os

checkpoint_path = "./results/checkpoint-250"
print("Checkpoint files:")
for file in os.listdir(checkpoint_path):
    size_mb = os.path.getsize(os.path.join(checkpoint_path, file)) / (1024**2)
    print(f"  {file}: {size_mb:.2f} MB")



Checkpoint files:
  adapter_config.json: 0.00 MB
  added_tokens.json: 0.00 MB
  trainer_state.json: 0.00 MB
  chat_template.jinja: 0.00 MB
  README.md: 0.00 MB
  special_tokens_map.json: 0.00 MB
  adapter_model.safetensors: 96.03 MB
  tokenizer_config.json: 0.00 MB
  tokenizer.json: 3.45 MB
  training_args.bin: 0.01 MB
  tokenizer.model: 0.48 MB


In [10]:
# Simple test without any fancy formatting
simple_prompt = "The capital of France is"

inputs = final_tokenizer(simple_prompt, return_tensors="pt").to(final_model.device)
outputs = final_model.generate(
    **inputs,
    max_new_tokens=20,
    do_sample=False
)

input_len = inputs['input_ids'].shape[1]
new_tokens = outputs[0][input_len:]
print("Generated:", final_tokenizer.decode(new_tokens, skip_special_tokens=True))



Generated: Paris.


In [14]:
!    pip install tqdm



In [15]:
from tqdm import tqdm


def generate_clean(model, tokenizer, prompt, max_new_tokens=128):
    """Generate without echoing the prompt"""

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    input_length = inputs['input_ids'].shape[1]

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )


    generated_tokens = outputs[0][input_length:]
    response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    response = response.replace("<|end|>", "").replace("<|assistant|>", "").strip()

    return response

# Test on a persona-specific example
test_prompt = """<|system|>
You are an expert in science. You are answering a question for a 10-year-old student.<|end|>
<|user|>
Why is the sky blue?<|end|>
<|assistant|>
"""

result = generate_clean(final_model, final_tokenizer, test_prompt)
print(f"Generated:\n{result}\n")

# If this looks good, run full eval
if len(result) > 10 and "<|system|>" not in result:



    gens_fixed = []
    for prompt in tqdm(prompts, desc="Generating responses"):
        response = generate_clean(final_model, final_tokenizer, prompt, max_new_tokens=128)
        gens_fixed.append(response)

    # Recalculate metrics
    em_fixed = np.mean([exact_match(g, a) for g, a in zip(gens_fixed, answers)])
    overlap_fixed = np.mean([char_overlap(g, a) for g, a in zip(gens_fixed, answers)])
    style_fixed = np.mean([persona_markers(g, p) for g, p in zip(gens_fixed, personas)])



    print(f"Exact Match (EM):          {em_fixed:.3f}")
    print(f"Semantic Overlap (Char):   {overlap_fixed:.3f}")
    print(f"Persona-Style Hit Rate:    {style_fixed:.3f}")

    for idx in [0, 1, 2]:
        print(f"[Example {idx}]")
        print(f"Persona:      {personas[idx]}")
        print(f"Question:     {get_field(eval_raw[idx], ['question', 'questionstringlengths'])[:80]}...")
        print(f"Ground Truth: {answers[idx][:150]}...")
        print(f"Generated:    {gens_fixed[idx][:150]}...")
        print("-" * 60 + "\n")
else:

    print(result)

✅ Checkpoint validated: 96MB adapter
✅ Generation test passed: 'Paris'
✅ Proceeding to fixed evaluation...

🧪 Testing fixed generation on your fine-tuned model...
Generated:
The sky looks blue because of a process called scattering. When sunlight reaches Earth's atmosphere, it's made up of different colors. Blue light gets scattered in all directions by the tiny oxygen and nitrogen molecules in our atmosphere, making the sky appear blue.

✅ Generation looks good! Running full evaluation...



Generating responses: 100%|██████████| 100/100 [03:29<00:00,  2.10s/it]


🎯 FIXED EVALUATION RESULTS
Exact Match (EM):          0.110
Semantic Overlap (Char):   0.783
Persona-Style Hit Rate:    0.000

📝 Sample Outputs:

[Example 0]
Persona:       educators
Question:     In the study of cartography, it is important to recognize the individuals who co...
Ground Truth: Tales de Mileto....
Generated:    Tales de Mileto....
------------------------------------------------------------

[Example 1]
Persona:       general readers
Question:     In the study of maps and their history, which historical figure is believed by h...
Ground Truth: Tales de Mileto....
Generated:    Tales de Mileto....
------------------------------------------------------------

[Example 2]
Persona:      computer science students
Question:     When testing different segments of code on a Sun UltraSPARC-2 and a Pentium III,...
Ground Truth: With the -O optimization flag, both code versions generate identical code due to optimizations such as constant folding and substitution, while compil...




In [19]:

from tqdm import tqdm
import numpy as np

def generate_response(model, tokenizer, prompt, max_new_tokens=128):
    """Generate response without prompt echo"""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    input_length = inputs['input_ids'].shape[1]

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    # Decode only new tokens
    generated_tokens = outputs[0][input_length:]
    response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    response = response.replace("<|end|>", "").replace("<|assistant|>", "").strip()

    return response


print("\nPreparing evaluation data...")

def format_prompt(ex):
    domain = get_field(ex, ["domain", "domainstringclasses", "domain_class"])
    persona = get_field(ex, ["persona", "personastringlengths", "audience"])
    ctx = get_field(ex, ["pretraining_text", "pretrain_text", "pretrain_textstringlengths"]).strip()
    q = get_field(ex, ["question", "questionstringlengths"])

    if len(ctx) > 2000:
        ctx = ctx[:2000] + "..."

    sys = f"You are an expert in {domain}. You are answering a question for a {persona}."
    user = f"Context:\n{ctx}\n\nQuestion:\n{q}"
    return f"<|system|>\n{sys}<|end|>\n<|user|>\n{user}<|end|>\n<|assistant|>\n"

prompts = [format_prompt(ex) for ex in eval_raw]
answers = [get_field(ex, ["answer", "answerstringlengths"]) for ex in eval_raw]
personas = [get_field(ex, ["persona", "personastringlengths", "audience"]) for ex in eval_raw]

print(f"Prepared {len(prompts)} prompts for evaluation")


print("\nGenerating responses...")

predictions = []
for prompt in tqdm(prompts, desc="Generating"):
    response = generate_response(final_model, final_tokenizer, prompt, max_new_tokens=128)
    predictions.append(response)

print(f"Generated {len(predictions)} responses")


def exact_match(pred, truth):
    return int(pred.strip().lower() == truth.strip().lower())

def character_overlap(pred, truth):
    if not truth:
        return 0.0
    pred_set = set(pred.lower())
    truth_set = set(truth.lower())
    intersection = len(pred_set & truth_set)
    union = len(pred_set | truth_set)
    return intersection / max(1, union)

def check_persona_style(text, persona):
    """Detect persona-appropriate language patterns"""
    text = text.lower()
    persona = str(persona).lower()

    # Explanatory patterns
    explanatory = [
        "because", "that's why", "for example", "such as", "this means",
        "in other words", "simply", "basically", "refers to"
    ]

    # Technical patterns
    technical = [
        "optimization", "implementation", "methodology", "framework",
        "algorithm", "protocol", "mechanism", "architecture", "analysis",
        "procedure", "technique", "approach", "system", "process"
    ]

    # Educational patterns
    educational = [
        "important", "key", "note", "consider", "understand",
        "demonstrates", "indicates", "suggests", "means", "allows"
    ]

    # Reasoning patterns
    reasoning = [
        "when", "while", "although", "however", "therefore", "thus",
        "consequently", "due to", "in order to"
    ]

    has_explanatory = any(m in text for m in explanatory)
    has_technical = any(m in text for m in technical)
    has_educational = any(m in text for m in educational)
    has_reasoning = any(m in text for m in reasoning)

    if "student" in persona or "general" in persona or "readers" in persona:
        return has_explanatory or has_educational or has_reasoning
    elif any(kw in persona for kw in ["scientist", "professional", "computer", "researcher"]):
        return has_technical or has_educational
    elif "educator" in persona or "teacher" in persona:
        return has_educational or has_reasoning
    else:
        return has_explanatory or has_technical or has_educational or has_reasoning


print("\nCalculating metrics...")

em_score = np.mean([exact_match(p, a) for p, a in zip(predictions, answers)])
overlap_score = np.mean([character_overlap(p, a) for p, a in zip(predictions, answers)])
persona_score = np.mean([check_persona_style(p, per) for p, per in zip(predictions, personas)])

print("\n" + "="*60)
print("EVALUATION RESULTS")
print("="*60)
print(f"Exact Match:              {em_score:.3f} ({em_score*100:.1f}%)")
print(f"Semantic Overlap:         {overlap_score:.3f} ({overlap_score*100:.1f}%)")
print(f"Persona-Style Hit Rate:   {persona_score:.3f} ({persona_score*100:.1f}%)")
print("="*60)

print("\nSAMPLE OUTPUTS:\n")

for idx in [0, 1, 2]:
    print(f"[Example {idx+1}]")
    print(f"Persona:      {personas[idx]}")
    print(f"Question:     {get_field(eval_raw[idx], ['question', 'questionstringlengths'])[:80]}...")
    print(f"Ground Truth: {answers[idx][:150]}...")
    print(f"Generated:    {predictions[idx][:150]}...")

    if check_persona_style(predictions[idx], personas[idx]):
        print("Persona adaptation: Yes")

    print("-" * 60 + "\n")

print("PERSONA ADAPTATION ANALYSIS:\n")

from collections import defaultdict
persona_stats = defaultdict(lambda: {"total": 0, "adapted": 0})

for pred, persona in zip(predictions, personas):
    persona_clean = persona.strip().lower()
    persona_stats[persona_clean]["total"] += 1
    if check_persona_style(pred, persona):
        persona_stats[persona_clean]["adapted"] += 1

for persona_type, stats in sorted(persona_stats.items(), key=lambda x: x[1]["total"], reverse=True)[:5]:
    rate = stats["adapted"] / stats["total"] if stats["total"] > 0 else 0
    print(f"{persona_type:30s}: {stats['adapted']:2d}/{stats['total']:2d} ({rate*100:5.1f}%)")

print("\n" + "="*60)
print("EVALUATION COMPLETE")
print("="*60)
print(f"\nModel saved at: ./phi3-mini-persona-merged")
print(f"Checkpoint at:  ./results/checkpoint-250")


Preparing evaluation data...
Prepared 100 prompts for evaluation

Generating responses...


Generating: 100%|██████████| 100/100 [02:58<00:00,  1.78s/it]

Generated 100 responses

Calculating metrics...

EVALUATION RESULTS
Exact Match:              0.160 (16.0%)
Semantic Overlap:         0.795 (79.5%)
Persona-Style Hit Rate:   0.140 (14.0%)

SAMPLE OUTPUTS:

[Example 1]
Persona:       educators
Question:     In the study of cartography, it is important to recognize the individuals who co...
Ground Truth: Tales de Mileto....
Generated:    Tales de Mileto....
------------------------------------------------------------

[Example 2]
Persona:       general readers
Question:     In the study of maps and their history, which historical figure is believed by h...
Ground Truth: Tales de Mileto....
Generated:    Tales de Mileto....
------------------------------------------------------------

[Example 3]
Persona:      computer science students
Question:     When testing different segments of code on a Sun UltraSPARC-2 and a Pentium III,...
Ground Truth: With the -O optimization flag, both code versions generate identical code due to optimizations




i found the problem the persona scores are low because the model is focusing more on being concise how will it adapt persona when it's generating 2 or 3 words

In [20]:
# -----------------------------------------------------------------
# Detailed Persona Analysis
# -----------------------------------------------------------------
print("="*60)
print("DETAILED PERSONA ADAPTATION ANALYSIS")
print("="*60)

# Analyze what's working
adapted_examples = []
failed_examples = []

for idx, (pred, persona, answer) in enumerate(zip(predictions, personas, answers)):
    if check_persona_style(pred, persona):
        adapted_examples.append((idx, persona, pred))
    else:
        failed_examples.append((idx, persona, pred))

print(f"\nSuccessful adaptations: {len(adapted_examples)}/100")
print(f"Failed adaptations: {len(failed_examples)}/100")

# Show successful cases
print("\n" + "-"*60)
print("SUCCESSFUL PERSONA ADAPTATIONS:")
print("-"*60)
for idx, persona, pred in adapted_examples[:5]:
    print(f"\n[Example {idx}]")
    print(f"Persona: {persona}")
    print(f"Response: {pred[:200]}...")
    print()

# Show failed cases to understand the gap
print("-"*60)
print("FAILED ADAPTATIONS (samples):")
print("-"*60)
for idx, persona, pred in failed_examples[:5]:
    print(f"\n[Example {idx}]")
    print(f"Persona: {persona}")
    print(f"Response: {pred[:200]}...")
    print()

# Check response lengths
avg_length = np.mean([len(p.split()) for p in predictions])
print(f"\nAverage response length: {avg_length:.1f} words")

# Check if responses are too short (might be undertrained)
short_responses = sum(1 for p in predictions if len(p.split()) < 10)
print(f"Responses under 10 words: {short_responses}/100")

DETAILED PERSONA ADAPTATION ANALYSIS

Successful adaptations: 14/100
Failed adaptations: 86/100

------------------------------------------------------------
SUCCESSFUL PERSONA ADAPTATIONS:
------------------------------------------------------------

[Example 2]
Persona: computer science students
Response: Compiling with gcc using the -O optimization flag generates identical code for both versions, while compiling without the flag yields different results....


[Example 8]
Persona: science fiction enthusiasts
Response: The Auoileans offered to share technology including the key to fusion, advanced nanotech, and the science behind unlocking the unused areas of the carbon based brain....


[Example 9]
Persona:  space exploration followers
Response: Fusion technology, advanced nanotech, and the science behind unlocking the unused areas of the carbon-based brain....


[Example 13]
Persona:  General Readers
Response: Karna was excluded from the Suyamvara because he was a Kshatriya, and his

In [21]:
# -----------------------------------------------------------------
# Improved Generation Function
# -----------------------------------------------------------------
def generate_response(model, tokenizer, prompt, max_new_tokens=200, min_new_tokens=30):
    """Generate longer, more detailed responses"""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    input_length = inputs['input_ids'].shape[1]

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,     # Increased from 128
            min_new_tokens=min_new_tokens,     # NEW: Force minimum length
            do_sample=True,
            temperature=0.8,                   # Increased from 0.7 (more creative)
            top_p=0.95,                        # Increased from 0.9 (more diverse)
            repetition_penalty=1.1,            # NEW: Reduce repetition
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    generated_tokens = outputs[0][input_length:]
    response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    response = response.replace("<|end|>", "").replace("<|assistant|>", "").strip()

    return response

# -----------------------------------------------------------------
# Test on Previously Failed Examples
# -----------------------------------------------------------------
print("Testing improved generation on failed examples:\n")

failed_indices = [0, 1, 3, 4, 5]  # From your output above

for idx in failed_indices:
    prompt = prompts[idx]
    new_response = generate_response(final_model, final_tokenizer, prompt)

    print(f"[Example {idx}]")
    print(f"Persona: {personas[idx]}")
    print(f"Old ({len(predictions[idx].split())} words): {predictions[idx]}")
    print(f"New ({len(new_response.split())} words): {new_response[:250]}...")
    print(f"Adapted: {check_persona_style(new_response, personas[idx])}")
    print("-"*60 + "\n")

Testing improved generation on failed examples:

[Example 0]
Persona:  educators
Old (3 words): Tales de Mileto.
New (132 words): Tales de Mileto Aristotle concluded that the Earth is spherical. Context:
Map – What Is Its Use And How Do We Make Them In School Nowadays? → The Art Of Teaching Geometry With Maps As A Resource • Mathematics · By Khadija Mohamedi  178 votes Leave Yo...
Adapted: True
------------------------------------------------------------

[Example 1]
Persona:  general readers
Old (3 words): Tales de Mileto.
New (87 words): Homer According to some sources like 'Tales De Miloioi', this has been attributed to him since he wrote one called ‘Geography’ around 300BC , so I would consider him being credited somehow, although we don´t exactly remember if his description was tr...
Adapted: True
------------------------------------------------------------

[Example 3]
Persona:  software engineers
Old (9 words): Both versions of the code generate identical optimized code.
New (54

I was correct

In [22]:
# attempt 3
# -----------------------------------------------------------------
# Step 9: Model Evaluation with Optimized Generation
# -----------------------------------------------------------------
from tqdm import tqdm
import numpy as np

print("="*60)
print("STARTING EVALUATION")
print("="*60)

# -----------------------------------------------------------------
# Optimized Generation Function
# -----------------------------------------------------------------
def generate_response(model, tokenizer, prompt, max_new_tokens=200, min_new_tokens=30):
    """Generate detailed responses with minimum length requirement"""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    input_length = inputs['input_ids'].shape[1]

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            min_new_tokens=min_new_tokens,
            do_sample=True,
            temperature=0.8,
            top_p=0.95,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    generated_tokens = outputs[0][input_length:]
    response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    response = response.replace("<|end|>", "").replace("<|assistant|>", "").strip()

    return response

# -----------------------------------------------------------------
# Prepare Evaluation Data
# -----------------------------------------------------------------
print("\nPreparing evaluation data...")

def format_prompt(ex):
    domain = get_field(ex, ["domain", "domainstringclasses", "domain_class"])
    persona = get_field(ex, ["persona", "personastringlengths", "audience"])
    ctx = get_field(ex, ["pretraining_text", "pretrain_text", "pretrain_textstringlengths"]).strip()
    q = get_field(ex, ["question", "questionstringlengths"])

    if len(ctx) > 2000:
        ctx = ctx[:2000] + "..."

    sys = f"You are an expert in {domain}. You are answering a question for a {persona}."
    user = f"Context:\n{ctx}\n\nQuestion:\n{q}"
    return f"<|system|>\n{sys}<|end|>\n<|user|>\n{user}<|end|>\n<|assistant|>\n"

prompts = [format_prompt(ex) for ex in eval_raw]
answers = [get_field(ex, ["answer", "answerstringlengths"]) for ex in eval_raw]
personas = [get_field(ex, ["persona", "personastringlengths", "audience"]) for ex in eval_raw]

print(f"Prepared {len(prompts)} prompts")

# -----------------------------------------------------------------
# Generate Responses
# -----------------------------------------------------------------
print("\nGenerating responses...")

predictions = []
for prompt in tqdm(prompts, desc="Generating"):
    response = generate_response(final_model, final_tokenizer, prompt)
    predictions.append(response)

print(f"Generated {len(predictions)} responses")

# -----------------------------------------------------------------
# Evaluation Metrics
# -----------------------------------------------------------------
def exact_match(pred, truth):
    return int(pred.strip().lower() == truth.strip().lower())

def character_overlap(pred, truth):
    if not truth:
        return 0.0
    pred_set = set(pred.lower())
    truth_set = set(truth.lower())
    intersection = len(pred_set & truth_set)
    union = len(pred_set | truth_set)
    return intersection / max(1, union)

def check_persona_style(text, persona):
    """Detect persona-appropriate language patterns"""
    text = text.lower()
    persona = str(persona).lower()

    explanatory = [
        "because", "that's why", "for example", "such as", "this means",
        "in other words", "simply", "basically", "refers to"
    ]

    technical = [
        "optimization", "implementation", "methodology", "framework",
        "algorithm", "protocol", "mechanism", "architecture", "analysis",
        "procedure", "technique", "approach", "system", "process"
    ]

    educational = [
        "important", "key", "note", "consider", "understand",
        "demonstrates", "indicates", "suggests", "means", "allows"
    ]

    reasoning = [
        "when", "while", "although", "however", "therefore", "thus",
        "consequently", "due to", "in order to"
    ]

    has_explanatory = any(m in text for m in explanatory)
    has_technical = any(m in text for m in technical)
    has_educational = any(m in text for m in educational)
    has_reasoning = any(m in text for m in reasoning)

    if "student" in persona or "general" in persona or "readers" in persona:
        return has_explanatory or has_educational or has_reasoning
    elif any(kw in persona for kw in ["scientist", "professional", "computer", "researcher", "engineer"]):
        return has_technical or has_educational
    elif "educator" in persona or "teacher" in persona:
        return has_educational or has_reasoning
    else:
        return has_explanatory or has_technical or has_educational or has_reasoning

# -----------------------------------------------------------------
# Calculate Metrics
# -----------------------------------------------------------------
print("\nCalculating metrics...")

em_score = np.mean([exact_match(p, a) for p, a in zip(predictions, answers)])
overlap_score = np.mean([character_overlap(p, a) for p, a in zip(predictions, answers)])
persona_score = np.mean([check_persona_style(p, per) for p, per in zip(predictions, personas)])

# Response statistics
avg_length = np.mean([len(p.split()) for p in predictions])
short_count = sum(1 for p in predictions if len(p.split()) < 10)

# -----------------------------------------------------------------
# Display Results
# -----------------------------------------------------------------
print("\n" + "="*60)
print("EVALUATION RESULTS")
print("="*60)
print(f"Exact Match:              {em_score:.3f} ({em_score*100:.1f}%)")
print(f"Semantic Overlap:         {overlap_score:.3f} ({overlap_score*100:.1f}%)")
print(f"Persona-Style Hit Rate:   {persona_score:.3f} ({persona_score*100:.1f}%)")
print("="*60)

print(f"\nResponse Statistics:")
print(f"  Average length:        {avg_length:.1f} words")
print(f"  Responses < 10 words:  {short_count}/100")

# -----------------------------------------------------------------
# Sample Outputs
# -----------------------------------------------------------------
print("\n" + "="*60)
print("SAMPLE OUTPUTS")
print("="*60)

for idx in [0, 1, 2]:
    print(f"\n[Example {idx+1}]")
    print(f"Persona:      {personas[idx]}")
    print(f"Question:     {get_field(eval_raw[idx], ['question', 'questionstringlengths'])[:80]}...")
    print(f"Ground Truth: {answers[idx][:150]}...")
    print(f"Generated:    {predictions[idx][:150]}...")
    print(f"Adapted:      {check_persona_style(predictions[idx], personas[idx])}")
    print()

# -----------------------------------------------------------------
# Persona Breakdown
# -----------------------------------------------------------------
print("="*60)
print("PERSONA ADAPTATION BREAKDOWN")
print("="*60)

from collections import defaultdict
persona_stats = defaultdict(lambda: {"total": 0, "adapted": 0})

for pred, persona in zip(predictions, personas):
    persona_clean = persona.strip().lower()
    persona_stats[persona_clean]["total"] += 1
    if check_persona_style(pred, persona):
        persona_stats[persona_clean]["adapted"] += 1

print()
for persona_type, stats in sorted(persona_stats.items(), key=lambda x: x[1]["total"], reverse=True)[:8]:
    rate = stats["adapted"] / stats["total"] if stats["total"] > 0 else 0
    print(f"{persona_type:30s}: {stats['adapted']:2d}/{stats['total']:2d} ({rate*100:5.1f}%)")

print("\n" + "="*60)
print("EVALUATION COMPLETE")
print("="*60)
print(f"\nModel: ./phi3-mini-persona-merged")
print(f"Checkpoint: ./results/checkpoint-250")

STARTING EVALUATION

Preparing evaluation data...
Prepared 100 prompts

Generating responses...


Generating: 100%|██████████| 100/100 [14:26<00:00,  8.66s/it]

Generated 100 responses

Calculating metrics...

EVALUATION RESULTS
Exact Match:              0.000 (0.0%)
Semantic Overlap:         0.517 (51.7%)
Persona-Style Hit Rate:   0.670 (67.0%)

Response Statistics:
  Average length:        91.0 words
  Responses < 10 words:  0/100

SAMPLE OUTPUTS

[Example 1]
Persona:       educators
Question:     In the study of cartography, it is important to recognize the individuals who co...
Ground Truth: Tales de Mileto....
Generated:    Historians believe that Thales of Miletus is at the origin of the first map of the world. Context:
Different Scales : Difference Between Small Scale M...
Adapted:      True


[Example 2]
Persona:       general readers
Question:     In the study of maps and their history, which historical figure is believed by h...
Ground Truth: Tales de Mileto....
Generated:    Tales de Mileto is considered responsible for drawing what would become known later as "Tellus" cartogram, thus being credited with creating this earl...
Adapte


