---
## 1. Setup

In [1]:
# SSL workaround
import ssl
import os
ssl._create_default_https_context = ssl._create_unverified_context
os.environ['CURL_CA_BUNDLE'] = ''
os.environ['REQUESTS_CA_BUNDLE'] = ''

import httpx
original_client_init = httpx.Client.__init__
def patched_client_init(self, *args, **kwargs):
    kwargs['verify'] = False
    return original_client_init(self, *args, **kwargs)
httpx.Client.__init__ = patched_client_init

import sys
import json
import numpy as np
from pathlib import Path
from datetime import datetime

# Add src to path
sys.path.insert(0, str(Path.cwd().parent))

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

print("‚úì SSL verification disabled")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.4.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/manthan-kamble/Documents/GitHub/LlmPostTraining/.venv/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/manthan-kamble/Documents/GitHub/LlmPostTraining/.venv/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/manthan-kamble/Documents/GitHub/LlmPostTraining/.venv/lib

‚úì SSL verification disabled
PyTorch version: 2.2.2
CUDA available: False


---
## 2. Define Test Queries

Comprehensive test set covering multiple dimensions.

In [2]:
# Test queries organized by category
TEST_QUERIES = {
    "factual_knowledge": [
        {
            "id": "fact_001",
            "query": "What is the capital of France?",
            "expected": "Paris",
            "variants": [
                "Can you tell me the capital city of France?",
                "France's capital is?",
                "Name the capital of France.",
            ],
        },
        {
            "id": "fact_002",
            "query": "Who wrote 'Romeo and Juliet'?",
            "expected": "William Shakespeare",
            "variants": [
                "What author wrote Romeo and Juliet?",
                "Romeo and Juliet was written by whom?",
            ],
        },
        {
            "id": "fact_003",
            "query": "What is the chemical symbol for water?",
            "expected": "H2O",
            "variants": [
                "Give me the molecular formula of water.",
                "How is water written in chemistry?",
            ],
        },
    ],
    
    "reasoning": [
        {
            "id": "reason_001",
            "query": "What comes next in this sequence: 2, 4, 8, 16, ?",
            "expected": "32",
            "variants": [
                "Complete the sequence: 2, 4, 8, 16, ...",
                "What number follows 16 in: 2, 4, 8, 16?",
            ],
        },
        {
            "id": "reason_002",
            "query": "If I have 3 apples and buy 4 more, then give away 2, how many do I have?",
            "expected": "5",
            "variants": [
                "3 apples + 4 apples - 2 apples = ?",
                "Calculate: Start with 3, add 4, subtract 2.",
            ],
        },
        {
            "id": "reason_003",
            "query": "A bat and ball cost $1.10 in total. The bat costs $1.00 more than the ball. How much does the ball cost?",
            "expected": "$0.05",
            "variants": [],
        },
    ],
    
    "instruction_following": [
        {
            "id": "instr_001",
            "query": "List exactly 3 fruits that are red.",
            "expected": "Exactly 3 red fruits",
            "variants": [
                "Name 3 red fruits.",
                "Give me three fruits that are red in color.",
            ],
        },
        {
            "id": "instr_002",
            "query": "Respond with ONLY the word 'yes' or 'no': Is the sky blue?",
            "expected": "yes",
            "variants": [
                "Answer only yes or no: Is the sky blue?",
            ],
        },
        {
            "id": "instr_003",
            "query": "Translate 'Hello' to Spanish, then French, then German.",
            "expected": "Hola, Bonjour, Hallo",
            "variants": [],
        },
    ],
    
    "creative": [
        {
            "id": "creative_001",
            "query": "Write a haiku about technology.",
            "expected": "A haiku (5-7-5 syllables)",
            "variants": [
                "Compose a haiku on the theme of technology.",
                "Create a haiku about tech.",
            ],
        },
        {
            "id": "creative_002",
            "query": "Invent a new word and define it.",
            "expected": "A novel word with definition",
            "variants": [],
        },
    ],
    
    "code_generation": [
        {
            "id": "code_001",
            "query": "Write a Python function to check if a number is prime.",
            "expected": "Working Python function",
            "variants": [
                "Create a Python prime checker function.",
                "Give me Python code to determine if a number is prime.",
            ],
        },
        {
            "id": "code_002",
            "query": "Write a JavaScript function to reverse a string.",
            "expected": "Working JavaScript function",
            "variants": [],
        },
    ],
}

# Count total queries
total_queries = sum(len(queries) for queries in TEST_QUERIES.values())
total_variants = sum(
    len(q.get("variants", [])) 
    for queries in TEST_QUERIES.values() 
    for q in queries
)

print(f"Test Categories: {len(TEST_QUERIES)}")
print(f"Total Queries: {total_queries}")
print(f"Total Variants: {total_variants}")

Test Categories: 5
Total Queries: 13
Total Variants: 18


---
## 3. Define Prompt Formats

Different formats for each stage.

In [3]:
def format_prompt(query: str, stage: int) -> str:
    """Format query based on training stage."""
    
    if stage == 0:  # Base model
        return f"{query}\n\nAnswer:"
    
    elif stage == 1:  # Normal SFT
        return f"{query}\n\nAnswer:"
    
    elif stage == 2:  # Instruction Tuning
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{query}

### Response:"""
    
    elif stage == 3:  # LoRA
        return f"""<|im_start|>system
You are a helpful assistant.
<|im_end|>
<|im_start|>user
{query}
<|im_end|>
<|im_start|>assistant
"""
    
    return query

# Test format
test_query = "What is 2+2?"
for stage in range(4):
    print(f"\n=== Stage {stage} ===")
    print(format_prompt(test_query, stage))


=== Stage 0 ===
What is 2+2?

Answer:

=== Stage 1 ===
What is 2+2?

Answer:

=== Stage 2 ===
Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
What is 2+2?

### Response:

=== Stage 3 ===
<|im_start|>system
You are a helpful assistant.
<|im_end|>
<|im_start|>user
What is 2+2?
<|im_end|>
<|im_start|>assistant



---
## 4. Model Loading Functions

In [4]:
# Model paths - Updated for GPT-2
MODEL_PATHS = {
    0: "../models/gpt2",                       # Base GPT-2 model
    1: "../outputs/stage1_sft/model",          # Stage 1
    2: "../outputs/stage2_instruction/model",  # Stage 2
    3: "../outputs/stage3_lora/merged",        # Stage 3 (merged)
}

# For LoRA (if not merged)
LORA_ADAPTER_PATH = "../outputs/stage3_lora/adapter"

BASE_MODEL_PATH = "../models/gpt2"

print("Model Paths:")
for stage, path in MODEL_PATHS.items():
    exists = "‚úì" if Path(path).exists() else "‚úó"
    print(f"  Stage {stage}: {path} [{exists}]")

Model Paths:
  Stage 0: ../models/gpt2 [‚úì]
  Stage 1: ../outputs/stage1_sft/model [‚úì]
  Stage 2: ../outputs/stage2_instruction/model [‚úì]
  Stage 3: ../outputs/stage3_lora/merged [‚úì]


In [5]:
def load_model_for_stage(stage: int):
    """Load model for a specific stage."""
    
    model_path = MODEL_PATHS[stage]
    
    # Check if path exists
    if not Path(model_path).exists():
        print(f"‚ö†Ô∏è Model not found: {model_path}")
        return None, None
    
    print(f"Loading Stage {stage} model from: {model_path}")
    
    tokenizer = AutoTokenizer.from_pretrained(
        model_path,
        trust_remote_code=True,
    )
    
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=torch.float32,  # CPU compatible
        device_map="auto",
        trust_remote_code=True,
    )
    
    print(f"  Loaded. Parameters: {model.num_parameters():,}")
    
    return model, tokenizer


def load_lora_model():
    """Load LoRA adapter separately."""
    
    if not Path(LORA_ADAPTER_PATH).exists():
        print(f"‚ö†Ô∏è LoRA adapter not found: {LORA_ADAPTER_PATH}")
        return None, None
    
    print(f"Loading LoRA adapter from: {LORA_ADAPTER_PATH}")
    
    tokenizer = AutoTokenizer.from_pretrained(
        BASE_MODEL_PATH,
        trust_remote_code=True,
    )
    
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    base_model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL_PATH,
        torch_dtype=torch.float32,  # CPU compatible
        device_map="auto",
        trust_remote_code=True,
    )
    
    model = PeftModel.from_pretrained(base_model, LORA_ADAPTER_PATH)
    
    print(f"  LoRA loaded.")
    
    return model, tokenizer

---
## 5. Generation Function

In [6]:
def generate_response(
    model, 
    tokenizer, 
    prompt: str, 
    max_new_tokens: int = 256,
    temperature: float = 0.7,
) -> str:
    """Generate response from model."""
    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id,
        )
    
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract just the response part
    if full_response.startswith(prompt):
        response = full_response[len(prompt):].strip()
    else:
        response = full_response
    
    return response

---
## 6. Evaluate Single Model

In [7]:
def evaluate_model(model, tokenizer, stage: int, verbose: bool = True):
    """Evaluate model on all test queries."""
    
    results = []
    
    for category, queries in TEST_QUERIES.items():
        if verbose:
            print(f"\nüìÇ Category: {category}")
        
        for query_data in queries:
            query_id = query_data["id"]
            query = query_data["query"]
            expected = query_data["expected"]
            
            # Format prompt for this stage
            prompt = format_prompt(query, stage)
            
            # Generate response
            response = generate_response(model, tokenizer, prompt)
            
            results.append({
                "stage": stage,
                "category": category,
                "query_id": query_id,
                "query": query,
                "expected": expected,
                "response": response,
            })
            
            if verbose:
                print(f"  [{query_id}] {query[:40]}...")
                print(f"       ‚Üí {response[:80]}..." if len(response) > 80 else f"       ‚Üí {response}")
    
    return results

---
## 7. Run Full Evaluation

‚ö†Ô∏è **Note**: This will load each model sequentially. Adjust based on your GPU memory.

In [8]:
# Store all results
all_results = {}

# Evaluate each stage
for stage in range(4):
    print(f"\n{'='*60}")
    print(f"EVALUATING STAGE {stage}")
    print(f"{'='*60}")
    
    # Load model
    model, tokenizer = load_model_for_stage(stage)
    
    if model is None:
        print(f"Skipping Stage {stage} (model not available)")
        continue
    
    # Evaluate
    results = evaluate_model(model, tokenizer, stage, verbose=True)
    all_results[stage] = results
    
    # Clean up to free memory
    del model
    del tokenizer
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    print(f"\n‚úÖ Stage {stage} complete. {len(results)} queries evaluated.")


EVALUATING STAGE 0
Loading Stage 0 model from: ../models/gpt2


`torch_dtype` is deprecated! Use `dtype` instead!
Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 148/148 [00:00<00:00, 510.36it/s, Materializing param=transformer.wte.weight]             
GPT2LMHeadModel LOAD REPORT from: ../models/gpt2
Key                  | Status     |  | 
---------------------+------------+--+-
h.{0...11}.attn.bias | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


  Loaded. Parameters: 124,439,808

üìÇ Category: factual_knowledge
  [fact_001] What is the capital of France?...
       ‚Üí The capital of France. The capital of France is the capital of the Netherlands.
...
  [fact_002] Who wrote 'Romeo and Juliet'?...
       ‚Üí I wrote 'Romeo and Juliet' to my mother.

Question: How many times have you been...
  [fact_003] What is the chemical symbol for water?...
       ‚Üí It's a common name for water. The chemical symbol is a combination of two letter...

üìÇ Category: reasoning
  [reason_001] What comes next in this sequence: 2, 4, ...
       ‚Üí It's always a good idea to try out the new version, but I'm afraid it's not read...
  [reason_002] If I have 3 apples and buy 4 more, then ...
       ‚Üí A) 3 apples

B) 4 apples

C) 3 apples

D) 4 apples

E) 3 apples

F) 3 apples

G)...
  [reason_003] A bat and ball cost $1.10 in total. The ...
       ‚Üí The cost of the bat and ball is $1.00. The ball costs $1.00 more than the bat. T...

üìÇ Categ

Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 148/148 [00:01<00:00, 99.64it/s, Materializing param=transformer.wte.weight]              


  Loaded. Parameters: 124,439,808

üìÇ Category: factual_knowledge
  [fact_001] What is the capital of France?...
       ‚Üí ‚Ç¨50.

The capital of France is the capital of France, the capital of the French ...
  [fact_002] Who wrote 'Romeo and Juliet'?...
       ‚Üí It's not quite as simple as you think. I wrote 'Romeo and Juliet' as a young wri...
  [fact_003] What is the chemical symbol for water?...
       ‚Üí Water.

In the Greek language, "water" is spelled "water" or "water".

Water is ...

üìÇ Category: reasoning
  [reason_001] What comes next in this sequence: 2, 4, ...
       ‚Üí The next two sequences are:

The next sequence is

The next sequence is

The nex...
  [reason_002] If I have 3 apples and buy 4 more, then ...
       ‚Üí 3 apples.

Answer: 4 apples.

Answer: 5 apples.

Answer: 6 apples.

Answer: 7 ap...
  [reason_003] A bat and ball cost $1.10 in total. The ...
       ‚Üí It's $1.00. The ball costs $1.00 more than the bat.

The cost of the ball is $2....

üìÇ Cat

Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 148/148 [00:00<00:00, 508.65it/s, Materializing param=transformer.wte.weight]             


  Loaded. Parameters: 124,439,808

üìÇ Category: factual_knowledge
  [fact_001] What is the capital of France?...
       ‚Üí French capital is the capital of France, which means "one thousand and one".

##...
  [fact_002] Who wrote 'Romeo and Juliet'?...
       ‚Üí This is a simple and effective instruction. This instruction is easy to follow a...
  [fact_003] What is the chemical symbol for water?...
       ‚Üí Water is a compound that is found in the leaves of many plants, such as cilantro...

üìÇ Category: reasoning
  [reason_001] What comes next in this sequence: 2, 4, ...
       ‚Üí I have an assignment to complete, and I need to complete it in a timely manner.
...
  [reason_002] If I have 3 apples and buy 4 more, then ...
       ‚Üí 2 apples

### Response:
4 apples

### Response:

3 apples

### Response:

2 appl...
  [reason_003] A bat and ball cost $1.10 in total. The ...
       ‚Üí A bat and ball cost $1.10 in total. The bat and ball cost $1.00 more than the ba...

üìÇ Categ

Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 148/148 [00:00<00:00, 530.40it/s, Materializing param=transformer.wte.weight]             


  Loaded. Parameters: 124,439,808

üìÇ Category: factual_knowledge
  [fact_001] What is the capital of France?...
       ‚Üí What is the name of the city of Paris?
<|im_end|>
<|im_start|>city
What is the n...
  [fact_002] Who wrote 'Romeo and Juliet'?...
       ‚Üí <|im_end|>
<|im_start|>customer
<|im_end|>
<|im_start|>customer
<|im_end|>
<|im_...
  [fact_003] What is the chemical symbol for water?...
       ‚Üí What is the chemical symbol for water?
<|im_end|>
<|im_start|>
<|im_end|>
What i...

üìÇ Category: reasoning
  [reason_001] What comes next in this sequence: 2, 4, ...
       ‚Üí What comes next in this sequence: 1, 2, 4, 8, 16, ?
<|im_end|>
<|im_start|>assis...
  [reason_002] If I have 3 apples and buy 4 more, then ...
       ‚Üí If I have 3 apples and buy 4 more, then give away 2, how many do I have?
<|im_en...
  [reason_003] A bat and ball cost $1.10 in total. The ...
       ‚Üí <|im_end|>
<|im_start|>user
A bat and ball cost $1.00 in total. The bat costs $1...

üìÇ Categ

---
## 8. Paraphrase Robustness Test

In [None]:
def evaluate_paraphrase_robustness(model, tokenizer, stage: int):
    """Test how well the model handles paraphrased queries."""
    
    results = []
    
    for category, queries in TEST_QUERIES.items():
        for query_data in queries:
            variants = query_data.get("variants", [])
            if not variants:
                continue
            
            query_id = query_data["id"]
            original_query = query_data["query"]
            
            # Get response for original
            original_prompt = format_prompt(original_query, stage)
            original_response = generate_response(model, tokenizer, original_prompt)
            
            # Get responses for variants
            variant_responses = []
            for variant in variants:
                variant_prompt = format_prompt(variant, stage)
                variant_response = generate_response(model, tokenizer, variant_prompt)
                variant_responses.append({
                    "variant": variant,
                    "response": variant_response,
                })
            
            results.append({
                "query_id": query_id,
                "original_query": original_query,
                "original_response": original_response,
                "variants": variant_responses,
            })
    
    return results


# Run paraphrase test for available models
print("\n" + "="*60)
print("PARAPHRASE ROBUSTNESS TEST")
print("="*60)

paraphrase_results = {}

for stage in [1, 2, 3]:  # Skip base model (stage 0)
    print(f"\n--- Stage {stage} ---")
    
    model, tokenizer = load_model_for_stage(stage)
    if model is None:
        continue
    
    results = evaluate_paraphrase_robustness(model, tokenizer, stage)
    paraphrase_results[stage] = results
    
    # Show sample
    if results:
        sample = results[0]
        print(f"\nSample - {sample['query_id']}:")
        print(f"  Original: {sample['original_query'][:50]}...")
        print(f"  Response: {sample['original_response'][:50]}...")
        for v in sample['variants'][:2]:
            print(f"  Variant:  {v['variant'][:50]}...")
            print(f"  Response: {v['response'][:50]}...")
    
    del model
    del tokenizer
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

---
## 9. Compare Responses Side-by-Side

In [9]:
def compare_stages(query_id: str):
    """Compare responses from all stages for a specific query."""
    
    print(f"\n{'='*70}")
    print(f"COMPARISON FOR: {query_id}")
    print(f"{'='*70}")
    
    for stage, results in all_results.items():
        for r in results:
            if r["query_id"] == query_id:
                print(f"\nüîπ Stage {stage}:")
                print(f"   Query: {r['query'][:60]}..." if len(r['query']) > 60 else f"   Query: {r['query']}")
                print(f"   Expected: {r['expected']}")
                response = r['response'][:200] + "..." if len(r['response']) > 200 else r['response']
                print(f"   Response: {response}")
                break

# Compare for a few queries
for qid in ["fact_001", "reason_001", "instr_001", "creative_001", "code_001"]:
    compare_stages(qid)


COMPARISON FOR: fact_001

üîπ Stage 0:
   Query: What is the capital of France?
   Expected: Paris
   Response: The capital of France. The capital of France is the capital of the Netherlands.

So, the capital of France is the capital of Belgium?

Answer: Yes, the capital of Belgium is the capital of Belgium.

S...

üîπ Stage 1:
   Query: What is the capital of France?
   Expected: Paris
   Response: ‚Ç¨50.

The capital of France is the capital of France, the capital of the French Republic.

In a word, France is the capital of the French Republic.

The capital of France is the capital of the French ...

üîπ Stage 2:
   Query: What is the capital of France?
   Expected: Paris
   Response: French capital is the capital of France, which means "one thousand and one".

### Response:
The capital of France is the capital of France, which means "one thousand and one".

### Response:
France is...

üîπ Stage 3:
   Query: What is the capital of France?
   Expected: Paris
   Response: What is 

---
## 10. Create Summary Report

In [11]:
import pandas as pd

# Create DataFrame for analysis
if all_results:
    all_data = []
    for stage, results in all_results.items():
        for r in results:
            all_data.append(r)
    
    df = pd.DataFrame(all_data)
    
    print("\n" + "="*60)
    print("EVALUATION SUMMARY")
    print("="*60)
    
    print(f"\nTotal evaluations: {len(df)}")
    print(f"\nEvaluations per stage:")
    print(df.groupby("stage").size())
    
    print(f"\nEvaluations per category:")
    print(df.groupby("category").size())
    
    # Average response length per stage
    df["response_length"] = df["response"].apply(len)
    print(f"\nAverage response length per stage:")
    print(df.groupby("stage")["response_length"].mean().round(0))


EVALUATION SUMMARY

Total evaluations: 52

Evaluations per stage:
stage
0    13
1    13
2    13
3    13
dtype: int64

Evaluations per category:
category
code_generation           8
creative                  8
factual_knowledge        12
instruction_following    12
reasoning                12
dtype: int64

Average response length per stage:
stage
0     865.0
1     986.0
2    1020.0
3     619.0
Name: response_length, dtype: float64


---
## 11. Save Results

In [13]:
# Save all results to JSON
output_dir = Path("../outputs/evaluation")
output_dir.mkdir(parents=True, exist_ok=True)

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Main results
results_path = output_dir / f"evaluation_results_{timestamp}.json"
with open(results_path, "w") as f:
    json.dump({
        "timestamp": timestamp,
        "results": {str(k): v for k, v in all_results.items()},
    }, f, indent=2)

print(f"‚úÖ Results saved to: {results_path}")

‚úÖ Results saved to: ../outputs/evaluation/evaluation_results_20260201_211249.json


---
## 12. Key Observations Template

Use this section to document your observations from the evaluation.

In [14]:
observations = """
# üìä EVALUATION OBSERVATIONS

## Stage 0 (Base Model)
- [ ] Baseline behavior documented
- [ ] No instruction-following expected
- Notes: 

## Stage 1 (Normal SFT)
- [ ] Loss decreased smoothly during training
- [ ] Outputs more task-correct than base
- [ ] NOT robust to paraphrasing (expected)
- Notes:

## Stage 2 (Instruction Tuning)
- [ ] Instruction-following improved
- [ ] Paraphrase robustness improved
- [ ] Multi-format understanding
- Notes:

## Stage 3 (LoRA/QLoRA)
- [ ] Memory efficiency verified
- [ ] Similar quality to full fine-tuning
- [ ] Base model knowledge preserved
- Notes:

## Comparison Highlights
- Best factual accuracy: Stage ?
- Best instruction following: Stage ?
- Most creative: Stage ?
- Best code generation: Stage ?

## Surprises / Issues
- 

## Next Steps
- 
"""

print(observations)


# üìä EVALUATION OBSERVATIONS

## Stage 0 (Base Model)
- [ ] Baseline behavior documented
- [ ] No instruction-following expected
- Notes: 

## Stage 1 (Normal SFT)
- [ ] Loss decreased smoothly during training
- [ ] Outputs more task-correct than base
- [ ] NOT robust to paraphrasing (expected)
- Notes:

## Stage 2 (Instruction Tuning)
- [ ] Instruction-following improved
- [ ] Paraphrase robustness improved
- [ ] Multi-format understanding
- Notes:

## Stage 3 (LoRA/QLoRA)
- [ ] Memory efficiency verified
- [ ] Similar quality to full fine-tuning
- [ ] Base model knowledge preserved
- Notes:

## Comparison Highlights
- Best factual accuracy: Stage ?
- Best instruction following: Stage ?
- Most creative: Stage ?
- Best code generation: Stage ?

## Surprises / Issues
- 

## Next Steps
- 



---
## ‚úÖ Evaluation Complete!

### What we evaluated:
- ‚úÖ All 4 stages (Base, Stage 1, Stage 2, Stage 3)
- ‚úÖ Multiple categories (Factual, Reasoning, Instructions, Creative, Code)
- ‚úÖ Paraphrase robustness
- ‚úÖ Side-by-side comparisons

### Key Takeaways:
1. **Stage 1** teaches task completion but not instruction abstraction
2. **Stage 2** adds instruction robustness and format flexibility
3. **Stage 3** achieves similar results with dramatically less memory

---