 # University Rubric-Aligned Feedback Generation using Unsloth

This notebook combines the complete pipeline for:
 1. Generating synthetic training data
 2. Preparing data for fine-tuning
 3. Fine-tuning Phi-3 with LoRA using Unsloth
 4. Testing and evaluating the model

 **Requirements:**
 ```bash
 pip install unsloth transformers datasets trl scikit-learn accelerate bitsandbytes
 ```


### 1. Setup and Imports

In [None]:
import os
import json
import csv
import random
import uuid
from datetime import datetime
from typing import List, Dict, Tuple

import torch
from torch.cuda import is_available
from datasets import Dataset, load_dataset
from sklearn.model_selection import train_test_split

# Unsloth imports
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq

# Set random seeds for reproducibility
SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print("‚úì All imports successful")
print(f"‚úì Using device: {'CUDA' if torch.cuda.is_available() else 'CPU'}")


### 2. Configuration

In [None]:
class Config:
    # Data generation
    NUM_QUESTIONS = 4
    INSTANCES_PER_QUESTION = 150
    OUTPUT_DIR = "output"
    DATA_PROCESSED_DIR = "./data_processed"
    
    # Model settings
    model_name = "unsloth/Phi-3-mini-4k-instruct"
    max_seq_length = 2048
    load_in_4bit = True
    
    # LoRA settings
    lora_r = 16
    lora_alpha = 16
    lora_dropout = 0
    
    # Training settings
    learning_rate = 2e-4
    num_epochs = 3
    per_device_train_batch_size = 2
    per_device_eval_batch_size = 2
    gradient_accumulation_steps = 4
    warmup_steps = 10
    
    # Model output
    output_model_dir = './feedback_model_phi3'
    
    # Logging
    logging_steps = 10
    eval_steps = 50
    save_steps = 50
    save_total_limit = 3
    
    # Evaluation
    test_split_size = 0.15
    val_split_size = 0.15

config = Config()

# Create directories
os.makedirs(config.OUTPUT_DIR, exist_ok=True)
os.makedirs(config.DATA_PROCESSED_DIR, exist_ok=True)

print("‚úì Configuration loaded")

### 3. Data Generation - Synthetic Dataset Creation

#### 3.1 Define Questions and Rubrics

In [None]:
QUESTIONS = [
    {
        "id": "q1",
        "course": "Intro to Machine Learning",
        "prompt": "What is overfitting in machine learning, and how can cross-validation help reduce it? (2-3 sentences)",
        "seeds": [
            "Overfitting happens when a model learns noise or patterns specific to the training data and fails to generalize to new data. Cross-validation helps by splitting data and validating across folds so we can detect models that don't generalize and select hyperparameters accordingly.",
            "When a model performs well on training data but poorly on unseen data, it is overfitting. Cross-validation estimates performance on unseen data and helps choose models or regularization settings to avoid overfitting."
        ],
        "mistakes": [
            "Describes overfitting vaguely as 'model does bad on test' without saying it learns noise.",
            "Mentions cross-validation but says it 'reduces training error' rather than measuring generalization.",
            "Confuses cross-validation with data augmentation or early stopping.",
            "Gives only definition, no method to reduce it."
        ],
        "rubric": [
            {"code": "C1", "criterion": "Definition correctness", "max_score": 5, "description": "Clear definition describing learning noise/poor generalization"},
            {"code": "C2", "criterion": "Cross-validation explanation", "max_score": 5, "description": "Explains how cross-validation helps detect or reduce overfitting"},
            {"code": "C3", "criterion": "Conciseness and clarity", "max_score": 5, "description": "Answer is concise (2-3 sentences) and uses correct terminology"}
        ]
    },
    {
        "id": "q2",
        "course": "Data Structures",
        "prompt": "Briefly compare BFS and DFS. When would you use one over the other? (2-3 sentences)",
        "seeds": [
            "BFS explores neighbors level by level and is useful for finding shortest paths in unweighted graphs. DFS goes deep along a branch before backtracking and is useful for topological ordering or searching for any path when memory is limited.",
            "Use BFS when you need shortest path or level information; use DFS for tasks like cycle detection or when you want to explore deep structure with less memory overhead."
        ],
        "mistakes": [
            "Says BFS is always faster than DFS or vice versa.",
            "Mixes up use-cases (e.g., says DFS finds shortest paths).",
            "Only describes one algorithm but not the other."
        ],
        "rubric": [
            {"code": "C1", "criterion": "Algorithm characteristics", "max_score": 5, "description": "Mentions traversal order and core property (level-order vs depth)"},
            {"code": "C2", "criterion": "Use-case justification", "max_score": 5, "description": "Gives correct reasons for choosing BFS or DFS"},
            {"code": "C3", "criterion": "Concise comparison", "max_score": 5, "description": "Clear, short comparison in 2-3 sentences"}
        ]
    },
    {
        "id": "q3",
        "course": "Databases",
        "prompt": "What does ACID mean in databases? Provide a short explanation of each property. (List and 1-line explanation)",
        "seeds": [
            "ACID stands for Atomicity, Consistency, Isolation, Durability. Atomicity means transactions are all-or-nothing; Consistency means DB moves between valid states; Isolation ensures concurrent transactions don't interfere; Durability ensures committed changes persist.",
            "Atomicity: either all operations of a transaction happen or none. Consistency: DB constraints hold before and after transactions. Isolation: concurrent transactions appear serial. Durability: once committed, data survive crashes."
        ],
        "mistakes": [
            "Mixes up Isolation and Consistency, or gives vague descriptions.",
            "Forgets one of the properties.",
            "Gives overly technical answer beyond short scope."
        ],
        "rubric": [
            {"code": "C1", "criterion": "Coverage of properties", "max_score": 5, "description": "Lists all four ACID properties correctly"},
            {"code": "C2", "criterion": "Correctness of explanations", "max_score": 5, "description": "Each property is explained correctly in one line"},
            {"code": "C3", "criterion": "Brevity and clarity", "max_score": 5, "description": "Concise list-style explanations"}
        ]
    },
    {
        "id": "q4",
        "course": "Computer Networks",
        "prompt": "Explain the TCP three-way handshake in one or two sentences.",
        "seeds": [
            "TCP uses SYN, SYN-ACK, ACK messages: client sends SYN, server replies SYN-ACK, client replies ACK, establishing a reliable connection. This ensures both sides are ready and agree on initial sequence numbers.",
            "A three-way handshake is: client SYN, server SYN-ACK, client ACK; it's used to synchronize sequence numbers and establish a TCP connection."
        ],
        "mistakes": [
            "Says handshake uses SYN, ACK only (missing SYN-ACK).",
            "Confuses UDP with TCP.",
            "Mentions extra steps not part of the handshake."
        ],
        "rubric": [
            {"code": "C1", "criterion": "Sequence correctness", "max_score": 5, "description": "Mentions SYN, SYN-ACK, ACK in the right order"},
            {"code": "C2", "criterion": "Purpose explanation", "max_score": 5, "description": "Explains why it's done (sync seq numbers, ensure readiness)"},
            {"code": "C3", "criterion": "Conciseness", "max_score": 5, "description": "Explanation within 1-2 short sentences"}
        ]
    }
]

print(f"‚úì Defined {len(QUESTIONS)} questions with rubrics")

#### 3.2 Data Generation Helper Functions

In [None]:
# Grade category distribution
GRADE_CATEGORIES = [
    ("excellent", 0.7),
    ("good", 0.15),
    ("fair", 0.08),
    ("poor", 0.05),
    ("incorrect", 0.02)
]

# Feedback templates
FEEDBACK_TEMPLATES = {
    "definition_missing": [
        "You described the topic but missed the key definition ‚Äî make sure to include that next time.",
        "Good attempt, but the core definition was missing or unclear. Add a clear definition for full marks."
    ],
    "partial_correct": [
        "You partially answered the question ‚Äî correct on some points but missed: {missed}.",
        "Partly correct; to improve, expand on: {missed}."
    ],
    "minor_fix": [
        "Small fix needed: {fix}. Then your answer will be complete.",
        "Minor correction ‚Äî {fix}. Good otherwise."
    ],
    "excellent_short": [
        "Clear and correct‚Äîwell done.",
        "Excellent answer; concise and accurate."
    ],
    "incorrect_short": [
        "The answer is incorrect or misunderstands the concept. Review {topic} and try again.",
        "Incorrect: there is a misunderstanding about {topic}. Revise the core concept."
    ]
}

def uid():
    return str(uuid.uuid4())

def clamp(x, a, b):
    return max(a, min(b, x))

def jitter_score(base, jitter=1, min_score=0, max_score=5):
    return clamp(int(round(base + random.uniform(-jitter, jitter))), min_score, max_score)

def pick_grade_category():
    r = random.random()
    cum = 0
    for cat, prob in GRADE_CATEGORIES:
        cum += prob
        if r <= cum:
            return cat
    return GRADE_CATEGORIES[-1][0]

# Simple text transformation helpers
def paraphrase(s):
    parts = s.split(',')
    if len(parts) > 1 and random.random() < 0.6:
        random.shuffle(parts)
        s = ','.join(p.strip() for p in parts)
    s = s.replace('model', random.choice(['classifier', 'fit model']))
    s = s.replace('training data', random.choice(['the data used for training', 'training set']))
    return s

def shorten(s):
    if '.' in s:
        return s.split('.')[0] + '.'
    return s

def remove_detail(s):
    s = s.replace('so we can detect models that don\'t generalize and select hyperparameters accordingly', '')
    s = s.replace('and helps choose models or regularization settings to avoid overfitting', '')
    return s

def swap_keywords(s):
    s = s.replace('cross-validation', 'data augmentation')
    s = s.replace('SYN-ACK', 'ACK')
    return s

def mutate_answer(seed, mistakes, category):
    """Generate realistic student answer based on grade category"""
    if category == "excellent":
        s = seed
        if random.random() < 0.3:
            s = paraphrase(seed)
        return s
    
    if category == "good":
        s = paraphrase(seed)
        if random.random() < 0.5 and mistakes:
            s = remove_detail(s)
            s += " " + random.choice(mistakes)
        return s
    
    if category == "fair":
        s = paraphrase(seed)
        s = shorten(s)
        if random.random() < 0.6 and mistakes:
            s += " " + random.choice(mistakes)
        return s
    
    if category == "poor":
        s = random.choice(mistakes) if mistakes else shorten(seed)
        if random.random() < 0.4:
            s = "I think " + s
        return s
    
    if category == "incorrect":
        s = random.choice(mistakes) if mistakes else "Incorrect description"
        s = swap_keywords(s)
        return s
    
    return seed

def score_from_category(category, rubric):
    """Generate scores based on grade category"""
    mapping = {
        'excellent': 0.9,
        'good': 0.75,
        'fair': 0.5,
        'poor': 0.25,
        'incorrect': 0.05
    }
    base_frac = mapping.get(category, 0.5)
    scores = {}
    for crit in rubric:
        base = base_frac * crit['max_score']
        scores[crit['code']] = jitter_score(base, jitter=1.2, min_score=0, max_score=crit['max_score'])
    return scores

def generate_feedback_text(scores, rubric, question_prompt, student_answer, category):
    """Generate rubric-aligned feedback"""
    low = []
    fixes = []
    
    for crit in rubric:
        code = crit['code']
        sc = scores.get(code, 0)
        if sc <= max(1, int(0.3 * crit['max_score'])):
            low.append(crit['criterion'])
        elif sc < crit['max_score'] and sc < int(0.7 * crit['max_score']):
            fixes.append(crit['criterion'])
    
    if category == 'excellent' and not low:
        short = random.choice(FEEDBACK_TEMPLATES['excellent_short'])
        detail = "".join(["Great: "+c+". " for c in [r['criterion'] for r in rubric]])
        return short, detail
    
    if category == 'incorrect' or (len(low) == len(rubric)):
        short = random.choice(FEEDBACK_TEMPLATES['incorrect_short']).format(topic=question_prompt.split(':')[0])
        detail = "You should review the core concepts and definitions for this topic."
        return short, detail
    
    parts = []
    if low:
        miss = ', '.join(low[:2])
        parts.append(random.choice(FEEDBACK_TEMPLATES['partial_correct']).format(missed=miss))
    if fixes:
        fix = ', '.join(fixes[:2])
        parts.append(random.choice(FEEDBACK_TEMPLATES['minor_fix']).format(fix=fix))
    
    short = ' '.join(parts) if parts else random.choice(FEEDBACK_TEMPLATES['excellent_short'])
    detail = "Details: " + "; ".join([f"{c['code']}({c['criterion']}): {scores[c['code']]}/{c['max_score']}" for c in rubric])
    
    return short, detail

print("‚úì Helper functions defined")

#### 3.3 Generate Dataset

In [None]:
def build_dataset(questions, instances_per_q):
    """Generate synthetic dataset"""
    records = []
    for q in questions[:config.NUM_QUESTIONS]:
        for i in range(instances_per_q):
            category = pick_grade_category()
            seed = random.choice(q['seeds'])
            ans = mutate_answer(seed, q.get('mistakes', []), category)
            scores = score_from_category(category, q['rubric'])
            short_fb, detailed_fb = generate_feedback_text(scores, q['rubric'], q['prompt'], ans, category)
            
            record = {
                'id': uid(),
                'question_id': q['id'],
                'course': q['course'],
                'prompt': q['prompt'],
                'student_answer': ans,
                'grade_category': category,
                'rubric_scores': scores,
                'rubric': q['rubric'],
                'instructor_feedback': {
                    'short_comment': short_fb,
                    'detailed_comment': detailed_fb,
                    'overall_score': sum(scores.values()),
                    'max_overall': sum([c['max_score'] for c in q['rubric']])
                },
                'created_at': datetime.utcnow().isoformat() + 'Z'
            }
            records.append(record)
    return records

# Generate the dataset
print("Generating synthetic dataset...")
records = build_dataset(QUESTIONS, config.INSTANCES_PER_QUESTION)

# Save to JSON
out_json = os.path.join(config.OUTPUT_DIR, 'records.json')
with open(out_json, 'w', encoding='utf-8') as f:
    json.dump(records, f, ensure_ascii=False, indent=2)

# Save sample CSV
out_csv = os.path.join(config.OUTPUT_DIR, 'records_sample.csv')
sample_fields = ['id', 'question_id', 'course', 'student_answer', 'grade_category', 
                 'instructor_feedback_short', 'overall_score', 'max_overall']
with open(out_csv, 'w', newline='', encoding='utf-8') as csvf:
    writer = csv.DictWriter(csvf, fieldnames=sample_fields)
    writer.writeheader()
    for r in records[:min(200, len(records))]:
        writer.writerow({
            'id': r['id'],
            'question_id': r['question_id'],
            'course': r['course'],
            'student_answer': r['student_answer'][:200].replace('\n', ' '),
            'grade_category': r['grade_category'],
            'instructor_feedback_short': r['instructor_feedback']['short_comment'],
            'overall_score': r['instructor_feedback']['overall_score'],
            'max_overall': r['instructor_feedback']['max_overall']
        })

print(f"‚úì Generated {len(records)} training instances")
print(f"‚úì Saved to {out_json}")
print(f"‚úì CSV sample saved to {out_csv}")

# Show sample
print("\n" + "="*80)
print("SAMPLE GENERATED RECORD")
print("="*80)
sample = records[0]
print(f"Course: {sample['course']}")
print(f"Question: {sample['prompt']}")
print(f"Student Answer: {sample['student_answer']}")
print(f"Grade Category: {sample['grade_category']}")
print(f"Feedback: {sample['instructor_feedback']['short_comment']}")
print(f"Score: {sample['instructor_feedback']['overall_score']}/{sample['instructor_feedback']['max_overall']}")


### 4. Data Preparation for Training

#### 4.1 Format Data for Model Training

In [None]:
def format_rubric_for_prompt(rubric):
    """Convert rubric to readable text"""
    text = "**Marking Rubric:**\n"
    for criterion in rubric:
        text += f"- **[{criterion['code']}] {criterion['criterion']}** "
        text += f"({criterion['max_score']} marks): {criterion['description']}\n"
    return text

def create_training_example(record, include_scores=False):
    """Convert record to training example format"""
    rubric_text = format_rubric_for_prompt(record['rubric'])
    
    instruction = f"""You are a university teaching assistant providing constructive feedback on student answers.

**Course:** {record['course']}
**Question:** {record['prompt']}

{rubric_text}
**Student Answer:** {record['student_answer']}

Provide brief, constructive feedback (2-3 sentences):"""
    
    response = record['instructor_feedback']['short_comment']
    
    return {
        'instruction': instruction,
        'response': response,
        'metadata': {
            'question_id': record['question_id'],
            'course': record['course'],
            'grade_category': record['grade_category'],
            'overall_score': record['instructor_feedback']['overall_score'],
            'max_score': record['instructor_feedback']['max_overall']
        }
    }

# Convert all records to training examples
print("Converting records to training format...")
examples = [create_training_example(r) for r in records]

# Shuffle
random.shuffle(examples)

# Split into train/val/test
train_examples, temp_examples = train_test_split(
    examples,
    test_size=(config.test_split_size + config.val_split_size),
    random_state=42
)

val_examples, test_examples = train_test_split(
    temp_examples,
    test_size=(config.test_split_size / (config.test_split_size + config.val_split_size)),
    random_state=42
)

print(f"‚úì Split: {len(train_examples)} train, {len(val_examples)} val, {len(test_examples)} test")

# Convert to HuggingFace Datasets
train_dataset = Dataset.from_list(train_examples)
val_dataset = Dataset.from_list(val_examples)
test_dataset = Dataset.from_list(test_examples)

# Save datasets
train_dataset.to_json(f'{config.DATA_PROCESSED_DIR}/train.json')
val_dataset.to_json(f'{config.DATA_PROCESSED_DIR}/val.json')
test_dataset.to_json(f'{config.DATA_PROCESSED_DIR}/test.json')

print(f"‚úì Saved datasets to {config.DATA_PROCESSED_DIR}/")

# Show sample training example
print("\n" + "="*80)
print("SAMPLE TRAINING EXAMPLE")
print("="*80)
print("\nINSTRUCTION:")
print(train_dataset[0]['instruction'])
print("\nRESPONSE:")
print(train_dataset[0]['response'])
print("="*80)


 ### 5. Model Fine-Tuning with Unsloth

#### 5.1 Load and Configure Model

In [None]:
def setup_model():
    """Load base model and add LoRA adapters"""
    print("\n" + "="*80)
    print("LOADING MODEL")
    print("="*80)
    
    # Load base model
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=config.model_name,
        max_seq_length=config.max_seq_length,
        dtype=None,
        load_in_4bit=config.load_in_4bit,
    )
    
    print(f"‚úì Loaded {config.model_name}")
    
    # Add LoRA adapters
    model = FastLanguageModel.get_peft_model(
        model,
        r=config.lora_r,
        target_modules=[
            "q_proj", "k_proj", "v_proj", "o_proj",
            "gate_proj", "up_proj", "down_proj",
        ],
        lora_alpha=config.lora_alpha,
        lora_dropout=config.lora_dropout,
        bias="none",
        use_gradient_checkpointing="unsloth",
        random_state=3407,
        use_rslora=False,
    )
    
    print(f"‚úì LoRA configured (r={config.lora_r}, alpha={config.lora_alpha})")
    
    # Apply chat template
    tokenizer = get_chat_template(
        tokenizer,
        chat_template="phi-3",
    )
    
    print("‚úì Chat template applied")
    
    return model, tokenizer

# Load model
model, tokenizer = setup_model()

#### 5.2 Format Datasets for Training

In [None]:
def formatting_prompts_func(examples, tokenizer):
    """Format examples using chat template"""
    instructions = examples["instruction"]
    responses = examples["response"]
    texts = []
    
    for instruction, response in zip(instructions, responses):
        messages = [
            {"role": "user", "content": instruction},
            {"role": "assistant", "content": response}
        ]
        
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False
        )
        texts.append(text)
    
    return {"text": texts}

print("\n" + "="*80)
print("FORMATTING DATASETS")
print("="*80)

# Format datasets
train_dataset_formatted = train_dataset.map(
    lambda x: formatting_prompts_func(x, tokenizer),
    batched=True,
    remove_columns=train_dataset.column_names
)

val_dataset_formatted = val_dataset.map(
    lambda x: formatting_prompts_func(x, tokenizer),
    batched=True,
    remove_columns=val_dataset.column_names
)

print("‚úì Datasets formatted for training")

# Show GPU stats
if torch.cuda.is_available():
    gpu_stats = torch.cuda.get_device_properties(0)
    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
    print(f"\n‚úì GPU: {gpu_stats.name}")
    print(f"‚úì Max memory: {max_memory} GB")
    print(f"‚úì Reserved memory: {start_gpu_memory} GB")

#### 5.3 Train the Model

In [None]:
print("\n" + "="*80)
print("TRAINING")
print("="*80)

# Training arguments
training_args = TrainingArguments(
    output_dir=config.output_model_dir,
    per_device_train_batch_size=config.per_device_train_batch_size,
    per_device_eval_batch_size=config.per_device_eval_batch_size,
    gradient_accumulation_steps=config.gradient_accumulation_steps,
    learning_rate=config.learning_rate,
    num_train_epochs=config.num_epochs,
    warmup_steps=config.warmup_steps,
    logging_steps=config.logging_steps,
    eval_strategy="steps",
    eval_steps=config.eval_steps,
    save_steps=config.save_steps,
    save_total_limit=config.save_total_limit,
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    report_to="none"  # Change to "wandb" if you want to use Weights & Biases
)

# Initialize trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset_formatted,
    eval_dataset=val_dataset_formatted,
    dataset_text_field="text",
    max_seq_length=config.max_seq_length,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    dataset_num_proc=2,
    packing=True,
    args=training_args,
)

# Train
print("\nStarting training...")
trainer_stats = trainer.train()

# Show stats
print("\n" + "="*80)
print("TRAINING COMPLETE")
print("="*80)
print(f"‚úì Time: {round(trainer_stats.metrics['train_runtime']/60, 2)} minutes")

if torch.cuda.is_available():
    used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    print(f"‚úì Peak GPU memory: {used_memory} GB")

#### 5.4 Save Model

In [None]:
print("\n" + "="*80)
print("SAVING MODEL")
print("="*80)

# Save LoRA adapters
model.save_pretrained(f"{config.output_model_dir}/lora_adapters")
tokenizer.save_pretrained(f"{config.output_model_dir}/lora_adapters")

print(f"‚úì Model saved to {config.output_model_dir}/lora_adapters")

# Optionally save merged model
model.save_pretrained_merged(
    f"{config.output_model_dir}/merged_model",
    tokenizer,
    save_method="merged_16bit",
)

print(f"‚úì Merged model saved to {config.output_model_dir}/merged_model")


### 6. Model Testing and Evaluation

#### 6.1 Load Fine-tuned Model for Inference

In [None]:
def load_finetuned_model(model_path=f"{config.output_model_dir}/lora_adapters"):
    """Load fine-tuned model for inference"""
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_path,
        max_seq_length=2048,
        dtype=None,
        load_in_4bit=True,
    )
    
    # Enable fast inference
    FastLanguageModel.for_inference(model)
    
    print(f"‚úì Model loaded from {model_path}")
    return model, tokenizer

# Load the fine-tuned model
inference_model, inference_tokenizer = load_finetuned_model()


#### 6.2 Inference Function

In [None]:
def generate_feedback(model, tokenizer, question, rubric, student_answer, course=""):
    """Generate feedback for a student answer"""
    
    # Format rubric
    rubric_text = format_rubric_for_prompt(rubric)
    
    # Build instruction
    instruction = f"""You are a university teaching assistant providing constructive feedback on student answers."""
    
    if course:
        instruction += f"\n\n**Course:** {course}"
    
    instruction += f"""

**Question:** {question}

{rubric_text}
**Student Answer:** {student_answer}

Provide brief, constructive feedback (2-3 sentences):"""
    
    # Format as chat
    messages = [{"role": "user", "content": instruction}]
    
    # Apply chat template
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # Generate
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        repetition_penalty=1.1,
        use_cache=True,
    )
    
    # Decode
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract assistant's response
    if "<|assistant|>" in response:
        feedback = response.split("<|assistant|>")[-1].strip()
    else:
        feedback = response.split(instruction)[-1].strip()
    
    return feedback

print("‚úì Inference function ready")

#### 6.3 Test on Sample Examples

In [None]:
print("\n" + "="*80)
print("TESTING ON SAMPLE EXAMPLES")
print("="*80)

# Test on a few examples from test set
for i in range(min(5, len(test_dataset))):
    example = test_dataset[i]
    
    print(f"\n--- Example {i+1} ---")
    
    # Extract question (simple parsing)
    instruction = example['instruction']
    question_start = instruction.find("**Question:**") + len("**Question:**")
    question_end = instruction.find("**Marking Rubric:**")
    question = instruction[question_start:question_end].strip()
    
    print(f"Question: {question[:100]}...")
    print(f"\nExpected Feedback: {example['response']}")
    
    # Generate
    messages = [{"role": "user", "content": instruction}]
    prompt = inference_tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = inference_tokenizer(prompt, return_tensors="pt").to(inference_model.device)
    outputs = inference_model.generate(
        **inputs, max_new_tokens=256, temperature=0.7,
        do_sample=True, repetition_penalty=1.1
    )
    generated = inference_tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    if "<|assistant|>" in generated:
        generated_feedback = generated.split("<|assistant|>")[-1].strip()
    else:
        generated_feedback = generated.split(instruction)[-1].strip()
    
    print(f"Generated Feedback: {generated_feedback}")
    print("-" * 80)

#### 6.4 Interactive Testing

In [None]:
def interactive_test(model, tokenizer):
    """Interactive mode to test with custom inputs"""
    
    print("\n" + "="*80)
    print("INTERACTIVE TESTING MODE")
    print("="*80)
    print("Enter question and student answer to get feedback")
    print("Type 'quit' to exit\n")
    
    # Example rubric
    example_rubric = [
        {
            "code": "C1",
            "criterion": "Definition correctness",
            "max_score": 5,
            "description": "Clear definition with key concepts"
        },
        {
            "code": "C2",
            "criterion": "Explanation completeness",
            "max_score": 5,
            "description": "Complete explanation of the concept"
        },
        {
            "code": "C3",
            "criterion": "Clarity and conciseness",
            "max_score": 5,
            "description": "Clear and concise answer"
        }
    ]
    
    while True:
        print("\n" + "-"*80)
        question = input("Question: ")
        if question.lower() == 'quit':
            break
        
        student_answer = input("Student answer: ")
        if student_answer.lower() == 'quit':
            break
        
        # Generate feedback
        feedback = generate_feedback(
            model, tokenizer,
            question=question,
            rubric=example_rubric,
            student_answer=student_answer,
            course="Computer Science"
        )
        
        print(f"\nüìù Generated Feedback:\n{feedback}\n")

# Uncomment to run interactive testing
# interactive_test(inference_model, inference_tokenizer)


#### 6.5 Quantitative Evaluation

In [None]:
print("\n" + "="*80)
print("QUANTITATIVE EVALUATION")
print("="*80)

# Evaluate on full test set
results = []

for i, example in enumerate(test_dataset):
    instruction = example['instruction']
    
    # Generate
    messages = [{"role": "user", "content": instruction}]
    prompt = inference_tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    inputs = inference_tokenizer(prompt, return_tensors="pt").to(inference_model.device)
    outputs = inference_model.generate(
        **inputs, max_new_tokens=256, temperature=0.7,
        do_sample=True, repetition_penalty=1.1
    )
    generated = inference_tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    if "<|assistant|>" in generated:
        generated_feedback = generated.split("<|assistant|>")[-1].strip()
    else:
        generated_feedback = generated.split(instruction)[-1].strip()
    
    results.append({
        'example_id': i,
        'question_id': example['metadata']['question_id'],
        'grade_category': example['metadata']['grade_category'],
        'expected': example['response'],
        'generated': generated_feedback
    })

# Save results
with open('evaluation_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print(f"‚úì Evaluated {len(results)} test examples")
print(f"‚úì Results saved to evaluation_results.json")

# Show some statistics
grade_categories = {}
for r in results:
    cat = r['grade_category']
    if cat not in grade_categories:
        grade_categories[cat] = 0
    grade_categories[cat] += 1

print("\nTest set distribution:")
for cat, count in sorted(grade_categories.items()):
    print(f"  {cat}: {count} examples")


### 7. Summary and Next Steps

In [None]:
print("\n" + "="*80)
print("üéâ PIPELINE COMPLETE!")
print("="*80)
print("\nWhat we accomplished:")
print("‚úì Generated synthetic dataset with rubric-aligned feedback")
print("‚úì Prepared data for fine-tuning")
print("‚úì Fine-tuned Phi-3 model using Unsloth with LoRA")
print("‚úì Evaluated model on test set")
print("\nModel location:", config.output_model_dir)
print("\nNext steps:")
print("1. Review evaluation_results.json for quality assessment")
print("2. Try interactive testing with your own examples")
print("3. Experiment with different hyperparameters")
print("4. Add more questions and rubrics to expand dataset")
print("5. Implement automatic metrics (BLEU, ROUGE, BERTScore)")
print("6. Deploy model for production use")
print("="*80)