In [None]:
%%capture

%pip install pip3-autoremove
%pip-autoremove torch torchvision torchaudio -y
%pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu121
%pip install unsloth

In [2]:
import torch
if torch.cuda.is_available():
    print(f"GPU available: {torch.cuda.get_device_name(0)}")
else:
    print("No GPU available.")

GPU available: Tesla T4


## Fine-tune the model

In [4]:
from unsloth import FastLanguageModel
import torch
from datasets import load_dataset
from transformers import TrainingArguments, EarlyStoppingCallback
from trl import SFTTrainer
import numpy as np

# CONFIGURATION FOR REDUCED OVERFITTING
max_seq_length = 4096
dtype = None
load_in_4bit = True

# Load model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# REDUCED LORA CONFIGURATION TO PREVENT OVERFITTING
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,  # REDUCED from 64 - smaller rank prevents overfitting
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],  # REMOVED lm_head
    lora_alpha = 16,  # REDUCED from 32 - lower alpha reduces adaptation strength
    lora_dropout = 0.1,  # INCREASED from 0.05 - more dropout for regularization
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = True,
    loftq_config = None,
)

# IMPROVED PROMPT TEMPLATE
alpaca_prompt = """You are an expert essay grader. Grade the following essay based on the provided marking scheme and return only the numerical score.

### Marking Scheme:
{}

### Question:
{}

### Reference Answer:
{}

### Student Answer:
{}

### Score:
{}"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    texts = []
    for q, ra, sa, ms, score in zip(examples["question"], examples["reference_answer"],
                                   examples["student_answer"], examples["mark_scheme"], examples["score"]):
        mark_scheme_str = "\n".join([f"Criterion {i+1}: {k} - {v}"
                                   for i, (k, v) in enumerate(ms.items())])

        text = alpaca_prompt.format(
            mark_scheme_str,
            q,
            ra,
            sa,
            str(score)
        ) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

# Load and prepare dataset with LARGER validation split
dataset = load_dataset("sue888888888888/essay_grading_for_instruction_tuning", split="train")
dataset = dataset.map(formatting_prompts_func, batched=True)

# LARGER validation split to better detect overfitting
dataset_split = dataset.train_test_split(test_size=0.2, seed=3407)
train_dataset = dataset_split['train']
val_dataset = dataset_split['test']

def compute_metrics(eval_pred):
    """Custom metrics to monitor overfitting"""
    predictions, labels = eval_pred
    # This is a placeholder - you'd implement actual scoring metrics here
    return {"custom_metric": 0.0}

def main():
    # ANTI-OVERFITTING TRAINING CONFIGURATION
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        dataset_text_field="text",
        max_seq_length=max_seq_length,
        dataset_num_proc=2,
        packing=False,
        # MORE AGGRESSIVE early stopping
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],  # REDUCED from 3
        args=TrainingArguments(
            # SMALLER batch sizes and more frequent evaluation
            per_device_train_batch_size=1,
            per_device_eval_batch_size=1,
            gradient_accumulation_steps=4,  # REDUCED from 8

            # REDUCED training to prevent overfitting
            num_train_epochs=2,  # REDUCED from 3
            max_steps=500,  # ADD maximum steps as safety net
            learning_rate=2e-5,  # REDUCED from 5e-5 - slower learning
            warmup_ratio=0.1,

            # Precision and optimization
            fp16=not torch.cuda.is_bf16_supported(),
            bf16=torch.cuda.is_bf16_supported(),
            optim="adamw_8bit",
            weight_decay=0.1,  # INCREASED from 0.01 - more regularization
            lr_scheduler_type="cosine",

            # MORE FREQUENT monitoring to catch overfitting early
            logging_steps=5,    # REDUCED from 10
            eval_steps=25,      # REDUCED from 50
            save_steps=50,      # REDUCED from 100
            eval_strategy="steps",
            save_strategy="steps",
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            greater_is_better=False,

            # ADD: Save only best model to prevent using overfit checkpoints
            save_total_limit=2,

            # Output
            output_dir="outputs/essay_grader_regularized",
            report_to="none",
            seed=3407,

            # ADD: Gradient clipping to prevent exploding gradients
            max_grad_norm=1.0,
        ),
    )

    # Memory stats
    gpu_stats = torch.cuda.get_device_properties(0)
    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
    print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
    print(f"{start_gpu_memory} GB of memory reserved.")

    # Train with monitoring
    print("Starting training with overfitting prevention measures...")
    trainer_stats = trainer.train()

        # VALIDATION: Check for overfitting by comparing train vs eval loss
    eval_results = trainer.evaluate()
    eval_loss = eval_results['eval_loss']

    # SAFELY get final training loss
    train_loss = None
    for record in reversed(trainer.state.log_history):
        if "loss" in record:
            train_loss = record["loss"]
            break

    if train_loss is None:
        print("⚠️ Could not retrieve training loss from log history.")
        train_loss = 0.0  # Fallback to avoid crash

    print(f"\nFinal Training Loss: {train_loss:.4f}")
    print(f"Final Validation Loss: {eval_loss:.4f}")
    print(f"Loss Difference (Eval - Train): {eval_loss - train_loss:.4f}")

    if eval_loss - train_loss > 0.5:
        print("⚠️  WARNING: Large gap between train and validation loss suggests overfitting!")
    else:
        print("✅ Train/validation loss gap looks reasonable.")


    # Final memory stats
    used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
    used_percentage = round(used_memory / max_memory * 100, 3)
    lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)

    print(f"\n{trainer_stats.metrics['train_runtime']} seconds used for training.")
    print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
    print(f"Peak reserved memory = {used_memory} GB.")
    print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
    print(f"Peak reserved memory % of max memory = {used_percentage} %.")
    print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

    # ROBUST INFERENCE FUNCTION
    def grade_essay(question, reference_answer, student_answer, mark_scheme_dict, return_confidence=False):
        """Grade an essay with optional confidence scoring"""
        FastLanguageModel.for_inference(model)

        mark_scheme_str = "\n".join([f"Criterion {i+1}: {k} - {v}"
                                   for i, (k, v) in enumerate(mark_scheme_dict.items())])

        prompt = alpaca_prompt.format(
            mark_scheme_str,
            question,
            reference_answer,
            student_answer,
            ""
        )

        inputs = tokenizer([prompt], return_tensors="pt").to("cuda")

        # Generate multiple samples to check consistency (reduces overfitting effects)
        scores = []
        for _ in range(3):  # Generate 3 samples
            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=10,
                    temperature=0.3,    # Slightly higher temperature
                    do_sample=True,
                    top_p=0.9,
                    pad_token_id=tokenizer.eos_token_id
                )

            decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
            generated_text = decoded.split("### Score:")[-1].strip()

            try:
                score = float(generated_text.split()[0])
                scores.append(score)
            except:
                pass

        if scores:
            avg_score = np.mean(scores)
            confidence = 1.0 - (np.std(scores) / (np.mean(scores) + 1e-8))  # Higher std = lower confidence

            if return_confidence:
                return avg_score, confidence
            return avg_score
        else:
            return "Error: Could not parse score"

    # TEST WITH MULTIPLE EXAMPLES
    test_cases = [
        {
            "question": "What is photosynthesis?",
            "reference": "Photosynthesis is the process by which green plants make their own food using sunlight, carbon dioxide, and water. The process occurs in the chloroplasts and produces glucose and oxygen as end products.",
            "student": "Photosynthesis is when plants eat sunlight and turn it into food and air.",
            "mark_scheme": {
                "Defines photosynthesis correctly": "2 points",
                "Mentions sunlight as energy source": "1 point",
                "Includes CO2 and water as inputs": "1 point",
                "Mentions oxygen/glucose as products": "1 point"
            }
        },
        {
            "question": "Explain Newton's first law of motion.",
            "reference": "Newton's first law states that an object at rest stays at rest and an object in motion stays in motion with the same speed and in the same direction unless acted upon by an unbalanced force.",
            "student": "Things don't move unless you push them, and moving things keep moving unless something stops them.",
            "mark_scheme": {
                "States the law correctly": "3 points",
                "Mentions rest and motion": "1 point",
                "Mentions unbalanced force": "1 point"
            }
        }
    ]

    print("\n" + "="*50)
    print("TESTING MODEL WITH CONFIDENCE SCORES")
    print("="*50)

    for i, test_case in enumerate(test_cases, 1):
        score, confidence = grade_essay(
            test_case["question"],
            test_case["reference"],
            test_case["student"],
            test_case["mark_scheme"],
            return_confidence=True
        )
        print(f"\nTest Case {i}:")
        print(f"Score: {score:.2f}")
        print(f"Confidence: {confidence:.2f}")

        if confidence < 0.8:
            print("⚠️  Low confidence - model may be uncertain or overfitted")

    # Save the model
    model.save_pretrained("essay_grader_regularized")
    tokenizer.save_pretrained("essay_grader_regularized")
    print(f"\nModel saved to 'essay_grader_regularized' directory")

    return trainer, eval_loss - train_loss  # Return gap for monitoring

if __name__ == "__main__":
    trainer, loss_gap = main()

    if loss_gap > 0.5:
        print("\n" + "="*60)
        print("OVERFITTING DETECTED - RECOMMENDATIONS:")
        print("="*60)
        print("1. Reduce LoRA rank further (try r=8)")
        print("2. Increase dropout to 0.2")
        print("3. Reduce learning rate to 1e-5")
        print("4. Add more training data if possible")
        print("5. Consider using smaller model")
        print("6. Implement cross-validation")
    else:
        print("\n✅ Training completed successfully with good generalization!")

==((====))==  Unsloth 2025.5.7: Fast Mistral patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
GPU = Tesla T4. Max memory = 14.741 GB.
10.854 GB of memory reserved.
Starting training with overfitting prevention measures...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 46 | Num Epochs = 46 | Total steps = 500
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 4 x 1) = 4
 "-____-"     Trainable parameters = 41,943,040/7,000,000,000 (0.60% trained)


Step,Training Loss,Validation Loss
25,0.8245,0.670429
50,0.2776,0.232053
75,0.0932,0.176636
100,0.0632,0.192399
125,0.0344,0.173059
150,0.0301,0.167038
175,0.0271,0.186531
200,0.0258,0.187339



Final Training Loss: 0.0258
Final Validation Loss: 0.1670
Loss Difference (Eval - Train): 0.1412
✅ Train/validation loss gap looks reasonable.

841.1534 seconds used for training.
14.02 minutes used for training.
Peak reserved memory = 11.238 GB.
Peak reserved memory for training = 0.384 GB.
Peak reserved memory % of max memory = 76.236 %.
Peak reserved memory for training % of max memory = 2.605 %.

TESTING MODEL WITH CONFIDENCE SCORES

Test Case 1:
Score: 1.00
Confidence: 1.00

Test Case 2:
Score: 2.00
Confidence: 1.00

Model saved to 'essay_grader_regularized' directory

✅ Training completed successfully with good generalization!


## Test on unseen samples

In [3]:
import json

path = "generated_essay_grading_samples.json"

# Load JSON into a list of dicts
with open(path, "r", encoding="utf-8") as f:
    eval_dataset2 = json.load(f)  # Should be a list of dicts

print(f"Loaded {len(eval_dataset2)} examples from JSON file")

# Then call evaluate_model with the loaded data


Loaded 40 examples from JSON file


In [4]:
from unsloth import FastLanguageModel
import torch
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, f1_score
import re
import pandas as pd
from tqdm import tqdm

# --- Load fine-tuned model ---
max_seq_length = 2048
dtype = None  # Auto detection
load_in_4bit = True

# Check GPU availability
if torch.cuda.is_available():
    print(f"GPU available: {torch.cuda.get_device_name(0)}")
    device = "cuda"
else:
    print("No GPU available. Using CPU (will be slow).")
    device = "cpu"

# Path to your fine-tuned model (change if needed)
model_path = "essay_grader_regularized"  # The output_dir from your training script

try:
    # Load the model - if this fails, may need to specify the exact checkpoint
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_path,
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )

    # Set model to evaluation mode
    FastLanguageModel.for_inference(model)

except Exception as e:
    print(f"Error loading fine-tuned model: {e}")
    print("Falling back to base model (unsloth/mistral-7b-instruct-v0.2-bnb-4bit)")

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model)

# --- Prepare evaluation dataset ---
# Load test split or use a portion of train data if test not available
try:
    eval_dataset = val_dataset
    print(f"Loaded test split with {len(eval_dataset)} examples")
except:
    # If no test split, use a portion of train data
    dataset = load_dataset("sue888888888888/essay_grading_for_instruction_tuning", split="train")
    # Use 20% of data for evaluation
    train_size = int(0.8 * len(dataset))
    eval_dataset = dataset.select(range(train_size, len(dataset)))
    print(f"No test split found. Using {len(eval_dataset)} examples from train split for evaluation")

# --- Define prompt template ---
# Same template as used in training
alpaca_prompt = """Below is an instruction that describes how to grade an essay, paired with an input that provides the grading schema. Write a response that grades essays based on the mark schema provided.

### Instruction:
{}

### Input:
{}

### Response:
"""

# --- Evaluation function ---
def evaluate_model(model, tokenizer, dataset, num_samples=None):
    if num_samples is not None:
        if num_samples > len(dataset):
            num_samples = len(dataset)
        indices = np.random.choice(len(dataset), num_samples, replace=False)
        dataset = dataset.select(indices)

    results = []
    true_scores = []
    pred_scores = []

    # Process each example in the dataset
    for idx, example in enumerate(tqdm(dataset, desc="Evaluating")):
        # Format prompt
        mark_scheme_str = "\n".join([f"{k}: {v}" for k, v in example["mark_scheme"].items()])
        instruction = "Grade this essay based on the following mark scheme:\n" + mark_scheme_str
        input_text = f"Question: {example['question']}\nReference Answer: {example['reference_answer']}\nStudent Answer: {example['student_answer']}"

        # Generate score
        prompt = alpaca_prompt.format(instruction, input_text)

        inputs = tokenizer(prompt, return_tensors="pt").to(device)

        # Generate with modest parameters
        outputs = model.generate(
            **inputs,
            max_new_tokens=64,
            temperature=0.1,  # Low temperature for more deterministic output
            top_p=0.9,
            do_sample=False,  # Greedy decoding for evaluation
            use_cache=True
        )

        # Decode the output
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract the generated score - look for the first number in the response
        response_part = generated_text.split("### Response:")[-1].strip()

        # Extract numeric score using regex
        score_match = re.search(r'\b(\d+)\b', response_part)
        pred_score = int(score_match.group(1)) if score_match else None

        true_score = example["score"]

        results.append({
            "index": idx,
            "question": example["question"],
            "student_answer": example["student_answer"][:100] + "...",  # Truncate for display
            "true_score": true_score,
            "pred_score": pred_score,
            "correct": pred_score == true_score if pred_score is not None else False,
            "full_response": response_part
        })

        if pred_score is not None:
            true_scores.append(true_score)
            pred_scores.append(pred_score)

    # Calculate metrics
    metrics = {}
    if true_scores and pred_scores:
        metrics["accuracy"] = accuracy_score([int(s) for s in true_scores], [int(s) for s in pred_scores])
        metrics["mae"] = mean_absolute_error([int(s) for s in true_scores], [int(s) for s in pred_scores])
        metrics["mse"] = mean_squared_error([int(s) for s in true_scores], [int(s) for s in pred_scores])
        metrics["rmse"] = np.sqrt(metrics["mse"])

        # For multi-class F1
        metrics["f1_macro"] = f1_score(
            [int(s) for s in true_scores],
            [int(s) for s in pred_scores],
            average='macro'
        )

    return results, metrics

# --- Run evaluation ---
# You can adjust the number of samples to evaluate if the dataset is large
num_eval_samples = None  # Change to None to evaluate on all samples
print(f"Starting evaluation on {num_eval_samples if num_eval_samples else len(eval_dataset2)} samples...")
results, metrics = evaluate_model(model, tokenizer, eval_dataset2, num_samples=num_eval_samples)

# --- Display results ---
# Summary metrics
print("\n=== EVALUATION METRICS ===")
for metric_name, value in metrics.items():
    print(f"{metric_name}: {value:.4f}")

# Create and display results dataframe
results_df = pd.DataFrame(results)
print("\n=== SAMPLE RESULTS ===")
print(results_df[["question", "true_score", "pred_score", "correct"]].head(10))

# Calculate distribution of scores
if results_df["pred_score"].notna().any():
    print("\n=== SCORE DISTRIBUTION ===")
    print("True scores distribution:")
    print(results_df["true_score"].value_counts().sort_index())
    print("\nPredicted scores distribution:")
    print(results_df["pred_score"].value_counts().sort_index())

# Save detailed results to CSV
results_df.to_csv("evaluation_results.csv", index=False)
print("\nDetailed results saved to 'evaluation_results.csv'")

# --- Error Analysis ---
if results_df["pred_score"].notna().any():
    print("\n=== ERROR ANALYSIS ===")

    # Find examples with largest errors
    results_df["error"] = abs(results_df["true_score"] - results_df["pred_score"])
    largest_errors = results_df.nlargest(5, "error")

    print("Examples with largest errors:")
    for _, row in largest_errors.iterrows():
        print(f"\nQuestion: {row['question']}")
        print(f"True score: {row['true_score']}, Predicted: {row['pred_score']}, Error: {row['error']}")


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
GPU available: Tesla T4
==((====))==  Unsloth 2025.5.7: Fast Mistral patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: Will load essay_grader_regularized as a legacy tokenizer.
Unsloth 2025.5.7 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


No test split found. Using 12 examples from train split for evaluation
Starting evaluation on 40 samples...


Evaluating: 100%|██████████| 40/40 [01:45<00:00,  2.65s/it]


=== EVALUATION METRICS ===
accuracy: 0.6250
mae: 0.4000
mse: 0.4500
rmse: 0.6708
f1_macro: 0.5746

=== SAMPLE RESULTS ===
                                          question  true_score  pred_score  \
0    What is a Convolutional Neural Network (CNN)?           1           1   
1    What is a Convolutional Neural Network (CNN)?           2           1   
2    What is a Convolutional Neural Network (CNN)?           3           2   
3    What is a Convolutional Neural Network (CNN)?           4           4   
4  What is a Generative Adversarial Network (GAN)?           1           1   
5  What is a Generative Adversarial Network (GAN)?           2           1   
6  What is a Generative Adversarial Network (GAN)?           3           2   
7  What is a Generative Adversarial Network (GAN)?           4           4   
8                What is YOLO in machine learning?           1           1   
9                What is YOLO in machine learning?           2           1   

   correct  
0    


