In [None]:
%%capture

%pip install pip3-autoremove
%pip-autoremove torch torchvision torchaudio -y
%pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu121
%pip install unsloth

In [2]:
import torch
if torch.cuda.is_available():
    print(f"GPU available: {torch.cuda.get_device_name(0)}")
else:
    print("No GPU available.")

GPU available: Tesla T4


In [3]:
# Commented out IPython magic to ensure Python compatibility.
# %%capture
# # Installs Unsloth, Xformers (Flash Attention) and all other packages!
# !pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
# !pip install --no-deps xformers trl peft accelerate bitsandbytes

"""* We support Llama, Mistral, CodeLlama, TinyLlama, Vicuna, Open Hermes etc
* And Yi, Qwen ([llamafied](https://huggingface.co/models?sort=trending&search=qwen+llama)), Deepseek, all Llama, Mistral derived archs.
* We support 16bit LoRA or 4bit QLoRA. Both 2x faster.
* `max_seq_length` can be set to anything, since we do automatic RoPE Scaling via [kaiokendev's](https://kaiokendev.github.io/til) method.
* With [PR 26037](https://github.com/huggingface/transformers/pull/26037), we support downloading 4bit models **4x faster**! [Our repo](https://huggingface.co/unsloth) has Llama, Mistral 4bit models.
* [**NEW**] We make Gemma 6 trillion tokens **2.5x faster**! See our [Gemma notebook](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing)
"""

from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/llama-2-13b-bnb-4bit",
    "unsloth/codellama-34b-bnb-4bit",
    "unsloth/tinyllama-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit", # New Google 6 trillion tokens model 2.5x faster!
    "unsloth/gemma-2b-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-instruct-v0.2-bnb-4bit", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,

)


"""We now add LoRA adapters so we only need to update 1 to 10% of all parameters!"""

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

from datasets import load_dataset

alpaca_prompt = """Below is an instruction that describes how to grade an essay, paired with an input that provides the grading schema. Write a response that grades essays based on the mark schema provided.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token  # Add EOS token to stop generation

def formatting_prompts_func(examples):
    texts = []
    for q, ra, sa, ms, score in zip(examples["question"], examples["reference_answer"], examples["student_answer"], examples["mark_scheme"], examples["score"]):
        # Convert mark_scheme dict to string
        mark_scheme_str = "\n".join([f"{k}: {v}" for k, v in ms.items()])
        instruction = "Grade this essay based on the following mark scheme:\n" + mark_scheme_str
        input_text = f"Question: {q}\nReference Answer: {ra}\nStudent Answer: {sa}"
        output_text = str(score)

        # Format full prompt
        text = alpaca_prompt.format(instruction, input_text, output_text) + EOS_TOKEN
        texts.append(text)
    return { "text": texts }

# Load dataset and apply formatting
dataset = load_dataset("sue888888888888/essay_grading_for_instruction_tuning", split="train")
dataset = dataset.map(formatting_prompts_func, batched=True)




def main():
    from trl import SFTTrainer
    from transformers import TrainingArguments
    from unsloth import is_bfloat16_supported

    trainer = SFTTrainer(
        model = model,
        tokenizer = tokenizer,
        train_dataset = dataset,
        dataset_text_field = "text",
        max_seq_length = max_seq_length,
        dataset_num_proc = 2,
        packing = False, # Can make training 5x faster for short sequences.
        args = TrainingArguments(
            per_device_train_batch_size = 2,
            gradient_accumulation_steps = 4,
            warmup_steps = 5,
            max_steps = 60, # Set num_train_epochs = 1 for full training runs
            learning_rate = 2e-4,
            fp16 = not is_bfloat16_supported(),
            bf16 = is_bfloat16_supported(),
            logging_steps = 1,
            optim = "adamw_8bit",
            weight_decay = 0.01,
            lr_scheduler_type = "linear",
            seed = 3407,
            output_dir = "outputs",
        ),
    )

    #@title Show current memory stats
    gpu_stats = torch.cuda.get_device_properties(0)
    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
    print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
    print(f"{start_gpu_memory} GB of memory reserved.")

    trainer_stats = trainer.train()

    #@title Show final memory and time stats
    used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
    used_percentage = round(used_memory         /max_memory*100, 3)
    lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
    print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
    print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
    print(f"Peak reserved memory = {used_memory} GB.")
    print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
    print(f"Peak reserved memory % of max memory = {used_percentage} %.")
    print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")


    # alpaca_prompt = Copied from above
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference
    inputs = tokenizer(
    [
        alpaca_prompt.format(
            "Grade this essay based on the following mark scheme:\n1: Defines photosynthesis correctly\n2: Mentions sunlight as an energy source\n3: Includes carbon dioxide and water as inputs\n4: Mentions oxygen or glucose as products ", # instruction

            "What is photosynthesis?\nReference Answer: Photosynthesis is the process by which green plants make their own food using sunlight, carbon dioxide, and water. The process occurs in the chloroplasts and produces glucose and oxygen as end products.\nStudent Answer: Photosynthesis is when plants eat sunlight and turn it into food and air.", # input

            "", # output - leave this blank for generation!
        )
    ], return_tensors = "pt").to("cuda")

    outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
    tokenizer.batch_decode(outputs)

    decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    print(decoded_output[0])


if __name__ == "__main__":
    main()

# %%


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.5.6: Fast Mistral patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.5.6 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


updated_data.json:   0%|          | 0.00/41.4k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/58 [00:00<?, ? examples/s]

Map:   0%|          | 0/58 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/58 [00:00<?, ? examples/s]

GPU = Tesla T4. Max memory = 14.741 GB.
7.043 GB of memory reserved.


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 58 | Num Epochs = 9 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/7,000,000,000 (0.60% trained)
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33ms12218588[0m ([33ms12218588-student[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.2771
2,2.5114
3,2.0812
4,1.7641
5,1.2466
6,1.0009
7,0.7438
8,0.7182
9,0.4382
10,0.3918


369.7277 seconds used for training.
6.16 minutes used for training.
Peak reserved memory = 7.359 GB.
Peak reserved memory for training = 0.316 GB.
Peak reserved memory % of max memory = 49.922 %.
Peak reserved memory for training % of max memory = 2.144 %.
Below is an instruction that describes how to grade an essay, paired with an input that provides the grading schema. Write a response that grades essays based on the mark schema provided.

### Instruction:
Grade this essay based on the following mark scheme:
1: Defines photosynthesis correctly
2: Mentions sunlight as an energy source
3: Includes carbon dioxide and water as inputs
4: Mentions oxygen or glucose as products 

### Input:
What is photosynthesis?
Reference Answer: Photosynthesis is the process by which green plants make their own food using sunlight, carbon dioxide, and water. The process occurs in the chloroplasts and produces glucose and oxygen as end products.
Student Answer: Photosynthesis is when plants eat sunlight a

In [4]:
from unsloth import FastLanguageModel
import torch
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, f1_score
import re
import pandas as pd
from tqdm import tqdm

# --- Load fine-tuned model ---
max_seq_length = 2048
dtype = None  # Auto detection
load_in_4bit = True

# Check GPU availability
if torch.cuda.is_available():
    print(f"GPU available: {torch.cuda.get_device_name(0)}")
    device = "cuda"
else:
    print("No GPU available. Using CPU (will be slow).")
    device = "cpu"

# Path to your fine-tuned model (change if needed)
model_path = "outputs"  # The output_dir from your training script

try:
    # Load the model - if this fails, may need to specify the exact checkpoint
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_path,
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )

    # Set model to evaluation mode
    FastLanguageModel.for_inference(model)

except Exception as e:
    print(f"Error loading fine-tuned model: {e}")
    print("Falling back to base model (unsloth/mistral-7b-instruct-v0.2-bnb-4bit)")

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model)

# --- Prepare evaluation dataset ---
# Load test split or use a portion of train data if test not available
try:
    eval_dataset = load_dataset("sue888888888888/essay_grading_for_instruction_tuning", split="test")
    print(f"Loaded test split with {len(eval_dataset)} examples")
except:
    # If no test split, use a portion of train data
    dataset = load_dataset("sue888888888888/essay_grading_for_instruction_tuning", split="train")
    # Use 20% of data for evaluation
    train_size = int(0.8 * len(dataset))
    eval_dataset = dataset.select(range(train_size, len(dataset)))
    print(f"No test split found. Using {len(eval_dataset)} examples from train split for evaluation")

# --- Define prompt template ---
# Same template as used in training
alpaca_prompt = """Below is an instruction that describes how to grade an essay, paired with an input that provides the grading schema. Write a response that grades essays based on the mark schema provided.

### Instruction:
{}

### Input:
{}

### Response:
"""

# --- Evaluation function ---
def evaluate_model(model, tokenizer, dataset, num_samples=None):
    if num_samples is not None:
        if num_samples > len(dataset):
            num_samples = len(dataset)
        indices = np.random.choice(len(dataset), num_samples, replace=False)
        dataset = dataset.select(indices)

    results = []
    true_scores = []
    pred_scores = []

    # Process each example in the dataset
    for idx, example in enumerate(tqdm(dataset, desc="Evaluating")):
        # Format prompt
        mark_scheme_str = "\n".join([f"{k}: {v}" for k, v in example["mark_scheme"].items()])
        instruction = "Grade this essay based on the following mark scheme:\n" + mark_scheme_str
        input_text = f"Question: {example['question']}\nReference Answer: {example['reference_answer']}\nStudent Answer: {example['student_answer']}"

        # Generate score
        prompt = alpaca_prompt.format(instruction, input_text)

        inputs = tokenizer(prompt, return_tensors="pt").to(device)

        # Generate with modest parameters
        outputs = model.generate(
            **inputs,
            max_new_tokens=64,
            temperature=0.1,  # Low temperature for more deterministic output
            top_p=0.9,
            do_sample=False,  # Greedy decoding for evaluation
            use_cache=True
        )

        # Decode the output
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract the generated score - look for the first number in the response
        response_part = generated_text.split("### Response:")[-1].strip()

        # Extract numeric score using regex
        score_match = re.search(r'\b(\d+)\b', response_part)
        pred_score = int(score_match.group(1)) if score_match else None

        true_score = example["score"]

        results.append({
            "index": idx,
            "question": example["question"],
            "student_answer": example["student_answer"][:100] + "...",  # Truncate for display
            "true_score": true_score,
            "pred_score": pred_score,
            "correct": pred_score == true_score if pred_score is not None else False,
            "full_response": response_part
        })

        if pred_score is not None:
            true_scores.append(true_score)
            pred_scores.append(pred_score)

    # Calculate metrics
    metrics = {}
    if true_scores and pred_scores:
        metrics["accuracy"] = accuracy_score([int(s) for s in true_scores], [int(s) for s in pred_scores])
        metrics["mae"] = mean_absolute_error([int(s) for s in true_scores], [int(s) for s in pred_scores])
        metrics["mse"] = mean_squared_error([int(s) for s in true_scores], [int(s) for s in pred_scores])
        metrics["rmse"] = np.sqrt(metrics["mse"])

        # For multi-class F1
        metrics["f1_macro"] = f1_score(
            [int(s) for s in true_scores],
            [int(s) for s in pred_scores],
            average='macro'
        )

    return results, metrics

# --- Run evaluation ---
# You can adjust the number of samples to evaluate if the dataset is large
num_eval_samples = 50  # Change to None to evaluate on all samples

print(f"Starting evaluation on {num_eval_samples if num_eval_samples else len(eval_dataset)} samples...")
results, metrics = evaluate_model(model, tokenizer, eval_dataset, num_samples=num_eval_samples)

# --- Display results ---
# Summary metrics
print("\n=== EVALUATION METRICS ===")
for metric_name, value in metrics.items():
    print(f"{metric_name}: {value:.4f}")

# Create and display results dataframe
results_df = pd.DataFrame(results)
print("\n=== SAMPLE RESULTS ===")
print(results_df[["question", "true_score", "pred_score", "correct"]].head(10))

# Calculate distribution of scores
if results_df["pred_score"].notna().any():
    print("\n=== SCORE DISTRIBUTION ===")
    print("True scores distribution:")
    print(results_df["true_score"].value_counts().sort_index())
    print("\nPredicted scores distribution:")
    print(results_df["pred_score"].value_counts().sort_index())

# Save detailed results to CSV
results_df.to_csv("evaluation_results.csv", index=False)
print("\nDetailed results saved to 'evaluation_results.csv'")

# --- Error Analysis ---
if results_df["pred_score"].notna().any():
    print("\n=== ERROR ANALYSIS ===")

    # Find examples with largest errors
    results_df["error"] = abs(results_df["true_score"] - results_df["pred_score"])
    largest_errors = results_df.nlargest(5, "error")

    print("Examples with largest errors:")
    for _, row in largest_errors.iterrows():
        print(f"\nQuestion: {row['question']}")
        print(f"True score: {row['true_score']}, Predicted: {row['pred_score']}, Error: {row['error']}")

GPU available: Tesla T4
Error loading fine-tuned model: Unsloth: Failed to load model. Both AutoConfig and PeftConfig loading failed.

AutoConfig error: Unrecognized model in outputs. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: albert, align, altclip, aria, aria_text, audio-spectrogram-transformer, autoformer, aya_vision, bamba, bark, bart, beit, bert, bert-generation, big_bird, bigbird_pegasus, biogpt, bit, blenderbot, blenderbot-small, blip, blip-2, bloom, bridgetower, bros, camembert, canine, chameleon, chinese_clip, chinese_clip_vision_model, clap, clip, clip_text_model, clip_vision_model, clipseg, clvp, code_llama, codegen, cohere, cohere2, colpali, conditional_detr, convbert, convnext, convnextv2, cpmant, ctrl, cvt, dab-detr, dac, data2vec-audio, data2vec-text, data2vec-vision, dbrx, deberta, deberta-v2, decision_transformer, deepseek_v3, deformable_detr, deit, depth_anything, depth_pro, deta, detr, diffllama, dinat, din

Evaluating: 100%|██████████| 12/12 [00:47<00:00,  3.96s/it]


=== EVALUATION METRICS ===
accuracy: 0.3333
mae: 1.3333
mse: 4.0000
rmse: 2.0000
f1_macro: 0.2111

=== SAMPLE RESULTS ===
                                            question  true_score  pred_score  \
0                 What are renewable energy sources?           0         0.0   
1                           What is the water cycle?           0         1.0   
2                        Why is recycling important?           4         4.0   
3                 Describe how photosynthesis works.           0         NaN   
4                        Why is recycling important?           3         4.0   
5                                 What is democracy?           0         1.0   
6                         Explain how gravity works.           0         1.0   
7                            What is photosynthesis?           0         0.0   
8                            What is photosynthesis?           0         4.0   
9  What is the function of the heart in the human...           0         NaN 




In [6]:
from unsloth import FastLanguageModel

# Enable 2x faster inference
FastLanguageModel.for_inference(model)

# Define your input
instruction = """Grade this essay based on the following mark scheme:
1: Defines gravity
2: Mentions gravitational force
3: Provides an example like an apple falling or planetary motion"""

input_text = """Question: What is gravity?
Reference Answer: Gravity is the force that attracts objects with mass towards each other, like how the Earth pulls everything down.
Student Answer: Gravity is the force ."""

# Format the prompt
custom_prompt = alpaca_prompt.format(instruction, input_text, "")  # Leave output blank

# Tokenize and run inference
inputs = tokenizer([custom_prompt], return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=64)

# Decode and print the answer
answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
print("Model's Response:\n", answer)


Model's Response:
 Below is an instruction that describes how to grade an essay, paired with an input that provides the grading schema. Write a response that grades essays based on the mark schema provided.

### Instruction:
Grade this essay based on the following mark scheme:
1: Defines gravity
2: Mentions gravitational force
3: Provides an example like an apple falling or planetary motion

### Input:
Question: What is gravity?
Reference Answer: Gravity is the force that attracts objects with mass towards each other, like how the Earth pulls everything down.
Student Answer: Gravity is the force .

### Response:
Based on the mark scheme provided, the student's answer would receive a grade of 1. The student has defined gravity as a force, which meets the first criterion of the mark scheme. However, the student's answer could be improved by adding more detail, such as mentioning that gravity attracts
