# Fine-Tuning T5 Small for Error Explanations in Google Colab

This notebook demonstrates how to fine-tune the Google T5 Small (60M parameters) model to generate structured error explanations in a universal format.

In [None]:
# Install Required Libraries
!pip install transformers torch datasets

In [None]:
# Authenticate and Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Load the T5 Small Model and Tokenizer
from transformers import T5Tokenizer, T5ForConditionalGeneration

model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [None]:
# Prepare the Training Dataset
from datasets import Dataset
import json

# Upload the JSON file to Colab or place it in your Drive
# For example, if uploaded: data_path = '/content/error_explanations_dataset.json'
# If in Drive: data_path = '/content/drive/MyDrive/error_explanations_dataset.json'
data_path = '/content/error_explanations_dataset.json'  # Change this path as needed

with open(data_path, 'r') as f:
    data = json.load(f)

dataset = Dataset.from_list(data)

def preprocess_function(examples):
    inputs = ["explain error: " + ex for ex in examples["input"]]
    targets = examples["output"]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=1024, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
# Define the Training Configuration
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=2,  # Small batch size for Colab
    num_train_epochs=5,  # More epochs since dataset is small
    weight_decay=0.01,
    save_steps=500,
    save_total_limit=2,
    logging_steps=10,
)

In [None]:
# Fine-Tune the Model
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
)

trainer.train()

# Save the model
model.save_pretrained("/content/drive/MyDrive/t5_error_explainer")
tokenizer.save_pretrained("/content/drive/MyDrive/t5_error_explainer")

In [None]:
# Test the Fine-Tuned Model
input_text = "explain error: IndentationError: unexpected indent in Python"
input_ids = tokenizer.encode(input_text, return_tensors="pt")
outputs = model.generate(input_ids, max_length=1024, num_beams=4, early_stopping=True)
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(decoded)

# Instructions for using the model
print("\nTo use this model in your code:")
print("from transformers import T5Tokenizer, T5ForConditionalGeneration")
print("tokenizer = T5Tokenizer.from_pretrained('/content/drive/MyDrive/t5_error_explainer')")
print("model = T5ForConditionalGeneration.from_pretrained('/content/drive/MyDrive/t5_error_explainer')")
print("input_text = 'explain error: <your error message>'")
print("input_ids = tokenizer.encode(input_text, return_tensors='pt')")
print("outputs = model.generate(input_ids, max_length=1024, num_beams=4, early_stopping=True)")
print("explanation = tokenizer.decode(outputs[0], skip_special_tokens=True)")