# 🔥 Firish T5-Small Fine-tuning - Simplified

Training T5-small on authentic Firish translation patterns for code-switching between English, French, and Irish.

In [None]:
# Install required packages
!pip install transformers datasets torch accelerate -q

import json
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
import torch

In [None]:
# Load training data
with open('/kaggle/input/firish-training/firish_train.json', 'r') as f:
    train_data = json.load(f)['data']

with open('/kaggle/input/firish-training/firish_val.json', 'r') as f:
    val_data = json.load(f)['data']

print(f"Training examples: {len(train_data)}")
print(f"Validation examples: {len(val_data)}")

In [None]:
# Load T5 model and tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

print(f"Model loaded: {model_name}")
print(f"Model parameters: {model.num_parameters():,}")

In [None]:
# Simple preprocessing function
def preprocess_data(examples):
    inputs = [item['input_text'] for item in examples]
    targets = [item['target_text'] for item in examples]
    
    # Tokenize inputs and targets separately
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding=True)
    labels = tokenizer(targets, max_length=128, truncation=True, padding=True)
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Process the data
train_inputs = preprocess_data(train_data)
val_inputs = preprocess_data(val_data)

# Create datasets
train_dataset = Dataset.from_dict(train_inputs)
val_dataset = Dataset.from_dict(val_inputs)

print("Data preprocessing complete")

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./firish-t5-results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=10,
    weight_decay=0.01,
    logging_steps=5,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",
    fp16=True,
)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

print("Trainer initialized")

In [None]:
# Start training
print("🚀 Starting training...")
trainer.train()
print("✅ Training complete!")

In [None]:
# Save the model
trainer.save_model("./firish-t5-final")
tokenizer.save_pretrained("./firish-t5-final")

print("💾 Model saved!")

# Test the model
test_input = "translate to firish [family, planning, medium]: We need to buy groceries"
inputs = tokenizer(test_input, return_tensors="pt")
outputs = model.generate(**inputs, max_length=50)
result = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"\n🧪 Test translation:")
print(f"Input: {test_input}")
print(f"Output: {result}")