In [5]:
# Install dependencies
!pip install transformers datasets accelerate torch torchvision --quiet

import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import T5ForConditionalGeneration, T5Tokenizer
from datasets import load_dataset

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# -----------------------------
# Step 1: Load and preprocess dataset
# -----------------------------
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")  # Use a smaller dataset

tokenizer = T5Tokenizer.from_pretrained("t5-small")  # Use a smaller model

def preprocess_function(examples):
    # Tokenize text with padding and truncation
    tokenized = tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)  # Reduced max_length
    # For T5, labels = input_ids in language modeling tasks
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

train_data = dataset['train'].map(preprocess_function, batched=True)
val_data = dataset['validation'].map(preprocess_function, batched=True)

# Convert to PyTorch tensors
train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# -----------------------------
# Step 2: Load T5 Model
# -----------------------------
model = T5ForConditionalGeneration.from_pretrained("t5-small")  # Use a smaller model
model.to(device)

# -----------------------------
# Step 3: Training Setup
# -----------------------------
batch_size = 4  # Reduce batch size
epochs = 1  # Reduce epochs for faster training

train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=batch_size)

optimizer = AdamW(model.parameters(), lr=5e-5)

# -----------------------------
# Step 4: Training Loop
# -----------------------------
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(train_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if (step + 1) % 100 == 0:
            print(f"Epoch {epoch+1} | Step {step+1}/{len(train_dataloader)} | Loss: {total_loss / (step+1):.4f}")

    # Validation after each epoch
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

    print(f"Epoch {epoch+1} completed. Validation Loss: {val_loss / len(val_dataloader):.4f}")

# -----------------------------
# Step 5: Save model and tokenizer
# -----------------------------
model.save_pretrained("./t5_simplified_model")
tokenizer.save_pretrained("./t5_simplified_tokenizer")

print("Model and tokenizer saved successfully!")

# -----------------------------
# Step 6: Text Generation Example
# -----------------------------
prompt = "The future of AI is"
inputs = tokenizer(prompt, return_tensors="pt").to(device)
outputs = model.generate(inputs['input_ids'], max_length=50, num_beams=3, early_stopping=True)  # Reduce generation length
print("Generated Text:\n", tokenizer.decode(outputs[0], skip_special_tokens=True))


Using device: cuda


Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

Epoch 1 | Step 100/9180 | Loss: 3.3883
Epoch 1 | Step 200/9180 | Loss: 2.1062
Epoch 1 | Step 300/9180 | Loss: 1.5684
Epoch 1 | Step 400/9180 | Loss: 1.2626
Epoch 1 | Step 500/9180 | Loss: 1.0619
Epoch 1 | Step 600/9180 | Loss: 0.9238
Epoch 1 | Step 700/9180 | Loss: 0.8212
Epoch 1 | Step 800/9180 | Loss: 0.7414
Epoch 1 | Step 900/9180 | Loss: 0.6776
Epoch 1 | Step 1000/9180 | Loss: 0.6256
Epoch 1 | Step 1100/9180 | Loss: 0.5813
Epoch 1 | Step 1200/9180 | Loss: 0.5437
Epoch 1 | Step 1300/9180 | Loss: 0.5108
Epoch 1 | Step 1400/9180 | Loss: 0.4813
Epoch 1 | Step 1500/9180 | Loss: 0.4554
Epoch 1 | Step 1600/9180 | Loss: 0.4323
Epoch 1 | Step 1700/9180 | Loss: 0.4113
Epoch 1 | Step 1800/9180 | Loss: 0.3920
Epoch 1 | Step 1900/9180 | Loss: 0.3744
Epoch 1 | Step 2000/9180 | Loss: 0.3581
Epoch 1 | Step 2100/9180 | Loss: 0.3430
Epoch 1 | Step 2200/9180 | Loss: 0.3296
Epoch 1 | Step 2300/9180 | Loss: 0.3167
Epoch 1 | Step 2400/9180 | Loss: 0.3046
Epoch 1 | Step 2500/9180 | Loss: 0.2934
Epoch 1 |