In [1]:
import pandas as pd
from datasets import Dataset
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    Trainer,
    TrainingArguments,
)


  from .autonotebook import tqdm as notebook_tqdm
2025-06-23 10:21:30.147135: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750666890.163418   83979 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750666890.167846   83979 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1750666890.179865   83979 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1750666890.179880   83979 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1750666890.179882   83979

In [2]:
import torch
torch.cuda.empty_cache()


In [4]:
df = pd.read_csv("data/final.csv")  # Replace with your actual CSV path
df = df.rename(columns={"t": "input_text", "og": "target_text"})

dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.5)

# STEP 2: Load tokenizer and preprocess
tokenizer = T5Tokenizer.from_pretrained("t5-small")

def preprocess(batch):
    input_texts = ["translate English to Shakespeare: " + text for text in batch["input_text"]]
    target_texts = batch["target_text"]

    model_inputs = tokenizer(input_texts, max_length=64, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(target_texts, max_length=64, truncation=True, padding="max_length")

    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess, batched=True)

# STEP 3: Load model
model = T5ForConditionalGeneration.from_pretrained("t5-small")
model.gradient_checkpointing_enable()

# STEP 4: Freeze all layers except lm_head and final encoder/decoder blocks
for name, param in model.named_parameters():
    param.requires_grad = False  # Freeze all by default

for name, param in model.named_parameters():
    if any(layer in name for layer in ["encoder.block.5", "decoder.block.5", "lm_head", "shared"]):
        param.requires_grad = True

# STEP 5: Training arguments (OOM-safe)
training_args = TrainingArguments(
    output_dir="./t5-shakespeare",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=10,
    eval_strategy="no",  # Disable eval to save memory
    save_strategy="no",        # No checkpointing
    fp16=False,
    report_to="none",
    logging_dir="./logs",
)

# STEP 6: Trainer (omit eval and metrics for now)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    tokenizer=tokenizer,
    # eval_dataset=tokenized_dataset["test"],     # ← DISABLED
    # compute_metrics=compute_metrics             # ← DISABLED
)

# STEP 7: Train
trainer.train()

# STEP 8: Save final model
trainer.save_model("./t5-shakespeare")
tokenizer.save_pretrained("./t5-shakespeare")



Map: 100%|██████████| 25893/25893 [00:11<00:00, 2200.83 examples/s]
Map: 100%|██████████| 25894/25894 [00:10<00:00, 2362.97 examples/s]
  trainer = Trainer(


Step,Training Loss
500,3.6594
1000,3.3612
1500,3.2795
2000,3.2001
2500,3.2099
3000,3.1529
3500,3.1396
4000,3.1251
4500,3.0748
5000,3.0709


('./t5-shakespeare/tokenizer_config.json',
 './t5-shakespeare/special_tokens_map.json',
 './t5-shakespeare/spiece.model',
 './t5-shakespeare/added_tokens.json')

In [5]:
def translate_to_shakespeare(text):
    model.eval()
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)  # Ensure model is on the correct device

    input_text = "translate English to Shakespeare: " + text
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(device)

    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=50)
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test it
print(translate_to_shakespeare("Where are you going?"))
print(translate_to_shakespeare("Baby dont hurt me dont hurt me no more"))
print(translate_to_shakespeare("I love you brother"))
print(translate_to_shakespeare("I'm writing an essay on how to throw a chick off the mountain"))

Where are you going?
Is my baby hurt, I am not hurt more
I love you brother
I write a essay on how to throw a chick off the mountain
