### Install & Imports:

In [11]:
!pip install -q transformers datasets accelerate rouge_score

import torch
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"üî• Using: {device}")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


üî• Using: cuda


### Load & Prepare Data:

In [13]:
# 1. Load Dataset
print("‚è≥ Loading Grammarly CoEdIT dataset...")
dataset = load_dataset("grammarly/coedit")

# 2. Broader Filter (Accept Grammar & Style fixes)
# We want anything that improves the text, not just "formal"
print("‚è≥ Filtering for relevant tasks...")
relevant_dataset = dataset['train'].filter(
    lambda x: any(keyword in x['src'].lower() for keyword in ["formal", "grammar", "improve", "fix", "rewrite"])
)

# 3. Remove "Lazy" Examples (Identity Mappings)
def is_significant_change(example):
    # Skip if input and output are identical or too similar
    if example['src'] == example['tgt']: return False
    if abs(len(example['src']) - len(example['tgt'])) < 2: return False
    return True

print("‚è≥ Cleaning lazy examples...")
filtered_dataset = relevant_dataset.filter(is_significant_change)

count = len(filtered_dataset)
print(f"‚úÖ Examples ready for training: {count}")

# 4. Smart Selection (Never crash again)
# We take 5,000 examples OR the total count, whichever is smaller
select_count = min(count, 5000)
split_data = filtered_dataset.shuffle(seed=42).select(range(select_count)).train_test_split(test_size=0.1)

print("\n--- SAMPLE INPUT ---")
print(split_data['train'][0]['src'])
print("\n--- SAMPLE TARGET ---")
print(split_data['train'][0]['tgt'])

‚è≥ Loading Grammarly CoEdIT dataset...
‚è≥ Filtering for relevant tasks...


Filter:   0%|          | 0/69071 [00:00<?, ? examples/s]

‚è≥ Cleaning lazy examples...


Filter:   0%|          | 0/30565 [00:00<?, ? examples/s]

‚úÖ Examples ready for training: 30451

--- SAMPLE INPUT ---
Fix grammar errors: But if you don't know the Potter very much, you might not understand it.

--- SAMPLE TARGET ---
But if you don't know Harry Potter very much, you might not understand it.


### Tokenization:

In [17]:
from transformers import AutoTokenizer

# We use Flan-T5-Base because it's smarter at following instructions
model_checkpoint = "google/flan-t5-base" 
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def preprocess_function(examples):
    # The 'src' column already has instructions (e.g., "Fix grammar: ...")
    inputs = examples["src"]
    targets = examples["tgt"]
    
    # Tokenize inputs
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)
    
    # Tokenize targets
    labels = tokenizer(text_target=targets, max_length=128, truncation=True)
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print("‚è≥ Tokenizing dataset...")
tokenized_datasets = split_data.map(preprocess_function, batched=True)
print("‚úÖ Data ready.")

‚è≥ Tokenizing dataset...


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

‚úÖ Data ready.


### The Training Loop:

In [18]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint).to(device)

args = Seq2SeqTrainingArguments(
    output_dir="/kaggle/working/professionalizer-flan",
    eval_strategy="epoch",
    learning_rate=3e-4,   # Slightly higher learning rate for Flan models
    per_device_train_batch_size=8, # 8 is safe for T4 GPU
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=8,   # 8 epochs ensures it really learns the style
    fp16=True,            # Faster training
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
    tokenizer=tokenizer,
)

print("üöÄ Starting Training (Flan-T5-Base)...")
trainer.train()

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  trainer = Seq2SeqTrainer(


üöÄ Starting Training (Flan-T5-Base)...




Epoch,Training Loss,Validation Loss
1,No log,0.474482
2,0.484000,0.485988
3,0.484000,0.512646
4,0.257100,0.560032
5,0.257100,0.611605
6,0.154200,0.680361
7,0.154200,0.73001
8,0.099400,0.76842




TrainOutput(global_step=2256, training_loss=0.22940417891698525, metrics={'train_runtime': 1335.7987, 'train_samples_per_second': 26.95, 'train_steps_per_second': 1.689, 'total_flos': 3103528958300160.0, 'train_loss': 0.22940417891698525, 'epoch': 8.0})

In [None]:
from transformers import pipeline
import shutil

# 1. Test the model immediately
print("--- üß™ TESTING YOUR MODEL ---")
my_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=0)

test_inputs = [
    "Make this formal: yo bro where is the report?",
    "Fix grammar: i dont like this code its bad.",
    "Make this formal: i ain't doing that lol."
]

for text in test_inputs:
    result = my_pipeline(text, max_length=60)
    print(f"\nInput:  {text}")
    print(f"Output: {result[0]['generated_text']}")

# 2. Save & Zip for Download
print("\n--- üì¶ ZIPPING FOR DOWNLOAD ---")
final_path = "/kaggle/working/my_professionalizer"
trainer.save_model(final_path)
tokenizer.save_pretrained(final_path)

# Create zip file
shutil.make_archive("/kaggle/working/professionalizer_pack", 'zip', final_path)
print("‚úÖ Done! Go to the 'Output' tab on the right sidebar to download 'professionalizer_pack.zip'.")

Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=60) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


--- üß™ TESTING YOUR MODEL ---


Both `max_new_tokens` (=256) and `max_length`(=60) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Input:  Make this formal: yo bro where is the report?
Output: Where is the report?


Both `max_new_tokens` (=256) and `max_length`(=60) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Input:  Fix grammar: i dont like this code its bad.
Output: I don't like this code because it's bad.

Input:  Make this formal: i ain't doing that lol.
Output: I'm not doing that, lol.

--- üì¶ ZIPPING FOR DOWNLOAD ---
