In [1]:
import torch
import pandas as pd
from datasets import Dataset
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments,AutoModelForSeq2SeqLM,T5TokenizerFast,T5ForConditionalGeneration,DataCollatorForSeq2Seq,Seq2SeqTrainer,Seq2SeqTrainingArguments
from peft import LoraConfig, get_peft_model
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback


KeyboardInterrupt: 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
dataset = load_dataset("csv", data_files="/content/drive/MyDrive/Headline Generator Dataset/headline_corpus.csv")

dataset = dataset["train"].train_test_split(test_size=0.1)
train_ds = dataset["train"]
val_ds   = dataset["test"]


In [None]:
MODEL_NAME = "t5-small"

tokenizer = T5TokenizerFast.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)


In [None]:
MAX_INPUT  = 256
MAX_TARGET = 32

def preprocess(batch):
    inputs = ["headline: " + x for x in batch["text"]]

    model_inputs = tokenizer(
        inputs,
        max_length=MAX_INPUT,
        truncation=True,
        padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["headline"],
            max_length=MAX_TARGET,
            truncation=True,
            padding="max_length"
        )

    # üî• mask padding tokens
    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label]
        for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_tok = train_ds.map(preprocess, batched=True, remove_columns=train_ds.column_names)
val_tok   = val_ds.map(preprocess, batched=True, remove_columns=val_ds.column_names)


In [None]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model
)

In [19]:
args = Seq2SeqTrainingArguments(
    output_dir="./headline_model",
    eval_strategy="steps",          # ‚úÖ FIX HERE
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=3e-4,
    num_train_epochs=25,
    fp16=True,
    logging_steps=100,
    eval_steps=500,
    save_steps=500,
    save_total_limit=2,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    remove_unused_columns=False
)


trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=val_tok ,   # ‚úÖ use this
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)
torch.cuda.empty_cache()
trainer.train()


  trainer = Seq2SeqTrainer(


Step,Training Loss,Validation Loss
500,0.273,0.267971
1000,0.2571,0.261208
1500,0.2532,0.263128
2000,0.2479,0.266399
2500,0.2538,0.258339
3000,0.2431,0.259823
3500,0.2461,0.260427
4000,0.2423,0.260313


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=4000, training_loss=0.25858783769607546, metrics={'train_runtime': 1387.1025, 'train_samples_per_second': 274.673, 'train_steps_per_second': 17.176, 'total_flos': 4328772178280448.0, 'train_loss': 0.25858783769607546, 'epoch': 4.197375328083989})

In [None]:
!pip install evaluate rouge-score


In [None]:
import evaluate
import numpy as np
import torch

rouge = evaluate.load("rouge")

model.eval()

preds = []
labels = []

for batch in val_tok:
    input_ids = torch.tensor(batch["input_ids"]).unsqueeze(0).to(model.device)
    attention_mask = torch.tensor(batch["attention_mask"]).unsqueeze(0).to(model.device)

    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=32
        )

    pred = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    label_ids = [l if l != -100 else tokenizer.pad_token_id for l in batch["labels"]]
    label = tokenizer.decode(label_ids, skip_special_tokens=True)

    preds.append(pred)
    labels.append(label)

results = rouge.compute(predictions=preds, references=labels)
print(results)


In [None]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

text = "·Äû·Äê·ÄÑ·Ä∫·Ä∏·Ä°·Äï·Äº·Ää·Ä∑·Ä∫·Ä°·ÄÖ·ÄØ·Ä∂·ÄÄ·Ä≠·ÄØ ·Äí·ÄÆ·Äô·Äæ·Ä¨·Äë·Ää·Ä∑·Ä∫·Äï·Ä´..."

inputs = tokenizer(
    "headline: " + text,
    return_tensors="pt",
    truncation=True
).to(device)

out = model.generate(
    **inputs,
    max_length=64,
    num_beams=4,
    early_stopping=True
)

print("Generated headline:")
print(tokenizer.decode(out[0], skip_special_tokens=True))


In [None]:
trainer.save_model("/content/burmese_headline_model")
tokenizer.save_pretrained("/content/burmese_headline_model")
