## 1. Setup and Installation

In [1]:
# Install required packages
!pip install -q transformers datasets sentence-transformers accelerate evaluate rouge-score sacrebleu

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.8/100.8 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


In [2]:
import torch
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
from sentence_transformers import SentenceTransformer
import evaluate
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

Using device: cuda
GPU: Tesla T4


## 2. Prepare Burmese Dataset

You'll need a dataset with Burmese articles and headlines. Here are some options:
- Use existing Burmese news datasets
- Load from CSV/JSON files
- Scrape Burmese news websites (with permission)

Expected format: `{'article': 'article text...', 'headline': 'headline text...'}`

In [5]:
df = pd.read_csv('/content/drive/MyDrive/NLP Project/Headline Generator Dataset/headline_corpus.csv')

In [6]:
# Split data into train/validation/test
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

# Convert to Hugging Face Dataset
dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'validation': Dataset.from_pandas(val_df),
    'test': Dataset.from_pandas(test_df)
})

print(dataset)

Train: 13547, Val: 1693, Test: 1694
DatasetDict({
    train: Dataset({
        features: ['id', 'headline', 'text', '__index_level_0__'],
        num_rows: 13547
    })
    validation: Dataset({
        features: ['id', 'headline', 'text', '__index_level_0__'],
        num_rows: 1693
    })
    test: Dataset({
        features: ['id', 'headline', 'text', '__index_level_0__'],
        num_rows: 1694
    })
})


In [7]:
print(df.head(3))

   id                                           headline  \
0   1  မော်လ်တာကမ်းလွန်၌ လှေမှောက်မှု ရွှေ့ပြောင်းနေထ...   
1   2  ၁၀ နှစ်ကြာ လုံခြုံရေးပူးပေါင်းဆောင်ရွက်မှု သဘေ...   
2   3  စစ်ပွဲလွန်ဂါဇာ၏ လုံခြုံရေးနှင့်အရပ်ဘက်ရေးရာမျာ...   

                                                text  
0  မော်လ်တာကမ်းလွန်မှာ တိမ်းမှောက်သွားတဲ့လှေကို ဖ...  
1  ဒိန်းမတ်နိုင်ငံဟာ ယူကရိန်းနဲ့ ၁၀ နှစ်ကြာ လုံခြ...  
2  စစ်ပွဲလွန်ဂါဇာကမ်းမြောင်ဒေသရဲ့ လုံခြုံရေးနဲ့ အ...  


## 3. Load Model and Tokenizer

We'll use mT5 (multilingual T5) which supports Burmese well for seq2seq tasks.

In [8]:
# Use mT5 for better multilingual support including Burmese
model_name = "google/mt5-small"  # Options: mt5-small, mt5-base, mt5-large

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

print(f"Model loaded: {model_name}")
print(f"Model parameters: {model.num_parameters():,}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/192 [00:00<?, ?it/s]



generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Model loaded: google/mt5-small
Model parameters: 556,291,456


## 4. Preprocess Data

In [9]:
max_input_length = 256
max_target_length = 64

def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["text"]]

    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True,
        padding=False,
    )

    labels = tokenizer(
        text_target=examples["headline"],
        max_length=max_target_length,
        truncation=True,
        padding=False,
    )

    #FIX: replace pad tokens with -100
    labels_ids = labels["input_ids"]
    labels_ids = [
        [(lid if lid != tokenizer.pad_token_id else -100) for lid in label]
        for label in labels_ids
    ]

    model_inputs["labels"] = labels_ids
    return model_inputs

# Apply preprocessing
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names
)

print("Tokenization complete!")
print(tokenized_dataset)

Map:   0%|          | 0/13547 [00:00<?, ? examples/s]

Map:   0%|          | 0/1693 [00:00<?, ? examples/s]

Map:   0%|          | 0/1694 [00:00<?, ? examples/s]

Tokenization complete!
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 13547
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1693
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1694
    })
})


In [10]:
sample = tokenized_dataset["train"][0]
print(sample["labels"][:20])


[259, 24122, 92894, 98238, 264, 3805, 121528, 97177, 98139, 95933, 259, 161086, 155726, 158580, 136137, 69855, 1]


In [11]:
import evaluate

# Load the ROUGE metric
rouge = evaluate.load("rouge")

# Ensure nltk is ready for the sentence splitter you're using
import nltk
nltk.download('punkt', quiet=True)

Downloading builder script: 0.00B [00:00, ?B/s]

True

## 5. Training Setup

In [12]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # 1. CHARACTER-LEVEL SPLITTING: Force spaces between characters for ROUGE
    # This ensures ROUGE sees "မောင်မောင်" as ၄ individual units
    preds_for_rouge = [" ".join(list(p.replace(" ", ""))) for p in decoded_preds]
    labels_for_rouge = [" ".join(list(l.replace(" ", ""))) for l in decoded_labels]

    result = rouge.compute(
        predictions=preds_for_rouge,
        references=labels_for_rouge,
        use_stemmer=False # Stemming is for English; disable for Burmese
    )

    result = {key: value * 100 for key, value in result.items()}

    # 2. Meaningful Gen Len: Count actual Burmese characters
    prediction_lens = [len(pred.replace(" ", "")) for pred in decoded_preds]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 2) for k, v in result.items()}

In [14]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./burmese-headline-model",
    eval_strategy="epoch",
    learning_rate=1e-4,           # Lowered from 3e-4 for stability
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    num_train_epochs=5,
    predict_with_generate=True,
    generation_max_length=64,
    generation_num_beams=4,
    fp16=False,                   # Set to False to prevent NaN loss
    label_smoothing_factor=0.1,   # Helps with small datasets
    logging_steps=10,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to="none",
)

In [15]:
# Data collator with proper padding
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=-100,
    pad_to_multiple_of=8 if torch.cuda.is_available() else None
)

In [16]:
# Remove the old lines and use this instead:
model.generation_config.max_length = 64
model.generation_config.forced_bos_token_id = tokenizer.pad_token_id

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [17]:
print(tokenized_dataset["train"][0]["labels"][:20])
print(tokenized_dataset["train"].column_names)
batch = next(iter(trainer.get_train_dataloader()))
print(batch["labels"].shape)
print(batch["labels"][0][:20])


[259, 24122, 92894, 98238, 264, 3805, 121528, 97177, 98139, 95933, 259, 161086, 155726, 158580, 136137, 69855, 1]
['input_ids', 'attention_mask', 'labels']
torch.Size([4, 40])
tensor([   259,   1975,  34847,  48849, 189417, 111489,  67859,    261, 113780,
         27063,  17381,  21987,    259, 187505, 127566,  66924,   9410,  23569,
         34979,  34979], device='cuda:0')


## 6. Train the Model

In [18]:
# Start training
print("Starting training...")
trainer.train()
print("Training complete!")

Starting training...


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,3.446217,3.111485,5.91,5.67,5.8,5.86,76.78
2,3.373857,2.969419,5.93,5.67,5.83,5.84,78.23
3,3.014203,2.900302,6.11,5.86,6.01,6.04,79.52


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

KeyboardInterrupt: 

## 7. Evaluate the Model

In [19]:
# Evaluate on test set
eval_results = trainer.evaluate(tokenized_dataset["test"])
print("\nEvaluation Results:")
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,3.446217,3.111485,5.91,5.67,5.8,5.86,76.78
2,3.373857,2.969419,5.93,5.67,5.83,5.84,78.23
3,3.192737,2.934046,7.6,7.39,7.54,7.55,78.77



Evaluation Results:
eval_loss: 2.9340
eval_rouge1: 7.6000
eval_rouge2: 7.3900
eval_rougeL: 7.5400
eval_rougeLsum: 7.5500
eval_gen_len: 78.7700


## 8. Test Headline Generation

In [20]:
def generate_headline(article_text, max_length=64, num_beams=4, temperature=0.7):
    """Generate headline with better parameters"""
    input_text = "summarize: " + article_text
    inputs = tokenizer(
        input_text,
        max_length=max_input_length,
        truncation=True,
        return_tensors="pt"
    ).to(device)

    model.to(device)
    model.eval()

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            num_beams=num_beams,
            early_stopping=True,
            no_repeat_ngram_size=2,
            length_penalty=1.0,
            temperature=temperature
        )

    headline = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return headline

In [22]:
# Test on examples
print("="*80)
print("GENERATION EXAMPLES")
print("="*80)

test_indices = [0, 1, 2] if len(dataset["test"]) >= 3 else range(len(dataset["test"]))

for idx in test_indices:
    example = dataset["test"][idx]
    article = example["text"]
    true_headline = example["headline"]
    generated_headline = generate_headline(article)

    print(f"\nExample {idx + 1}:")
    print("-" * 80)
    print(f"Article: {article[:150]}...")
    print(f"\nTrue:      {true_headline}")
    print(f"Generated: {generated_headline}")
    print("-" * 80)

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


GENERATION EXAMPLES

Example 1:
--------------------------------------------------------------------------------
Article: အိန္ဒိယနိုင်ငံရဲ့ ၇၅ နှစ်မြောက် သမ္မတနိုင်ငံနေ့ကို နယူးဒေလီမြို့တော်မှာ ဇန်နဝါရီ ၂၆ ရက်က ကျင်းပခဲ့ပါတယ်။ နိုင်ငံရဲ့ စစ်ရေးစွမ်းရည်၊ အစဉ်အလာနဲ့ ယဉ်ကျေး...

True:      ၇၅ နှစ်မြောက် အိန္ဒိယသမ္မတ နိုင်ငံနေ့ နယူးဒေလီတွင် ကျင်းပ
Generated: အိန္ဒိယသမ္မတနိုင်ငံနေ့ နယူးဒေလီမြို့တော်တွင် ကျင်းပ
--------------------------------------------------------------------------------

Example 2:
--------------------------------------------------------------------------------
Article: မန္တလေးတိုင်းဒေသကြီး မြင်းခြံခရိုင် ၄ မြို့နယ်မှာ   ဇူလိုင် ၃၀ ရက်ထိ  ကိုဗစ်-၁၉ အတည်ပြုလူနာ ၂၉၀၆ ဦးနဲ့ သေဆုံး ၁၀၇ ဦးရှိပြီလို့ ဒေသတွင်း ပရဟိတအသင်းတွေဆ...

True:      မြင်းခြံခရိုင်တွင် ကိုဗစ်အတည်ပြုလူနာ ၂၉၀၀ ကျော်နှင့် သေဆုံးသူ ၁၀၇ ဦးရှိလာ
Generated: မြင်းခြံခရိုင် ၄ မြို့နယ်တွင် ကိုဗစ်-၁၉ အတည်ပြုလူနာ ၂၉၀၆ ဦး၊ သေဆုံးသူ ၁၀၇ ဦးရှိလာ
---------------------------------------------------------------------------

In [23]:
# Interactive testing - Try your own Burmese text
custom_article = """
ရန်ကုန်မြို့ရှိ ဈေးကွက်များတွင် ဒီဇင်ဘာလအတွင်း
စားသောက်ကုန်ဈေးနှုန်းများ သိသိသာသာ မြင့်တက်လာခဲ့ပါသည်။
"""

print("Custom Article:")
print(custom_article)
print("\nGenerated Headline:")
print(generate_headline(custom_article.strip()))

Custom Article:

ရန်ကုန်မြို့ရှိ ဈေးကွက်များတွင် ဒီဇင်ဘာလအတွင်း
စားသောက်ကုန်ဈေးနှုန်းများ သိသိသာသာ မြင့်တက်လာခဲ့ပါသည်။


Generated Headline:
ရန်ကုန်မြို့တွင် စားသောက်ကုန်ဈေးနှုန်းများ သိသိသာသာမြင့်တက်


## 9. Save the Model

In [24]:
# Save model locally
output_dir = "./burmese-headline-model-final"
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Model saved to ./burmese-headline-model-final


In [25]:
!cp -r {output_dir} /content/drive/MyDrive/
print("Model copied to Google Drive")


Model copied to Google Drive


## 10. Load Saved Model (for future use)

In [None]:
# Load the saved model
# loaded_model = AutoModelForSeq2SeqLM.from_pretrained(output_dir)
# loaded_tokenizer = AutoTokenizer.from_pretrained(output_dir)
# print("Model loaded successfully!")

## 11. Export for Production (Optional)

In [None]:
# Optional: Convert to ONNX for faster inference
# !pip install -q optimum[exporters]

# from optimum.onnxruntime import ORTModelForSeq2SeqLM

# ort_model = ORTModelForSeq2SeqLM.from_pretrained(
#     output_dir,
#     export=True
# )
# ort_model.save_pretrained("./burmese-headline-onnx")
# print("ONNX model exported")

## Notes and Tips

### Improving Performance:
1. **More Data**: Collect more Burmese news articles with headlines (1000+ examples recommended)
2. **Larger Model**: Try `google/mt5-base` or `google/mt5-large` for better quality
3. **Data Augmentation**: Back-translation or paraphrasing of existing data
4. **Hyperparameter Tuning**: Adjust learning rate, batch size, num_beams
5. **Preprocessing**: Clean and normalize Burmese text properly

### Model Options:
- `google/mt5-small`: Fast, good for prototyping (~300M params)
- `google/mt5-base`: Better quality (~580M params)
- `google/mt5-large`: Best quality (~1.2B params, requires more GPU)

### Dataset Sources:
- Burmese news websites
- Myanmar Wikipedia articles
- Public Burmese NLP datasets

### GPU Memory Tips:
- Reduce `per_device_train_batch_size` if OOM error
- Use gradient accumulation: `gradient_accumulation_steps=2`
- Enable `fp16=True` for mixed precision training
