In [None]:
# Install libraries
!pip install transformers datasets nltk --quiet
import nltk
nltk.download("punkt")

# Import modules
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from datasets import load_dataset
import torch

# Load summarization model
summarizer_model = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(summarizer_model)
model = AutoModelForSeq2SeqLM.from_pretrained(summarizer_model)

# Load translation pipeline (English → Selected Language)
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-en-mul")

# Load CNN/DailyMail dataset (3% test sample)
dataset = load_dataset("cnn_dailymail", "3.0.0", split="test[:3%]")

# Summarization
def summarize(text, max_len=150, min_len=40, num_beams=4):
    inputs = tokenizer("summarize: " + text, return_tensors="pt", truncation=True, max_length=512)
    summary_ids = model.generate(inputs["input_ids"], max_length=max_len, min_length=min_len,
                                 length_penalty=2.0, num_beams=num_beams, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Hook generator
def generate_hook(summary):
    prompt = f"Generate a catchy headline for: {summary}"
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
    output_ids = model.generate(inputs["input_ids"], max_length=20, min_length=5, num_beams=5,
                                length_penalty=1.5, early_stopping=True)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Translate summary/hook to selected language
def translate_text(text, target_lang="hi"):
    # Target codes: "hi" → Hindi, "fr" → French, "es" → Spanish, "de" → German, etc.
    translated = translator(text, tgt_lang=target_lang)[0]['translation_text']
    return translated

# Process
articles = dataset["article"][:5]
actual_summaries = dataset["highlights"][:5]
generated_summaries = [summarize(article) for article in articles]
generated_hooks = [generate_hook(summary) for summary in generated_summaries]

# Set target language for translation
target_lang_code = "hi"  # Change to "fr", "es", "de", etc.

# Translate results
translated_summaries = [translate_text(summary, target_lang_code) for summary in generated_summaries]
translated_hooks = [translate_text(hook, target_lang_code) for hook in generated_hooks]

# Show samples
for i in range(3):
    print(f"\nArticle #{i+1} (truncated):\n{articles[i][:300]}...")
    print(f"\nGenerated Summary:\n{generated_summaries[i]}")
    print(f"Hook Headline:\n {generated_hooks[i]}")
    print(f"Translated Summary ({target_lang_code}):\n{translated_summaries[i]}")
    print(f"Translated Hook ({target_lang_code}):\n{translated_hooks[i]}")
    print(f"\nActual Summary:\n{actual_summaries[i]}")
    print("="*100)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/310M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/310M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/790k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/707k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

Device set to use cpu


README.md: 0.00B [00:00, ?B/s]

3.0.0/train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

3.0.0/train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

3.0.0/train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

3.0.0/validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

3.0.0/test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]


Article #1 (truncated):
(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the cou...

Generated Summary:
the palestinians signed the ICC's founding Rome Statute in January. the ICC also accepted its jurisdiction over alleged crimes committed in the occupied Palestinian territory. the ICC opened a preliminary examination into the situation in the occupied territories.
Hook Headline:
 's founding Rome Statute in January. the ICC accepted its jurisdiction over alleged
Translated Summary (hi):
Palestinians firmant ICC's fondator the ICC Roma Statute in January. ICC acceptaded saw jurisdiction over supositions crimes committed in the ocupate Palestina territory. ICC aberte preliminari examination in the situation in the ocupate territory.
Translated H