<a href="https://colab.research.google.com/github/Harshchoubey55/Multilingual-News-Summarization-and-Headline-Generation-Using-Transformers/blob/main/TextSummarizationUsingTransferLearning(NLP).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install libraries
!pip install transformers datasets nltk --quiet
import nltk
nltk.download("punkt")

# Import modules
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from datasets import load_dataset
import torch

# Load summarization model
summarizer_model = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(summarizer_model)
model = AutoModelForSeq2SeqLM.from_pretrained(summarizer_model)

# Load translation pipeline (English → Selected Language)
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-en-mul")

# Load CNN/DailyMail dataset (3% test sample)
dataset = load_dataset("cnn_dailymail", "3.0.0", split="test[:3%]")

# Summarization
def summarize(text, max_len=150, min_len=40, num_beams=4):
    inputs = tokenizer("summarize: " + text, return_tensors="pt", truncation=True, max_length=512)
    summary_ids = model.generate(inputs["input_ids"], max_length=max_len, min_length=min_len,
                                 length_penalty=2.0, num_beams=num_beams, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Hook generator
def generate_hook(summary):
    prompt = f"Generate a catchy headline for: {summary}"
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
    output_ids = model.generate(inputs["input_ids"], max_length=20, min_length=5, num_beams=5,
                                length_penalty=1.5, early_stopping=True)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Translate summary/hook to selected language
def translate_text(text, target_lang="hi"):
    # Target codes: "hi" → Hindi, "fr" → French, "es" → Spanish, "de" → German, etc.
    translated = translator(text, tgt_lang=target_lang)[0]['translation_text']
    return translated

# Process
articles = dataset["article"][:5]
actual_summaries = dataset["highlights"][:5]
generated_summaries = [summarize(article) for article in articles]
generated_hooks = [generate_hook(summary) for summary in generated_summaries]

# Set target language for translation
target_lang_code = "hi"  # Change to "fr", "es", "de", etc.

# Translate results
translated_summaries = [translate_text(summary, target_lang_code) for summary in generated_summaries]
translated_hooks = [translate_text(hook, target_lang_code) for hook in generated_hooks]

# Show samples
for i in range(3):
    print(f"\nArticle #{i+1} (truncated):\n{articles[i][:300]}...")
    print(f"\nGenerated Summary:\n{generated_summaries[i]}")
    print(f"Hook Headline:\n {generated_hooks[i]}")
    print(f"Translated Summary ({target_lang_code}):\n{translated_summaries[i]}")
    print(f"Translated Hook ({target_lang_code}):\n{translated_hooks[i]}")
    print(f"\nActual Summary:\n{actual_summaries[i]}")
    print("="*100)
