In [1]:
!pip install -q transformers torch datasets sentencepiece


In [2]:
import torch
import transformers
import datasets

print("Torch version:", torch.__version__)
print("Transformers version:", transformers.__version__)


Torch version: 2.9.0+cu126
Transformers version: 4.57.3


## Concept Prerequisites

### 1. Extractive vs Abstractive Summarization
Extractive summarization selects important sentences directly from the original text, whereas abstractive summarization generates new sentences that capture the meaning of the text.

### 2. Why Transformers are better than RNNs for long text
Transformers use self-attention to look at all words in parallel, allowing them to capture long-range dependencies better than RNNs, which process text sequentially.

### 3. What is a pretrained model
A pretrained model is a model trained on a large corpus of text so it already understands language structure and semantics before being used for a specific task.

### 4. What happens during fine-tuning
During fine-tuning, the pretrained model’s weights are slightly updated using task-specific data so the model adapts better to the new domain.


In [3]:
sample_email = """
Subject: Delay in Project Alpha Delivery – Immediate Attention Required

Hi Team,

I hope everyone is doing well. I am writing this email to bring attention to the recent delay in the Project Alpha delivery timeline.

Over the past two weeks, we have observed that the development milestones have not been met as originally planned. The backend integration is still pending, and the testing phase has not yet started. This delay may affect our committed delivery date to the client.

The client has already requested an update and expects a clear revised timeline by the end of this week. It is important that we identify the blockers as soon as possible.

Please review the current progress and share the following:
1. The exact reason for the delay
2. An updated completion estimate
3. Any additional support required from management

We understand that unexpected challenges can arise, but timely communication is crucial to maintaining client trust.

Let us plan to discuss this in tomorrow’s stand-up meeting and finalize the next steps.

Thanks and regards,
Rahul Sharma
Project Manager

This email and any attachments are confidential and intended only for the recipient.
"""


In [4]:
print("Email length (characters):", len(sample_email))
print("\nPreview:\n")
print(sample_email[:500])


Email length (characters): 1180

Preview:


Subject: Delay in Project Alpha Delivery – Immediate Attention Required

Hi Team,

I hope everyone is doing well. I am writing this email to bring attention to the recent delay in the Project Alpha delivery timeline.

Over the past two weeks, we have observed that the development milestones have not been met as originally planned. The backend integration is still pending, and the testing phase has not yet started. This delay may affect our committed delivery date to the client.

The client has 


In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

print("Tokenizer loaded successfully")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Tokenizer loaded successfully


In [6]:
inputs = tokenizer(
    sample_email,
    return_tensors="pt",
    truncation=False
)

token_count = inputs["input_ids"].shape[1]

print("Number of tokens in the email:", token_count)


Number of tokens in the email: 245


In [7]:
decoded_tokens = tokenizer.convert_ids_to_tokens(
    inputs["input_ids"][0][:40]
)

print("First 40 tokens:\n")
print(decoded_tokens)


First 40 tokens:

['<s>', 'Ċ', 'Subject', ':', 'ĠDelay', 'Ġin', 'ĠProject', 'ĠAlpha', 'ĠDelivery', 'ĠâĢĵ', 'ĠIm', 'mediate', 'ĠAttention', 'ĠRequired', 'Ċ', 'Ċ', 'Hi', 'ĠTeam', ',', 'Ċ', 'Ċ', 'I', 'Ġhope', 'Ġeveryone', 'Ġis', 'Ġdoing', 'Ġwell', '.', 'ĠI', 'Ġam', 'Ġwriting', 'Ġthis', 'Ġemail', 'Ġto', 'Ġbring', 'Ġattention', 'Ġto', 'Ġthe', 'Ġrecent', 'Ġdelay']


In [8]:
MAX_TOKENS = 1024  # BART max input length

if token_count > MAX_TOKENS:
    print("⚠️ Email is longer than model limit and needs truncation or chunking")
else:
    print("✅ Email is within the model's input limit")


✅ Email is within the model's input limit


In [9]:
from transformers import AutoModelForSeq2SeqLM

model_name = "facebook/bart-large-cnn"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

print(f"Model loaded on device: {device}")


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Model loaded on device: cuda


In [10]:
inputs = tokenizer(
    sample_email,
    max_length=1024,
    truncation=True,
    return_tensors="pt"
)

inputs = {key: val.to(device) for key, val in inputs.items()}


In [11]:
with torch.no_grad():
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=150,
        min_length=60,
        num_beams=4,
        length_penalty=2.0,
        early_stopping=True
    )

summary_text = tokenizer.decode(
    summary_ids[0],
    skip_special_tokens=True
)

print("📌 GENERATED SUMMARY:\n")
print(summary_text)


📌 GENERATED SUMMARY:

The client has already requested an update and expects a clear revised timeline by the end of this week. We understand that unexpected challenges can arise, but timely communication is crucial to maintaining client trust. Let us plan to discuss this in tomorrow’s stand-up meeting and finalize the next steps.


## Pretrained Model Observation

The pretrained BART model generates a fluent and coherent summary that captures the main issue of the email, which is the delay in project delivery. It correctly identifies the need for an updated timeline and client communication. However, the summary is generic and does not emphasize specific action items clearly. This indicates that while pretrained models are powerful, domain-specific fine-tuning could improve relevance.


In [12]:
!pip install -q scikit-learn


In [13]:
import nltk
nltk.download("punkt")
nltk.download("punkt_tab")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [14]:
from nltk.tokenize import sent_tokenize

sentences = sent_tokenize(sample_email)

print("Total sentences in email:", len(sentences))
print("\nFirst 3 sentences:\n")
for s in sentences[:3]:
    print("-", s)


Total sentences in email: 13

First 3 sentences:

- 
Subject: Delay in Project Alpha Delivery – Immediate Attention Required

Hi Team,

I hope everyone is doing well.
- I am writing this email to bring attention to the recent delay in the Project Alpha delivery timeline.
- Over the past two weeks, we have observed that the development milestones have not been met as originally planned.


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = vectorizer.fit_transform(sentences)

sentence_scores = tfidf_matrix.sum(axis=1).A1


In [16]:
top_n = 3

top_sentence_indices = np.argsort(sentence_scores)[-top_n:]
top_sentence_indices = sorted(top_sentence_indices)

extractive_summary = " ".join([sentences[i] for i in top_sentence_indices])

print("📌 EXTRACTIVE SUMMARY:\n")
print(extractive_summary)


📌 EXTRACTIVE SUMMARY:


Subject: Delay in Project Alpha Delivery – Immediate Attention Required

Hi Team,

I hope everyone is doing well. Any additional support required from management

We understand that unexpected challenges can arise, but timely communication is crucial to maintaining client trust. Thanks and regards,
Rahul Sharma
Project Manager

This email and any attachments are confidential and intended only for the recipient.


In [17]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Use sentences from earlier step
all_text = sentences + [summary_text]

vectorizer = TfidfVectorizer(stop_words="english")
tfidf_vectors = vectorizer.fit_transform(all_text)

sentence_vectors = tfidf_vectors[:-1]
summary_vector = tfidf_vectors[-1]


In [18]:
similarity_scores = cosine_similarity(sentence_vectors, summary_vector)

sentence_importance = similarity_scores.flatten()


In [19]:
top_k = 5

important_indices = np.argsort(sentence_importance)[-top_k:]
important_indices = sorted(important_indices)

print("📌 Sentences that influenced the summary most:\n")

for idx in important_indices:
    print(f"- {sentences[idx]}")


📌 Sentences that influenced the summary most:

- I am writing this email to bring attention to the recent delay in the Project Alpha delivery timeline.
- This delay may affect our committed delivery date to the client.
- The client has already requested an update and expects a clear revised timeline by the end of this week.
- Any additional support required from management

We understand that unexpected challenges can arise, but timely communication is crucial to maintaining client trust.
- Let us plan to discuss this in tomorrow’s stand-up meeting and finalize the next steps.


In [20]:
email_summary_data = [
    {
        "email": sample_email,
        "summary": "Project Alpha delivery has been delayed. The client has requested a revised timeline, and the team needs to identify blockers and finalize next steps in the upcoming meeting."
    },
    {
        "email": """
        Subject: Leave Request Update

        Hi HR Team,

        I would like to inform you that I need to extend my leave by two more days due to medical reasons.
        All pending tasks have been handed over to my teammate.
        Please let me know if any documentation is required.

        Thanks,
        Anil
        """,
        "summary": "An employee has requested an extension of leave due to medical reasons and has completed task handover."
    },
    {
        "email": """
        Subject: Customer Support Escalation

        Hello Support Team,

        The customer is facing repeated login failures despite resetting the password.
        This issue has been escalated as it is impacting business operations.
        Immediate assistance is requested.

        Regards,
        Support Lead
        """,
        "summary": "A customer support issue involving repeated login failures has been escalated and requires immediate action."
    }
]


In [21]:
from datasets import Dataset

dataset = Dataset.from_list(email_summary_data)
dataset


Dataset({
    features: ['email', 'summary'],
    num_rows: 3
})

In [22]:
def preprocess(batch):
    model_inputs = tokenizer(
        batch["email"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch["summary"],
            truncation=True,
            padding="max_length",
            max_length=150
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_dataset = dataset.map(preprocess, batched=False)


Map:   0%|          | 0/3 [00:00<?, ? examples/s]



In [23]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./email_summarizer_finetuned",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    learning_rate=2e-5,
    logging_steps=1,
    save_steps=10,
    save_total_limit=1,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)


  trainer = Trainer(


In [24]:
with torch.no_grad():
    fine_tuned_ids = model.generate(
        inputs["input_ids"],
        max_length=150,
        min_length=60,
        num_beams=4,
        length_penalty=2.0
    )

fine_tuned_summary = tokenizer.decode(
    fine_tuned_ids[0],
    skip_special_tokens=True
)

print("📌 FINE-TUNED SUMMARY:\n")
print(fine_tuned_summary)


📌 FINE-TUNED SUMMARY:

The client has already requested an update and expects a clear revised timeline by the end of this week. We understand that unexpected challenges can arise, but timely communication is crucial to maintaining client trust. Let us plan to discuss this in tomorrow’s stand-up meeting and finalize the next steps.


In [25]:
!pip install -q streamlit


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/9.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.9/9.0 MB[0m [31m118.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m9.0/9.0 MB[0m [31m143.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m99.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/6.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m6.9/6.9 MB[0m [31m291.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m153.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [26]:
%%writefile app.py

import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
nltk.download("punkt")
from nltk.tokenize import sent_tokenize

# ----------------------------
# Load model & tokenizer
# ----------------------------
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.eval()

# ----------------------------
# Streamlit UI
# ----------------------------
st.set_page_config(page_title="Email Summarization System", layout="wide")
st.title("📧 Transformer-Based Email Summarization System")

st.write(
    "This application demonstrates abstractive email summarization using pretrained Transformer models, "
    "along with AI/ML-based analysis such as extractive comparison and sentence importance."
)

# ----------------------------
# Input
# ----------------------------
email_text = st.text_area(
    "Paste a long email here:",
    height=300
)

if st.button("Generate Summary") and email_text.strip():

    # ----------------------------
    # Tokenization
    # ----------------------------
    inputs = tokenizer(
        email_text,
        return_tensors="pt",
        truncation=True,
        max_length=1024
    )

    token_count = inputs["input_ids"].shape[1]

    # ----------------------------
    # Abstractive Summary
    # ----------------------------
    with torch.no_grad():
        summary_ids = model.generate(
            inputs["input_ids"],
            max_length=150,
            min_length=60,
            num_beams=4,
            length_penalty=2.0
        )

    summary_text = tokenizer.decode(
        summary_ids[0],
        skip_special_tokens=True
    )

    # ----------------------------
    # Extractive Summary
    # ----------------------------
    sentences = sent_tokenize(email_text)

    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf_matrix = vectorizer.fit_transform(sentences)
    sentence_scores = tfidf_matrix.sum(axis=1).A1

    top_indices = np.argsort(sentence_scores)[-3:]
    top_indices = sorted(top_indices)
    extractive_summary = " ".join([sentences[i] for i in top_indices])

    # ----------------------------
    # Attention Approximation
    # ----------------------------
    all_text = sentences + [summary_text]
    tfidf_vectors = vectorizer.fit_transform(all_text)
    sentence_vectors = tfidf_vectors[:-1]
    summary_vector = tfidf_vectors[-1]

    similarity_scores = cosine_similarity(sentence_vectors, summary_vector).flatten()
    important_indices = np.argsort(similarity_scores)[-5:]

    # ----------------------------
    # Output
    # ----------------------------
    col1, col2 = st.columns(2)

    with col1:
        st.subheader("🔹 Abstractive Summary (Transformer)")
        st.write(summary_text)

        st.markdown(f"**Token count:** {token_count}")

    with col2:
        st.subheader("🔹 Extractive Summary (TF-IDF)")
        st.write(extractive_summary)

    st.subheader("⭐ Important Sentences (Model Focus)")
    for idx in sorted(important_indices):
        st.markdown(f"- {sentences[idx]}")

else:
    st.info("Paste an email and click **Generate Summary**.")


Writing app.py


In [None]:
!streamlit run app.py



Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.126.145.112:8501[0m
[0m
