Initially loading the Dataset and pre-req operations

In [None]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
from transformers import pipeline
nltk.download('punkt_tab')
# Load dataset
file_path = "/content/Cleaned Smart Email Dataset.csv"
df = pd.read_csv(file_path)

Summarization Methods and Operations Involved

In [None]:
# The function for extractive summarization using TF-IDF and cosine similarity
def extractive_summary(text, num_sentences=2):
    sentences = nltk.sent_tokenize(text)
    if len(sentences) <= num_sentences:
        return text  # Return full text if it's already short

    vectorizer = TfidfVectorizer(stop_words='english')
    sentence_vectors = vectorizer.fit_transform(sentences)
    similarity_matrix = cosine_similarity(sentence_vectors)
    ranked_sentences = sorted(((similarity_matrix[i].sum(), s) for i, s in enumerate(sentences)), reverse=True)
    summary = " ".join([s for _, s in ranked_sentences[:num_sentences]])
    return summary

# We load transformer model for abstractive summarization
summarizer = pipeline("summarization", model="t5-small")

Abstractive Summarization Implementation (Analyse and frame a new text)

In [None]:
# Function for abstractive summarization
def abstractive_summary(text):
    if len(text.split()) < 50:
        return text  # Avoid summarization for short text
    return summarizer(text, max_length=50, min_length=20, do_sample=False)[0]['summary_text']
# Apply summarization to dataset
df['Extractive_Summary'] = df['Email Content'].apply(lambda x: extractive_summary(str(x)))
df['Abstractive_Summary'] = df['Email Content'].apply(lambda x: abstractive_summary(str(x)))
# We have to Save the summarized dataset
df.to_csv("/mnt/data/Summarized_Smart_Email_Dataset.csv", index=False)

print("Summarization completed. Extractive and Abstractive summaries added.")

Tweeking the previous implementation for summarization

In [None]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
from transformers import pipeline
import torch

# Load dataset
file_path = "/content/Cleaned Smart Email Dataset.csv"
df = pd.read_csv(file_path)

# Function for extractive summarization using TF-IDF and cosine similarity
def extractive_summary(text, num_sentences=2):
    sentences = nltk.sent_tokenize(text)
    if len(sentences) <= num_sentences:
        return text  # Return full text if it's already short

    vectorizer = TfidfVectorizer(stop_words='english')
    sentence_vectors = vectorizer.fit_transform(sentences)
    similarity_matrix = cosine_similarity(sentence_vectors)

    ranked_sentences = sorted(((similarity_matrix[i].sum(), s) for i, s in enumerate(sentences)), reverse=True)
    summary = " ".join([s for _, s in ranked_sentences[:num_sentences]])
    return summary
# Load transformer model for abstractive summarization
summarizer = pipeline("summarization", model="t5-small", device=0 if torch.cuda.is_available() else -1)

# Function for abstractive summarization with proper token chunking
def abstractive_summary(text, tokenizer_max_length=512):
    words = text.split()
    if len(words) < 50:
        return text  # Avoid summarization for short text
   # Ensure chunks fit within token limit
    max_tokens = tokenizer_max_length - 10  # Leave margin for safety
    chunks = []
    current_chunk = []
    current_length = 0

    for word in words:
        current_length += len(word) + 1  # Account for spaces
        if current_length > max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
            current_length = len(word) + 1
        else:
            current_chunk.append(word)

    if current_chunk:
        chunks.append(" ".join(current_chunk))
# Summarize each chunk separately