<a href="https://colab.research.google.com/github/JamesMungai254/Text-Summarization/blob/main/TextSummarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [34]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 1000)



In [10]:
import pandas as pd
import numpy as np
import PyPDF2
import re
import nltk
from docx import Document

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import matplotlib.pyplot as plt

In [22]:

# Download necessary NLTK data
nltk.download('punkt')
# nltk.download('stopwords')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [12]:
# Step 1: Data Cleaning and Preprocessing
def clean_text(text):
    """
    Preprocess text: remove special characters, numbers, and extra spaces.
    """
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[0-9]', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert text to lowercase
    return text


In [15]:

# Load data
data = {
    "document": [
        "Text summarization is the process of distilling the most important information.",
        "Automatic text summarization uses AI techniques to identify essential parts.",
        "Natural Language Processing plays a key role in text summarization algorithms."
    ]
}
df = pd.DataFrame(data)
df



Unnamed: 0,document
0,Text summarization is the process of distillin...
1,Automatic text summarization uses AI technique...
2,Natural Language Processing plays a key role i...


In [16]:
# Clean and preprocess text
df['cleaned_text'] = df['document'].apply(clean_text)


In [23]:
# Step 2: Sentence Tokenization
def sentence_tokenizer(text):
    """
    Split text into sentences.
    """
    return sent_tokenize(text)

df['sentences'] = df['document'].apply(sentence_tokenizer)

In [24]:
# Step 3: Stopword Removal and Normalization
stop_words = set(stopwords.words('english'))

def normalize_sentence(sentence):
    """
    Remove stopwords and tokenize sentences into meaningful words.
    """
    words = word_tokenize(sentence)
    normalized_words = [word for word in words if word not in stop_words]
    return ' '.join(normalized_words)

df['normalized_sentences'] = df['sentences'].apply(
    lambda sentences: [normalize_sentence(s) for s in sentences]
)



In [25]:
# Step 4: TextRank Summarization
def text_rank_summarizer(sentences, top_n=2):
    """
    Apply TextRank algorithm for sentence extraction.
    """
    # Convert sentences to TF-IDF matrix
    vectorizer = TfidfVectorizer()
    sentence_vectors = vectorizer.fit_transform(sentences)

    # Compute cosine similarity
    similarity_matrix = cosine_similarity(sentence_vectors)

    # Build a graph and apply TextRank
    graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(graph)

    # Rank sentences by their TextRank score
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)

    # Extract top N sentences for the summary
    summary = [sentence for _, sentence in ranked_sentences[:top_n]]
    return summary

df['summary'] = df['normalized_sentences'].apply(lambda sentences: text_rank_summarizer(sentences))


In [31]:
# Step 5: Hyperparameter Tuning
# Example: Experiment with different TF-IDF parameters and top_n values
vectorizer = TfidfVectorizer(min_df=1.0, max_df=1)  # Fine-tuning TF-IDF parameters
top_n_sentences = 3  # Number of sentences in the summary
summary_results = []
for sentences in df['normalized_sentences']:
    sentence_vectors = vectorizer.fit_transform(sentences)
    similarity_matrix = cosine_similarity(sentence_vectors)
    graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(graph)
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    summary = [sentence for _, sentence in ranked_sentences[:top_n_sentences]]
    summary_results.append(summary)

df['optimized_summary'] = summary_results


In [36]:
# Display results
print(df[['document', 'summary', 'optimized_summary']])



                                            document                                            summary                                  optimized_summary
0  Text summarization is the process of distillin...  [Text summarization process distilling importa...  [Text summarization process distilling importa...
1  Automatic text summarization uses AI technique...  [Automatic text summarization uses AI techniqu...  [Automatic text summarization uses AI techniqu...
2  Natural Language Processing plays a key role i...  [Natural Language Processing plays key role te...  [Natural Language Processing plays key role te...


In [35]:
df['optimized_summary']

Unnamed: 0,optimized_summary
0,[Text summarization process distilling importa...
1,[Automatic text summarization uses AI techniqu...
2,[Natural Language Processing plays key role te...


In [39]:

print(df['optimized_summary'].iloc[2])


['Natural Language Processing plays key role text summarization algorithms .']


In [45]:
def text_reader(txtFile):
  with open(txtFile, 'r', encoding='utf-8') as file:
    text_data = file.read()
  return text_data


In [48]:
# After extracting text from the document
text_data = text_reader('/content/note 2.txt')
cleaned_text = clean_text(text_data)  # Apply the cleaning function from earlier
sentences = sentence_tokenizer(cleaned_text)  # Tokenize sentences
normalized_sentences = [normalize_sentence(sentence) for sentence in sentences]
summary = text_rank_summarizer(normalized_sentences, top_n=3)
print("Summary:")
print(summary)

Summary:
['end lecture students able understand limitations classical retrieval models explain compare probabilistic language modeling neural ir models implement simulate basic advanced ir models using example datasets evaluate effectiveness advanced retrieval models using standard metrics classical ir models boolean model vector space tfidf suffered vocabulary mismatch probabilistic interpretation model term dependencies understanding relationships words data points time distance probabilistic retrieval models binary independence model best matching best matching incorporates term frequency inverse document frequency document length normalization intution times query term appears document term frequency relevant document might diminishing returns information retrieval systems diminishing returns occur adding retrieval rounds resources yields progressively smaller improvements results measured metrics like precision recall redundancy length normalization longer documents tend contain w

### Using Word Limit

In [49]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx




In [50]:
# Function to clean text (preserve stop words and punctuation)
def clean_text(text):
    """
    Preprocess text: remove extra spaces and numbers but preserve punctuation.
    """
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[0-9]', '', text)  # Remove numbers
    text = text.strip()  # Strip leading/trailing spaces
    return text



In [51]:
# Step 1: Read input file
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

text_data = read_text_file('/content/note 2.txt')



In [52]:
# Step 2: Clean and tokenize the text
cleaned_text = clean_text(text_data)
sentences = sent_tokenize(cleaned_text)  # Split text into sentences

# Step 3: TextRank Summarization with Word Limit
def text_rank_summarizer(sentences, word_limit=100):
    """
    Apply TextRank algorithm for summarization with a user-defined word limit.
    """
    vectorizer = TfidfVectorizer()
    sentence_vectors = vectorizer.fit_transform(sentences)
    similarity_matrix = cosine_similarity(sentence_vectors)

    # Build a graph and apply TextRank
    graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(graph)

    # Rank sentences by their TextRank score
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)

    # Construct the summary with a word limit
    summary = []
    word_count = 0
    for _, sentence in ranked_sentences:
        if word_count + len(sentence.split()) <= word_limit:
            summary.append(sentence)
            word_count += len(sentence.split())
        else:
            break
    return '. '.join(summary)  # Combine sentences into a sensible summary



In [54]:
# Step 4: User Input for Word Limit
word_limit = int(input("Enter the word limit for your summary: "))
summary = text_rank_summarizer(sentences, word_limit)



Enter the word limit for your summary: 100


In [55]:
# Display the summary
print("Generated Summary:")
print(summary)

Generated Summary:
Intution: The more times a query term appears in a document (term frequency), the more relevant that document might be Diminishing returns: In information retrieval systems, diminishing returns occur when adding more retrieval rounds or resources yields progressively smaller improvements in results, measured by metrics like precision, recall, and redundancy Length normalization: Longer documents tend to contain more words, so we penalize longer docs unless they use the term more heavily How It works The search score is computed based on statistical properties of the string input and the query itself.


# Code for T5 summarization

In [56]:
from transformers import T5Tokenizer, T5ForConditionalGeneration



In [58]:
# Load the T5 model and tokenizer
model_name = "t5-small"  # You can also try "t5-base" or "t5-large" for better performance
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [59]:
# Input text
input_text = """
Text summarization is the task of condensing a large body of text while retaining the most important information.
Automatic text summarization has gained popularity with the advent of deep learning methods.
"""



In [62]:
# Preprocess input for the model
input_ids = tokenizer.encode("summarize: " + input_text, return_tensors="pt", max_length=512, truncation=True)

# Generate summary
summary_ids = model.generate(
    input_ids,
    max_length=50,  # Max length of summary
    min_length=10,  # Minimum length of summary
    length_penalty=2.0,  # Encourage brevity
    num_beams=4,  # Beam search for better results
    early_stopping=True  # Stop when desired length is reached
)

# Decode the summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)


In [63]:

# Print the summary
print("Original Text:")
print(input_text)
print("\nGenerated Summary:")
print(summary)

Original Text:

Text summarization is the task of condensing a large body of text while retaining the most important information. 
Automatic text summarization has gained popularity with the advent of deep learning methods.


Generated Summary:
text summarization is the task of condensing a large body of text while retaining the most important information. automatic text summarization is the task of condensing a large body of text while retaining
