In [2]:
! pip install scikit-learn


Collecting scikit-learn
  Using cached scikit_learn-1.5.0-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.14.0-cp312-cp312-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.8 kB ? eta -:--:--
     ------ --------------------------------- 10.2/60.8 kB ? eta -:--:--
     ------------ ------------------------- 20.5/60.8 kB 330.3 kB/s eta 0:00:01
     ------------ ------------------------- 20.5/60.8 kB 330.3 kB/s eta 0:00:01
     ------------------- ------------------ 30.7/60.8 kB 163.8 kB/s eta 0:00:01
     -------------------------------- ----- 51.2/60.8 kB 239.5 kB/s eta 0:00:01
     -------------------------------------- 60.8/60.8 kB 248.9 kB/s eta 0:00:00
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.5.0-cp312-cp312-win_amd64.whl (10.9 MB)
Downloading scipy-1.14.0-cp312-cp312-win_amd64.whl (

In [3]:
import nltk
import numpy as np
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import string

# Download required NLTK data files
nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    # Tokenize text into sentences
    sentences = sent_tokenize(text)
    # Remove stopwords and punctuation, and convert to lowercase
    stop_words = set(stopwords.words('english'))
    preprocessed_sentences = []
    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        filtered_words = [word for word in words if word not in stop_words and word not in string.punctuation]
        preprocessed_sentences.append(' '.join(filtered_words))
    return sentences, preprocessed_sentences

def build_similarity_matrix(sentences):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
    similarity_matrix = cosine_similarity(tfidf_matrix)
    return similarity_matrix

def textrank_summary(text, num_sentences=3):
    original_sentences, preprocessed_sentences = preprocess_text(text)
    similarity_matrix = build_similarity_matrix(preprocessed_sentences)
    similarity_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(similarity_graph)
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(original_sentences)), reverse=True)
    summary = ' '.join([sentence for score, sentence in ranked_sentences[:num_sentences]])
    return summary

# Sample text for demonstration
sample_text = """
TextRank is an unsupervised extractive summarization technique based on graph-based ranking algorithms. 
It is a general-purpose algorithm that can be applied to a variety of tasks, including keyword extraction and sentence extraction. 
The algorithm constructs a graph from the text, where sentences are nodes and edges between them represent sentence similarity. 
The similarity between sentences is determined by their content overlap, usually measured by cosine similarity. 
Once the graph is constructed, the algorithm uses a variant of the PageRank algorithm to rank the sentences. 
The top-ranked sentences are then extracted to form the summary. TextRank is simple, efficient, and language-independent, 
making it a popular choice for extractive summarization.
"""

# Generate summary
summary = textrank_summary(sample_text, num_sentences=2)
print("Summary:")
print(summary)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...


Summary:
The algorithm constructs a graph from the text, where sentences are nodes and edges between them represent sentence similarity. Once the graph is constructed, the algorithm uses a variant of the PageRank algorithm to rank the sentences.


[nltk_data]   Unzipping corpora\stopwords.zip.
