In [1]:
pip install nltk numpy scikit-learn networkx

Note: you may need to restart the kernel to use updated packages.


In [3]:
import nltk
import numpy as np
import networkx as nx
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import string

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mayanksingh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mayanksingh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
def preprocess_sentences(text):
    stop_words = set(stopwords.words('english'))
    sentences = sent_tokenize(text)
    clean_sentences = []

    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        words = [word for word in words if word not in stop_words and word not in string.punctuation]
        clean_sentences.append(" ".join(words))
    
    return sentences, clean_sentences

In [7]:
def sentence_similarity_matrix(sentences, clean_sentences):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(clean_sentences)
    similarity_matrix = cosine_similarity(vectors)
    return similarity_matrix

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

def generate_summary(text, top_n=3):
    original_sentences, cleaned = preprocess_sentences(text)
    sim_matrix = sentence_similarity_matrix(original_sentences, cleaned)
    nx_graph = nx.from_numpy_array(sim_matrix)
    scores = nx.pagerank(nx_graph)
    
    ranked = sorted(((scores[i], s) for i, s in enumerate(original_sentences)), reverse=True)
    summary = " ".join([s for _, s in ranked[:top_n]])
    return summary

In [13]:
sample_text = """
The Indian Railways is one of the world's largest railway networks, managed by the Ministry of Railways. 
It operates both long-distance and suburban rail systems on a multi-gauge network of broad, meter, and narrow gauges. 
Serving millions of passengers daily, it is the backbone of India's transportation system. 
It also handles a significant portion of the country's freight traffic. 
Indian Railways is a state-owned enterprise and contributes significantly to the national economy. 
Despite challenges like congestion and outdated infrastructure, modernization efforts are underway.
"""

summary = generate_summary(sample_text, top_n=2)
print("Generated Summary:\n", summary)

Generated Summary:
 Serving millions of passengers daily, it is the backbone of India's transportation system. It operates both long-distance and suburban rail systems on a multi-gauge network of broad, meter, and narrow gauges.
