In [5]:
import numpy as np
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.stem import PorterStemmer
from pypdf import PdfReader

In [6]:
book=PdfReader('book1.pdf')

In [7]:
text=''
for i in range(3,len(book.pages)):
    page=book.pages[i]
    text+=page.extract_text()
print(len(text))

46231


In [8]:
def preprocess_text(text):
    swords=set(nltk.corpus.stopwords.words('english'))
    stemmer=PorterStemmer()
    sentences=sent_tokenize(text)
    preprocess_sent=[]
    for sentence in sentences:
        words=word_tokenize(sentence)
        words=[stemmer.stem(word) for word in words if word.isalnum() and word not in swords]
        preprocess_sent.append(' '.join(words))
    return preprocess_sent

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

def compute_tfidf(sentences):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    return tfidf_matrix

def score_sentences(tfidf_matrix):
    sentence_scores = tfidf_matrix.sum(axis=1)
    return sentence_scores

In [10]:
def generate_summary(text, top_n=5):
    preprocessed_sentences = preprocess_text(text)
    tfidf_matrix = compute_tfidf(preprocessed_sentences)
    sentence_scores = score_sentences(tfidf_matrix)
    
    ranked_sentences = [sent for _, sent in sorted(zip(sentence_scores, sent_tokenize(text)), reverse=True)]
    summary = ' '.join(ranked_sentences[:top_n])
    
    return summary

In [11]:
summary=generate_summary(text,5)
print(summary)

['chapter', 't', 'sherlock', 'holm', 'alway', 'woman']
['i', 'seldom', 'heard', 'tion', 'name']
['in', 'eye', 'eclips', 'predomin', 'whole', 'sex']
['it', 'felt', 'emot', 'akin', 'love', 'iren', 'adler']
['all', 'emot', 'one', 'particularli', 'abhorr', 'cold', 'precis', 'admir', 'balanc', 'mind']
['he', 'i', 'take', 'perfect', 'reason', 'observ', 'machin', 'world', 'seen', 'lover', 'would', 'place', 'fals', 'posit']
['he', 'never', 'spoke', 'softer', 'passion', 'save', 'gibe', 'sneer']
['they', 'admir', 'thing', 'draw', 'veil', 'men', 'motiv', 'action']
['but', 'train', 'reason', 'admit', 'intrus', 'delic', 'ﬁneli', 'adjust', 'tempera', 'introduc', 'distract', 'factor', 'might', 'throw', 'doubt', 'upon', 'mental', 'result']
['grit', 'sensit', 'instrument', 'crack', 'one', 'lens', 'would', 'disturb', 'strong', 'emot', 'natur']
['and', 'yet', 'one', 'woman', 'woman', 'late', 'iren', 'adler', 'dubiou', 'question', 'memori']
['i', 'seen', 'littl', 'holm', 'late']
['my', 'marriag', 'drift',