In [1]:
# Importing libraries connect to the internet
import bs4 as bs
import urllib.request
import re
import nltk
nltk.download('stopwords')
import heapq

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ananya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Considering data source from wikipedia using url and use read() to read
source = urllib.request.urlopen('https://en.wikipedia.org/wiki/India_Today').read()

In [3]:
# BeautifulSoup is used for parsing the data and create BeautifulSoup object
soup = bs.BeautifulSoup(source,'lxml')

In [4]:
# Fetching the data
text = ""
for paragraph in soup.find_all('p'):
    text += paragraph.text

In [5]:
# Preprocessing the data
text = re.sub(r'\[[0-9]*\]',' ',text)
text = re.sub(r'\s+',' ',text)
clean_text = text.lower()
clean_text = re.sub(r'\W',' ',clean_text)
clean_text = re.sub(r'\d',' ',clean_text)
clean_text = re.sub(r'\s+',' ',clean_text)
print(clean_text)

 india today is a fortnightly indian english language news magazine published by living media india limited it is the most widely circulated magazine in india with a readership of over of close to million in india today launched a new online opinion orientated site called the dailyo india today was established in by vidya vilas purie owner of thompson press with his daughter madhu trehan as its editor and his son aroon purie as its publisher at present india today is also published in hindi kannada tamil malayalam and telugu the india today news channel was launched on may in october aroon purie passed control of the india today group to his daughter kallie purie 


In [6]:
# Tokenize sentences
sentences = nltk.sent_tokenize(text)

In [13]:
# Stopword list
stop_words = nltk.corpus.stopwords.words('english')
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [8]:
# Word counts 
word2count = {}
for word in nltk.word_tokenize(clean_text):
    if word not in stop_words:
        if word not in word2count.keys():
            word2count[word] = 1
        else:
            word2count[word] += 1

In [9]:
# Converting counts to weights
max_count = max(word2count.values())
for key in word2count.keys():
    word2count[key] = word2count[key]/max_count

In [10]:
# Product sentence scores    
sent2score = {}
for sentence in sentences:
    for word in nltk.word_tokenize(sentence.lower()):
        if word in word2count.keys():
            if len(sentence.split(' ')) < 25:
                if sentence not in sent2score.keys():
                    sent2score[sentence] = word2count[word]
                else:
                    sent2score[sentence] += word2count[word]

In [21]:
print(sent2score)
print(word2count)
#print(sent2score[sentence])


{' India Today is a fortnightly Indian English-language news magazine published by Living Media India Limited.': 4.125, 'It is the most widely circulated magazine in India, with a readership of over of close to 8 million.': 1.875, 'In 2014, India Today launched a new online opinion-orientated site called the DailyO.': 2.625, 'At present, India Today is also published in Hindi, Kannada, Tamil, Malayalam and Telugu.': 2.875, 'The India Today news channel was launched on 22 May 2015.': 2.5, 'In October 2017, Aroon Purie passed control of the India Today Group to his daughter, Kallie Purie.': 3.875}
{'india': 1.0, 'today': 0.75, 'fortnightly': 0.125, 'indian': 0.125, 'english': 0.125, 'language': 0.125, 'news': 0.25, 'magazine': 0.25, 'published': 0.25, 'living': 0.125, 'media': 0.125, 'limited': 0.125, 'widely': 0.125, 'circulated': 0.125, 'readership': 0.125, 'close': 0.125, 'million': 0.125, 'launched': 0.25, 'new': 0.125, 'online': 0.125, 'opinion': 0.125, 'orientated': 0.125, 'site': 

In [11]:
# Gettings best 5 lines             
best_sentences = heapq.nlargest(4, sent2score, key=sent2score.get)

In [22]:
print('----------------------------------------------------------------------------------------------------------------')
for sentence in best_sentences:
    print(sentence)
print('----------------------------------------------------------------------------------------------------------------')

----------------------------------------------------------------------------------------------------------------
 India Today is a fortnightly Indian English-language news magazine published by Living Media India Limited.
In October 2017, Aroon Purie passed control of the India Today Group to his daughter, Kallie Purie.
At present, India Today is also published in Hindi, Kannada, Tamil, Malayalam and Telugu.
In 2014, India Today launched a new online opinion-orientated site called the DailyO.
----------------------------------------------------------------------------------------------------------------
