In [None]:
'''---------------------------------------------------------------------
--nltk = "natural language processing toolkit"
--sent_tokenize : split the data into sentences
--word_tokenize : split the data into words
--stopworkds.words('english') : to remove/ignore the stopwords (e.g. a, an , the, of ...)
download : nltk.download() - for using NLTK packages
----------------------------------------------------------------------------------------------------
concept of 'defaultdict' : Rather than throwing a KeyError , it will create a default item and add that key-value pair to the dictionary.
defaultdict inherits from ("is-a" ) dictionary.'''

In [107]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

from collections import defaultdict
from string import punctuation
# get n largest elements from the list
from heapq import nlargest



In [175]:
# our class: TextSummarizer
class TextSummarizer:
    def __init__(self, min_cut=0.1, max_cut=0.9):
        #default member initialization
        self._min_cut=min_cut
        self._max_cut=max_cut
        #stopwords + punctuations
        self._stopwords=set(stopwords.words('english')+ list(punctuation))
        #print(len(self._stopwords))
    
    def _Calculate_frequencies(self, word_sent):
        #keys : words, Key_value: frequency of the word
        #if key is not present in the wordlist the assign default value
        #defalultdict is a class inherits from dictionary
        #stopwords1=set(stopwords.words('english')+ list(punctuation))
        freq = defaultdict(int)
        #freq1=freq
        for sentence in word_sent:
            for word in sentence:
                if word not in self._stopwords:
                    freq[word]=freq[word]+1
        #freq1=freq
        #frequency normalization of each word 
        #by dividing with hightest frequency after it frequency will be between(0-1)
        max_freq=float(max(freq.values())) # to get highest frequency
        # copy key values into list
        res=[]
        for key in freq.keys(): 
            res.append(key) 
        print(res)
        # copy the freq dictionary into freq1 to avoid RuntimeError dictinary size change
        freq1=freq.copy()
        for word in freq1.keys():
            freq[word]=freq[word]/max_freq
            if freq[word]>=self._max_cut or freq[word]<=self._min_cut:
                del freq[word]  
                # will delete key value pair with respect to low and hight freq.
        return freq
    
    def summarize(self, text, n):
        sents=sent_tokenize(text)  # converts text into sentences
        assert n<= len(sents)    # to check is summary less than the original text or not 
        word_sent=[word_tokenize(s.lower()) for s in sents]  # converts sentences into lowercase and then into list of words
        self._freq=self._Calculate_frequencies(word_sent)  # returns dictionary with its frequency
        rankings=defaultdict(int)  # used to create empty default dictionary  to hold sentence rankings
        for i, sent in enumerate(word_sent): 
            #enumerate converts list into tuples with its index value.
            for word in sent:
                if word in self._freq:
                    rankings[i]+= self._freq[word]
        sents_index = nlargest(n, rankings, key=rankings.get)
        
        return [sents[j] for j in sents_index]
                
        
        
        

In [176]:
# ger URL  to summarize the data 
#import urllib
from bs4 import BeautifulSoup
from urllib.request import urlopen

In [1]:
def get_text_from_Article_url(url):
    page= urllib.request.urlopen(url).read().decode('utf8')
    soup = BeautifulSoup(page)
    #join all data which is in tag: article 
    text= " ".join(map(lambda p: p.text, soup.find_all('article')))
    
    soup2=BeautifulSoup(text)
    #join all data which is in tag: p - paragraph
    text= " ".join(map(lambda p: p.text, soup2.find_all('p')))
    return soup.title.text, text

In [178]:
# here one can type the url which they want to summarise
someUrl = "https://www.washingtonpost.com/politics/in-face-of-criticism-trump-surges-to-his-biggest-lead-over-the-gop-field/2015/12/14/b9555e30-a29c-11e5-9c4e-be37f66848bb_story.html"
textofURL=get_text_from_Article_url(someUrl)
#textofURL



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [None]:
#someUrl = "https://www.washingtonpost.com/politics/in-face-of-criticism-trump-surges-to-his-biggest-lead-over-the-gop-field/2015/12/14/b9555e30-a29c-11e5-9c4e-be37f66848bb_story.html"
#textofURL=get_text_from_Article_url(someUrl)
#textofURL

In [179]:
# create object of the class and call the function summarize by passing Text and the maximum no. of statements want to see as summary
fs=TextSummarizer()
summary = fs.summarize(textofURL[1],3)

['following', 'proposal', 'temporarily', 'bar', 'muslims', 'entering', 'country', 'donald', 'trump', 'increased', 'lead', 'republican', 'primary', 'largest', 'margin', 'yet', 'according', 'new', 'washington', 'post-abc', 'news', 'poll', 'results', 'latest', 'sign', '’', 'outspoken', 'comments', 'immigration', 'terrorism', 'continue', 'find', 'audience', 'among', 'rank-and-file', 'republicans', 'spite', 'sharp', 'condemnation', 'democrats', 'gop', 'leaders', 'rivals', 'chorus', 'world', 'survey', 'puts', 'support', '38', 'percent', 'registered', 'gop-leaning', 'independents', 'six', 'points', 'higher', 'october', 'november', 'sen.', 'ted', 'cruz', 'texas', 'also', 'running', 'anti-establishment', 'campaign', 'surged', 'second', 'place', '15', 'effectively', 'doubling', 'since', 'last', 'month', 'marco', 'rubio', 'florida', 'ben', 'carson', 'tied', 'third', '12', 'top', 'field', 'earlier', 'fall', 'saw', 'cut', 'roughly', 'half', 'past', 'candidate', 'registers', 'double', 'digits', 'for

In [180]:
# The final result :- Summary of the Article
summary

['Trump leads among every demographic and ideological group of Republicans in the survey, but he has significantly greater support among those with less education and lower incomes and among men.',
 '[Explore the complete results of the poll]      Republicans trust Trump on terrorism View Graphic     Republicans trust Trump on terrorism  At this point in the campaign, even with the first votes fewer than 50 days away, national polls are not always the reliable predictors of where presidential nominating contests are heading.',
 'The results are the latest sign that Trump’s outspoken comments on immigration and terrorism continue to find an audience among rank-and-file Republicans in spite of sharp condemnation from Democrats, GOP leaders, some of Trump’s rivals and a chorus of world leaders.']

In [138]:
summary

['In a hypothetical \xadgeneral-election race, Trump trails Clinton, the Democratic front-runner, by 53 percent to 40 percent among all adults and 50 percent to 44 percent among registered voters.',
 '[Explore the complete results of the poll]      Republicans trust Trump on terrorism View Graphic     Republicans trust Trump on terrorism  At this point in the campaign, even with the first votes fewer than 50 days away, national polls are not always the reliable predictors of where presidential nominating contests are heading.',
 'The results are the latest sign that Trump’s outspoken comments on immigration and terrorism continue to find an audience among rank-and-file Republicans in spite of sharp condemnation from Democrats, GOP leaders, some of Trump’s rivals and a chorus of world leaders.']

In [155]:
someUrl

'https://www.washingtonpost.com/politics/in-face-of-criticism-trump-surges-to-his-biggest-lead-over-the-gop-field/2015/12/14/b9555e30-a29c-11e5-9c4e-be37f66848bb_story.html'