# Extractive
---



In [None]:
# from: https://blog.floydhub.com/gentle-introduction-to-text-summarization-in-machine-learning/

# This doesn't use a model, just math 

# which is choosing specific main words from the input to generate the output, 
# this model tends to work , but won’t output a correctly structured sentences, 
# as it just selects words from input and copy them to the output, 
# without actually understanding the sentences , think of it as a highlighter

# Creating a dictionary for the word frequency table
# Tokenizing the sentences
# Algorithm for scoring a sentence by its words
# Getting the threshold
# Producing the summary

#importing libraries
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
import bs4 as BeautifulSoup
import urllib.request  
import nltk
nltk.download('stopwords')
nltk.download('punkt')

# STEP 1: Get the Data

#fetching the content from the URL
fetched_data = urllib.request.urlopen('https://www.cbc.ca/news/canada/coronavirus-covid19-canada-world-nov10-1.5796310')

article_read = fetched_data.read()

#parsing the URL content and storing in a variable
article_parsed = BeautifulSoup.BeautifulSoup(article_read,'html.parser')

#returning <p> tags
paragraphs = article_parsed.find_all('p')

article_content = ''

#looping through the paragraphs and adding them to the variable
for p in paragraphs:  
    article_content += p.text

# STEP 2: Cleaning the data

def _create_dictionary_table(text_string) -> dict:
   
    #removing stop words
    stop_words = set(stopwords.words("english"))
    
    words = word_tokenize(text_string)
    
    #reducing words to their root form
    stem = PorterStemmer()
    
    #creating dictionary for the word frequency table
    frequency_table = dict()
    for wd in words:
        wd = stem.stem(wd)
        if wd in stop_words:
            continue
        if wd in frequency_table:
            frequency_table[wd] += 1
        else:
            frequency_table[wd] = 1

    return frequency_table

# STEP 3: Weighting each sentence

def _calculate_sentence_scores(sentences, frequency_table) -> dict:   

    #algorithm for scoring a sentence by its words
    sentence_weight = dict()

    for sentence in sentences:
        sentence_wordcount = (len(word_tokenize(sentence)))
        sentence_wordcount_without_stop_words = 0
        for word_weight in frequency_table:
            if word_weight in sentence.lower():
                sentence_wordcount_without_stop_words += 1
                if sentence[:7] in sentence_weight:
                    sentence_weight[sentence[:7]] += frequency_table[word_weight]
                else:
                    sentence_weight[sentence[:7]] = frequency_table[word_weight]

        sentence_weight[sentence[:7]] = sentence_weight[sentence[:7]] / sentence_wordcount_without_stop_words

    return sentence_weight

# STEP 4: Find the average sentence score

def _calculate_average_score(sentence_weight) -> int:
   
    #calculating the average score for the sentences
    sum_values = 0
    for entry in sentence_weight:
        sum_values += sentence_weight[entry]

    #getting sentence average value from source text
    average_score = (sum_values / len(sentence_weight))

    return average_score

# STEP 5: Run it

def _get_article_summary(sentences, sentence_weight, threshold):
    sentence_counter = 0
    article_summary = ''

    for sentence in sentences:
        if sentence[:7] in sentence_weight and sentence_weight[sentence[:7]] >= (threshold):
            article_summary += " " + sentence
            sentence_counter += 1

    return article_summary

def _run_article_summary(article):
    
    #creating a dictionary for the word frequency table
    frequency_table = _create_dictionary_table(article)

    #tokenizing the sentences
    sentences = sent_tokenize(article)

    #algorithm for scoring a sentence by its words
    sentence_scores = _calculate_sentence_scores(sentences, frequency_table)

    #getting the threshold
    threshold = _calculate_average_score(sentence_scores)

    #producing the summary
    article_summary = _get_article_summary(sentences, sentence_scores, 1.5 * threshold)

    return article_summary

if __name__ == '__main__':
    summary_results = _run_article_summary(article_content)
    print(summary_results)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
 "If it needs to be done, we'll do it. Provinces and territories listed 221,279 cases as recovered or resolved. We reserve the right to close comments at any time.Audience Relations, CBC P.O.


# BART - Facebook Model. (Sequence to Sequence)
https://colab.research.google.com/github/sshleifer/blog_v2/blob/master/_notebooks/2020-03-12-bart.ipynb#scrollTo=dBwfB2wvY3dR

## Setup

In [None]:
# Bart
!pip install torch
!pip install transformers
import torch
try:
    import transformers
    from transformers import BartTokenizer, BartForConditionalGeneration
except ImportError:
    raise ImportError(INSTALL_MSG)
from IPython.display import display, Markdown

torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Newspaper
!pip install newspaper3k
from newspaper import Article



## Get Article

In [None]:
url = 'https://www.cbc.ca/news/canada/coronavirus-covid19-canada-world-nov10-1.5796310'

article = Article(url)
# Get article
article.download()
article.parse()

article.text

'The latest:\n\nJurisdictions across Canada implemented new restrictions while others committed to existing measures to counter a spike in COVID-19 cases on Tuesday.\n\nToronto is moving into the "red" level of Ontario\'s colour-coded coronavirus shutdown system and adding stricter measures on top of that as the country\'s largest city reported 520 new cases on Tuesday, setting a record for new infections for the second day in a row.\n\n"We need more measures now because we\'re seeing spread and risk like we\'ve never seen before," the city\'s Medical Officer of Health Dr. Eileen de Villa told reporters on Tuesday.\n\nToronto will continue to enforce takeout only dining options; require that meeting and event spaces — including bingo halls, casinos and other establishments — remain closed; and prohibit indoor group fitness classes.\n\nWATCH | Toronto\'s top doctor announces stricter pandemic measures:\n\nToronto\'s top doctor announces stricter pandemic measures 2:16 Toronto\'s Medical

## Run Model

In [None]:
#collapse-show
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

article_input_ids = tokenizer.batch_encode_plus([article.text], return_tensors='pt', max_length=1024, truncation=True)['input_ids'].to(torch_device)
summary_ids = model.generate(article_input_ids,
                             num_beams=4,
                             length_penalty=2.0,
                             max_length=142,
                             min_length=56,
                             no_repeat_ngram_size=3)

summary_txt = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)
display(Markdown('> **Summary: **'+summary_txt))

> **Summary: **Jurisdictions across Canada implemented new restrictions to counter a spike in COVID-19 cases on Tuesday. Toronto is moving into Ontario's colour-coded coronavirus shutdown system and adding stricter measures on top of that. British Columbia extended its provincial state of emergency for another two weeks to ensure B.C. health and emergency officials have the powers they need.