In [93]:
import pickle
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
import numpy
nltk.download('punkt')
import difflib


#finds the score of the cluster of words based off how close they are and how often the word is used
def cluster_score(cluster):
    sig_words = len(cluster)
    total_words = cluster[-1] - cluster[0] + 1
    return sig_words ** 2 / total_words
#scores each sentence out of all the sentences and returns the score of all the sentences
def score_sentences(sentences, important_words, THRESH = 5):
    scores = []
    #word tokenizes the sentences
    for sent in map(word_tokenize, sentences):
        word_idx = []
        #finds out which words are important
        for word in important_words:
            if word in sent:
                word_idx.append(sent.index(word))
        word_idx.sort()
        #uses the important words found to find the cluster score and scores each sentence
        if len(word_idx) > 0:
            clusters = []
            current_cluster = [word_idx[0]]
            for idx in word_idx[1:]:
                if idx - word_idx[-1] < THRESH:
                    current_cluster.append(idx)
                else:
                    clusters.append(current_cluster)
                    current_cluster = [idx]
            clusters.append(current_cluster)
            scores.append(max(map(cluster_score, clusters)))
        else:
            #score = 0 if sentence has no important words
            scores.append(0)
    return scores
#automatic summarizer returns 5 highest scoring sentences
def summarize(sentences, important_words, CTHRESH = 5, NUM_SENTENCES = 5):
    #sentence score must be higher than CTHRESH to be important
    scores = score_sentences(sentences, important_words, CTHRESH)
    avg = numpy.mean(scores)
    std_dev = numpy.std(scores)
    score_threshold = avg + 0.5 * std_dev
    #dont use mean scored
    mean_scored = [t[0] for t in enumerate(scores) if t[1] > score_threshold]
    
    sorted_scores = sorted(enumerate(scores), key=lambda x: x[1], reverse = True)[:NUM_SENTENCES]
    sorted_indexes = sorted([s[0] for s in sorted_scores])
    
    return (' '.join([sentences[i] for i in sorted_indexes]))

#removes all '-_\/' in sentences and replaces with blank space 
def sentence_cleaner(sentences):
    listsents = []
    for a in range(len(sentences)):
        for sent in sentences[a]:
            newsents = re.sub('[-_\/]', '', sent)
            listsents.append(newsents)
    return listsents

#re.sub('[^a-zA-Z]', ' ', sent) finds any character that is not a-z or A-Z and replaces it with space. returns list of words
def word_cleaner(sentences):
    newList = []
    for a in range(len(sentences)):
        for sent in sentences[a]:
            newsent = re.sub('[^a-zA-Z]', ' ', sent)
            newList.append(newsent)
    tokens = [word_tokenize(s) for s in newList]
    article_words = []
    for toke in tokens:
        article_words += toke
    return article_words

#finds the 5 most common words out of list of words, makes sure all punct is gone. checks if words are in stopwords
#returns list of words
def most_common_words(words, num_sents):
    stopWords = set(stopwords.words("english"))
    for punct in "_-,.'?:;'""":
        stopWords.add(punct)
    freqDist = nltk.FreqDist(words)
    sorted_words = sorted(freqDist.items(), key=lambda x: x[1], reverse=True)
    common_words = [word[0] for word in sorted_words
                    if word[0].lower() not in stopWords][:num_sents]
    return common_words
#uses request and beautiful soup to get info from web url and turn it into list of sentences of the articles paragraph text
def article_sentences(request):
    soup = BeautifulSoup(request.text, 'html.parser')
    paras = soup.findAll('p', string=True)
    paragraphs = []
    for para in paras:
        paragraphs.append(para.text)
    sentences = []
    sentences = [sent_tokenize(sent) for sent in paragraphs]
    return sentences

#finds and returns title from a web url
#tries to filter out stopwords from the title
#doesn't work exactly how I intended. if I don't use tokeny as a list it separates phraseList by characters.
def article_title(request):
    souper = BeautifulSoup(request.text, 'html.parser')
    souptitler = souper.find('title')
    titler = souptitler.text
    sentTitler = sent_tokenize(titler)
    phraseList = []
    for sent in sentTitler:
        phrasesent = re.sub('[^a-zA-Z]', ' ', sent)
        phraseList.append(phrasesent)
    tokeny = [word_tokenize(s) for s in phraseList]
    filtered_words = [word for word in tokeny if word not in stopwords.words('english')]
    return filtered_words

#paste any URL into 1 of these requests and get information from that article. and automatically generate a summary
#article One info from web
one = requests.get('https://www.msn.com/en-us/news/world/putin-russia-knows-mueller-probe-gave-birth-to-a-mouse/ar-BBVLtjG?ocid=spartanntp')
#article Two infro from web
two = requests.get('https://www.msn.com/en-us/money/companies/worlds-2-biggest-beer-brewers-in-battle-over-keg-technology/ar-BBVZLRj?ocid=spartanntp')

print('Question: Is a title of an article a good representation of the entire article or are titles just there to grab attention?')
print('       ')

#calls article_sentences to get back the sentences of that article from that webpage
oneSentences = article_sentences(one)

#pickled sentences of article one
with open('sentencesOne.pkl', 'wb') as f:
    pickle.dump(oneSentences, f)
with open('sentencesOne.pkl', 'rb') as f:
    pickled_sentsOne = pickle.load(f)

#calls article_sentences to get back the sentences of that article from that webpage
twoSentences = article_sentences(two)

#pickled sentences of article two
with open('sentencesTwo.pkl', 'wb') as f:
    pickle.dump(twoSentences, f)
with open('sentencesTwo.pkl', 'rb') as f:
    pickled_sentsTwo = pickle.load(f)

#Cleans the sentences. gets rid of special characters that would throw errors if still in sentences
oneCleanSents = sentence_cleaner(pickled_sentsOne)

#pickled after sentences cleans of article one
with open('cleanSentsOne.pkl', 'wb') as f:
    pickle.dump(oneCleanSents, f)
with open('cleanSentsOne.pkl', 'rb') as f:
    pickled_cleanSentsOne = pickle.load(f)

print('Article One    ')    
print(pickled_cleanSentsOne)
print('   ')

#getting article ones title in text to print
soup1 = BeautifulSoup(one.text, 'html.parser')
souptitle1 = soup1.find('title')
oneTitler = souptitle1.text



print('Article One Title: ' + oneTitler)
#calls sentence_cleaner to clean article two's sentences of special characters
twoCleanSents = sentence_cleaner(pickled_sentsTwo)

#pickled article two cleaned sentences
with open('cleanSentsTwo.pkl', 'wb') as f:
    pickle.dump(twoCleanSents, f)
with open('cleanSentsTwo.pkl', 'rb') as f:
    pickled_cleanSentsTwo = pickle.load(f)

#calls word_cleaner to tokenize the words of article one sentences and turn everything into characters a-z
oneClean_words = word_cleaner(pickled_sentsOne)

#pickled clean words of article one
with open('cleanWordsOne.pkl', 'wb') as f:
    pickle.dump(oneClean_words, f)
with open('cleanWordsOne.pkl', 'rb') as f:
    pickled_cleanWordsOne = pickle.load(f)

#calls word_cleaner to tokenize the words of article two sentences and turn everything into characters a-z
twoClean_words = word_cleaner(pickled_sentsTwo)

#pickled clean words of article two
with open('cleanWordsTwo.pkl', 'wb') as f:
    pickle.dump(twoClean_words, f)
with open('cleanWordsTwo.pkl', 'rb') as f:
    pickled_cleanWordsTwo = pickle.load(f)

#number of sentences you want your summaries to be
num_sents = 5

#finds the most commonly used words in article one
one_common_words = most_common_words(pickled_cleanWordsOne, num_sents)

#builds a summary based off commonly used words
one_common_word_summary = summarize(pickled_cleanSentsOne, one_common_words, 5, 5)


print('    ')

#title of article one in list
one_titlePhrase = article_title(one)

#pickled article one title
with open('title_One.pkl', 'wb') as f:
    pickle.dump(one_titlePhrase, f)
with open('title_One.pkl', 'rb') as f:
    pickled_One_title = pickle.load(f)


#summary of article based off of the title
one_title_summary = summarize(pickled_cleanSentsOne, pickled_One_title, 5, 5)
print('Summary of article One using the title to generate summary:')
print(one_title_summary)

print('   ')
print('5 most commonly used words in article One:')
print(one_common_words)
print('    ')
print('Summary of article One using most common words to generate summary:')
print(one_common_word_summary)
print('   ')

#most commonly used words in article two
two_common_words = most_common_words(pickled_cleanWordsTwo, num_sents)

#summary of article two using most common words
two_common_word_summary = summarize(pickled_cleanSentsTwo, two_common_words, 5, 5)



#article two title in list
two_Title = article_title(two)

#pickled article two title
with open('title_Two.pkl', 'wb') as f:
    pickle.dump(two_Title, f)
with open('title_Two.pkl', 'rb') as f:
    pickled_Two_title = pickle.load(f)




#summary of article two using title
two_title_summary = summarize(pickled_cleanSentsTwo, pickled_Two_title, 5, 5)

#uses difflib sequenceMatcher to determine how similiar the sentences are
seq = difflib.SequenceMatcher(None, one_common_word_summary, one_title_summary)
similiarOne = seq.ratio()*100
print('     ')
print('These two summaries when tested for similiar sentences had a similarity rating of ')
print(similiarOne)
print(' ')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jzech\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jzech\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jzech\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Question: Is a title of an article a good representation of the entire article or are titles just there to grab attention?
       
Article One    
['ST. PETERSBURG, Russia  — Russian President Vladimir Putin on Tuesday mocked U.S. special counsel Robert Mueller\'s investigation of Kremlin interference in the 2016 presidential election, saying "a mountain gave birth to a mouse."', "In his first comments since Mueller finished his probe, Putin sought to cast the 22month investigation as a failure and disregarded the special counsel's exposure of a Russian operation to put Donald Trump in the White House.", '"It was clear for us from the start that it would end like this," the Russian leader said as the Trump administration and Congress sparred over making Mueller\'s stillconfidential investigation report public.', 'Attorney General William Barr wrote in a summary of Mueller\'s report that the special counsel found no evidence the Trump campaign "conspired or coordinated" with the Russian

In [94]:
#information about article two
#getting article twos title in text to print
soup2 = BeautifulSoup(two.text, 'html.parser')
souptitle2 = soup2.find('title')
twoTitler = souptitle2.text

print('Article Two')
#prints article two
print(pickled_cleanSentsTwo)
print('   ')
#prints title of article two
print('Article Two Title: ' + twoTitler)
print('  ')
print('Summary of article Two using the title to generate summary: ')
#prints title summary of article two
print(two_title_summary)
print('   ')
print('5 most commonly used words in article Two: ')
print(two_common_words)
print(' ')
print('Summary of article Two using most common words to generate summary: ')
#prints common words used in article two
print(two_common_word_summary)
print('  ')
print('These two summaries when tested for similiar sentences had a similarity rating of ')

seq2 = difflib.SequenceMatcher(None, two_common_word_summary, two_title_summary)
similiarTwo = seq2.ratio()*100
print(similiarTwo)
print(' ')

Article Two
['As American consumers seek quality over quantity in their beer, the world’s two biggest brewers are locked in a legal battle over the quest to serve a freshtapped taste in every mug.', 'HEINY', 'ITC', 'AnheuserBusch InBev SA and Heineken NV are fighting over who invented key parts of a new way to deliver draft beer that allows for smaller batches and eliminates the need for traditional steel kegs, which use compressed air and can degrade flavor over time.', '“The bars used to put in big kegs of Bud Light and Coors Light and they would sell so much of it that freshness was not an issue,” Sanford C. Bernstein analyst Trevor Stirling said.', '“Each individual brand is selling less.', 'Now you need technology to protect the freshness longer even as you’re selling less.”', 'Analysts say the $120 billion beer industry is getting more fragmented.', 'Americans are drinking less beer, but they are willing to pay for better quality.', '“Consumer interest is in the higher end of the

In [95]:
print('Conclusion:')
print('To conclude, I think there is something to be looked at when comparing the title summary to the common word summary. My assumption was that the summary generated from using the common words, and how closely those words were spread out in each sentence, would be a overall better summarization of the article. Although I could not find a way to empirically test these results, there is evidence that these summaries are different by such a margin that one may be better than the other. Both ratings when tested for similiar sentences were really low. Both being below 40 percent. In the second article the summaries were almost completely different. All in all, I think this was a cool project because I can paste any URL of any article on the web and immediately get back a couple summaries of the article. It really comes down to your judgment when deciding which summary is a better summarization of the article.')

Conclusion:
To conclude, I think there is something to be looked at when comparing the title summary to the common word summary. My assumption was that the summary generated from using the common words, and how closely those words were spread out in each sentence, would be a overall better summarization of the article. Although I could not find a way to empirically test these results, there is evidence that these summaries are different by such a margin that one may be better than the other. Both ratings when tested for similiar sentences were really low. Both being below 40 percent. In the second article the summaries were almost completely different. All in all, I think this was a cool project because I can paste any URL of any article on the web and immediately get back a couple summaries of the article. It really comes down to your judgment when deciding which summary is a better summarization of the article.
