In [4]:
!pip install stanza



In [8]:
import stanza, nltk
from collections import Counter
from string import punctuation

In [6]:
# Download Stanza LM
stanza.download('en')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.1.0.json: 122kB [00:00, 10.3MB/s]                    
2020-11-05 21:57:52 INFO: Downloading default packages for language: en (English)...
2020-11-05 21:57:53 INFO: File exists: /root/stanza_resources/en/default.zip.
2020-11-05 21:57:59 INFO: Finished downloading models and saved to /root/stanza_resources.


In [41]:
# Initialize Stanza Pipeline
nlp = stanza.Pipeline('en', processors='tokenize,pos')

2020-11-05 22:25:37 INFO: Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | ewt     |
| pos       | ewt     |

2020-11-05 22:25:37 INFO: Use device: cpu
2020-11-05 22:25:37 INFO: Loading: tokenize
2020-11-05 22:25:37 INFO: Loading: pos
2020-11-05 22:25:38 INFO: Done loading processors!


In [42]:
# Get a sample book from Gutenberg
# !wget "http://www.gutenberg.org/files/11/11-0.txt"
alice_str = open("/content/11-0.txt").read()

In [43]:
# Initialize Stanza object from book
doc = nlp(alice_str)

#### From Tutorial

https://medium.com/better-programming/extractive-text-summarization-using-spacy-in-python-88ab96d1fd97


TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document)

TF: Term Frequency — Measures how frequently a term occurs in a document. Since every document is different in length, it is possible that a term would appear many more times in long documents than shorter ones. Thus, the term frequency is often divided by the document length, such as the total number of terms in the document, as a way of normalization.

IDF(t) = log_e(Total number of documents / Number of documents with term t in it)

IDF: Inverse Document Frequency — Measures how important a term is. While computing the term frequency, all terms are considered equally important. However, it is known that certain terms may appear a lot of times but have little importance in the document. We usually term these words stopwords. For example: is, are, they, and so on.

In [None]:
def top_sentence(text, limit):
    keyword = []
    pos_tag = ['PROPN', 'ADJ', 'NOUN', 'VERB']
    doc = nlp(text.lower())     # convert to stanza
    for token in doc:
        if(token.text in nlp.Defaults.stop_words or token.text in punctuation): # convert to stanza
            continue
        if(token.pos_ in pos_tag): # convert to stanza
            keyword.append(token.text)
    
    freq_word = Counter(keyword)
    max_freq = Counter(keyword).most_common(1)[0][1]
    for w in freq_word:
        freq_word[w] = (freq_word[w]/max_freq)
        
    sent_strength={}
    for sent in doc.sents:
        for word in sent:
            if word.text in freq_word.keys():
                if sent in sent_strength.keys():
                    sent_strength[sent]+=freq_word[word.text]
                else:
                    sent_strength[sent]=freq_word[word.text]
    
    summary = []
    
    sorted_x = sorted(sent_strength.items(), key=lambda kv: kv[1], reverse=True)
    
    counter = 0
    for i in range(len(sorted_x)):
        summary.append(str(sorted_x[i][0]).capitalize())

        counter += 1
        if(counter >= limit):
            break
            
    return ' '.join(summary)

def main():
  # TO DO: Need to add step to check for EN lang model
  # Download Stanza LM
  stanza.download('en')
  # Initialize Stanza Pipeline
  nlp = stanza.Pipeline('en', processors='tokenize,pos')
  alice_str = open("/content/11-0.txt").read()
  top_sentence(alice_str, 30)
