## Text Summary with tf-ifd

### This exercise serves as an example to one of the usecases of tf-idf, which is summarizing a text document

In [27]:
import math
from nltk import sent_tokenize, word_tokenize, PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

#### import our text and tokenize the sentences
#### Note that in this exercise, we are tokenizing by sentence, and not words. This is to ensure a meaningful output.

In [28]:
text = open('news.txt','r').read()
sentences = sent_tokenize(text)
total_documents = len(sentences)
print(total_documents)
print(sentences)

11
['British Airways and budget rival Ryanair have cancelled hundreds of flights as demand for travel drops amid fears about the spread of coronavirus.', 'BA is cancelling 216 flights from 16-28 March from London to destinations including New York, Italy, France, Austria, Belgium, Germany and Ireland.', 'Ryanair will cut up to 25% of flights in and out of Italy from 17 March to 8 April.', 'Tourists and business people are cutting back on foreign travel.', 'There could be a "very significant expansion" of the number of cases of coronavirus in the UK, Prime Minister Boris Johnson has warned.', 'Ryanair boss Michael O\'Leary said: "Our focus at this time is on minimising any risk to our people and our passengers.', '"While we are heavily booked over the next two weeks, there has been a notable drop in forward bookings towards the end of March, into early April.', '"It makes sense to selectively prune our schedule to and from those airports where travel has been most affected by the Covid-

### Create the Frequency matrix of the words in each sentence

In [50]:
def _create_frequency_matrix(sentences):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()

    for sent in sentences:
        # counts frequencies of words in every sentence
        freq_table = {}
        words = word_tokenize(sent)

        for word in words:
            #stopwords removal
            if word in stopWords or len(word) < 2 or not word.isalpha():
                continue
            # stemming and lowercasing
            word = ps.stem(word.lower())
            if word not in freq_table:
                freq_table[word] = 0
            freq_table[word] += 1
        
        #save frequencies as sent -> word_frequencies
        frequency_matrix[sent] = freq_table

    return frequency_matrix

In [51]:
t = [word for word in word_tokenize(sentences[0]) if word not in stopwords.words("english") and word.isalpha()]
t

['British',
 'Airways',
 'budget',
 'rival',
 'Ryanair',
 'cancelled',
 'hundreds',
 'flights',
 'demand',
 'travel',
 'drops',
 'amid',
 'fears',
 'spread',
 'coronavirus']

In [52]:
frequency_matrix = _create_frequency_matrix(sentences)
frequency_matrix

{'British Airways and budget rival Ryanair have cancelled hundreds of flights as demand for travel drops amid fears about the spread of coronavirus.': {'british': 1,
  'airway': 1,
  'budget': 1,
  'rival': 1,
  'ryanair': 1,
  'cancel': 1,
  'hundr': 1,
  'flight': 1,
  'demand': 1,
  'travel': 1,
  'drop': 1,
  'amid': 1,
  'fear': 1,
  'spread': 1,
  'coronaviru': 1},
 'BA is cancelling 216 flights from 16-28 March from London to destinations including New York, Italy, France, Austria, Belgium, Germany and Ireland.': {'ba': 1,
  'cancel': 1,
  'flight': 1,
  'march': 1,
  'london': 1,
  'destin': 1,
  'includ': 1,
  'new': 1,
  'york': 1,
  'itali': 1,
  'franc': 1,
  'austria': 1,
  'belgium': 1,
  'germani': 1,
  'ireland': 1},
 'Ryanair will cut up to 25% of flights in and out of Italy from 17 March to 8 April.': {'ryanair': 1,
  'cut': 1,
  'flight': 1,
  'itali': 1,
  'march': 1,
  'april': 1},
 'Tourists and business people are cutting back on foreign travel.': {'tourist': 1,


### Calculate TermFrequency for each word

In [54]:
def _create_tf_matrix(freq_matrix):
    tf_matrix = {}

    for sent, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix

In [55]:
tf_matrix = _create_tf_matrix(frequency_matrix)
tf_matrix

{'British Airways and budget rival Ryanair have cancelled hundreds of flights as demand for travel drops amid fears about the spread of coronavirus.': {'british': 0.06666666666666667,
  'airway': 0.06666666666666667,
  'budget': 0.06666666666666667,
  'rival': 0.06666666666666667,
  'ryanair': 0.06666666666666667,
  'cancel': 0.06666666666666667,
  'hundr': 0.06666666666666667,
  'flight': 0.06666666666666667,
  'demand': 0.06666666666666667,
  'travel': 0.06666666666666667,
  'drop': 0.06666666666666667,
  'amid': 0.06666666666666667,
  'fear': 0.06666666666666667,
  'spread': 0.06666666666666667,
  'coronaviru': 0.06666666666666667},
 'BA is cancelling 216 flights from 16-28 March from London to destinations including New York, Italy, France, Austria, Belgium, Germany and Ireland.': {'ba': 0.06666666666666667,
  'cancel': 0.06666666666666667,
  'flight': 0.06666666666666667,
  'march': 0.06666666666666667,
  'london': 0.06666666666666667,
  'destin': 0.06666666666666667,
  'includ': 

### Counting how many sentences (documents) contain a specific word

In [56]:
def _create_documents_per_words(freq_matrix):
    word_per_doc_table = {}

    for sent, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1

    return word_per_doc_table

In [57]:
sentences = sent_tokenize(text)
documents_per_words = _create_documents_per_words(frequency_matrix)
documents_per_words

{'british': 2,
 'airway': 2,
 'budget': 1,
 'rival': 1,
 'ryanair': 3,
 'cancel': 3,
 'hundr': 1,
 'flight': 5,
 'demand': 1,
 'travel': 4,
 'drop': 2,
 'amid': 1,
 'fear': 1,
 'spread': 1,
 'coronaviru': 2,
 'ba': 2,
 'march': 4,
 'london': 1,
 'destin': 1,
 'includ': 1,
 'new': 1,
 'york': 1,
 'itali': 2,
 'franc': 1,
 'austria': 1,
 'belgium': 1,
 'germani': 1,
 'ireland': 1,
 'cut': 2,
 'april': 2,
 'tourist': 1,
 'busi': 1,
 'peopl': 2,
 'back': 1,
 'foreign': 1,
 'there': 1,
 'could': 1,
 'signific': 1,
 'expans': 1,
 'number': 1,
 'case': 1,
 'uk': 1,
 'prime': 1,
 'minist': 1,
 'bori': 1,
 'johnson': 1,
 'warn': 1,
 'boss': 1,
 'michael': 1,
 'said': 3,
 'our': 1,
 'focu': 1,
 'time': 1,
 'minimis': 1,
 'risk': 1,
 'passeng': 2,
 'while': 1,
 'heavili': 1,
 'book': 1,
 'next': 1,
 'two': 1,
 'week': 1,
 'notabl': 1,
 'forward': 1,
 'toward': 1,
 'end': 2,
 'earli': 1,
 'it': 1,
 'make': 1,
 'sens': 1,
 'select': 1,
 'prune': 1,
 'schedul': 1,
 'airport': 1,
 'affect': 3,
 'outb

### Calculate IDF

In [58]:
def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}

    for sent, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))

        idf_matrix[sent] = idf_table

    return idf_matrix

In [59]:
idf_matrix = _create_idf_matrix(frequency_matrix, documents_per_words, total_documents)
idf_matrix

{'British Airways and budget rival Ryanair have cancelled hundreds of flights as demand for travel drops amid fears about the spread of coronavirus.': {'british': 0.7403626894942439,
  'airway': 0.7403626894942439,
  'budget': 1.0413926851582251,
  'rival': 1.0413926851582251,
  'ryanair': 0.5642714304385625,
  'cancel': 0.5642714304385625,
  'hundr': 1.0413926851582251,
  'flight': 0.3424226808222063,
  'demand': 1.0413926851582251,
  'travel': 0.43933269383026263,
  'drop': 0.7403626894942439,
  'amid': 1.0413926851582251,
  'fear': 1.0413926851582251,
  'spread': 1.0413926851582251,
  'coronaviru': 0.7403626894942439},
 'BA is cancelling 216 flights from 16-28 March from London to destinations including New York, Italy, France, Austria, Belgium, Germany and Ireland.': {'ba': 0.7403626894942439,
  'cancel': 0.5642714304385625,
  'flight': 0.3424226808222063,
  'march': 0.43933269383026263,
  'london': 1.0413926851582251,
  'destin': 1.0413926851582251,
  'includ': 1.0413926851582251,

### Calculate TF-IDF

In [60]:
def _create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(),
                                                    f_table2.items()):  # here, keys are the same in both the table
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix

In [61]:
tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
tf_idf_matrix

{'British Airways and budget rival Ryanair have cancelled hundreds of flights as demand for travel drops amid fears about the spread of coronavirus.': {'british': 0.049357512632949595,
  'airway': 0.049357512632949595,
  'budget': 0.06942617901054834,
  'rival': 0.06942617901054834,
  'ryanair': 0.03761809536257084,
  'cancel': 0.03761809536257084,
  'hundr': 0.06942617901054834,
  'flight': 0.022828178721480418,
  'demand': 0.06942617901054834,
  'travel': 0.02928884625535084,
  'drop': 0.049357512632949595,
  'amid': 0.06942617901054834,
  'fear': 0.06942617901054834,
  'spread': 0.06942617901054834,
  'coronaviru': 0.049357512632949595},
 'BA is cancelling 216 flights from 16-28 March from London to destinations including New York, Italy, France, Austria, Belgium, Germany and Ireland.': {'ba': 0.049357512632949595,
  'cancel': 0.03761809536257084,
  'flight': 0.022828178721480418,
  'march': 0.02928884625535084,
  'london': 0.06942617901054834,
  'destin': 0.06942617901054834,
  'in

In [62]:
vectorizer = TfidfVectorizer(stop_words="english")
transformed = vectorizer.fit_transform(sentences)
tfidf_transformer=TfidfTransformer(use_idf=True)
tfidf_transformer.fit(transformed)

TfidfTransformer()

In [63]:
tfidf_transformer.idf_

array([2.79175947, 2.79175947, 2.79175947, 2.79175947, 2.79175947,
       2.79175947, 2.79175947, 2.79175947, 2.38629436, 2.79175947,
       2.38629436, 2.79175947, 2.38629436, 2.79175947, 2.38629436,
       2.79175947, 2.79175947, 2.79175947, 2.79175947, 2.79175947,
       2.38629436, 2.79175947, 2.79175947, 2.38629436, 2.79175947,
       2.79175947, 2.79175947, 2.79175947, 2.38629436, 2.79175947,
       2.79175947, 2.79175947, 2.79175947, 2.79175947, 2.79175947,
       2.79175947, 2.79175947, 2.79175947, 2.79175947, 2.79175947,
       2.79175947, 2.79175947, 2.79175947, 2.79175947, 2.79175947,
       2.79175947, 2.79175947, 1.69314718, 2.79175947, 2.79175947,
       2.79175947, 2.79175947, 2.79175947, 2.79175947, 2.79175947,
       2.79175947, 2.79175947, 2.38629436, 2.79175947, 2.79175947,
       2.79175947, 2.79175947, 2.79175947, 1.87546874, 2.79175947,
       2.79175947, 2.79175947, 2.79175947, 2.79175947, 2.79175947,
       2.79175947, 2.79175947, 2.38629436, 2.38629436, 2.79175

### Score the sentences

In [64]:
"""
score a sentence by its word's TF
Basic algorithm: adding the TF frequency of every non-stop word
in a sentence divided by total no of words in a sentence.
"""
###

s_sum={}

for bgn,wf in tf_idf_matrix.items():
    s_sum[bgn] = sum(wf.values())/len(wf)

### Find the threshold 

In [65]:
"""
Find the average score from the sentence value dictionary
"""
###
suum = sum(s_sum.values())
average_score = suum/len(s_sum)
print("averagge score :",average_score)

averagge score : 0.08035694323999669


#### Select a sentence for a summarization if the sentence score is more than the average score

In [66]:
s_sum

{'British Airways and budget rival Ryanair have cancelled hundreds of flights as demand for travel drops amid fears about the spread of coronavirus.': 0.05405110128717399,
 'BA is cancelling 216 flights from 16-28 March from London to destinations including New York, Italy, France, Austria, Belgium, Germany and Ireland.': 0.058847462380719,
 'Ryanair will cut up to 25% of flights in and out of Italy from 17 March to 8 April.': 0.09908652426593785,
 'Tourists and business people are cutting back on foreign travel.': 0.12419650639697245,
 'There could be a "very significant expansion" of the number of cases of coronavirus in the UK, Prime Minister Boris Johnson has warned.': 0.07832588704966238,
 'Ryanair boss Michael O\'Leary said: "Our focus at this time is on minimising any risk to our people and our passengers.': 0.08181005814853874,
 '"While we are heavily booked over the next two weeks, there has been a notable drop in forward bookings towards the end of March, into early April.': 

In [67]:
###
summary=" "
for sentence in sentences:
    if(s_sum[sentence]>=average_score):
        summary +=" "+sentence

summary

'  Ryanair will cut up to 25% of flights in and out of Italy from 17 March to 8 April. Tourists and business people are cutting back on foreign travel. Ryanair boss Michael O\'Leary said: "Our focus at this time is on minimising any risk to our people and our passengers. "It makes sense to selectively prune our schedule to and from those airports where travel has been most affected by the Covid-19 outbreak." The firm declined to say how many flights or passengers would be affected. However, it said the move would not affect its results for the financial year which ends on 31 March.'