<a href="https://colab.research.google.com/github/GianFederico/MD-repo-Natural_Language_Processing/blob/main/NLP_lab3_text_summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
#utils for cleaning process 
import nltk
import re
import string
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('inaugural')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# split the input on anything other than a word character
def onlywords(text):
    cleaned_tokens = re.split(r'\W+', text)
    return cleaned_tokens

# split on whitespace and then remove punct
def wordsnopunct(text):
    tokens=text.split()
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    return stripped

def wordmatch(text):
    cleaned_tokens = re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", text)
    return cleaned_tokens

def NLTKTokenize(text):
    nltk_words = word_tokenize(text)
    return nltk_words

def NLTKregtokenize(text):
    pattern = r'''(?x)
    (?:[A-Z]\.)+       
   | \w+(?:-\w+)*       
   | \$?\d+(?:\.\d+)?%? 
   | \.\.\.             
   | [][.,;"'?():-_`]   
 '''
    tokens=nltk.regexp_tokenize(text, pattern)
    return tokens

def onlypunct(text):
    waste=re.findall(r"[!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~]+", text)
    return waste


#TFIDF SCORE functions
import math 

def create_df_table(dic,corpus_frequencies):
    # create a df table as a dictionary (term, document frequency) 
    df_table = {}
    for word in dic:
        df_table[word]=0
        for doc_id in corpus_frequencies.keys():
            doc_freq=corpus_frequencies[doc_id]
            if doc_freq[word]>0:
                df_table[word] += 1
    return df_table

def create_idf_table(df_table, total_documents):
    # create a df table as a dictionary (term, document frequency) 
    idf_table = {}
    for word in df_table.keys():
        idf_table[word]=math.log(total_documents / float(df_table[word]))
    return idf_table

def create_tfidf_table(f_distr, idf_table):
    # create a tf*idf table (term, tf*idf score) from a doc frequency distribution
    tf_idf_table = {}
    for word in f_distr:
        tf_idf_table[word]= float(f_distr.freq(word) * idf_table[word])
    return tf_idf_table

def compute_avg_tfidfscore(tf_idf_column):
    # computing document relevance as avg. of tf*idf scores
    doc_score=0
    for word in tf_idf_column.keys():
        doc_score+=tf_idf_column[word]
    return doc_score / len(tf_idf_column)

def compute_docvect_length(tf_idf_column):
    # computing length of document vector
    coordsum=0
    for word in tf_idf_column.keys():
        coordsum+=tf_idf_column[word]**2
    return math.sqrt(coordsum)

def compute_lenghtnorm_vectors(tf_idf_matrix):
    # produces a length-normalized tf*idf matrix
    norm_tf_idf_matrix = {}
    for doc_id in tf_idf_matrix.keys():
        column=tf_idf_matrix[doc_id]
        vec_length=compute_docvect_length(column)
        for word in column.keys():
            column[word]=column[word]/vec_length
        norm_tf_idf_matrix[doc_id]=column
    return norm_tf_idf_matrix

def compute_docweight_incorpus_table(scoring_table):
    # computing document relevance in a corpus from a scoring table (doc, score) 
    all_weights=0
    weight_table = {}
    for doc_id in scoring_table.keys():
        all_weights+=scoring_table[doc_id]
    for doc_id in scoring_table.keys():
        weight_table[doc_id]=(scoring_table[doc_id] / all_weights)
        # normalized scores
    return weight_table

def compute_cos_similarity(norm_vec1, norm_vec2):
    # computing cosine similarity between 2 normalized vectors
    common_words=[w for w in norm_vec1 if w in norm_vec2]
    sim_score=0
    for w in common_words:
        sim_score+=norm_vec1[w]*norm_vec2[w]
    return sim_score

def compute_centroid(voc, tf_idf_matrix):
    # produces the centroid of vectors in a tf*idf matrix
    centroid = {}
    length=len(tf_idf_matrix.keys())
    for word in voc:
        centroid[word]=0
    for doc_id in tf_idf_matrix.keys():
        column=tf_idf_matrix[doc_id]
        for word in column.keys():
            centroid[word]+=column[word]/length
    return centroid

def normalize_vector(vector):
    # produces a length-normalized vector
    norm_vect = {}
    vec_length=compute_docvect_length(vector)
    for word in vector.keys():
        norm_vect[word]=vector[word]/vec_length
    return norm_vect


[nltk_data] Downloading package inaugural to /root/nltk_data...
[nltk_data]   Package inaugural is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [12]:
#ATTEMPT 1

from nltk import sent_tokenize
from nltk.corpus import inaugural
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


# load text
#text=inaugural.raw("1817-Monroe.txt")
text=inaugural.raw("2013-Obama.txt")
#text=inaugural.raw("2017-Trump.txt")
sentences = sent_tokenize(text) # NLTK function
corpus={}
# our corpus is the set of sentences in the text
# one doc = one sentence

for sent in sentences:
        corpus[sent]=sent

# apply the pipeline on each doc to build a frequency table


total_documents = len(corpus)
stop_words = stopwords.words('english')
frequencies = {}
vocabulary=set()
vocabularysize=0
for doc_id in corpus.keys():
    tokens= wordmatch(corpus[doc_id])
    waste = onlypunct(corpus[doc_id])
    cleaned_tokens = [t for t in tokens if not t in waste]
    nostop_tokens = [t for t in cleaned_tokens if not t in stop_words]
    norm_tokens = [t.lower() for t in nostop_tokens]
    #porter = PorterStemmer()
    #stemmed_tokens = [porter.stem(t) for t in norm_tokens]
    wnl=nltk.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(t) for t in norm_tokens]
    docvocabulary=set(lemmas)
    vocabulary=vocabulary.union(docvocabulary)
    vocabularysize=vocabularysize+len(docvocabulary)
    docfdist = nltk.FreqDist(t for t in lemmas)      
    frequencies[doc_id]=docfdist
# building a df table (term, #sentences that contain term)
df_table=create_df_table(vocabulary, frequencies)
# building an idf table (term, idf score) based on a df_table
idf_table=create_idf_table(df_table, total_documents)
#show.printdict(idf_table)

# Now we build a tf*idf_table for each sentence
# building a tf*idf table from: 1) a frequency distribution 2) an idf_table
#               doc1    doc2    ...     docN
# term1         w11     w12     ...     w1N
# term2         w21     w22     ...     w2N
# ...           ...     ...     ...     ...
# termM         wM1     wM2     ...     wMN

# A tf_idf_table corresponds to a column of the classic term-doc matrix
# We build the whole matrix indexed by docs

tf_idf_tables={}
for doc_id in frequencies.keys():
      tf_idf_table=create_tfidf_table(frequencies[doc_id], idf_table)
      tf_idf_tables[doc_id]=tf_idf_table

# Now we need to assign a relevance to a sentence

# computing a table containing a score for each doc in the corpus (from each tfidf_table)
tfidf_sentence_score_table={}
for sent in tf_idf_tables.keys():
        tfidf_sentence_score_table[sent]=compute_avg_tfidfscore(tf_idf_tables[sent])


# computing a table containing a relevance score for each sentence (from a scoring table)
relevance_table=compute_docweight_incorpus_table(tfidf_sentence_score_table)

# generating a summary based on some criteria on relevance table
# selection criterion --> relevance score of sentence > avg. relevance 
def generate_summary(relevance_table):
    summary=""
    n_sentences=len(relevance_table.keys())
    sentence_count = 0
    total_rel=0
    for sentence in relevance_table.keys():
        total_rel+=relevance_table[sentence]
    avgrel=total_rel/n_sentences
    for sentence in relevance_table.keys():
        if relevance_table[sentence]>=avgrel:
            #print ("taken!", relevance_table[sentence], avgrel)
            summary += " " + sentence
            sentence_count+=1
    #used=int((sentence_count / n_sentences)*100)    
    summary += "\n Used: "+str(sentence_count)+" out of "+str(n_sentences)+" sentences."
    return summary

summary=generate_summary(relevance_table)

#sentence_relevance_matrix=tfidf.sentence_relevance(tfidfscoring)
#print (sentence_relevance_matrix)
#threshold=tfidf.find_average_score(sentence_relevance_matrix)
#print ("Avg. relevance score per sentence: ", threshold)
#print ("All sentences above the threshold will be included in the summary")
#summary=tfidf.generate_summary(sentences, sentence_relevance_matrix, threshold)
print ("SUMMARY\n")
print (summary)







SUMMARY

 Thank you. Thank you so much. We affirm the promise of our democracy. And for more than 200 years, we have. We made ourselves anew, and vowed to move forward together. This generation of Americans has been tested by crises that steeled our resolve and proved our resilience. A decade of war is now ending. An economic recovery has begun. We understand that outworn programs are inadequate to the needs of our time. That is what this moment requires. That is what will give real meaning to our creed. We do not believe that in this country freedom is reserved for the lucky, or happiness for the few. The path towards sustainable energy sources will be long and sometimes difficult. But America cannot resist this transition, we must lead it. That is how we will preserve our planet, commanded to our care by God. That's what will lend meaning to the creed our fathers once declared. America will remain the anchor of strong alliances in every corner of the globe. It is now our generation's

In [14]:
#ATTEMPT 2

# load text
text=inaugural.raw("2013-Obama.txt")
#text=inaugural.raw("2017-Trump.txt")
sentences = sent_tokenize(text) # NLTK function
corpus={}
# our corpus is the set of sentences in the text
# one doc = one sentence

for sent in sentences:
        corpus[sent]=sent

# apply the pipeline on each doc to build a frequency table


total_documents = len(corpus)
stop_words = stopwords.words('english')
frequencies = {}
vocabulary=set()
vocabularysize=0
for doc_id in corpus.keys():
        tokens=wordmatch(corpus[doc_id])
        waste = onlypunct(corpus[doc_id])
        cleaned_tokens = [t for t in tokens if not t in waste]
        nostop_tokens = [t for t in cleaned_tokens if not t in stop_words]
        norm_tokens = [t.lower() for t in nostop_tokens]
        #porter = PorterStemmer()
        #stemmed_tokens = [porter.stem(t) for t in norm_tokens]
        wnl=nltk.WordNetLemmatizer()
        lemmas = [wnl.lemmatize(t) for t in norm_tokens]
        docvocabulary=set(lemmas)
        vocabulary=vocabulary.union(docvocabulary)
        vocabularysize=vocabularysize+len(docvocabulary)
        docfdist = nltk.FreqDist(t for t in lemmas)      
        frequencies[doc_id]=docfdist
# building a df table (term, #sentences that contain term)
df_table=create_df_table(vocabulary, frequencies)
# building an idf table (term, idf score) based on a df_table
idf_table=create_idf_table(df_table, total_documents)
#show.printdict(idf_table)

# Now we build a tf*idf_table for each sentence
# building a tf*idf table from: 1) a frequency distribution 2) an idf_table
#               doc1    doc2    ...     docN
# term1         w11     w12     ...     w1N
# term2         w21     w22     ...     w2N
# ...           ...     ...     ...     ...
# termM         wM1     wM2     ...     wMN

# A tf_idf_table corresponds to a column of the classic term-doc matrix
# We build the whole matrix indexed by docs

tf_idf_tables={}
for doc_id in frequencies.keys():
        tf_idf_table=create_tfidf_table(frequencies[doc_id], idf_table)
        tf_idf_tables[doc_id]=tf_idf_table

###### ANOTHER SOLUTION BASED ON NORMALIZED VECTORS #######


#### we want to length-normalize the tf-idf vectors ###
tf_idf_tables=compute_lenghtnorm_vectors(tf_idf_tables)

# Now we need to assign a relevance to a sentence

# computing a table containing a score for each doc in the corpus (from each tfidf_table)
tfidf_sentence_score_table={}
for sent in tf_idf_tables.keys():
        tfidf_sentence_score_table[sent]=compute_avg_tfidfscore(tf_idf_tables[sent])


# computing a table containing a relevance score for each sentence (from a scoring table)
relevance_table=compute_docweight_incorpus_table(tfidf_sentence_score_table)
#show.printdict(relevance_table)
# generating a summary based on some criteria on relevance table

# selection criterion --> relevance score of sentence > avg. relevance 
def generate_summary(relevance_table):
    summary=""
    n_sentences=len(relevance_table.keys())
    sentence_count = 0
    total_rel=0
    for sentence in relevance_table.keys():
        total_rel+=relevance_table[sentence]
    avgrel=total_rel/n_sentences
    for sentence in relevance_table.keys():
        if relevance_table[sentence]>=avgrel:
            #print ("taken!", relevance_table[sentence], avgrel)
            summary += " " + sentence
            sentence_count+=1
    #used=int((sentence_count / n_sentences)*100)    
    summary += "\n Used: "+str(sentence_count)+" out of "+str(n_sentences)+" sentences."
    return summary


summary=generate_summary(relevance_table)

#sentence_relevance_matrix=tfidf.sentence_relevance(tfidfscoring)
#print (sentence_relevance_matrix)
#threshold=tfidf.find_average_score(sentence_relevance_matrix)
#print ("Avg. relevance score per sentence: ", threshold)
#print ("All sentences above the threshold will be included in the summary")
#summary=tfidf.generate_summary(sentences, sentence_relevance_matrix, threshold)
print ("SUMMARY\n")
print (summary)

#######################################################






SUMMARY

 Thank you. Thank you so much. We affirm the promise of our democracy. Today we continue a never-ending journey to bridge the meaning of those words with the realities of our time. The patriots of 1776 did not fight to replace the tyranny of a king with the privileges of a few or the rule of a mob. And for more than 200 years, we have. We made ourselves anew, and vowed to move forward together. Together, we discovered that a free market only thrives when there are rules to ensure competition and fair play. Now more than ever, we must do these things together, as one nation and one people. This generation of Americans has been tested by crises that steeled our resolve and proved our resilience. A decade of war is now ending. An economic recovery has begun. We understand that outworn programs are inadequate to the needs of our time. That is what this moment requires. That is what will give real meaning to our creed. We do not believe that in this country freedom is reserved for 

In [16]:
#ATTEMPT 3

import operator
# load text
text=inaugural.raw("2013-Obama.txt")
#text=inaugural.raw("2017-Trump.txt")
sentences = sent_tokenize(text) # NLTK function
corpus={}
# our corpus is the set of sentences in the text
# one doc = one sentence

for sent in sentences:
        corpus[sent]=sent

# apply the pipeline on each doc to build a frequency table


total_documents = len(corpus)
stop_words = stopwords.words('english')
frequencies = {}
vocabulary=set()
vocabularysize=0
for doc_id in corpus.keys():
        tokens=wordmatch(corpus[doc_id])
        waste =onlypunct(corpus[doc_id])
        cleaned_tokens = [t for t in tokens if not t in waste]
        nostop_tokens = [t for t in cleaned_tokens if not t in stop_words]
        norm_tokens = [t.lower() for t in nostop_tokens]
        #porter = PorterStemmer()
        #stemmed_tokens = [porter.stem(t) for t in norm_tokens]
        wnl=nltk.WordNetLemmatizer()
        lemmas = [wnl.lemmatize(t) for t in norm_tokens]
        docvocabulary=set(lemmas)
        vocabulary=vocabulary.union(docvocabulary)
        vocabularysize=vocabularysize+len(docvocabulary)
        docfdist = nltk.FreqDist(t for t in lemmas)      
        frequencies[doc_id]=docfdist
# building a df table (term, #sentences that contain term)
df_table=create_df_table(vocabulary, frequencies)
# building an idf table (term, idf score) based on a df_table
idf_table=create_idf_table(df_table, total_documents)
#show.printdict(idf_table)

# Now we build a tf*idf_table for each sentence
# building a tf*idf table from: 1) a frequency distribution 2) an idf_table
#               doc1    doc2    ...     docN
# term1         w11     w12     ...     w1N
# term2         w21     w22     ...     w2N
# ...           ...     ...     ...     ...
# termM         wM1     wM2     ...     wMN

# A tf_idf_table corresponds to a column of the classic term-doc matrix
# We build the whole matrix indexed by docs

tf_idf_tables={}
for doc_id in frequencies.keys():
        tf_idf_table=create_tfidf_table(frequencies[doc_id], idf_table)
        tf_idf_tables[doc_id]=tf_idf_table

###### ANOTHER SOLUTION BASED ON NORMALIZED VECTORS AND k-NN#######


#### we want to length-normalize the tf-idf vectors ###
tf_idf_tables=compute_lenghtnorm_vectors(tf_idf_tables)

# Instead of assigning a relevance to a sentence we:
# compute the centroid vector of the document and find most similar sentences


# computing the centroid from a vocabulary and a tf*idf matrix
centroid=compute_centroid(vocabulary, tf_idf_tables)
norm_centroid=normalize_vector(centroid)
#print ("centroid length:", tfidf.compute_docvect_length(centroid))
#print ("centroid normalized length:", tfidf.compute_docvect_length(norm_centroid))
# generating a summary by selecting k most similar sentences to the centroid  

k=34


def generate_summary(tf_idf_tables, norm_centroid, k):
    summary=""
    sentence_list={}
    for sentence in tf_idf_tables.keys():
        sim=compute_cos_similarity(centroid,tf_idf_tables[sentence])
        sentence_list[sentence]=sim
    # rank sentence_list
    sorted_sim_table = dict( sorted(sentence_list.items(), key=operator.itemgetter(1),reverse=True))
    sorted_topk = dict(list(sorted_sim_table.items())[0: k])
    for sentence in sorted_topk:
        summary += " " + sentence
    # select top-k sentences
    return summary

summary=generate_summary(tf_idf_tables, norm_centroid, k)

#sentence_relevance_matrix=tfidf.sentence_relevance(tfidfscoring)
#print (sentence_relevance_matrix)
#threshold=tfidf.find_average_score(sentence_relevance_matrix)
#print ("Avg. relevance score per sentence: ", threshold)
#print ("All sentences above the threshold will be included in the summary")
#summary=tfidf.generate_summary(sentences, sentence_relevance_matrix, threshold)

print ("SUMMARY made of: ", str(k), "sentences\n")
print (summary)

#######################################################






SUMMARY made of:  34 sentences

 That is our generation's taskâto make these words, these rights, these values of life and liberty and the pursuit of happiness real for every American. They do not make us a nation of takers; they free us to take the risks that make this country great. We, the people, still believe that enduring security and lasting peace do not require perpetual war. We, the people, still believe that our obligations as Americans are not just to ourselves, but to all posterity. We, the people, still believe that every citizen deserves a basic measure of security and dignity. What makes us exceptionalâwhat makes us Americanâis our allegiance to an idea articulated in a declaration made more than two centuries ago:

We hold these truths to be self-evident, that all men are created equal; that they are endowed by their Creator with certain unalienable rights; that among these are life, liberty, and the pursuit of happiness. Progress does not compel us to settle cent