<a href="https://colab.research.google.com/github/GianFederico/MD-repo-Natural_Language_Processing/blob/main/NLP_lab3_text_similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
# In general, the similarity between two items is measured according to their features.
# For text documents, we need to define:
# – A document representation strategy, i.e. features that describe the texts
# – A similarity (or distance) function based on features

import nltk
import re
import string
import math
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import sent_tokenize
from nltk.corpus import inaugural
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from itertools import groupby, combinations
import operator
nltk.download('inaugural')
nltk.download('punkt')
nltk.download('stopwords')

def wordmatch(text):
    cleaned_tokens = re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", text)
    return cleaned_tokens

def onlypunct(text):
    waste=re.findall(r"[!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~]+", text)
    return waste

# def NLTKTokenize(text):
#     nltk_words = word_tokenize(text)
#     return nltk_words

# def NLTKregtokenize(text):
#     pattern = r'''(?x)
#     (?:[A-Z]\.)+
#    | \w+(?:-\w+)*
#    | \$?\d+(?:\.\d+)?%?
#    | \.\.\.
#    | [][.,;"'?():-_`]
#  '''
#     tokens=nltk.regexp_tokenize(text, pattern)
#     return tokens

# # split the input on anything other than a word character
# def onlywords(text):
#     cleaned_tokens = re.split(r'\W+', text)
#     return cleaned_tokens

# # split on whitespace and then remove punct
# def wordsnopunct(text):
#     tokens=text.split()
#     table = str.maketrans('', '', string.punctuation)
#     stripped = [w.translate(table) for w in tokens]
#     return stripped


#TFIDF SCORE functions
def create_df_table(dic,corpus_frequencies):
    # create a df table as a dictionary (term, document frequency)
    df_table = {}
    for word in dic:
        df_table[word]=0
        for doc_id in corpus_frequencies.keys():
            doc_freq=corpus_frequencies[doc_id]
            if doc_freq[word]>0:
                df_table[word] += 1
    return df_table

def create_idf_table(df_table, total_documents):
    # create a df table as a dictionary (term, document frequency)
    idf_table = {}
    for word in df_table.keys():
        idf_table[word]=math.log(total_documents / float(df_table[word]))
    return idf_table

def create_tfidf_table(f_distr, idf_table):
    # create a tf*idf table (term, tf*idf score) from a doc frequency distribution
    tf_idf_table = {}
    for word in f_distr:
        tf_idf_table[word]= float(f_distr.freq(word) * idf_table[word])
    return tf_idf_table

def compute_docvect_length(tf_idf_column):
    # computing length of document vector
    coordsum=0
    for word in tf_idf_column.keys():
        coordsum+=tf_idf_column[word]**2
    return math.sqrt(coordsum)

def compute_lenghtnorm_vectors(tf_idf_matrix):
    # produces a length-normalized tf*idf matrix
    norm_tf_idf_matrix = {}
    for doc_id in tf_idf_matrix.keys():
        column=tf_idf_matrix[doc_id]
        vec_length=compute_docvect_length(column)
        for word in column.keys():
            column[word]=column[word]/vec_length
        norm_tf_idf_matrix[doc_id]=column
    return norm_tf_idf_matrix

def compute_cos_similarity(norm_vec1, norm_vec2):
    # computing cosine similarity between 2 normalized vectors
    common_words=[w for w in norm_vec1 if w in norm_vec2]
    sim_score=0
    for w in common_words:
        sim_score+=norm_vec1[w]*norm_vec2[w]
    return sim_score

# def compute_centroid(voc, tf_idf_matrix):
#     # produces the centroid of vectors in a tf*idf matrix
#     centroid = {}
#     length=len(tf_idf_matrix.keys())
#     for word in voc:
#         centroid[word]=0
#     for doc_id in tf_idf_matrix.keys():
#         column=tf_idf_matrix[doc_id]
#         for word in column.keys():
#             centroid[word]+=column[word]/length
#     return centroid

# def normalize_vector(vector):
#     # produces a length-normalized vector
#     norm_vect = {}
#     vec_length=compute_docvect_length(vector)
#     for word in vector.keys():
#         norm_vect[word]=vector[word]/vec_length
#     return norm_vect

# def compute_avg_tfidfscore(tf_idf_column):
#     # computing document relevance as avg. of tf*idf scores
#     doc_score=0
#     for word in tf_idf_column.keys():
#         doc_score+=tf_idf_column[word]
#     return doc_score / len(tf_idf_column)

# def compute_docweight_incorpus_table(scoring_table):
#     # computing document relevance in a corpus from a scoring table (doc, score)
#     all_weights=0
#     weight_table = {}
#     for doc_id in scoring_table.keys():
#         all_weights+=scoring_table[doc_id]
#     for doc_id in scoring_table.keys():
#         weight_table[doc_id]=(scoring_table[doc_id] / all_weights)
#         # normalized scores
#     return weight_table


[nltk_data] Downloading package inaugural to /root/nltk_data...
[nltk_data]   Package inaugural is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
# JACCARD SIMILARITY

corpus={}
for doc_id in nltk.corpus.inaugural.fileids():
        corpus[doc_id]=inaugural.raw(doc_id)

# apply the pipeline on each doc to build a frequency table
total_documents = len(corpus)
stop_words = stopwords.words('english')
frequencies = {}
vocabulary=set()
vocabularysize=0
for doc_id in corpus.keys():
        tokens=wordmatch(corpus[doc_id])
        waste = onlypunct(corpus[doc_id])
        cleaned_tokens = [t for t in tokens if not t in waste]
        nostop_tokens = [t for t in cleaned_tokens if not t in stop_words]
        norm_tokens = [t.lower() for t in nostop_tokens]
        #porter = PorterStemmer()
        #stemmed_tokens = [porter.stem(t) for t in norm_tokens]
        wnl=nltk.WordNetLemmatizer()
        lemmas = [wnl.lemmatize(t) for t in norm_tokens]
        docvocabulary=set(lemmas)
        vocabulary=vocabulary.union(docvocabulary)
        vocabularysize=vocabularysize+len(docvocabulary)
        docfdist = nltk.FreqDist(t for t in lemmas)      
        frequencies[doc_id]=docfdist
# building a df table (term, #sentences that contain term)
df_table=create_df_table(vocabulary, frequencies)
# building an idf table (term, idf score) based on a df_table
idf_table=create_idf_table(df_table, total_documents)
#show.printdict(idf_table)

# Now we build a tf*idf_table for each sentence
# building a tf*idf table from: 1) a frequency distribution 2) an idf_table
#               doc1    doc2    ...     docN
# term1         w11     w12     ...     w1N
# term2         w21     w22     ...     w2N
# ...           ...     ...     ...     ...
# termM         wM1     wM2     ...     wMN

# A tf_idf_table corresponds to a column of the classic term-doc matrix
# We build the whole matrix indexed by docs

tf_idf_tables={}
for doc_id in frequencies.keys():
        tf_idf_table=create_tfidf_table(frequencies[doc_id], idf_table)
        tf_idf_tables[doc_id]=tf_idf_table

#### we want to length-normalize the tf-idf vectors ###
tf_idf_tables=compute_lenghtnorm_vectors(tf_idf_tables)

def compute_Jacc_sim(v1,v2):
        # compute Jaccard similarity between two lists of terms
        set1=set(v1)
        set2=set(v2)
        union = set1 | set2
        intersection = set([w for w in set1 if w in set2])
        return len(intersection)/len(union)

pairs=combinations(tf_idf_tables.keys(), 2)
all_pairs=list(pairs)
sim_table={}
for w in all_pairs:
        key=""
        words1=tf_idf_tables[w[0]].keys()
        words2=tf_idf_tables[w[1]].keys()
        sim=compute_Jacc_sim(words1,words2)
        key=w[0][:len(w[0])-4]+"---"+w[1][:len(w[1])-4]
        sim_table[key]=sim

sorted_sim_table = dict(sorted(sim_table.items(), key=operator.itemgetter(1),reverse=True))
# take only top-k similarities
print ("TOP-K JACCARD SIMILARITIES:")
K=3
sorted_topk = dict(list(sorted_sim_table.items())[0: K])
#show.printdict(sorted_topk)
print(sorted_topk)
sorted_lastk = dict(list(sorted_sim_table.items())[len(sorted_sim_table)-K:])
print ("LAST-K JACCARD SIMILARITIES:")
#show.printdict(sorted_lastk)
print(sorted_lastk)
print("____________________________")
print ("SPECIFIC COMPARISONS:")
key="2009-Obama---2013-Obama"
print (key," ", sim_table[key])
key="2013-Obama---2017-Trump"
print (key, " ", sim_table[key])
key="2005-Bush---2013-Obama"
print (key, " ", sim_table[key])
key="2005-Bush---2017-Trump"
print (key, " ", sim_table[key])


TOP-K JACCARD SIMILARITIES:
{'1817-Monroe---1821-Monroe': 0.2804476629361422, '1837-VanBuren---1841-Harrison': 0.25321100917431194, '1837-VanBuren---1845-Polk': 0.25124515771997785}
LAST-K JACCARD SIMILARITIES:
{'1793-Washington---2009-Obama': 0.026570048309178744, '1793-Washington---1841-Harrison': 0.025153374233128835, '1793-Washington---1997-Clinton': 0.021961932650073207}
____________________________
SPECIFIC COMPARISONS:
2009-Obama---2013-Obama   0.22798353909465022
2013-Obama---2017-Trump   0.18674698795180722
2005-Bush---2013-Obama   0.21005385996409337
2005-Bush---2017-Trump   0.1696149843912591


In [18]:
#COSINE SIMILARITY

corpus={}
for doc_id in nltk.corpus.inaugural.fileids():
        corpus[doc_id]=inaugural.raw(doc_id)

# apply the pipeline on each doc to build a frequency table
total_documents = len(corpus)
stop_words = stopwords.words('english')
frequencies = {}
vocabulary=set()
vocabularysize=0
for doc_id in corpus.keys():
        tokens=wordmatch(corpus[doc_id])
        waste = onlypunct(corpus[doc_id])
        cleaned_tokens = [t for t in tokens if not t in waste]
        nostop_tokens = [t for t in cleaned_tokens if not t in stop_words]
        norm_tokens = [t.lower() for t in nostop_tokens]
        #porter = PorterStemmer()
        #stemmed_tokens = [porter.stem(t) for t in norm_tokens]
        wnl=nltk.WordNetLemmatizer()
        lemmas = [wnl.lemmatize(t) for t in norm_tokens]
        docvocabulary=set(lemmas)
        vocabulary=vocabulary.union(docvocabulary)
        vocabularysize=vocabularysize+len(docvocabulary)
        docfdist = nltk.FreqDist(t for t in lemmas)      
        frequencies[doc_id]=docfdist
# building a df table (term, #sentences that contain term)
df_table=create_df_table(vocabulary, frequencies)
# building an idf table (term, idf score) based on a df_table
idf_table=create_idf_table(df_table, total_documents)
#show.printdict(idf_table)

# Now we build a tf*idf_table for each sentence
# building a tf*idf table from: 1) a frequency distribution 2) an idf_table
#               doc1    doc2    ...     docN
# term1         w11     w12     ...     w1N
# term2         w21     w22     ...     w2N
# ...           ...     ...     ...     ...
# termM         wM1     wM2     ...     wMN

# A tf_idf_table corresponds to a column of the classic term-doc matrix
# We build the whole matrix indexed by docs

tf_idf_tables={}
for doc_id in frequencies.keys():
        tf_idf_table=create_tfidf_table(frequencies[doc_id], idf_table)
        tf_idf_tables[doc_id]=tf_idf_table

#### we want to length-normalize the tf-idf vectors ###
tf_idf_tables=compute_lenghtnorm_vectors(tf_idf_tables)

pairs=combinations(tf_idf_tables.keys(), 2)
all_pairs=list(pairs)
sim_table={}
for w in all_pairs:
        key=""
        # take the two vectors
        words1=tf_idf_tables[w[0]]
        words2=tf_idf_tables[w[1]]
        sim=compute_cos_similarity(words1,words2)
        key=w[0][:len(w[0])-4]+"---"+w[1][:len(w[1])-4]
        sim_table[key]=sim

sorted_sim_table = dict( sorted(sim_table.items(), key=operator.itemgetter(1),reverse=True))
# take only top-k similarities
print ("TOP-K COSINE SIMILARITIES:")
K=3
sorted_topk = dict(list(sorted_sim_table.items())[0: K])
#show.printdict(sorted_topk)
print(sorted_topk)
sorted_lastk = dict(list(sorted_sim_table.items())[len(sorted_sim_table)-K:])
print ("LAST-K COSINE SIMILARITIES:")
#show.printdict(sorted_lastk)
print(sorted_lastk)
print("____________________________")
print ("SPECIFIC COMPARISONS:")
key="2009-Obama---2013-Obama"
print (key, " ", sim_table[key])
key="2013-Obama---2017-Trump"
print (key, " ", sim_table[key])
key="2005-Bush---2013-Obama" 
print (key, " ", sim_table[key])
key="2005-Bush---2017-Trump"
print (key, " ", sim_table[key])

TOP-K COSINE SIMILARITIES:
{'2013-Obama---2021-Biden': 0.3351574327645749, '1817-Monroe---1821-Monroe': 0.28641925416692, '1837-VanBuren---1841-Harrison': 0.24784710485922665}
LAST-K COSINE SIMILARITIES:
{'1793-Washington---1913-Wilson': 0.00845784472040958, '1793-Washington---1865-Lincoln': 0.008201545078799308, '1793-Washington---1905-Roosevelt': 0.0041607134804818304}
____________________________
SPECIFIC COMPARISONS:
2009-Obama---2013-Obama   0.18878927738334836
2013-Obama---2017-Trump   0.18118560541687198
2005-Bush---2013-Obama   0.08195075702143417
2005-Bush---2017-Trump   0.08221561078669792
