In [215]:
import nltk 
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to /home/mg/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/mg/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [216]:
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
import spacy
import string
from nltk.corpus import wordnet
import numpy as np


stemmer = PorterStemmer()
sp = spacy.load('en_core_web_sm')

punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''

[nltk_data] Downloading package stopwords to /home/mg/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/mg/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [217]:
def parse_query(query):
    '''
    This function is for Text normalization
    '''
    no_punct_query = ""
    for char in query:
        if char not in punctuations:
            no_punct_query = no_punct_query + char
            
    lower_query = no_punct_query.lower()
    lower_tokens = lower_query.split(' ')
    stemmed_tokens = [stemmer.stem(token) for token in lower_tokens]
    stemmed_tokens = [token for token in stemmed_tokens if token not in stopwords.words('english')]
    doc = sp(lower_query)
    lemm_tokens = [ token.lemma_ for token in doc]
    lemm_tokens = [token for token in lemm_tokens if token not in stopwords.words('english')]
    
    return set(lemm_tokens+stemmed_tokens)

In [218]:
data_list = ["Sheikh Jarrah is a Palestinian neighborhood in occupied East Jerusalem",
"Palestinian Jerusalemite families are facing imminent forced eviction from their homes by illegal Israeli settlers",
"The forced evictions are part of Israel’s systematic policies aimed at erasing the Palestinian identity of Jerusalem",
"A colossal dam is near completion on Ethiopia’s stretch of the Nile",
"It is true that Egypt’s survival has hinged on the flow of the Nile’s waters since time immemorial",
" The filling phase of the colossal dam will, for a time at least, reduce the flow downstream",
"Arabic  is one of the oldest and most widely spoken languages in the world.",
"It’s not surprising that the Arabic language has influenced many other languages",
"Given the extent of the influence of the language, it is still difficult for non-speakers to learn Arabic"
]

In [219]:
normalized_text = [] 
for i in data_list:
    normalized_text.append(parse_query(i)) # make a list of normalized sentences

In [220]:
def path_similarity(sent_1 , sent_2):
    mylist = {}
    similarity_score=0
    for word1 in sent_1:
        mylist[word1] = []
        for word2 in sent_2:
            wordnet_list1 = wordnet.synsets(word1) # get the similiar word from wordnet for word1
            wordnet_list2 = wordnet.synsets(word2) # get the similiar word from wordnet for word2
            if wordnet_list1 and wordnet_list2: 
                s = wordnet_list1[0].path_similarity(wordnet_list2[0]) # calculate similarity between the two word sets by using wup_similarity 
                mylist[word1].append(s)
        try:
            similarity_score += max(list(filter(None, mylist[word1])))
        except:
            pass

    similiarity_score_normalized = similarity_score/len(sent_1)
    
    return similiarity_score_normalized

In [221]:
similarity_matrix =np.zeros((len(normalized_text) , len(normalized_text)))
for sent1_num ,sent_1 in enumerate(normalized_text):
    for sent2_num ,sent_2 in  enumerate(normalized_text):
        if sent1_num == sent2_num:
            similarity_matrix[sent1_num][sent2_num] =1 
        else:
            similarity_matrix[sent1_num][sent2_num]=get_sentence_similarity(sent_1 ,sent_2)

In [222]:
similarity_matrix

array([[1.        , 0.39610423, 0.51644737, 0.33958333, 0.36969507,
        0.2871732 , 0.38735073, 0.40479323, 0.3712793 ],
       [0.22637621, 1.        , 0.34542248, 0.2043674 , 0.20074697,
        0.18475891, 0.23912754, 0.26746913, 0.25984386],
       [0.2839667 , 0.37730075, 1.        , 0.27891361, 0.30105451,
        0.27873601, 0.28059278, 0.3106805 , 0.32595303],
       [0.22364302, 0.24177869, 0.26342371, 1.        , 0.34713388,
        0.34567303, 0.20878288, 0.21440857, 0.20172401],
       [0.19696731, 0.20975131, 0.2561405 , 0.27244126, 1.        ,
        0.28101362, 0.19259676, 0.22142425, 0.19650798],
       [0.17273243, 0.21618469, 0.29858671, 0.31624906, 0.32871205,
        1.        , 0.22269635, 0.21059467, 0.22813853],
       [0.22176871, 0.24159412, 0.27155408, 0.19912281, 0.20213834,
        0.21238811, 1.        , 0.35034014, 0.35272109],
       [0.20421245, 0.25971916, 0.27533688, 0.18624559, 0.22244935,
        0.16869549, 0.33296703, 1.        , 0.36923077],


In [223]:
# CLustering
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=0).fit(similarity_matrix)

In [224]:
kmeans.labels_

array([2, 2, 2, 0, 0, 0, 1, 1, 1], dtype=int32)

In [225]:
kmeans.cluster_centers_

array([[0.19778092, 0.22257157, 0.27271697, 0.52956344, 0.55861531,
        0.54222888, 0.20802533, 0.21547583, 0.20879017],
       [0.22390982, 0.27782483, 0.30814995, 0.21457719, 0.23474481,
        0.21295241, 0.58195535, 0.59697779, 0.57398395],
       [0.50344764, 0.59113499, 0.62062328, 0.27428811, 0.29049885,
        0.25022271, 0.30235702, 0.32764762, 0.3190254 ]])