# Contextual and Semantic Similarity

This notebook contains code using to obtain the semantic and contextual similarity scores

In [1]:
# importing necessary libraries
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import json, os
import re
from collections import Counter

from sentence_transformers import SentenceTransformer
from nltk.corpus import wordnet as wn
import nltk
import math
from nltk import word_tokenize

from sklearn.metrics import log_loss
import itertools

## Loading the dataset to train on

In [2]:
# loading imdb and synthetic dataset
dir = os.path.dirname("__file__")
datasets = os.path.join(dir, "..", "datasets")
outputs = os.path.join(dir, "..", "outputs")

dataset = []
dblp_dict = []
count = 0
with open(os.path.join(datasets, "dataset.json"), encoding='utf-8') as f:
    for line in f:
        doc = json.loads(line)
        dataset.append(doc)
        count+=1
print(count, "documents loaded.")


print(len(dataset))


200 documents loaded.
200


In [3]:
# finding RTL and NTL paths list
stack = []
final_dict = {}
all_keys = set()

def do_walk(datadict):
    if isinstance(datadict, dict):
        for key, value in datadict.items():
            stack.append(key)
            if isinstance(value, dict) and len(value.keys()) == 0:
                for val in stack:
                    all_keys.add(val)
                final_dict["/".join(stack)] = "EMPTY_DICT"
            if isinstance(value, list) and len(value) == 0:
                for val in stack:
                    all_keys.add(val)
                final_dict["/".join(stack)] = 'EMPTY_LIST'
            if isinstance(value, dict):
                do_walk(value)
            if isinstance(value, list):
                do_walk(value)
            if isinstance(value, str):
                for val in stack:
                    all_keys.add(val)
                final_dict["/".join(stack)] = value
            stack.pop()

    if isinstance(datadict, list):
        n = 0
        for key in datadict:
            n = n + 1
            if isinstance(key, dict):
                do_walk(key)
            if isinstance(key, list):
                do_walk(key)
            if isinstance(key, str):
                for val in stack:
                    all_keys.add(val)
                final_dict["/".join(stack)] = key

keys_list = []
ntl_paths_list_util = []
rtl_paths_list = []
for i in range(0,len(dataset)):
    do_walk(dataset[i])
    keys_list.append(all_keys)
    ntl_paths_list_util.append([x for x in final_dict.keys()])
    rtl_paths_list.append(list(final_dict.keys()))
    final_dict={}
    all_keys=set()

def flatten(t):
    return [item for sublist in t for item in sublist]


final_append_array = []
for document in ntl_paths_list_util:
    for path in document:
        if path is not None:
            result = [path[_.start()+1:] for _ in re.finditer("/", path)]
        for item in result : document.append(item)
    final_append_array.append(list(set(document)))
ntl_paths_list = final_append_array

(ntl_paths_list[-1])

['_id/$oid', 'type', '$oid', '_key', 'author', 'mdate', 'title']

In [26]:
rtl_paths_list[-15]

['_id/$oid',
 'mdate',
 'author',
 'ee',
 'booktitle',
 'title',
 'pages',
 'url',
 'year',
 'type',
 '_key',
 'crossref']

In [5]:
# preparing sentences to be fed into bert
sent_list = []
for path in rtl_paths_list:
    temp_list = []
    i = 0
    while i in range(len(path)):
        if "/" not in path[i]:
            temp_list.append(path[i])
            i += 1
        else:
            left_str = path[i][:path[i].find("/")]
            temp_list2 = []
            temp_list2.append(path[i][path[i].find("/")+1:])
            i += 1
            while i in range(len(path)) and left_str in path[i]:
                temp_list2.append(path[i][path[i].find("/")+1:])
                i += 1
            temp_str = ""
            if len(temp_list2) == 1 and temp_list2[0] not in left_str:
                temp_str = left_str + " " + temp_list2[0]
            elif len(temp_list2) == 1:
                temp_str = left_str
            else:
                temp_str = left_str + " " + ' '.join(temp_list2)
            temp_list.append(temp_str)
    sent_list.append(temp_list)

sent_list[-1]


['_id $oid', 'mdate', 'author', '_key', 'title', 'type']

In [6]:
# getting models ready

bert_model = SentenceTransformer('C:/Users/desik/.cache/torch/sentence_transformers/sentence-transformers_bert-base-nli-mean-tokens')
# roberta_model = SentenceTransformer('C:/Users/desik/.cache/torch/sentence_transformers/sentence-transformers_all-roberta-large-v1')
# electra_model = SentenceTransformer('C:/Users/desik/.cache/torch/sentence_transformers/ddobokki_electra-small-nli-sts')

In [7]:
# to get meaningful words out of the labels

def viterbi_segment(text):
    probs, lasts = [1.0], [0]
    for i in range(1, len(text) + 1):
        prob_k, k = max((probs[j] * word_prob(text[j:i]), j)
                        for j in range(max(0, i - max_word_length), i))
        probs.append(prob_k)
        lasts.append(k)
    words = []
    i = len(text)
    while 0 < i:
        words.append(text[lasts[i]:i])
        i = lasts[i]
    words.reverse()
    return words, probs[-1]

def word_prob(word): return dictionary[word] / total
def words(text): return re.findall('[a-z]+', text.lower()) 
dictionary = Counter(words(open('../datasets/big.txt').read()))
max_word_length = max(map(len, dictionary))
total = float(sum(dictionary.values()))

## Contextual similarity with just the RTL paths of the JSON documents

In [9]:
# formatting keys to more meaningful words
def string_replace_meaningful(key_list):
    for k in range(len(key_list)):
        str_list = key_list[k].split()
        for i in range(len(str_list)):
            str1 = str_list[i].replace('cinematgrs', 'cinematographers').replace('costdes', 'costumedes').replace('aka', 'alternate')
            str1 = str1.replace('proddesi', 'productiondesi').replace('doi', 'digital identifier').replace('publtype', 'publishingtype').replace('ref', 'reference')
            str1 = str1.replace("_id", '').replace("$oid", '').replace('_key', '').replace('imdb', 'moviedatabase').replace('url', '')
            word_split = viterbi_segment(str1)
            str1 = ' '.join(word_split[0])
            str_list[i] = str1

        key_list[k] = " ".join(str_list)

        if len(str_list) == 1:
            key_list[k] = str_list[0]
        elif len(str_list) == 2:
            key_list[k] = " ".join(str_list)
        elif len(str_list) > 2:
            left = str_list[0]
            str2 = " ".join(str_list[1:len(str_list)-1])  
            key_list[k] = left + " " + str2 + " and " + str_list[-1]    
    return key_list

In [11]:
# converting RTL paths to sentence
key_list_sentences = []
for i in range(len(sent_list)):
    t_list = sent_list[i][:]
    t_list = string_replace_meaningful(t_list)
    # if (i < 3): print(t_list)
    sent = ', '.join(t_list)
    sent = "Document has " + sent
    key_list_sentences.append(sent)

len(key_list_sentences)


200

In [22]:
# to find pairwise contextual similarity scores
import random
def similarity_scores(sentence_list):
    # embeddings = model.encode(sentence_list)
    sent1 = sentence_list[0:50]
    sent2 = sentence_list[150:200]
    sent1.extend(sent2)
    all_100 = bert_model.encode(sent1)

    sent_combo = list(itertools.combinations(sent1, 2))
    embed_combo = list(itertools.combinations(all_100, 2))

    df = pd.DataFrame(columns = ['Document 1', 'Document 2', 'CosineSim'])

    print("Calculating similarity scores...")

    for i in range(len(sent_combo)):
        s1 = embed_combo[i][0]
        s2 = embed_combo[i][1]
        cos_score = cosine_similarity(s1.reshape(1, -1), s2.reshape(1, -1))
        dict_t = {'Document 1': sent_combo[i][0], 'Document 2': sent_combo[i][1], 'CosineSim': cos_score}
        # print(movie_100[i].reshape(1, -1).shape, paper_100[i].shape)
        df = df.append(dict_t, ignore_index = True)

    print("done.")
    return df

In [None]:
# bert_scores = similarity_scores(key_list_sentences)

## Contextual similarity with Content of JSON documents

In [None]:
# converting JSON document into sentences
def convert(doc) -> str:
    s = ''
    i = 0
    for (key, value) in doc.items():
        has = ''
        if key in ['movieid', 'crossref', '_id', '_key', 'ee', 'url', 'pages', 'number', 'pages']: # ignoring key-value pairs that do not bring any contextual meaning
            continue
        if type(value) == type(dict()):
            has = convert(value)
            s += f' {key} has {has} '
        elif type(value) == type(list()):
            has = convert_list(value)
            s += f' {key} has {has} '
        else:
            s += f" {key} has " + str(value)
            continue
        i = i+1
        

    return s

def convert_list(values):
    dict_ret = {}
    if type(values[0]) != dict:
        sent = ' '.join(values)
        return sent

    for key, value in values[0].items():
        dict_ret[key] = []
    
    for item in values:
        for key, value in item.items():
            
            if type(value) == list:
                value = ", ".join(value)
            if key in dict_ret.keys():
                dict_ret[key].append(value)
            else:
                temp_list = []
                temp_list.append(value)
                dict_ret[key] = temp_list

    sent = ' '

    for key, value in dict_ret.items():
        sent = sent + " " + key + " "
        sent_t = " ".join(value)
        sent = sent + " " + sent_t


    return sent

sentences = []

for i in range(200):
    sentences.append(convert(dataset[i]))


In [19]:
# function to clean the sentences for BERT and WordNet
import unicodedata
def clean_sentence(val):
    "remove chars that are not letters or numbers, downcase, then remove stop words"
    regex = re.compile('([^\s\w]|_)+')
    sentence = regex.sub('', val)
    return sentence


def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])

def string_replace(sentence):
    for k in range(len(sentence)):
        str_list = sentence[k].split()
        for i in range(len(str_list)):
            str1 = str_list[i].replace('cinematgrs', 'cinematographers').replace('costdes', 'costume des').replace('akan', 'alternate n')
            str1 = str1.replace('proddesi', 'production desi').replace('doi', 'digital identifier').replace('publtype', 'publishing type').replace('ref', 'reference')
            str1 = str1.replace("_id", '').replace("$oid", '').replace('_key', '').replace('imdb', 'movie database ').replace('url', '')
            str_list[i] = str1

        sentence[k] = " ".join(str_list)
  
    return sentence

In [20]:
for i in range(len(sentences)):
    sentences[i] = clean_sentence(sentences[i])
    sentences[i] = remove_accents(sentences[i])
    
sentences_f = string_replace(sentences)

with open('sentences_taken.txt', 'w', encoding="utf-8") as f:
    f.write('\n'.join(sentences_f))

In [44]:
# finding similarity scores from BERT
import warnings
warnings.filterwarnings("ignore")
bert_scores = similarity_scores(sentences_f)
bert_scores.describe()

Calculating similarity scores...
done.


Unnamed: 0,Document 1,Document 2,CosineSim
count,4950,4950,4950
unique,99,99,4950
top,title has 1 vs 100 2007 22 year has null misc ...,mdate has 20090609 author has James A Worthey ...,[[0.5969405]]
freq,99,99,1


In [24]:
bert_scores.to_csv('bert_scores_context.csv')


In [None]:
# writing json dict to json file

# with open("dataset.json", 'w') as f:
#         for value in dataset:
#             print(json.dumps(value))
#             f.write(json.dumps(value))
#             f.write('\n')



## Semantic Similarity with Wordnet

In [34]:
class Lesk(object):

    def __init__(self, sentence):
        self.sentence = sentence
        self.meanings = {}
        for word in sentence:
            self.meanings[word] = ''

    def getSenses(self, word):
        # print word
        return wn.synsets(word.lower())

    def getGloss(self, senses):

        gloss = {}

        for sense in senses:
            gloss[sense.name()] = []

        for sense in senses:
            gloss[sense.name()] += word_tokenize(sense.definition())

        return gloss

    def getAll(self, word):
        senses = self.getSenses(word)

        if senses == []:
            return {word.lower(): senses}

        return self.getGloss(senses)

    def Score(self, set1, set2):
        # Base
        overlap = 0

        # Step
        for word in set1:
            if word in set2:
                overlap += 1

        return overlap

    def overlapScore(self, word1, word2):

        gloss_set1 = self.getAll(word1)
        if self.meanings[word2] == '':
            gloss_set2 = self.getAll(word2)
        else:
            # print 'here'
            gloss_set2 = self.getGloss([wn.synset(self.meanings[word2])])

        # print gloss_set2

        score = {}
        for i in gloss_set1.keys():
            score[i] = 0
            for j in gloss_set2.keys():
                score[i] += self.Score(gloss_set1[i], gloss_set2[j])

        bestSense = None
        max_score = 0
        for i in gloss_set1.keys():
            if score[i] > max_score:
                max_score = score[i]
                bestSense = i

        return bestSense, max_score

    def lesk(self, word, sentence):
        maxOverlap = 0
        context = sentence
        word_sense = []
        meaning = {}

        senses = self.getSenses(word)

        for sense in senses:
            meaning[sense.name()] = 0

        for word_context in context:
            if not word == word_context:
                score = self.overlapScore(word, word_context)
                if score[0] == None:
                    continue
                meaning[score[0]] += score[1]

        if senses == []:
            return word, None, None

        self.meanings[word] = max(meaning.keys(), key=lambda x: meaning[x])

        return word, self.meanings[word], wn.synset(self.meanings[word]).definition()

In [35]:
from scipy import spatial
from nltk.metrics import edit_distance

def path(set1, set2):
    return wn.path_similarity(set1, set2)


def wup(set1, set2):
    return wn.wup_similarity(set1, set2)


def edit(word1, word2):
    if float(edit_distance(word1, word2)) == 0.0:
        return 0.0
    return 1.0 / float(edit_distance(word1, word2))

In [36]:
def tokenize(q1, q2):
    return word_tokenize(q1), word_tokenize(q2)


def posTag(q1, q2):
    return nltk.pos_tag(q1), nltk.pos_tag(q2)


def stemmer(tag_q1, tag_q2):
    stem_q1 = []
    stem_q2 = []

    for token in tag_q1:
        stem_q1.append(stem(token))

    for token in tag_q2:
        stem_q2.append(stem(token))

    return stem_q1, stem_q2

In [38]:
def computePath(q1, q2):

    R = np.zeros((len(q1), len(q2)))

    for i in range(len(q1)):
        for j in range(len(q2)):
            if q1[i][1] == None or q2[j][1] == None:
                sim = edit(q1[i][0], q2[j][0]) # using edit distance
            else:
                sim = path(wn.synset(q1[i][1]), wn.synset(q2[j][1]))

            if sim == None:
                sim = edit(q1[i][0], q2[j][0])

            R[i, j] = sim

    # print R

    return R

In [39]:
def computeWup(q1, q2):

    R = np.zeros((len(q1), len(q2)))

    for i in range(len(q1)):
        for j in range(len(q2)):
            if q1[i][1] == None or q2[j][1] == None:
                sim = edit(q1[i][0], q2[j][0])
            else:
                sim = wup(wn.synset(q1[i][1]), wn.synset(q2[j][1]))

            if sim == None:
                sim = edit(q1[i][0], q2[j][0])

            R[i, j] = sim

    # print R

    return R

In [40]:
def overallSim(q1, q2, R):

    sum_X = 0.0
    sum_Y = 0.0

    for i in range(len(q1)):
        max_i = 0.0
        for j in range(len(q2)):
            if R[i, j] > max_i:
                max_i = R[i, j]
        sum_X += max_i

    for i in range(len(q1)):
        max_j = 0.0
        for j in range(len(q2)):
            if R[i, j] > max_j:
                max_j = R[i, j]
        sum_Y += max_j
        
    if (float(len(q1)) + float(len(q2))) == 0.0:
        return 0.0
        
    overall = (sum_X + sum_Y) / (2 * (float(len(q1)) + float(len(q2))))

    return overall

In [51]:
STOP_WORDS = nltk.corpus.stopwords.words()
def clean_sentence_nltk(val):
    "remove chars that are not letters or numbers, downcase, then remove stop words"
    regex = re.compile('([^\s\w]|_)+')
    sentence = regex.sub('', val).lower()
    sentence = sentence.split(" ")

    for word in list(sentence):
        if word in STOP_WORDS:
            sentence.remove(word)

    sentence = " ".join(sentence)
    return sentence

In [41]:
def semanticSimilarity(q1, q2):

    tokens_q1, tokens_q2 = tokenize(q1, q2)
    # stem_q1, stem_q2 = stemmer(tokens_q1, tokens_q2)
    tag_q1, tag_q2 = posTag(tokens_q1, tokens_q2)

    sentence = []
    for i, word in enumerate(tag_q1):
        if 'NN' in word[1] or 'JJ' in word[1] or 'VB' in word[1]:
            sentence.append(word[0])

    sense1 = Lesk(sentence)
    sentence1Means = []
    for word in sentence:
        sentence1Means.append(sense1.lesk(word, sentence))

    sentence = []
    for i, word in enumerate(tag_q2):
        if 'NN' in word[1] or 'JJ' in word[1] or 'VB' in word[1]:
            sentence.append(word[0])

    sense2 = Lesk(sentence)
    sentence2Means = []
    for word in sentence:
        sentence2Means.append(sense2.lesk(word, sentence))
    # for i, word in enumerate(sentence1Means):
    #     print sentence1Means[i][0], sentence2Means[i][0]

    R1 = computePath(sentence1Means, sentence2Means)
    R2 = computeWup(sentence1Means, sentence2Means)

    R = (R1 + R2) / 2

    # print R

    return overallSim(sentence1Means, sentence2Means, R)

In [52]:
# cleaning and feeding sentences to WordNet
y_pred = []
count = 0

for i in range(len(sentences_f)):
    sentences_f[i] = clean_sentence_nltk(sentences_f[i])

sentence_50 = sentences_f[0:50]
sentence_l_50 = sentences_f[150:200]
sentence_50.extend(sentence_l_50)


combo = list(itertools.combinations(sentence_50, 2))

df_sem = pd.DataFrame(columns = ['Document 1', 'Document 2', 'SemanticSim'])

print('Calculating similarity for the training data, please wait.')

for i in range(len(combo)):
    # print row

    sim = semanticSimilarity(combo[i][0], combo[i][1])
    dict_t = {'Document 1': combo[i][0], 'Document 2': combo[i][1], 'SemanticSim': sim}
        # print(movie_100[i].reshape(1, -1).shape, paper_100[i].shape)
    df_sem = df_sem.append(dict_t, ignore_index = True)
    
    count += 1
    print(count)
    if count % 100 == 0:
        print(str(count)+", "+str(sim))
    y_pred.append(sim)
    
y_pred

Calculating similarity for the training data, please wait.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
100, 0.3707087010532791
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
200, 0.12176268374346291
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250


[0.049502763318552793,
 0.07279827263379894,
 0.05528973950026581,
 0.12163600288600289,
 0.026511399201768742,
 0.06579650247413404,
 0.049116260801256875,
 0.03428252431236696,
 0.05962171052631579,
 0.10705811001863633,
 0.05448874144049582,
 0.10562724921277554,
 0.13453644589778194,
 0.035519125683060114,
 0.10308084772370485,
 0.01589068825910931,
 0.09784075573549258,
 0.1202083544474849,
 0.09731771204985491,
 0.03594322344322344,
 0.03838011695906432,
 0.05225137548012041,
 0.10902255639097744,
 0.031278855032317636,
 0.08250896725619718,
 0.11608187134502922,
 0.013602847698957539,
 0.09363585245164191,
 0.13094916779127305,
 0.054962031945535485,
 0.024023248304811107,
 0.13453644589778194,
 0.07844220594220594,
 0.12647028897028897,
 0.0935985797827903,
 0.10047230515013023,
 0.05869692259494891,
 0.03659270725834193,
 0.05376000387442035,
 0.08579639171744434,
 0.030992982456140353,
 0.10852366444471709,
 0.12263732247284878,
 0.05929643374818814,
 0.042542253068568854,
 0