# Contextual and Semantic Similarity

This notebook contains code using to obtain the semantic and contextual similarity scores

In [None]:
# importing necessary libraries
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import json, os
import re
from collections import Counter

from sentence_transformers import SentenceTransformer
from nltk.corpus import wordnet as wn
import nltk
import math
from nltk import word_tokenize

from sklearn.metrics import log_loss
import itertools

## Loading the dataset to train on

In [None]:
# loading imdb and synthetic dataset
dir = os.path.dirname("__file__")
datasets = os.path.join(dir, "..", "datasets")
outputs = os.path.join(dir, "..", "outputs")

In [None]:
dataset = []
syn1_dict = []
syn2_dict = []
syn3_dict = []

count = 0
with open(os.path.join(datasets, "imdb"), encoding='utf-8') as f:
    for line in f:
        doc = json.loads(line)
        dataset.append(doc)
        count+=1

# Synthetic_2 contains the first 10000-odd documents from the synthetic dataset supplied to us
with open(os.path.join(datasets, "Synthetic_2.json"), encoding='utf-8') as f:
    for line in f:
        doc = json.loads(line)
        syn1_dict.append(doc)

print(count)

with open(os.path.join(datasets, "Full_Schema_1.json"), encoding='utf-8') as f:
    json_data = f.read()
    syn2_dict = json.loads(json_data)

with open(os.path.join(datasets, "Full_Schema.json"), encoding='utf-8') as f:
    json_data = f.read()
    syn3_dict = json.loads(json_data)

dataset.extend(syn1_dict)
dataset.extend(syn2_dict)
dataset.extend(syn3_dict)
print(len(dataset))

In [None]:
# finding RTL and NTL paths list
stack = []
final_dict = {}
all_keys = set()

def do_walk(datadict):
    if isinstance(datadict, dict):
        for key, value in datadict.items():
            stack.append(key)
            if isinstance(value, dict) and len(value.keys()) == 0:
                for val in stack:
                    all_keys.add(val)
                final_dict["/".join(stack)] = "EMPTY_DICT"
            if isinstance(value, list) and len(value) == 0:
                for val in stack:
                    all_keys.add(val)
                final_dict["/".join(stack)] = 'EMPTY_LIST'
            if isinstance(value, dict):
                do_walk(value)
            if isinstance(value, list):
                do_walk(value)
            if isinstance(value, str):
                for val in stack:
                    all_keys.add(val)
                final_dict["/".join(stack)] = value
            stack.pop()

    if isinstance(datadict, list):
        n = 0
        for key in datadict:
            n = n + 1
            if isinstance(key, dict):
                do_walk(key)
            if isinstance(key, list):
                do_walk(key)
            if isinstance(key, str):
                for val in stack:
                    all_keys.add(val)
                final_dict["/".join(stack)] = key

keys_list = []
ntl_paths_list_util = []
rtl_paths_list = []
for i in range(0,len(dataset)):
    do_walk(dataset[i])
    keys_list.append(all_keys)
    ntl_paths_list_util.append([x for x in final_dict.keys()])
    rtl_paths_list.append(list(final_dict.keys()))
    final_dict={}
    all_keys=set()

def flatten(t):
    return [item for sublist in t for item in sublist]


final_append_array = []
for document in ntl_paths_list_util:
    for path in document:
        if path is not None:
            result = [path[_.start()+1:] for _ in re.finditer("/", path)]
        for item in result : document.append(item)
    final_append_array.append(list(set(document)))
ntl_paths_list = final_append_array

(ntl_paths_list[-1])

['id/$oid',
 'book/doi',
 'booktitle',
 'author',
 'book/booktitle',
 '$oid',
 'doi',
 'book/author']

In [None]:
rtl_paths_list[13797:13800]

[['id/$oid',
  'article/author',
  'article/journal',
  'article/title',
  'article/doi'],
 ['id/$oid',
  'article/author',
  'article/journal',
  'article/title',
  'article/doi'],
 ['id/$oid',
  'article/author',
  'article/journal',
  'article/title',
  'article/doi']]

In [None]:
# preparing sentences to be fed into bert
sent_list = []
for path in rtl_paths_list:
    temp_list = []
    i = 0
    while i in range(len(path)):
        if "/" not in path[i]:
            temp_list.append(path[i])
            i += 1
        else:
            left_str = path[i][:path[i].find("/")]
            temp_list2 = []
            temp_list2.append(path[i][path[i].find("/")+1:])
            i += 1
            while i in range(len(path)) and left_str in path[i]:
                temp_list2.append(path[i][path[i].find("/")+1:])
                i += 1
            temp_str = ""
            if len(temp_list2) == 1 and temp_list2[0] not in left_str:
                temp_str = left_str + " " + temp_list2[0]
            elif len(temp_list2) == 1:
                temp_str = left_str
            else:
                temp_str = left_str + " " + ' '.join(temp_list2)
            temp_list.append(temp_str)
    sent_list.append(temp_list)

sent_list[13797:13800]


[['id $oid', 'article author journal title doi'],
 ['id $oid', 'article author journal title doi'],
 ['id $oid', 'article author journal title doi']]

In [None]:
# getting models ready

bert_model = SentenceTransformer('C:/Users/desik/.cache/torch/sentence_transformers/sentence-transformers_bert-base-nli-mean-tokens')
# roberta_model = SentenceTransformer('C:/Users/desik/.cache/torch/sentence_transformers/sentence-transformers_all-roberta-large-v1')
# electra_model = SentenceTransformer('C:/Users/desik/.cache/torch/sentence_transformers/ddobokki_electra-small-nli-sts')

In [None]:
# to get meaningful words out of the labels

def viterbi_segment(text):
    probs, lasts = [1.0], [0]
    for i in range(1, len(text) + 1):
        prob_k, k = max((probs[j] * word_prob(text[j:i]), j)
                        for j in range(max(0, i - max_word_length), i))
        probs.append(prob_k)
        lasts.append(k)
    words = []
    i = len(text)
    while 0 < i:
        words.append(text[lasts[i]:i])
        i = lasts[i]
    words.reverse()
    return words, probs[-1]

def word_prob(word): return dictionary[word] / total
def words(text): return re.findall('[a-z]+', text.lower()) 
dictionary = Counter(words(open('../datasets/big.txt').read()))
max_word_length = max(map(len, dictionary))
total = float(sum(dictionary.values()))

## Contextual similarity with just the RTL paths of the JSON documents

In [None]:
# formatting keys to more meaningful words
def string_replace_meaningful(key_list):
    for k in range(len(key_list)):
        str_list = key_list[k].split()
        for i in range(len(str_list)):
            str1 = str_list[i].replace('cinematgrs', 'cinematographers').replace('costdes', 'costumedes').replace('aka', 'alternate')
            str1 = str1.replace('proddesi', 'productiondesi').replace('doi', 'digitalidentifier').replace('publtype', 'publishingtype').replace('ref', 'reference')
            str1 = str1.replace("_id", '').replace("$oid", '').replace('_key', '').replace('imdb', 'moviedatabase').replace('url', '')
            word_split = viterbi_segment(str1)
            str1 = ' '.join(word_split[0])
            str_list[i] = str1

        key_list[k] = " ".join(str_list)

        if len(str_list) == 1:
            key_list[k] = str_list[0]
        elif len(str_list) == 2:
            key_list[k] = " ".join(str_list)
        elif len(str_list) > 2:
            left = str_list[0]
            str2 = " ".join(str_list[1:len(str_list)-1])  
            key_list[k] = left + " " + str2 + " and " + str_list[-1]    
    return key_list

In [None]:
# converting RTL paths to sentence
key_list_sentences = []
for i in range(len(sent_list)):
    t_list = sent_list[i][:]
    t_list = string_replace_meaningful(t_list)
    # if (i < 3): print(t_list)
    sent = ', '.join(t_list)
    sent = "Document has " + sent
    key_list_sentences.append(sent)

(key_list_sentences[13797])


'Document has id , article author journal title and digital identifier'

In [None]:
# to return bert embeddings
import random
def embeddings(sentence_list):
    # embeddings = model.encode(sentence_list)
    all_docs = bert_model.encode(sentence_list)
    print("done.")
    return all_docs

In [None]:
viterbi_segment("identifier")

(['identifier'], 9.047317470370035e-07)

In [None]:
# bert_scores = similarity_scores(key_list_sentences)

## Contextual similarity with Content of JSON documents

In [None]:
# converting JSON document into sentences
def convert(doc) -> str:
    s = ''
    i = 0
    for (key, value) in doc.items():
        has = ''
        if key in ['movieid', 'crossref', '_id', '_key', 'ee', 'url', 'pages', 'number', 'pages', 'imdburl']: # ignoring key-value pairs that do not bring any contextual meaning
            continue
        if type(value) == type(dict()):
            has = convert(value)
            s += f' {key} has {has} '
        elif type(value) == type(list()):
            has = convert_list(key, value)
            s += f' {key} has {has} '
        else:
            s += f" {key} has " + str(value)
            continue
        i = i+1
        

    return s

def convert_list(key_v, values):
    dict_ret = {}
    if type(values[0]) != dict:
        sent = ' '.join(values)
        return sent

    for key, value in values[0].items():
        dict_ret[key] = []

    sent = ' '

    if key_v in ['actors', 'directors', 'producers', 'cinematgrs', 'costdesigners', 'misc', 'editors', 'composers', 'writers']:
        temp_set = set()
        for i in values:
            for key, value in i.items():
                temp_set.add(key)
        
        for i in temp_set:
            sent = sent + i + " "


    else:
        for item in values:
            for key, value in item.items():
                
                if type(value) == list:
                    value = ", ".join(value)
                if key in dict_ret.keys():
                    dict_ret[key].append(value)
                else:
                    temp_list = []
                    temp_list.append(value)
                    dict_ret[key] = temp_list


        for key, value in dict_ret.items():
            sent = sent + " " + key + " "
            sent_t = " ".join(value)
            sent = sent + " " + sent_t


    return sent

sentences = []

for i in range(13797):
    sentences.append(convert(dataset[i]))

sentences[-10:-1]


[" title has 'Adam 12' (1990) (Real Estate Scam (#1.8)) actors has  sex akaname name   directors has  name   producers has  name   writers has  name   releasedates has   country  USA imdbdate  12 November 1990 releasedate  1990-11-12 addition  null ",
 " title has 'Adam 12' (1990) (Teach the Children (#1.9)) actors has  sex akaname name   directors has  name   producers has  name   writers has  name   releasedates has   country  USA imdbdate  19 November 1990 releasedate  1990-11-19 addition  null ",
 " title has 'Adam 12' (1990) (The Landlord (#1.14)) actors has  sex akaname name   directors has  name   producers has  name   writers has  name   releasedates has   country  USA imdbdate  24 December 1990 releasedate  1990-12-24 addition  null ",
 " title has 'Adam 12' (1990) (The Sniper (#1.1)) actors has  sex akaname name   directors has  name   producers has  name   writers has  name   releasedates has   country  USA imdbdate  24 September 1990 releasedate  1990-09-24 addition  null "

In [None]:
# function to clean the sentences for BERT
import unicodedata
def clean_sentence(val):
    "remove chars that are not letters or numbers, downcase, then remove stop words"
    regex = re.compile('([^\s\w]|_)+')
    sentence = regex.sub('', val)
    return sentence


def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])

def string_replace(sentence):
    for k in range(len(sentence)):
        str_list = sentence[k].split()
        for i in range(len(str_list)):
            str1 = str_list[i].replace('cinematgrs', 'cinematographers').replace('costdes', 'costume des').replace('akan', 'alternate n')
            str1 = str1.replace('proddesi', 'production desi').replace('doi', 'digital identifier').replace('publtype', 'publishing type').replace('ref', 'reference')
            str1 = str1.replace("_id", '').replace("$oid", '').replace('_key', '').replace('imdb', 'movie database ').replace('url', '')
            str_list[i] = str1

        sentence[k] = " ".join(str_list)
  
    return sentence

print(len(sentences))

13797


In [None]:
for i in range(len(sentences)):
    sentences[i] = clean_sentence(sentences[i])
    sentences[i] = remove_accents(sentences[i])
    
sentences_f = string_replace(sentences)

len(sentences_f)

# with open('sentences_taken.txt', 'w', encoding="utf-8") as f:
#     f.write('\n'.join(sentences_f))

13797

In [None]:
key_list_sentences[13799]

'Document has id , article author journal title and digital identifier'

In [None]:
# finding similarity scores from BERT
import warnings
warnings.filterwarnings("ignore")

sentences_f.extend(key_list_sentences[13797:30000])
# sentences_f[6900:7100]

In [None]:
len(sentences_f)

30000

In [None]:
bert_scores = embeddings(sentences_f)
len(bert_scores)

done.


30000

### Clustering with just the contextual scores

In [None]:
from sklearn.cluster import AgglomerativeClustering

In [None]:
cluster_model = AgglomerativeClustering(n_clusters=2, affinity='cosine', linkage='average')
cluster_model.fit(bert_scores)

In [None]:
np.where(cluster_model.labels_ == 0)

(array([    0,     1,     2, ..., 13794, 13795, 13796], dtype=int64),)

## Semantic Similarity with Wordnet

In [None]:
# using Michael Lesk algorithm for word sense disambiguation
class Lesk(object):

    def __init__(self, sentence):
        self.sentence = sentence
        self.meanings = {}
        for word in sentence:
            self.meanings[word] = ''

    def getSenses(self, word):
        # print word
        return wn.synsets(word.lower())

    def getGloss(self, senses):

        gloss = {}

        for sense in senses:
            gloss[sense.name()] = []

        for sense in senses:
            gloss[sense.name()] += word_tokenize(sense.definition())

        return gloss

    def getAll(self, word):
        senses = self.getSenses(word)

        if senses == []:
            return {word.lower(): senses}

        return self.getGloss(senses)

    def Score(self, set1, set2):
        # Base
        overlap = 0

        # Step
        for word in set1:
            if word in set2:
                overlap += 1

        return overlap

    def overlapScore(self, word1, word2):

        gloss_set1 = self.getAll(word1)
        if self.meanings[word2] == '':
            gloss_set2 = self.getAll(word2)
        else:
            # print 'here'
            gloss_set2 = self.getGloss([wn.synset(self.meanings[word2])])

        # print gloss_set2

        score = {}
        for i in gloss_set1.keys():
            score[i] = 0
            for j in gloss_set2.keys():
                score[i] += self.Score(gloss_set1[i], gloss_set2[j])

        bestSense = None
        max_score = 0
        for i in gloss_set1.keys():
            if score[i] > max_score:
                max_score = score[i]
                bestSense = i

        return bestSense, max_score

    def lesk(self, word, sentence):
        maxOverlap = 0
        context = sentence
        word_sense = []
        meaning = {}

        senses = self.getSenses(word)

        for sense in senses:
            meaning[sense.name()] = 0

        for word_context in context:
            if not word == word_context:
                score = self.overlapScore(word, word_context)
                if score[0] == None:
                    continue
                meaning[score[0]] += score[1]

        if senses == []:
            return word, None, None

        self.meanings[word] = max(meaning.keys(), key=lambda x: meaning[x])

        return word, self.meanings[word], wn.synset(self.meanings[word]).definition()

In [None]:
from scipy import spatial
from nltk.metrics import edit_distance

def path(set1, set2):
    return wn.path_similarity(set1, set2)


def wup(set1, set2):
    return wn.wup_similarity(set1, set2)


def edit(word1, word2):
    if float(edit_distance(word1, word2)) == 0.0:
        return 0.0
    return 1.0 / float(edit_distance(word1, word2))

In [None]:
def tokenize(q1):
    return word_tokenize(q1)


def posTag(q1):
    return nltk.pos_tag(q1)


def stemmer(tag_q1):
    stem_q1 = []

    for token in tag_q1:
        stem_q1.append(stem(token))

    return stem_q1

In [None]:
def computePath(q1, q2):

    R = np.zeros((len(q1), len(q2)))

    for i in range(len(q1)):
        for j in range(len(q2)):
            if q1[i][1] == None or q2[j][1] == None:
                sim = edit(q1[i][0], q2[j][0]) # using edit distance
            else:
                sim = path(wn.synset(q1[i][1]), wn.synset(q2[j][1]))

            if sim == None:
                sim = edit(q1[i][0], q2[j][0])

            R[i, j] = sim

    # print R

    return R

In [None]:
def computeWup(q1, q2):

    R = np.zeros((len(q1), len(q2)))

    for i in range(len(q1)):
        for j in range(len(q2)):
            if q1[i][1] == None or q2[j][1] == None:
                sim = edit(q1[i][0], q2[j][0])
            else:
                sim = wup(wn.synset(q1[i][1]), wn.synset(q2[j][1]))

            if sim == None:
                sim = edit(q1[i][0], q2[j][0])

            R[i, j] = sim

    # print R

    return R

In [None]:
def overallSim(q1, q2, R):

    sum_X = 0.0
    sum_Y = 0.0

    for i in range(len(q1)):
        max_i = 0.0
        for j in range(len(q2)):
            if R[i, j] > max_i:
                max_i = R[i, j]
        sum_X += max_i

    for i in range(len(q1)):
        max_j = 0.0
        for j in range(len(q2)):
            if R[i, j] > max_j:
                max_j = R[i, j]
        sum_Y += max_j
        
    if (float(len(q1)) + float(len(q2))) == 0.0:
        return 0.0
        
    overall = (sum_X + sum_Y) / (2 * (float(len(q1)) + float(len(q2))))

    return overall

In [None]:
STOP_WORDS = nltk.corpus.stopwords.words()
def clean_sentence_nltk(val):
    #"remove chars that are not letters or numbers, downcase, then remove stop words"
    regex = re.compile('([^\s\w]|_)+')
    sentence = regex.sub('', val).lower()
    sentence = sentence.split(" ")

    for word in list(sentence):
        if word in STOP_WORDS:
            sentence.remove(word)

    sentence = " ".join(sentence)
    return sentence

In [None]:
def sentenceProcessing(s1):
    token1 = tokenize(s1)
    tag1 = posTag(token1)
    sentence = []

    for i, word in enumerate(tag1):
        if 'NN' in word[1] or 'JJ' in word[1] or 'VB' in word[1]:
            sentence.append(word[0])

    sense1 = Lesk(sentence)
    sentence1Means = []
    for word in sentence:
        sentence1Means.append(sense1.lesk(word, sentence))

    return sentence1Means

def semanticSimilarity(s1, s2):
    R1 = computePath(s1, s2)
    R2 = computeWup(s1, s2)

    R = (R1 + R2) / 2
    # print R
    return overallSim(s1, s2, R)

In [None]:
def prepareSent(s1):
    temp_list = sentenceProcessing(s1)
    return temp_list

In [None]:
inputs = sentences_f[0:10000]
answer = []
for i in range(len(inputs)):
    answer.append(prepareSent(inputs[i]))
    if i%100 == 0:
        print(i)


In [None]:
inputs = sentences_f[10000:]
for i in range(len(inputs)):
    answer.append(prepareSent(inputs[i]))
    if i%100 == 0:
        print(i)

# note: this and the above cell will take a long time to run, given the fact that WordNet goes through individual words and finds the best fit for it. 

In [None]:
# writing all the outputs into files

final_list = []     # compiling the BERT embeddings, wordnet embeddings and the NTL paths into one

for i in range(len(bert_scores)):
    temp_list = []
    temp_list.append(i)
    temp_list.append(bert_scores[i].tolist())
    temp_list.append(answer[i])
    temp_list.append(ntl_paths_list[i])
    final_list.append(temp_list)


with open(os.path.join(outputs, "scores_list.json"), 'w') as f:
    # print(json.dumps(dicts))
    f.write(json.dumps(final_list))
    f.write('\n')

with open(os.path.join(outputs, "dataset_f.json"), 'w') as f:
    f.write(json.dumps(dataset[0:30000]))
    f.write('\n')

with open(os.path.join(outputs, "sentences.json"), 'w') as f:
    f.write(json.dumps(sentences_f))
    f.write('\n')