# Concept Similarity

In [20]:
import nltk
import math
import scipy
import re

from nltk.corpus import wordnet as wn
from scipy import stats as st

## Wu & Palmer

In [21]:
def depth(s):
    return min([len(path) for path in s.hypernym_paths()]) #min dist root

In [22]:
def lcs(s1, s2): #lowest common synonym
    paths1 = s1.hypernym_paths()
    paths2 = s2.hypernym_paths()
    lcs =  None
    min_path = math.inf
    for i in range(0, len(paths1)):
        for j in range(0, len(paths2)):
            n = min(len(paths1[i]), len(paths2[j])) - 1 #min length path (-1 because the first index is 0)
            current_path = abs(len(paths1[i]) - len(paths2[j])) # initial distance from lcs
            while n >= 0 and current_path < min_path:
                if paths1[i][n] == paths2[j][n]: #bottom up search for lcs
                    lcs = paths1[i][n]
                    min_path = current_path
                n -= 1
                current_path += 2
            return (lcs, min_path)

In [33]:
def wu_palmer(s1, s2):
        lcs_pair = lcs(s1, s2)
        if lcs_pair[0]:
            lcs_depth = depth(lcs_pair[0])
            return 2 * lcs_depth / (2 * lcs_depth + lcs_pair[1]) #depth(s1) + depth(s2) = distance(s1, s2) passing by the lcs
        return 0

## Shortest Path

In [24]:
max_depth = max(max(len(hyp_path) for hyp_path in ss.hypernym_paths()) for ss in wn.all_synsets())
max_depth

20

In [34]:
def shortest_path(s1, s2):
        lcs_pair = lcs(s1, s2)
        if lcs_pair[0]:
            return 2 * max_depth - lcs_pair[1]
        return 0

## Leakcock & Chodorow

In [35]:
def leakcock_chodorow(s1, s2):
        lcs_pair = lcs(s1, s2)
        if lcs_pair[0]:
            return - math.log((lcs_pair[1] + 1)/(2 * max_depth + 1), 10)
        return 0

### Input

In [27]:
lines = []
with open("res\WordSim353.csv", "r") as file:
    for i in file.readlines():
            lines.append(i.strip().split(','))
#print(lines)

In [37]:
res_wp = []
res_sp = []
res_lc = []
human_res = []
for i in range(1, len(lines) - 1):
    human_res.append(float(lines[i][2]))
    set1 = wn.synsets(lines[i][0])
    set2 = wn.synsets(lines[i][1])
    maximum_wp = 0
    maximum_sp = 0
    maximum_lc = 0
    for syn1 in set1:
        for syn2 in set2:
            wp = wu_palmer(syn1, syn2)
            maximum_wp = max(maximum_wp, wp)
            
            sp = shortest_path(syn1, syn2)
            maximum_sp = max(maximum_sp, sp)
            
            lc = leakcock_chodorow(syn1, syn2)
            maximum_lc = max(maximum_lc, lc)
            
    res_wp.append(maximum_wp)
    res_sp.append(maximum_sp)
    res_lc.append(maximum_lc)

print("Pearson WP: ", st.pearsonr(res_wp, human_res)[0])
print("Pearson SP: ", st.pearsonr(res_sp, human_res)[0])
print("Pearson LC: ", st.pearsonr(res_lc, human_res)[0])
print("Spearman WP: ", st.spearmanr(res_wp, human_res)[0])
print("Spearman SP: ", st.spearmanr(res_sp, human_res)[0])
print("Spearman LC: ", st.spearmanr(res_lc, human_res)[0])

Pearson WP:  0.3133778521262506
Pearson SP:  0.18660311573550914
Pearson LC:  0.3274173550208445
Spearman WP:  0.3450816969196533
Spearman SP:  0.300305123168255
Spearman LC:  0.300305123168255


# WSD

## Lesk Algorithm

In [41]:
def lesk(word, sentence): #bag of words approach at WD
    best_sense = []
    max_overlap = -1
    context = set(sentence.split())
    for s in wn.synsets(word):
        signature = set(s.definition().split())
        for e in s.examples():
            signature.update(e.split())
        overlap = len(context & signature)
        if overlap > max_overlap:
            best_sense = s
            max_overlap = overlap
    return best_sense

### Input

In [44]:
words = []
sentences = []
with open("res\sentences.txt", "r") as file:
    for i, line in enumerate(file):
        if i >= 3 and i < 17:
            sentence = line.strip()[2:-1]
            word = re.findall(r"\*\*\w+\*\*", sentence)[0]
            words.append(word[2:-2])
            sentences.append(sentence)
#print(words)
#print(sentences)
#print(lesk(words[1], sentences[1]).lemmas())

In [45]:
for pair in zip(words, sentences):
    print(pair[1].replace("**" + pair[0] + "**", str([l.name() for l in lesk(pair[0], pair[1]).lemmas()])))

['arm'] bend at the elbow
Germany sells ['arm'] to Saudi Arabia
The ['key'] broke in the lock
The ['cardinal', 'central', 'fundamental', 'key', 'primal'] problem was not one of quality but of quantity
Work out the ['solution', 'answer', 'result', 'resolution', 'solvent'] in your head
Heat the ['solution', 'answer', 'result', 'resolution', 'solvent'] to 75Â° Celsius
The house was burnt to ['ash'] while the owner returned
This table is made of ['ash'] wood
The ['lunch', 'luncheon', 'tiffin', 'dejeuner'] with her boss took longer than she expected
She packed her ['lunch'] in her purse
The ['categorization', 'categorisation', 'classification', 'compartmentalization', 'compartmentalisation', 'assortment'] of the genetic data took two years
The journal Science published the ['categorization', 'categorisation', 'classification', 'compartmentalization', 'compartmentalisation', 'assortment'] this month
His cottage is near a small ['forest', 'wood', 'woods']
The statue was made out of a block of

## SemCor

In [46]:
from nltk.corpus import semcor as sc

In [47]:
tagged_sentences = [[str(c) for c in s] for s in sc.tagged_sents(tag='both')[:50]] #get first 50 sentences as sting lists
sentences = sc.sents()
correct_answers = 0
answers = 0
for i, s in enumerate(tagged_sentences):
    sentence = " ".join(sentences[i])
    for w in s:  
        if "Lemma" in w and "NN " in w:
            w = w.split(" (NN ")
            lemma = w[0][1:]
            word = w[1][:-2]
            p_lemma = lesk(word.replace(" ", "_"), sentence)
            if p_lemma and lemma in map(str, p_lemma.lemmas()):
                correct_answers += 1
            answers += 1
print("accuracy is: ", correct_answers/answers * 100, "%")

accuracy is:  50.90909090909091 %
