## Importing Libraries

In [1]:
from nltk.corpus import wordnet as wn
import nltk
import pandas as pd
import numpy as np
from scipy.stats import spearmanr as spc
from scipy.stats import pearsonr as pec

## Support Function

In [4]:
def pathsim(w1,w2):
    #Calculating Shortest Path between concepts
    distance = w1.shortest_path_distance(w2)
    
    #In case doesn't have a path
    if distance == None:
        return None
    #Path exists, Assuming longer the distance lesser the similarity
    return 1/(distance+1.0)

basOp = pathsim

# To calculate similarity between two english word based on word.
# Since lemma is not defined we will consider the similary which will be the best among path.
# Word1,Word2 : are the 2 words between which we have to find similarity
# SimOp : The similarity measure we wish to apply.
### SimOp : default : basOp(basic path based similarity nothing fancy), lch_similarity, wup_similarity 
def wordsimCalc(word1,word2,simOp=basOp):
    wsl1 = wn.synsets(word1)
    wsl2 = wn.synsets(word2)
    maxSim = 0
    for i in wsl1:
        for j in wsl2:
            # Words should belong to same to tree structure.
            # Noun to Noun, Verb to Verb. There is no path between Noun to Verb
            # There is no IS-A link structure between Adjective, so they have to be dropen
            if (i.pos() == 'v' or i.pos()=='n') and (j.pos() == 'v' or j.pos()=='n') and (i.pos() == j.pos()) :
                sim = simOp(i,j)
                #If path doesn't exist in the taxonomy.
                if sim == None:
                    continue
                #If sim is greater than previous existed 
                if sim > maxSim:
                    maxSim = sim
    return maxSim



## Preparing for wordSim353
Steps :
1. Loading WordSim353.
2. Creating two different list of words for similarity.
3. Calculate the similairy between words and store in list.
4. Co-relate calculated list with the WordSim353 similarity using Spearman-rank-correlation.
5. Show Correlation

In [5]:
wordSimData = pd.read_csv('/home/sp/NLP_PA1/path_sim.csv')
w1list = wordSimData.values[:,0]
w2list = wordSimData.values[:,1]
humanSim = wordSimData.values[:,2]

calSim = np.zeros(353)

i = 0
for w1,w2 in zip(w1list,w2list):
    calSim[i] = wordsimCalc(w1,w2,simOp = wn.lch_similarity)
    print( w1, " <> ", w2 ," : ",calSim[i],"\n")
    i = i + 1
    

love  <>  sex  :  2.9444389791664407 

tiger  <>  cat  :  2.9444389791664407 

tiger  <>  tiger  :  3.6375861597263857 

book  <>  paper  :  2.538973871058276 

computer  <>  keyboard  :  2.2512917986064953 

computer  <>  internet  :  1.55814461804655 

plane  <>  car  :  1.6916760106710724 

train  <>  car  :  1.845826690498331 

telephone  <>  communication  :  1.2396908869280152 

television  <>  radio  :  2.538973871058276 

media  <>  radio  :  2.2512917986064953 

drug  <>  abuse  :  1.6486586255873816 

bread  <>  butter  :  2.2512917986064953 

cucumber  <>  potato  :  2.2512917986064953 

doctor  <>  nurse  :  2.2512917986064953 

professor  <>  doctor  :  1.6916760106710724 

student  <>  professor  :  1.55814461804655 

smart  <>  student  :  0.9985288301111273 

smart  <>  stupid  :  0.9985288301111273 

company  <>  stock  :  1.845826690498331 

stock  <>  market  :  1.6916760106710724 

stock  <>  phone  :  1.6916760106710724 

stock  <>  CD  :  1.845826690498331 

stock

school  <>  center  :  2.538973871058276 

reason  <>  hypertension  :  1.4403615823901665 

reason  <>  criterion  :  1.55814461804655 

hundred  <>  percent  :  1.072636802264849 

Harvard  <>  Yale  :  2.538973871058276 

hospital  <>  infrastructure  :  1.1526795099383855 

death  <>  row  :  1.55814461804655 

death  <>  inmate  :  1.2396908869280152 

lawyer  <>  evidence  :  1.2396908869280152 

life  <>  death  :  2.538973871058276 

life  <>  term  :  2.538973871058276 

word  <>  similarity  :  1.4403615823901665 

board  <>  recommendation  :  1.1526795099383855 

governor  <>  interview  :  0.9985288301111273 

OPEC  <>  country  :  1.6916760106710724 

peace  <>  atmosphere  :  2.0281482472922856 

peace  <>  insurance  :  2.2512917986064953 

territory  <>  kilometer  :  1.072636802264849 

travel  <>  activity  :  1.845826690498331 

competition  <>  price  :  1.55814461804655 

consumer  <>  confidence  :  1.1526795099383855 

consumer  <>  energy  :  1.3350010667323402

In [6]:
cors,pvals = spc(calSim,(humanSim))
corp,pvalp = pec(calSim,(humanSim))

In [7]:
print("Correlation is : ",cors,"\n")

Correlation is :  0.30119153189226205 

