In [1]:
import numpy as np
import tensorflow as tf
from IPython import display
from scipy.spatial.distance import cosine
import json

In [2]:
endings = ['ություն', 'ության', 'ությամբ', 'ներ', 'ով', 'եր', 'իկ', '-' 'ը', 'ի', 'ն']
endings = sorted(endings, key=len)[::-1]

def dissect_word(word):
    ans = []
    repeat_while=True
    flag_one=False
    
    for _ in range(5):
        repeat_loop = False
        
        for e in endings:
            if word[-len(e):] == e:
                repeat_loop=True
                        
                ans.append(e)
                word = word[:-len(e)]
                break
                
        if not repeat_loop:
            break
    
    if word!='':
        if 'չ' == word[0]:
            ans.append(word[1:])
            ans.append('չ')
        elif 'ան' == word[:2]:
            ans.append(word[2:])
            ans.append('ան')
        else:
            ans.append(word)
            
    return  ans[::-1]

In [7]:
# Opening JSON file
f = open('embeddings_e0_w2600.0K.json')
embeddings = json.load(f)
f.close()

In [8]:
def find_related(word, cnt=10):
    """
    prints top 'cnt' similar and opposite vectors
    """
    word = dissect_word(word)   # to disect the word and combine parts together
    vectr = np.array(embeddings[word[0]])
    for sub_word in word[1:]:
        vectr += np.array(embeddings[sub_word])
    cos_dist=dict()
    for k, v in embeddings.items():
        cos_dist[k] = 1-cosine(vectr, v) # cosine similarity from cos_distance
        
    cos_dist = sorted(cos_dist.items(), key = lambda x: -x[1]) # sort in discending order
    print('similar vecotrs:')
    for i in cos_dist[:cnt]:
        print(i)
    
    print('opposite vecotrs:')
    for i in cos_dist[-cnt:]:
        print(i)

# here is the testing space

In [22]:
find_related('ություն')

similar vecotrs:
('ություն', 1)
('ն', 0.7061836625614021)
('ի', 0.66649692161346)
('կզանգե', 0.6478759779472605)
('ներ', 0.6270914763817074)
('1', 0.6078124695090786)
(',', 0.6073003692493473)
('փոխհամաձայ', 0.6033341005771445)
('կուրորտշ', 0.5804929173678498)
('փոստանջյա', 0.557442885563298)
opposite vecotrs:
('մանկամիտ', -0.5349780946665303)
('նեպոմնյաշչ', -0.5357359503116403)
('թաիլանդակա', -0.5390802696200367)
('սփարքս', -0.5472791847532001)
('բալաստ', -0.5499673313109215)
('էվամակ', -0.5525679942681494)
('տրվող', -0.5643181984889902)
('միջմշակութայ', -0.5690861871475619)
('ինքնաարդարացմա', -0.5809619428808515)
('գրանիտ', -0.6108780407009975)


#### 'ի' ending should be appearing more frequently with similar words and less so with non attachable ones

In [10]:
find_related('ի')

similar vecotrs:
('ի', 1)
('ն', 0.8908756997487611)
('ներ', 0.7708430504845388)
(',', 0.7631182913702186)
('ության', 0.7115620669801979)
('։', 0.6901648397090749)
('ը', 0.6876572820804548)
('ե', 0.6777774144581552)
('ություն', 0.66649692161346)
('է', 0.6392816525653607)
opposite vecotrs:
('համարձակվ', -0.5438636374757575)
('քաղքակա', -0.5492303847498148)
('դանիեբեկյա', -0.5520040529038437)
('գաղտնագր', -0.5528212663994394)
('հրապակում', -0.5649740731036768)
('թմբլ', -0.5654807391693919)
('զլանում', -0.5733801014674524)
('արտահանվ', -0.5786949064347655)
('ld', -0.5818054363610452)
('կոմեդիա', -0.5947091593047751)


## calculate similarity between two words

In [43]:
def calculate_similarity(word1, word2):
    word1 = dissect_word(word1)
    vectr1 = np.array(embeddings[word1[0]])
    for sub_word in word1[1:]:
        vectr1 += np.array(embeddings[sub_word])

    word2 = dissect_word(word2)
    vectr2 = np.array(embeddings[word2[0]])
    for sub_word in word2[1:]:
        vectr2 += np.array(embeddings[sub_word])

    return 1-cosine(vectr1, vectr2)

In [44]:
calculate_similarity('պատերազմ', 'կռիվ')

0.2809797993013441

In [45]:
calculate_similarity('տղա', 'աղջիկ')

0.04139628725713562

In [46]:
calculate_similarity('կին', 'աղջիկ')

0.16812541500754008

In [53]:
calculate_similarity('ոչ', 'այո')

-0.21690410942902938