In [1]:
import numpy as np
import tensorflow as tf
from IPython import display
from scipy.spatial.distance import cosine
import json

In [2]:
endings = ['ություն', 'ության', 'ությամբ', 'ներ', 'ով', 'եր', 'իկ', '-' 'ը', 'ի', 'ն']
endings = sorted(endings, key=len)[::-1]

def dissect_word(word):
    ans = []
    repeat_while=True
    flag_one=False
    
    for _ in range(5):
        repeat_loop = False
        
        for e in endings:
            if word[-len(e):] == e:
                repeat_loop=True
                        
                ans.append(e)
                word = word[:-len(e)]
                break
                
        if not repeat_loop:
            break
    
    if word!='':
        if 'չ' == word[0]:
            ans.append(word[1:])
            ans.append('չ')
        elif 'ան' == word[:2]:
            ans.append(word[2:])
            ans.append('ան')
        else:
            ans.append(word)
            
    return  ans[::-1]

In [7]:
# Opening JSON file
f = open('embeddings_e0_w2600.0K.json')
embeddings = json.load(f)
f.close()

In [8]:
def find_related(word, cnt=10):
    """
    prints top 'cnt' similar and opposite vectors
    """
    word = dissect_word(word)   # to disect the word and combine parts together
    vectr = np.array(embeddings[word[0]])
    for sub_word in word[1:]:
        vectr += np.array(embeddings[sub_word])
    cos_dist=dict()
    for k, v in embeddings.items():
        cos_dist[k] = 1-cosine(vectr, v) # cosine similarity from cos_distance
        
    cos_dist = sorted(cos_dist.items(), key = lambda x: -x[1]) # sort in discending order
    print('similar vecotrs:')
    for i in cos_dist[:cnt]:
        print(i)
    
    print('opposite vecotrs:')
    for i in cos_dist[-cnt:]:
        print(i)

## here is the testing space

In [17]:
find_related('ը')

similar vecotrs:
('ը', 1)
('ն', 0.8081993714329113)
('ի', 0.6876572820804548)
('պարոլ', 0.6706986500912953)
('ներ', 0.6545493341352883)
('ու', 0.6436490167073012)
('է', 0.6281370408801262)
(',', 0.6124258532791613)
('վալյուտա', 0.6044189222572811)
('ության', 0.5814067689774851)
opposite vecotrs:
('работу', -0.5566047722022978)
('կիսալուս', -0.5584423828834346)
('տվերսկոյ', -0.5612226697973601)
('կխեղ', -0.5625938215506401)
('թանբայ', -0.5647698929978628)
('հարգեք', -0.5668106807008777)
('գերել', -0.5720208172466053)
('թեյլոր', -0.5790197160512116)
('դեբետ', -0.5913799087776286)
('ժեշտ', -0.6026176210293799)


#### 'ի' ending should be appearing more frequently with similar words and less so with non attachable ones

In [10]:
find_related('ի')

similar vecotrs:
('ի', 1)
('ն', 0.8908756997487611)
('ներ', 0.7708430504845388)
(',', 0.7631182913702186)
('ության', 0.7115620669801979)
('։', 0.6901648397090749)
('ը', 0.6876572820804548)
('ե', 0.6777774144581552)
('ություն', 0.66649692161346)
('է', 0.6392816525653607)
opposite vecotrs:
('համարձակվ', -0.5438636374757575)
('քաղքակա', -0.5492303847498148)
('դանիեբեկյա', -0.5520040529038437)
('գաղտնագր', -0.5528212663994394)
('հրապակում', -0.5649740731036768)
('թմբլ', -0.5654807391693919)
('զլանում', -0.5733801014674524)
('արտահանվ', -0.5786949064347655)
('ld', -0.5818054363610452)
('կոմեդիա', -0.5947091593047751)
