### NN functions

In [1]:
def _cosine_similarity(word1, word2):
    # word1: [N, sum(Filters)]
    # word2: [voca, sum(filters)]
    word1 = np.array(word1)
    word2 = np.array(word2)
    dot = np.dot(word1, word2.T) # [N, voca]
    word1_size = np.sqrt(np.sum(np.square(word1), axis=-1)) # [N]
    word2_size = np.sqrt(np.sum(np.square(word2), axis=-1)) # [voca]
    size = np.multiply(word1_size.reshape(-1, 1), word2_size) # [N, voca]
    cosim = dot/size # [N, voca]
    return cosim # [N, voca]

def _top_k_cosine_similarity(current_word_embedding, voca_embedding, embedding_name, word, top_k=3, name=None):
    cosim = _cosine_similarity(current_word_embedding, voca_embedding)
#     print(cosim.shape)
    argsort = np.argsort(-cosim)[:, :top_k] # decreasing order
    NN = []
    for row in argsort:
        temp = []
        for col in row:
            temp.append(embedding_name[int(col)])
        NN.append(temp)
        
    top_k_cosim = np.array([cosim[index][i] for index, i in enumerate(argsort)]) # [N, top_k]

    print(name)
    for i in range(len(word)):
        print('input_word:', word[i])
        print(np.array(list(zip(NN[i], top_k_cosim[i]))))
        print()


### FastText

In [2]:
# Load the library
import fasttext
import json
model = fasttext.load_model("../../models/FastText/result/corpus_word_continuous.txt.bin")



In [3]:
import numpy as np
emb_dict ={}
with open('./vec_files/FastText.300.vec', 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        word, vec = line.split(' ', 1)
        emb_dict[word] = np.fromstring(vec, sep=' ')

In [8]:
vectors = []
word = ['BAratasya','suKam','satyam','nityam','aham','AByAm','ahham','bahUhuni','sahasra-cakzo']
# word = ['drAvyate','saMgrasate','ayuDyati','pAtaye','dravyAn']
word = []
embed_space = []
for w in emb_dict.keys():
    embed_space.append(emb_dict[w])
for w in word:
    vectors.append(model.get_word_vector(w))
_top_k_cosine_similarity(vectors, embed_space, list(emb_dict.keys()), word, top_k=8, name='after_highway')

after_highway
input_word: tAByAm
[['tAByAm' '1.000000056177758']
 ['tayoH' '0.49548605706870197']
 ['tO' '0.4714334801003086']
 ['dvArO' '0.44562153380335845']
 ['iti-etAByAm' '0.4443622636209815']
 ['cakzuqByAm' '0.4076752226616034']
 ['BUByAm' '0.40690532204182556']
 ['tatas-tAByAm' '0.40588611021396054']]

input_word: ahham
[['siṁham' '0.4546009509466571']
 ['tam' '0.42580659124516135']
 ['tat' '0.4109035211604918']
 ['yaTA-aham' '0.395642486824646']
 ['vaṁSam' '0.3942224525806031']
 ['sa-asram' '0.39024018014114414']
 ['kaṁsam' '0.3888144790848053']
 ['sarvam' '0.3857804238939817']]

input_word: bahUhuni
[['bahuni' '0.6083027695450411']
 ['mandmatyoH' '0.5466141793526074']
 ['suprayogaviSiKa' '0.5306529275065406']
 ['hata-Adara-tayA' '0.5275621022344397']
 ['sa-saNgam-asaNgam-iti' '0.5232818076813854']
 ['bahavaH-tatra' '0.5193559600618802']
 ['deSa-antaram' '0.5177594114726974']
 ['pratiniDyoH' '0.5106806803959424']]

input_word: sahasra-cakzo
[['sahasraxaMRtra' '0.76442225247277'

### word2vec

In [9]:
f =  open('./Data/embedding_space_or_vocab.txt','r')
lines = f.readlines()
voca = set()
for line in lines:
    line = line.replace('\n','')
    voca.add(line)
f.close()
voca.remove('')
voca = list(voca)

In [10]:
import gensim
import sentencepiece as spm
sp_model = spm.SentencePieceProcessor()
sp_model.load('../../models/corpus_variants/Sentencepiece_Model'+'/model_vocab_32000.model')
oov = 0
vectors = []
words = word
# words = ['BAratasya','suKam','satyam']
saved_model = gensim.models.Word2Vec.load('../../models/word2vec/saved_models/word2vec.model')
embed_space = []

for w in voca:
    enc = sp_model.encode_as_pieces(w)
    vector = np.zeros(300)
    piece_size = 0
    for piece in enc:
        if piece in saved_model.wv.vocab:
            vector += saved_model.wv[piece]
            piece_size += 1

    if piece_size != 0:
        vector = vector/piece_size
    else:
        vector =  np.random.rand(300)
        oov += 1
    embed_space.append(vector)
    
for word in words:
    enc = sp_model.encode_as_pieces(word)
    vector = np.zeros(300)
    piece_size = 0
    for piece in enc:
        if piece in saved_model.wv.vocab:
            vector += saved_model.wv[piece]
            piece_size += 1

    if piece_size != 0:
        vector = vector/piece_size
    else:
        vector =  np.random.rand(300)
        oov += 1
    vectors.append(vector)
_top_k_cosine_similarity(vectors, embed_space, voca , words, top_k=5, name='after_highway')

after_highway
input_word: tAByAm
[['tAByAm' '0.9999999999999992']
 ['AvAByAm' '0.3375851360269416']
 ['dvAByAm' '0.32601343729616217']
 ['tayA' '0.3079213692836734']
 ['mayA' '0.3038272780186519']]

input_word: ahham
[['allAh' '0.7193251074620278']
 ['ahne' '0.7128906473693825']
 ['ahrasat' '0.7015612728601706']
 ['ahrasan' '0.698943611739984']
 ['ahrasam' '0.6938093441899283']]

input_word: bahUhuni
[['mAnini' '0.6013870385686777']
 ['sani' '0.5880590737609316']
 ['dahu' '0.5775882315337663']
 ['cUrRitAni' '0.5710922752912875']
 ['nivizwAni' '0.5709430918875563']]

input_word: sahasra-cakzo
[['pracakzva' '0.6249167257292929']
 ['suparRAH-ca' '0.582374904384294']
 ['pracakzmahe' '0.5613083197245341']
 ['vAlaKilyAH-ca' '0.5521646846738717']
 ['antarikzAt-' '0.5503355553623744']]



### Glove

In [11]:
emb_dict = {}
oov = 0
embed_space = []
vectors = []
with open('../../models/GloVe/saved_models/vectors.txt', 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        word, vec = line.split(' ', 1)
        emb_dict[word] = np.fromstring(vec, sep=' ')
for word in voca:
    enc = sp_model.encode_as_pieces(word)
    vector = np.zeros(300)
    piece_size = 0
    for piece in enc:
        if piece in emb_dict.keys():
            vector += emb_dict[piece]
            piece_size += 1

    if piece_size != 0:
        vector = vector/piece_size
    else:
        oov += 1
        vector =  np.random.rand(300)
    embed_space.append(vector)
for word in words:
    enc = sp_model.encode_as_pieces(word)
    vector = np.zeros(300)
    piece_size = 0
    for piece in enc:
        if piece in emb_dict.keys():
            vector += emb_dict[piece]
            piece_size += 1

    if piece_size != 0:
        vector = vector/piece_size
    else:
        oov += 1
        vector =  np.random.rand(300)
    vectors.append(vector)
_top_k_cosine_similarity(vectors, embed_space, voca, words, top_k=5, name='after_highway')

after_highway
input_word: tAByAm
[['tAByAm' '0.9999999999999999']
 ['aDikAriByAm' '0.34620537734488294']
 ['pArzRiByAm' '0.33669497098130713']
 ['jaNGAByAm' '0.3216142465375557']
 ['tO' '0.31422354429472865']]

input_word: ahham
[['allAh' '0.7606563966192078']
 ['ahne' '0.7435124235905444']
 ['ahnuvi' '0.6955578327008431']
 ['ahnUye' '0.6891296525573866']
 ['ahnUyAvahi' '0.6869392062943536']]

input_word: bahUhuni
[['bahU' '0.5814050688005236']
 ['behulA' '0.539564070142775']
 ['behulAyAH' '0.5068954393817564']
 ['dahu' '0.5051067533720685']
 ['venitvA' '0.5035352385266239']]

input_word: sahasra-cakzo
[['tat-ca-eva' '0.7604227190331986']
 ['taTA-ca' '0.7599812090442553']
 ['yat-ca-eva' '0.7582624109981531']
 ['nicayAt-' '0.7579032428331048']
 ['tat-ca-api' '0.7578166516676037']]

