In [1]:
import numpy as np
import sys

In [2]:
def generate():
    with open(r'E:\Machine Learning\NLP\0 Data\GloVe Arabic\vocab.txt', 'r', encoding='utf-8') as f:
        words = [x.rstrip().split(' ')[0] for x in f.readlines()]
    i=0
    with open(r'E:\Machine Learning\NLP\0 Data\GloVe Arabic\vectors.txt', 'r', encoding='utf-8') as f:
        vectors = {}
        for line in f:
            i+=1
            if i==100000 : break
            vals = line.rstrip().split(' ')
            vectors[vals[0]] = [float(x) for x in vals[1:]]

    vocab_size = len(words)
    vocab = {w: idx for idx, w in enumerate(words)}
    ivocab = {idx: w for idx, w in enumerate(words)}

    vector_dim = len(vectors[ivocab[0]])
    W = np.zeros((vocab_size, vector_dim))
    for word, v in vectors.items():
        if word == '<unk>':
            continue
        W[vocab[word], :] = v

    # normalize each word vector to unit variance
    W_norm = np.zeros(W.shape)
    d = (np.sum(W ** 2, 1) ** (0.5))
    W_norm = (W.T / d).T
    return (W_norm, vocab, ivocab)



In [3]:
def distance(W, vocab, ivocab, input_term):
    for idx, term in enumerate(input_term.split(' ')):
        if term in vocab:
            print('Word: %s  Position in vocabulary: %i' % (term, vocab[term]))
            if idx == 0:
                vec_result = np.copy(W[vocab[term], :])
            else:
                vec_result += W[vocab[term], :] 
        else:
            print('Word: %s  Out of dictionary!\n' % term)
            return
    
    vec_norm = np.zeros(vec_result.shape)
    d = (np.sum(vec_result ** 2,) ** (0.5))
    vec_norm = (vec_result.T / d).T

    dist = np.dot(W, vec_norm.T)

    for term in input_term.split(' '):
        index = vocab[term]
        dist[index] = -np.Inf

    a = np.argsort(-dist)[:N]

    print("\n                               Word       Cosine distance\n")
    print("---------------------------------------------------------\n")
    for x in a:
        print("%35s\t\t%f\n" % (ivocab[x], dist[x]))


In [4]:
if __name__ == "__main__":
    N = 100;          # number of closest words that will be shown
    W, vocab, ivocab = generate()    



In [5]:
while True:
        input_term = input("\nEnter word or sentence (EXIT to break): ")
        if input_term == 'exit':
            break
        else:
            distance(W, vocab, ivocab, input_term)


Enter word or sentence (EXIT to break): مصر
Word: مصر  Position in vocabulary: 314

                               Word       Cosine distance

---------------------------------------------------------

                            المصرية		0.768280

                            القاهرة		0.760673

                               بمصر		0.751293

                         الاسكندرية		0.706401

                               ومصر		0.695225

                             المصري		0.689339

                           المصريين		0.667862

                               لمصر		0.666878

                           بالقاهرة		0.640960

                            العربية		0.632281

                              الشام		0.628092

                               دمشق		0.626274

                             البلاد		0.626227

                               تونس		0.617080

                               بلاد		0.607334

                                دول		0.599329

                               دولة		0.598003

In [6]:
words = ['مصر','سوريا','يشرب','انترنت','أزرق','حذاء','شجر','سماء','كذب']


for w in words : 
    distance(W, vocab, ivocab, w)
    print('====================================')

Word: مصر  Position in vocabulary: 314

                               Word       Cosine distance

---------------------------------------------------------

                            المصرية		0.768280

                            القاهرة		0.760673

                               بمصر		0.751293

                         الاسكندرية		0.706401

                               ومصر		0.695225

                             المصري		0.689339

                           المصريين		0.667862

                               لمصر		0.666878

                           بالقاهرة		0.640960

                            العربية		0.632281

                              الشام		0.628092

                               دمشق		0.626274

                             البلاد		0.626227

                               تونس		0.617080

                               بلاد		0.607334

                                دول		0.599329

                               دولة		0.598003

                              محمود		0.5949


                               Word       Cosine distance

---------------------------------------------------------

                                شرب		0.770257

                              ويشرب		0.721231

                             الشراب		0.673112

                               تشرب		0.648669

                              الخمر		0.646797

                               اشرب		0.634731

                                ماء		0.619947

                               شربه		0.617046

                               شراب		0.616536

                               ياكل		0.612804

                              يشربه		0.603619

                              الشرب		0.600785

                             يشربون		0.597530

                               وشرب		0.582121

                             النبيذ		0.570394

                             يشربها		0.566640

                              الماء		0.562293

                               شربت		0.558535

                               فشرب


                               Word       Cosine distance

---------------------------------------------------------

                             منكبيه		0.504488

                              بحذاء		0.481487

                              قدميه		0.444147

                                حذو		0.436107

                                نعل		0.434458

                              اذنيه		0.413564

                            الاذنين		0.412340

                               رجله		0.407044

                              يحاذي		0.404137

                               يديه		0.402086

                             اليدين		0.396327

                           المنكبين		0.393876

                              النعل		0.390373

                             ركبتيه		0.385935

                             اليسرى		0.383583

                              رجليه		0.379818

                               فردة		0.377705

                               كتفه		0.375063

                             اليمنى


                               Word       Cosine distance

---------------------------------------------------------

                             السماء		0.736551

                                افق		0.590886

                              السما		0.555224

                              الافق		0.542382

                                فوق		0.541710

                              القمر		0.517581

                              الشمس		0.511025

                               كوكب		0.510815

                            والسماء		0.508836

                              وسماء		0.499365

                                قمر		0.491687

                            السموات		0.488005

                              اشرقت		0.483634

                               نجوم		0.481575

                                نور		0.480610

                             الثريا		0.478405

                              الارض		0.474817

                             والارض		0.474789

                              انوار