In [None]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter
import io

flatten = lambda l: [item for sublist in l for item in sublist]
random.seed(1024)

In [None]:
print(torch.__version__)
print(nltk.__version__)

1.11.0+cu113
3.2.5


In [None]:
USE_CUDA = torch.cuda.is_available()
gpus = [0]
#torch.cuda.set_device(gpus[0])

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

In [None]:
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex: eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch

    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [None]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return Variable(LongTensor(idxs))

def prepare_word(word, word2index):
    return Variable(LongTensor([word2index[word]]) if word2index.get(word) is not None else LongTensor([word2index["<UNK>"]]))


# Data Loading and Preprocessing

In [None]:
with open('/content/almizan_processed.txt', 'r', encoding='utf-8') as file:
  corpus = file.readlines()

corpus = ''.join(corpus).split('\n')
corpus = [[word for word in sent.split()] for sent in corpus]
corpus[:2]

[['بسم', 'الله', 'الرحمن', 'الرحيم'],
 ['پايگاه', 'قرآن', 'شناسي', 'حوزه', 'علميه', 'ميبد']]

## Remove Sparse words

In [None]:
word_count = Counter(flatten(corpus))

In [None]:
MIN_COUNT = 3
exclude = []

In [None]:
for w, c in word_count.items():
    if c < MIN_COUNT:
        exclude.append(w)
exclude = set(exclude)

## Prepare Training data

In [None]:
vocab = list(set(flatten(corpus)) - set(exclude))

In [None]:
word2index = {}
for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)
        
index2word = {v:k for k, v in word2index.items()}

In [None]:
WINDOW_SIZE = 5
windows =  flatten([list(nltk.ngrams(['<DUMMY>'] * WINDOW_SIZE + c + ['<DUMMY>'] * WINDOW_SIZE, WINDOW_SIZE * 2 + 1)) for c in corpus])
train_data = []

for window in windows:
    for i in range(WINDOW_SIZE * 2 + 1):
        if window[i] in exclude or window[WINDOW_SIZE] in exclude: 
            continue # min_count
        if i == WINDOW_SIZE or window[i] == '<DUMMY>': 
            continue
        train_data.append((window[WINDOW_SIZE], window[i]))

X_p = []
y_p = []

for tr in train_data:
    X_p.append(prepare_word(tr[0], word2index).view(1, -1))
    y_p.append(prepare_word(tr[1], word2index).view(1, -1))
    
train_data = list(zip(X_p, y_p))



In [None]:
len(train_data)

1148374

### Build Unigram Distribution

                                                    P(w)=U(w)3/4/Z

In [None]:
Z = 0.001

In [None]:
word_count = Counter(flatten(corpus))
num_total_words = sum([c for w, c in word_count.items() if w not in exclude])

In [None]:
unigram_table = []

for vo in vocab:
    unigram_table.extend([vo] * int(((word_count[vo]/num_total_words)**0.75)/Z))

In [None]:
print(len(vocab), len(unigram_table))


6200 3314


### Negative Sampling 

In [None]:
def negative_sampling(targets, unigram_table, k):
    batch_size = targets.size(0)
    neg_samples = []
    for i in range(batch_size):
        nsample = []
        target_index = targets[i].data.cpu().tolist()[0] if USE_CUDA else targets[i].data.tolist()[0]
        while len(nsample) < k: # num of sampling
            neg = random.choice(unigram_table)
            if word2index[neg] == target_index:
                continue
            nsample.append(neg)
        neg_samples.append(prepare_sequence(nsample, word2index).view(1, -1))
    
    return torch.cat(neg_samples)

### Modelling

In [None]:
class SkipgramNegSampling(nn.Module):
    
    def __init__(self, vocab_size, projection_dim):
        super(SkipgramNegSampling, self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, projection_dim) # center embedding
        self.embedding_u = nn.Embedding(vocab_size, projection_dim) # out embedding
        self.logsigmoid = nn.LogSigmoid()
                
        initrange = (2.0 / (vocab_size + projection_dim))**0.5 # Xavier init
        self.embedding_v.weight.data.uniform_(-initrange, initrange) # init
        self.embedding_u.weight.data.uniform_(-0.0, 0.0) # init
        
    def forward(self, center_words, target_words, negative_words):
        center_embeds = self.embedding_v(center_words) # B x 1 x D
        target_embeds = self.embedding_u(target_words) # B x 1 x D
        
        neg_embeds = -self.embedding_u(negative_words) # B x K x D
        
        positive_score = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2) # Bx1
        negative_score = torch.sum(neg_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2), 1).view(negs.size(0), -1) # BxK -> Bx1
        
        loss = self.logsigmoid(positive_score) + self.logsigmoid(negative_score)
        
        return -torch.mean(loss)
    
    def prediction(self, inputs):
        embeds = self.embedding_v(inputs)
        
        return embeds

## Train

In [None]:
EMBEDDING_SIZE = 50
BATCH_SIZE = 256
EPOCH = 100
NEG = 20 # Num of Negative Sampling

In [None]:
losses = []
model = SkipgramNegSampling(len(word2index), EMBEDDING_SIZE)
if USE_CUDA:
    model = model.cuda()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
for epoch in range(EPOCH):
    for i,batch in enumerate(getBatch(BATCH_SIZE, train_data)):
        
        inputs, targets = zip(*batch)
        
        inputs = torch.cat(inputs) # B x 1
        targets = torch.cat(targets) # B x 1
        negs = negative_sampling(targets, unigram_table, NEG)
        model.zero_grad()

        loss = model(inputs, targets, negs)
        
        loss.backward()
        optimizer.step()
    
        losses.append(loss.data.tolist())
    if epoch % 10 == 0:
        print("Epoch : %d, mean_loss : %.02f" % (epoch, np.mean(losses)))
        losses = []


Epoch : 0, mean_loss : 0.70
Epoch : 10, mean_loss : 0.52
Epoch : 20, mean_loss : 0.45
Epoch : 30, mean_loss : 0.43
Epoch : 40, mean_loss : 0.42
Epoch : 50, mean_loss : 0.41
Epoch : 60, mean_loss : 0.41
Epoch : 70, mean_loss : 0.40
Epoch : 80, mean_loss : 0.40
Epoch : 90, mean_loss : 0.40


## Test

In [None]:
vectors_v = []
vectors_u = []
vectors_m = []

v_weight = lambda word: model.embedding_v(prepare_word(word, word2index))[0].data.tolist()
u_weight = lambda word: model.embedding_u(prepare_word(word, word2index))[0].data.tolist()
mean_weight = lambda word: list((np.asarray(v_weight(word)) + np.asarray(u_weight(word)))/2.)

for word in vocab:
  vectors_v.append(v_weight(word))
  vectors_u.append(u_weight(word))
  vectors_m.append(mean_weight(word))

In [None]:
embedding_dict_v = {}
embedding_dict_u = {}
embedding_dict_m = {}

for i, word in enumerate(vocab):
  embedding_dict_v[word] = vectors_v[i]
  embedding_dict_u[word] = vectors_u[i]
  embedding_dict_m[word] = vectors_m[i]

In [None]:
import pickle

In [None]:
model_data_v = {'embeddings': embedding_dict_v, 'dim':50, 'neg':20, 'batch_size': 256}
model_data_u = {'embeddings': embedding_dict_u, 'dim':50, 'neg':20, 'batch_size': 256}
model_data_m = {'embeddings': embedding_dict_m, 'dim':50, 'neg':20, 'batch_size': 256}

with open("saved_weights_v_pytorch.pkl", 'wb') as file:
  pickle.dump(model_data_v, file)
  
with open("saved_weights_u_pytorch.pkl", 'wb') as file:
  pickle.dump(model_data_u, file)

with open("saved_weights_m_pytorch.pkl", 'wb') as file:
  pickle.dump(model_data_m, file)

In [None]:
out_v = io.open('vectors_v_pytorch.tsv', 'w', encoding='utf-8')
out_u = io.open('vectors_u_pytorch.tsv', 'w', encoding='utf-8')
out_m = io.open('vectors_m_pytorch.tsv', 'w', encoding='utf-8')
out_meta = io.open('metadata_pytorch.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = vectors_v[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")

  vec = vectors_u[index]
  out_u.write('\t'.join([str(x) for x in vec]) + "\n")

  vec = vectors_m[index]
  out_m.write('\t'.join([str(x) for x in vec]) + "\n")

  out_meta.write(word + "\n")
out_v.close()
out_meta.close()

In [78]:
try:
  from google.colab import files
  files.download('vectors_v_pytorch.tsv')
  files.download('vectors_u_pytorch.tsv')
  files.download('vectors_m_pytorch.tsv')
  files.download('saved_weights_v_pytorch.pkl')
  files.download('saved_weights_u_pytorch.pkl')
  files.download('saved_weights_m_pytorch.pkl')
  files.download('metadata_pytorch.tsv')
except Exception:
  pass

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [76]:
def word_similarity(target, vocab):
    if USE_CUDA:
        target_V = model.prediction(prepare_word(target, word2index))
    else:
        target_V = model.prediction(prepare_word(target, word2index))
    similarities = []
    for i in range(len(vocab)):
        if vocab[i] == target: 
            continue
        
        if USE_CUDA:
            vector = model.prediction(prepare_word(list(vocab)[i], word2index))
        else:
            vector = model.prediction(prepare_word(list(vocab)[i], word2index))
        
        cosine_sim = F.cosine_similarity(target_V, vector).data.tolist()[0]
        similarities.append([vocab[i], cosine_sim])
    return sorted(similarities, key=lambda x: x[1], reverse=True)[:20]

In [74]:
test = random.choice(list(vocab))
test

'حقائق'

In [79]:
word_similarity('الصراط', vocab)

[['ينتهي', 0.5445147752761841],
 ['الحق.', 0.5320985913276672],
 ['حكيم', 0.5094931125640869],
 ['يقوله', 0.5041706562042236],
 ['فرد', 0.4950423538684845],
 ['ربهم', 0.4870917797088623],
 ['سا', 0.464834988117218],
 ['البخل', 0.4620210528373718],
 ['ديني', 0.4564393162727356],
 ['يد', 0.452975332736969]]

In [1]:
import pickle

def load_model(path):
    with open(path, 'rb') as file:
        data = pickle.load(file)
    return data

data_v = load_model('saved_weights_v_pytorch.pkl')
embeddings = data_v['embeddings']

In [15]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import random
import numpy as np

In [11]:
def find_closest_word(word, k, dist_type, embeddings):
    most_closest_words = []
    word_emb = embeddings[word]
    similar_word = ''
    
    for w in embeddings.keys():
        if w != word:
            # get the word embedding
            w_emb = embeddings[w]
            # calculating distance
            if dist_type == "Cosine":
                distance = cosine_similarity(np.asarray(word_emb).reshape(1,-1), np.asarray(w_emb).reshape(1,-1))
            elif dist_type == "Euclidean":
                distance = euclidean_distances(np.asarray(word_emb).reshape(1,-1), np.asarray(w_emb).reshape(1,-1))

            # store the similar_word as a tuple, which contains the word and the similarity
            similar_word = (w, distance)
            # append each tuple to list
            most_closest_words.append(similar_word)
    # sort based on more similarity
    most_closest_words.sort(key=lambda y: -y[1])
    return most_closest_words[:k]

In [31]:
def test_neighbours(distance_type='Cosine', embeddings=embeddings):
  for i in range(10):
    vocab = embeddings.keys()
    word = random.choice(list(vocab))
    neighbours = find_closest_word(word, 20, 'Cosine', embeddings)
    print(f'{word}:')
    for neighbour in neighbours:
      print(f'{neighbour[0]} : {neighbour[1][0][0]:.4f}')
    print()

In [33]:
test_neighbours()

تهتدي:
قواه : 0.7246
أساس : 0.6755
الإنسان : 0.6427
أفراد : 0.6398
الشعور : 0.6372
باعتقاد : 0.6195
القوى : 0.6191
الفطرة : 0.6013
بالفعل : 0.6001
الإنساني : 0.5962
الاجتماع : 0.5904
يتوجه : 0.5901
إلغا : 0.5894
تشريك : 0.5890
الاجتماعات : 0.5824
الاجتماعي : 0.5782
المدني : 0.5776
وجودا : 0.5749
مرتبط : 0.5746
الإنسانية : 0.5722

أشار:
الخالي : 0.5774
مقابلته : 0.5652
آتيه : 0.5592
يجوزه : 0.5571
بالكلام : 0.5525
ذكره : 0.5471
توضيح : 0.5409
العطف : 0.5255
خصوص : 0.5240
مر. : 0.5191
الوجه : 0.5106
بحق : 0.5074
إطلاق : 0.5013
فسر : 0.5006
يؤيده : 0.4986
يسلم : 0.4984
الفضل : 0.4977
مسوق : 0.4974
الوالد : 0.4970
الغرض : 0.4957

اكتسبت:
السموت : 0.6005
الجميل : 0.5411
نكتب : 0.5232
جعلنا : 0.5216
كسبتم : 0.5184
باليوم : 0.5178
ننزله : 0.5175
خلفهم : 0.5146
المولى : 0.5135
مصيبة : 0.5130
حسن : 0.5102
نسوا : 0.5012
الإنس : 0.4988
فأطلق : 0.4981
غنى : 0.4909
قائما : 0.4810
تؤاخذنا : 0.4708
شئت : 0.4697
يتعدى : 0.4688
يستر : 0.4668

عني:
لهو : 0.5735
بي : 0.5643
بيدي : 0.5628
الغالبون : 0.554