In [1]:
import numpy as np
import tensorflow as tf
from IPython import display

### We need to keep track of how many times each word appeared

In [2]:
vocab = dict()    # { word:count }
total_word_cnt = 0
line_num = 0

with open('corpus_100k_pass2', 'r', encoding="utf8") as file:
    line = file.readline()
    
    while line:
        line_num+=1
        if line_num%10000==0:
            print('line_num =', line_num)
            display.clear_output(wait=True)
        
        line = line.lower().split()
        total_word_cnt += len(line)
        
        prevs=''
        for token in line:                    
            if token in vocab:
                vocab[token] += 1
            else:
                vocab[token] = 1
                    
        line = file.readline()

len(vocab), total_word_cnt

(242675, 37389589)

In [14]:
cnt_rare = 0
num_apear = 3
for k,v in vocab.items():
    cnt_rare+=(v<=num_apear)
    
print(cnt_rare/len(vocab)*100, f'% appear <={num_apear} times')

63.177500772638304 % appear <=3 times


### Lets load the data and remove infrequent tokens and trim the frequent ones

In [35]:
f=open('corpus_100k_pass2', 'r', encoding="utf8")
data = f.readlines()
for ind in range(len(data)):
    data[ind] = data[ind][:-1].split()
f.close()

In [36]:
# threshold
T = 1e-5
line_num = 0


for ind in range(len(data)):
    line_num+=1
    if line_num%10000==0:
        print('line_num =', line_num)
        display.clear_output(wait=True)
        
    pop_indxs = []
    for token_ind in range(len(data[ind])):
        if vocab[data[ind][token_ind]] <= 3:
            pop_indxs.append(token_ind)      # infrequent words will be removed regardless
        else:
            word_frequency = vocab[data[ind][token_ind]]/total_word_cnt
            if  word_frequency > T:
                if np.random.rand(1) > np.sqrt(T/word_frequency):
                    pop_indxs.append(token_ind)  # save the index for removal based on chance
    for p in pop_indxs[::-1]:
        data[ind].pop(p)      # removing the word
print('done')

done


In [52]:
cnt_n=0
leter_removed = 'ն'
for ind in range(len(data)):
    for token in data[ind]:
        if token == leter_removed:
            cnt_n+=1
            
print(cnt_n, vocab[leter_removed])

39481 4241325


#### updating vocab

In [107]:
# lets fix our vocabulary

vocab = dict()
total_word_cnt = 0

# adding line sizes
line_len = []

for line in data:
    line_len.append(len(line))
    total_word_cnt += len(line)
    for token in line:
        if token in vocab:
            vocab[token]+=1
        else:
            vocab[token]=1

line_len = np.array(line_len, dtype='float64')
len(vocab), total_word_cnt

(89359, 7817651)

In [128]:
# changing the negative sampling distribution

total_val = 0
for k in vocab.keys():
    vocab[k] **= 0.75
    total_val += vocab[k]

In [129]:
min(vocab.values()), max(vocab.values()), total_val, sum(vocab.values())

(2.8284271247461903, 2800.8581094726187, 1883338.191322189, 1883338.191322189)

### initializing the model weights

In [219]:
emb_dim = 40

word_embeddings = dict((k, tf.Variable(np.random.normal(0, 0.1, emb_dim))) for k in vocab.keys())
output_weights = dict((k, tf.Variable(np.random.normal(0, 0.1, emb_dim))) for k in vocab.keys())

### Training loop

In [187]:
def sample_negatives(set_context, cnt=10):
    totl = total_val
    for i in set_context:
        totl -= vocab[i]
    set_usable = set(vocab.keys()) - set_context
    usable = list(set_usable)
    indxs = np.random.choice(len(usable), cnt, replace=False, p=[vocab[k]/totl for k in usable])
    
    return np.array(usable)[indxs]

In [193]:
sample_negatives(set(['բարև','մարդ']))

array(['բացահայտումը', 'թյունում', 'պայմանները', 'ձեռքբերմանը', 'կունենա',
       'վերնագիր', 'կմտածե', 'ինքնազգացող', 'ուղղված', 'հարավայ'],
      dtype='<U50')

In [198]:
def sample_word_window(line_indx, word_indx):

    window_start = word_indx - window_size
    window_end = word_indx + window_size

    # implementing dynamic window
    if window_start < 0:
        window_end = min(int(line_len[line_indx])-1, window_end-window_start)
        window_start = 0
    elif window_end >= line_len[line_indx]:
        diff_end = window_end - int(line_len[line_indx])
        window_start = max(0, window_start-diff_end)
        window_end = int(line_len[line_indx]) - 1
    
    return window_start, window_end, line_indx, word_indx

In [None]:
window_size = 3
epochs = 5
lr = 1e-3 # learning rate
loss_print_rate = 100
negatives_cnt = 4

line_probas = line_len/total_word_cnt
line_indxs = []
word_indxs = []
for i in range(len(line_len)):
    for j in range(int(line_len[i])):
        line_indxs.append(i)
        word_indxs.append(j)
line_indxs = np.array(line_indxs)
word_indxs = np.array(word_indxs)

for _ in range(epochs):
    print('epoch -', _)
    shuffled_indxs = np.arange(total_word_cnt)
    np.random.shuffle(shuffled_indxs)
    line_indxs = line_indxs[shuffled_indxs]
    word_indxs = word_indxs[shuffled_indxs]
    
    for word_indx1 in range(0, total_word_cnt):
        if word_indx1%loss_print_rate == 0 and word_indx1 > 0:
            print(float(sum_loss)/loss_print_rate)
            sum_loss=0
            
            
        window_start, window_end, line_indx, word_indx = sample_word_window(line_indxs[word_indx1+b],
                                                                            word_indxs[word_indx1+b])
        target_word = data[line_indx][word_indx]
        window_words = data[line_indx][window_start:window_end+1] # this will be used for negative sampling

        for i in range(window_start, window_end+1):
            if i != word_indx:
                context_word = data[line_indx][word_indx]
                negative_words = sample_negatives(set(window_words), negatives_cnt)

                trainable_params = []
                trainable_params.extend([output_weights[n_w] for n_w in negative_words])
                trainable_params.extend([word_embeddings[target_word], output_weights[context_word]])

                with tf.GradientTape() as tape:
                    p_true = word_embeddings[target_word] * output_weights[context_word]
                    p_false = [word_embeddings[target_word] * output_weights[n_w] for n_w in negative_words]

                    sigm_true = tf.math.sigmoid(tf.math.reduce_sum(p_true))
                    sigm_false = [tf.math.sigmoid(tf.math.reduce_sum(-p_f)) for p_f in p_false]

                    log_true = tf.math.log(sigm_true)
                    log_false = [tf.math.log(s_f) for s_f in sigm_false]


                    loss = -log_true - tf.math.reduce_sum(log_false)
                    sum_loss += loss
                    #print(loss)

                grads = tape.gradient(loss, trainable_params)
                
                word_embeddings[target_word].assign_sub(grads[-2]*lr/(window_end-window_start))
                output_weights[context_word].assign_sub(grads[-1]*lr)

                for i in range(len(negative_words)):
                    output_weights[negative_words[i]].assign_sub(grads[i]*lr)
                

epoch - 0
18.942756911764373
16.685508687315245
17.34127083112054
16.90375207320818
17.372580795409295
