In [1]:
import numpy as np
import tensorflow as tf
from IPython import display

### We need to keep track of how many times each word appeared

In [2]:
vocab = dict()    # { word:count }
total_word_cnt = 0
line_num = 0

with open('corpus_100k_pass2', 'r', encoding="utf8") as file:
    line = file.readline()
    
    while line:
        line_num+=1
        if line_num%10000==0:
            print('line_num =', line_num)
            display.clear_output(wait=True)
        
        line = line.lower().split()
        total_word_cnt += len(line)
        
        prevs=''
        for token in line:                    
            if token in vocab:
                vocab[token] += 1
            else:
                vocab[token] = 1
                    
        line = file.readline()

len(vocab), total_word_cnt

(199143, 40535543)

In [3]:
cnt_rare = 0
num_apear = 3
for k,v in vocab.items():
    cnt_rare+=(v<=num_apear)
    
print(cnt_rare/len(vocab)*100, f'% appear <={num_apear} times')

63.528218415912185 % appear <=3 times


### Lets load the data and remove infrequent tokens and trim the frequent ones
### Also remove small sentences

In [13]:
f=open('corpus_100k_pass2', 'r', encoding="utf8")
data = f.readlines()
for ind in range(len(data)):
    data[ind] = data[ind][:-1].split()
f.close()

In [14]:
# threshold
T = 1e-5
line_num = 0

pop_lines = []
for ind in range(len(data)):
    line_num+=1
    if line_num%10000==0:
        print('line_num =', line_num)
        display.clear_output(wait=True)
        
    pop_indxs = []
    for token_ind in range(len(data[ind])):
        if vocab[data[ind][token_ind]] <= 3:
            pop_indxs.append(token_ind)      # infrequent words will be removed regardless
        else:
            word_frequency = vocab[data[ind][token_ind]]/total_word_cnt
            if  word_frequency > T:
                if np.random.rand(1) > np.sqrt(T/word_frequency):
                    pop_indxs.append(token_ind)  # save the index for removal based on chance
                    
    for p in pop_indxs[::-1]:
        data[ind].pop(p)      # removing the word

    if len(data[ind])<3:
        pop_lines.append(ind)

for ind, p in enumerate(pop_lines[::-1]):
    if ind%1000==0:
        print(ind, '/', len(pop_lines))
        display.clear_output(wait=True)
    data.pop(p)      # removing the line
    
print('done')

done


In [15]:
# just testing

cnt_n=0
leter_removed = 'ն'
for ind in range(len(data)):
    for token in data[ind]:
        if token == leter_removed:
            cnt_n+=1
        
# new count vs original count
print(cnt_n, vocab[leter_removed])

42545 4675336


#### updating vocab

In [16]:
# lets fix our vocabulary

vocab = dict()
total_word_cnt = 0

# adding line sizes
line_len = []

for line in data:
    line_len.append(len(line))
    total_word_cnt += len(line)
    for token in line:
        if token in vocab:
            vocab[token]+=1
        else:
            vocab[token]=1

line_len = np.array(line_len, dtype='float32')
len(vocab), total_word_cnt

(72566, 7075674)

In [17]:
# changing the negative sampling distribution

total_val = 0
for k in vocab.keys():
    vocab[k] **= 0.75
    total_val += vocab[k]

In [18]:
min(vocab.values()), max(vocab.values()), total_val, sum(vocab.values())

(1.0, 2962.350185036202, 1629349.650183193, 1629349.650183193)

In [19]:
# for quickly choosing negatives
data1=''
ind = 0
for k,v in vocab.items():
    data1 += (k+' ')*(round(v))
    
data1 = data1.split()

### initializing the model weights

In [34]:
emb_dim = 40

# define new weights or load old ones

"""
word_embeddings = dict((k, tf.Variable(np.random.normal(0, 0.1, emb_dim))) for k in vocab.keys())
output_weights = dict((k, tf.Variable(np.random.normal(0, 0.1, emb_dim))) for k in vocab.keys())
"""

f = open('embeddings_e1_w6000.0K.json')
word_embeddings = json.load(f)
f.close()
f = open('output_weights_e1_w6000K.json')
output_weights = json.load(f)
f.close()

for k in word_embeddings.keys():
    word_embeddings[k] = tf.Variable(word_embeddings[k])
for k in output_weights.keys():
    output_weights[k] = tf.Variable(output_weights[k])

### Training loop

In [21]:
import json

def save_model(iter_indx=''):
    dump_dict = dict()
    ind123=0
    for k,v in word_embeddings.items():
        ind123+=1
        if ind123%1000==0:
            print('saving model...', ind123)
            display.clear_output(wait=True)
        dump_dict[k]=v.numpy().tolist()
        
    #dump_dict = dict((k, list(v)) for k,v in word_embeddings.items())
    json_object = json.dumps(dump_dict, indent=4)
 
    with open(f"embeddings{iter_indx}.json", "w") as outfile:
        outfile.write(json_object)
    print('saved!')

In [22]:
from operator import itemgetter

def sample_negatives(set_context, cnt=10):
    negs = []
    for _ in range(cnt):
        x = np.random.randint(0, len(data1))
        while data1[x] in set_context:
            x = np.random.randint(0, len(data1))
        negs.append(data1[x])
    
    return np.array(negs)

In [23]:
sample_negatives(set(['բարև','մարդ']))

array(['բացառապես', 'եզակ', 'ցանկացել', 'առաքելյա', 'մարզաձեւում',
       'հեկտար', 'հուշագր', 'ներգրավվմա', 'վերեւ', 'ագ'], dtype='<U11')

In [24]:
def sample_word_window(line_indx, word_indx):
    """
    returns window starting/ending indexes
    """
    window_start = word_indx - window_size
    window_end = word_indx + window_size

    # implementing dynamic window
    if window_start < 0:
        window_end = min(int(line_len[line_indx])-1, window_end-window_start)
        window_start = 0
    elif window_end >= line_len[line_indx]:
        diff_end = window_end - int(line_len[line_indx])
        window_start = max(0, window_start-diff_end)
        window_end = int(line_len[line_indx]) - 1
    
    return window_start, window_end

In [1]:
window_size = 4 # window from both sides
epochs = 5
lr = 1e-3 # learning rate
negatives_cnt = 20
sum_loss = 0
loss_print_rate = 10000
save_indx_rate = 1000000


line_probas = line_len/total_word_cnt    # this is done for quick shuffling and sampling
line_indxs = []
word_indxs = []
for i in range(len(line_len)):
    for j in range(int(line_len[i])):
        line_indxs.append(i)
        word_indxs.append(j)
line_indxs = np.array(line_indxs)    # will show the line of sampled word
word_indxs = np.array(word_indxs)    # will show the index in line of sampled word


for _ in range(2, 2+epochs):
    print('epoch -', _)
    shuffled_indxs = np.arange(total_word_cnt)
    np.random.shuffle(shuffled_indxs)
    line_indxs = line_indxs[shuffled_indxs]
    word_indxs = word_indxs[shuffled_indxs]
    
    for word_loop_ind in range(0, total_word_cnt):
        if word_loop_ind%loss_print_rate == 0 and word_loop_ind > 0:
            print(float(sum_loss)/loss_print_rate)
            sum_loss=0
        if word_loop_ind%save_indx_rate == 0 and word_loop_ind > 0:
            save_model(f'_e{_}_w{word_loop_ind//save_indx_rate}M')
            print(f'epoch - {_}, word - {word_loop_ind//save_indx_rate}M')
            
        
        word_indx = word_indxs[word_loop_ind]
        line_indx = line_indxs[word_loop_ind]
            
        window_start, window_end = sample_word_window(line_indx,
                                                      word_indx)
        
        target_word = data[line_indx][word_indx]
        window_words = data[line_indx][window_start:window_end+1] # this is for negative sampling

        # sampling a single positive and training
        i = np.random.randint(window_start, window_end+1)
        while i == word_indx:
            i = np.random.randint(window_start, window_end+1)
            
        context_word = data[line_indx][i]
        negative_words = sample_negatives(set(window_words), negatives_cnt)

        trainable_params = [output_weights[n_w] for n_w in negative_words]
        trainable_params.extend([word_embeddings[target_word], output_weights[context_word]])

        with tf.GradientTape() as tape:
            p_true = word_embeddings[target_word] * output_weights[context_word]
            p_false = [word_embeddings[target_word] * output_weights[n_w] for n_w in negative_words]

            sigm_true = tf.math.sigmoid(tf.math.reduce_sum(p_true))
            sigm_false = [tf.math.sigmoid(tf.math.reduce_sum(-p_f)) for p_f in p_false]

            log_true = tf.math.log(sigm_true)
            log_false = [tf.math.log(s_f) for s_f in sigm_false]


            loss = -log_true - tf.math.reduce_sum(log_false)
            sum_loss += loss
            #print(loss)

        grads = tape.gradient(loss, trainable_params)

        word_embeddings[target_word].assign_sub(grads[-2]*lr)
        output_weights[context_word].assign_sub(grads[-1]*lr)

        for i in range(len(negative_words)):
            output_weights[negative_words[i]].assign_sub(grads[i]*lr)
                
print('done')

done


In [None]:
# saves the embeddings

save_model()

In [36]:
# saves the output layer weights

dump_dict = dict()
ind123=0

for k,v in output_weights.items():
    ind123+=1
    if ind123%1000==0:
        print('saving model...', ind123)
        display.clear_output(wait=True)
    dump_dict[k]=v.numpy().tolist()

json_object = json.dumps(dump_dict, indent=4)

with open(f"output_weights_e2_w3M.json", "w") as outfile:
    outfile.write(json_object)
print('saved!')

saved!
