In [43]:

def load_data(data_path:str):
    with open(data_path,'r') as fp:
        data=fp.read()
    return data

In [1]:
import torch

In [2]:
device="cuda" if torch.cuda.is_available() else "cpu"

In [44]:
from collections import Counter

def preprocess_data(text:str):
    text=text.lower()
    words=text.split()
    word_counts=Counter(words)
    filtered_words=[word for word in words if word_counts[word]>10]
    return filtered_words
    

In [45]:
data=load_data("/home/local/ZOHOCORP/muikumar-pt6527/Desktop/Deep_learning/rnn/word_embeddings/word_to_vec/data/datasets/text8")

In [46]:
train_data=preprocess_data(data)

In [47]:
def build_vocab(words):
    word_counts=Counter(words)
    sorted_vocab=sorted(word_counts,key=word_counts.get,reverse=True)
    word_to_int={word:idx for idx,word in enumerate(sorted_vocab)}
    int_to_word={idx:word for word,idx in word_to_int.items()}
    return word_to_int,int_to_word
    

In [48]:
import numpy as np
import random
def sub_sampling(int_words,threshold=1e-5):
    word_counts=Counter(int_words)
    total_words=len(int_words)
    word_freq_ratios={word:freq/total_words for word,freq in word_counts.items()}   
    p_drop={word:1-np.sqrt(threshold/word_freq_ratios[word]) for word in word_counts}
    rand_prob=random.random()
    return [word for word in int_words if rand_prob>p_drop[word]]

In [49]:
def get_context(int_words,idx,max_window_size):
    window_size=random.randint(1,max_window_size)
    start=max(0,idx-window_size)
    end=min(idx+window_size+1,len(int_words)-1)
    context_words=int_words[start:idx]+int_words[idx+1:end]
    return context_words

In [50]:
def get_batches(int_words,batch_size,max_window_size):
    n_batches=len(int_words)//batch_size
    int_words=int_words[:n_batches*batch_size]
    for batch_num in range(0,len(int_words),batch_size):
        batch_words=int_words[batch_num:batch_num+batch_size]
        
        batch_x,batch_y=[],[]
        for i in range(len(batch_words)):
            target_word=batch_words[i]
            context_words=get_context(batch_words,i,max_window_size)
            batch_x.extend([target_word]*len(context_words))
            batch_y.extend(context_words)
        yield batch_x,batch_y

            

In [None]:
import torch.nn as nn
import torch
class SkipGramNegSampling(nn.Module):
    def __init__(self,n_vocab,n_embed,noise_dist):
        super(SkipGramNegSampling,self).__init__()
        self.n_vocab=n_vocab
        self.n_embed=n_embed
        self.noise_dist=noise_dist
        
        self.context_embed=nn.Embedding(n_vocab,n_embed)
        self.target_embed=nn.Embedding(n_vocab,n_embed)
        
        self.context_embed.weight.data.uniform_(-1,1)
        self.target_embed.weight.data.uniform_(-1,1)
        
    def forward_context(self,contexts):
        embed_contexts=self.context_embed(contexts)
        return embed_contexts
    
    def forward_target(self,target):
        embed_target=self.target_embed(target)
        return embed_target
    
    def forward_noise(self,batch_size,n_samples):
        if self.noise_dist:
            noise=self.noise_dist
        else:
            noise=torch.ones(self.n_vocab)
        
        noise_words=torch.multinomial(noise,
                                  num_samples=n_samples,
                              replacement=True)
        noise_words.to(device)
        noise_vector=self.target_embed(noise_words).view(batch_size,n_samples,self.n_embed)
        return noise_vector
        

In [None]:
class NegativeSamplingLoss(nn.Module):
    def __init__(self, args):
        super(NegativeSamplingLoss, self).__init__()
    def forward(self,
            target_vector,
            context_vector,
            noise_vector):
        batch_size,embedding_size=target_vector.shape
        target_vector=target_vector.view(batch_size,embedding_size,1)
        context_vector=context_vector.view(batch_size,1,embedding_size)
        out_loss=torch.bmm(context_vector,target_vector).sigmoid().log().squeeze()
        
        noise_loss=torch.bmm(noise_vector.neg(),target_vector).sigmoid().log()
        noise_loss=noise_loss.squeeze().sum(1)
        return -(out_loss+noise_loss).mean()
       

In [None]:
woi,iow=build_vocab(train_data)

In [52]:
int_words=[woi[word] for word in train_data]

In [59]:
int_words=sub_sampling(int_words)

In [60]:
y=get_batches(int_words,8,5)

In [83]:
a,b=next(y)

In [84]:
len(a)

34

In [85]:
import torch

In [96]:
freq = Counter(train_data)
freq_ratio = {word:cnt/len(woi) for word, cnt in freq.items()}     
freq_ratio = np.array(sorted(freq_ratio.values(), reverse=True))
unigram_dist = freq_ratio / freq_ratio.sum() 
noise_dist = torch.from_numpy(unigram_dist**0.75 / np.sum(unigram_dist**0.75))


tensor([1.5537e-02, 1.0049e-02, 7.7052e-03,  ..., 2.8380e-06, 2.8380e-06,
        2.8380e-06], dtype=torch.float64)
