In [1]:
import numpy as np
import torch 
from torch import nn,optim
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import torch.nn.functional as F
torch.manual_seed(1)

<torch._C.Generator at 0x2b3e01b0170>

In [2]:
class TransformerToTensor():
    def __init__(self):
        pass
    def __call__(self,x):
        return torch.tensor(x,dtype=torch.long)

In [39]:
class CorpusDataset(Dataset):
    def __init__(self,corpus,context_size = 2,transform = None):
        self.transform = transform
        self.context_size = context_size
        self.tokens = self.tokenize_corpus(corpus)
        self.vocabulary = self.get_vocabulary(self.tokens)
        self.word2idx = self.word2idx(self.vocabulary)
        self.idx2word = self.idx2word(self.vocabulary)     
        self.data = self.context_vector(self.tokens)
        self.dataset = self.idx_pair(self.data)

            
    def get_vocabulary(self,tokens):
        vocabulary = []
        for sentence in tokens:
            for token in sentence:
                if token not in vocabulary:
                    vocabulary.append(token)
        return vocabulary
    
    def word2idx(self,vocabulary):
        return {w: idx for (idx, w) in enumerate(vocabulary)}
    
    def idx2word(self,vocabulary):
        return {idx: w for (idx, w) in enumerate(vocabulary)}
    
    def tokenize_corpus(self,corpus):
        tokens = [x.split() for x in corpus]
        return tokens
    
    def context_vector(self,tokens):
        data = []
        for sentence in tokens:
            for i in range(0,len(sentence)): #min len(sentence) = 2
                context = []
                for j in reversed(range(1,self.context_size+1)):
                    if((i-j) >= 0): #if sentence[i-j] != null
                        context.append(sentence[i-j])
                for j in range(1,self.context_size+1):
                    if((i+j) < len(sentence)): #if sentence[i+j] != null
                        context.append(sentence[i+j])
                target = sentence[i]
                data.append((context, target))
        return data
    
    def idx_pair(self,data):
        dataset = []
        for input_sent,target_word in data: 
            dataset.append([[self.word2idx[input_word] for input_word in input_sent],self.word2idx[target_word]])
        return dataset
    
    def show_dataset(self):
        for input_word,output_word in self.dataset:
            print(self.idx2word[input_word],self.idx2word[output_word])
            
    def __getitem__(self,idx):
        x,y = self.dataset[idx]
        if(self.transform):
           #x = torch.tensor(x,dtype=torch.long)
            x = self.transform(x)
            y = self.transform([y])
        return x,y 
        #it must be returned like to to be casting as set for dataLoader
    
    def __len__(self):
        return len(self.dataset)

In [50]:
'''
X = list of indices input from vocab [0 ,2 ]
V = sum(W1[X])/len(X) = sum(W1[0],W1[3],W1[2])/3
out = dot(V,W2)
out = softmax(out)
W1.shape = (VxD)
W2.shape = (DxV)
V = (dx1)

'''
class word2vec(nn.Module):
    def __init__(self,vocab_size,word_dims): #VxD
        super(word2vec,self).__init__()
        self.embed = nn.Embedding(vocab_size,word_dims)
        self.linear = nn.Linear(word_dims,vocab_size)
        
    def forward(self,x):
        W1 = torch.relu(self.embed(x))
        V = W1.sum(dim=0).view(1,-1)/len(x)
        print(V)
        W2 = self.linear(V)
        out = torch.log_softmax(W2,dim=1)
        return out

In [40]:
corpus2 = ["""We are about to study the idea of a computational process.
She me are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells."""]
data_set = CorpusDataset(corpus2,context_size = 2,transform = TransformerToTensor())

In [52]:
#testing my code 
vocab_size = len(data_set.vocabulary)
word_embed_dim = 10
model = word2vec(vocab_size,word_embed_dim)
yhat = model(data_set[0][0])
print("input = ",data_set[0][0], " \noutput = ",yhat)
print("y = ",data_set[0][1])

tensor([[0.0000, 0.0000, 0.7041, 0.4893, 0.0271, 0.8223, 0.0000, 0.6789, 1.8353,
         0.7729]], grad_fn=<DivBackward0>)
input =  tensor([1, 2])  
output =  tensor([[-2.3820, -1.9742, -1.8417, -1.6153, -1.5184, -2.2069, -2.4976]],
       grad_fn=<LogSoftmaxBackward>)
y =  tensor([0])


In [43]:
def train_model(model,criterion,optimizer,train_loader,iter=100):
    for epoch in range(iter):
        for x,y in train_loader:
            yhat = model(x)
            loss = criterion(yhat,y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        if(epoch%100 == 0):
            print("Epoch #",epoch , " Loss = ",loss.item())

In [44]:
corpus1 = [
    'he is a king',
    'she is a queen',
    'he is a man',
    'she is a woman',
    'warsaw is poland capital',
    'berlin is germany capital',
    'paris is france capital',
]
corpus2 = ["""We are about to study the idea of a computational process.
She me are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells."""]

corpus3 = ["The earth revolves around the sun. The moon revolves around the earth"]

In [45]:
data_set = CorpusDataset(corpus3,context_size = 2,transform = TransformerToTensor())
#train_loader = DataLoader(dataset = data_set,batch_size=32)
#hint: i cannot use DataLoader because input not the same shape
#if you want to use DataLoader you have to make input fixed size it's the only options and it's not hard 
vocab_size = len(data_set.vocabulary)
word_embed_dim = 10
model = word2vec(vocab_size,word_embed_dim)
learning_rate = 0.001
optimizer = torch.optim.SGD(model.parameters(),lr = learning_rate)
criterion = nn.NLLLoss()
train_model(model,criterion,optimizer,data_set,iter=10000)

Epoch # 0  Loss =  2.2688112258911133
Epoch # 100  Loss =  2.074704170227051
Epoch # 200  Loss =  1.9486188888549805
Epoch # 300  Loss =  1.8596223592758179
Epoch # 400  Loss =  1.7910146713256836
Epoch # 500  Loss =  1.73349130153656
Epoch # 600  Loss =  1.681817650794983
Epoch # 700  Loss =  1.6330299377441406
Epoch # 800  Loss =  1.5854477882385254
Epoch # 900  Loss =  1.538126826286316
Epoch # 1000  Loss =  1.4905660152435303
Epoch # 1100  Loss =  1.4425263404846191
Epoch # 1200  Loss =  1.3937621116638184
Epoch # 1300  Loss =  1.3443530797958374
Epoch # 1400  Loss =  1.2945390939712524
Epoch # 1500  Loss =  1.2446238994598389
Epoch # 1600  Loss =  1.194955587387085
Epoch # 1700  Loss =  1.145906686782837
Epoch # 1800  Loss =  1.0978474617004395
Epoch # 1900  Loss =  1.0511244535446167
Epoch # 2000  Loss =  1.0060362815856934
Epoch # 2100  Loss =  0.9628233909606934
Epoch # 2200  Loss =  0.9216533303260803
Epoch # 2300  Loss =  0.8832783699035645
Epoch # 2400  Loss =  0.84831279516

In [46]:
#print(model.state_dict()['embed.weight'].size())
W1 = model.state_dict()['embed.weight'].clone().detach()
W2 = model.state_dict()['linear.weight'].clone().detach()
print(W1.shape)
print(W2.shape)

torch.Size([7, 10])
torch.Size([7, 10])


In [47]:
def get_similar_words(word,top_k,Weights):
    idx = torch.tensor(data_set.word2idx[word])
    #W1(VxD) dot word_vec(Dx1) = Vx1 
    word_vec = torch.tensor(Weights[idx]).view(-1,1)
    yhat = torch.mm(Weights,word_vec)
    top_k = 4
    values , most_similar_idx =  torch.topk(yhat, k=top_k, dim=0)
    for i in range(top_k):
        most_similar_word = data_set.idx2word[most_similar_idx[i].item()]
        if(most_similar_word == word):
            continue
        print(most_similar_word,values[i].item())

In [49]:
word = "around"
print("With weights W1 (nn.Embedded)\n")
get_similar_words(word,top_k=3,Weights=W1)
print("\nWith Weights W2 (nn.Linear)\n")
get_similar_words(word,top_k=3,Weights=W2)

#W1 gives best results 

With weights W1 (nn.Embedded)

The 10.775361061096191
revolves 7.231320381164551
sun. 4.813801288604736

With Weights W2 (nn.Linear)

The 8.588621139526367
revolves 3.4847989082336426
sun. -1.274202585220337


  after removing the cwd from sys.path.
