In [1]:
import numpy as np
import torch 
from torch import nn,optim
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import torch.nn.functional as F
torch.manual_seed(1)

<torch._C.Generator at 0x1979e730170>

In [2]:
class TransformerToTensor():
    def __init__(self):
        pass
    def __call__(self,x):
        return torch.tensor(x,dtype=torch.long).view(-1)

In [3]:
class CorpusDataset(Dataset):
    def __init__(self,corpus,context_size = 2,transform = None):
        self.transform = transform
        self.context_size = context_size
        self.tokens = self.tokenize_corpus(corpus)
        self.vocabulary = self.get_vocabulary(self.tokens)
        self.word2idx = self.word2idx(self.vocabulary)
        self.idx2word = self.idx2word(self.vocabulary)     
        self.data = self.context_vector(self.tokens)
        self.dataset = self.idx_pair(self.data)

            
    def get_vocabulary(self,tokens):
        vocabulary = []
        for sentence in tokens:
            for token in sentence:
                if token not in vocabulary:
                    vocabulary.append(token)
        return vocabulary
    
    def word2idx(self,vocabulary):
        return {w: idx for (idx, w) in enumerate(vocabulary)}
    
    def idx2word(self,vocabulary):
        return {idx: w for (idx, w) in enumerate(vocabulary)}
    
    def tokenize_corpus(self,corpus):
        tokens = [x.split() for x in corpus]
        return tokens
    
    def context_vector(self,tokens):
        data = []
        for sentence in tokens:
            for i in range(0,len(sentence)): #min len(sentence) = 2
                context = []
                for j in reversed(range(1,self.context_size+1)):
                    if((i-j) >= 0): #if sentence[i-j] != null
                        context.append(sentence[i-j])
                for j in range(1,self.context_size+1):
                    if((i+j) < len(sentence)): #if sentence[i+j] != null
                        context.append(sentence[i+j])
                target = sentence[i]
                data.append((context, target))
        return data
    
    def idx_pair(self,data):
        dataset = []
        for target_sent,context_word in data:
            for target_word in target_sent:
                dataset.append([self.word2idx[context_word],self.word2idx[target_word]])
        return dataset
    
    def show_dataset(self):
        for input_word,output_word in self.dataset:
            print(self.idx2word[input_word],self.idx2word[output_word])
            
    def __getitem__(self,idx):
        x,y = self.dataset[idx]
        if(self.transform):
#              x = torch.tensor(x,dtype=torch.long)
               x = self.transform(x)
               y = self.transform(y)
               #no need no transform y to tensor DataLoader will convert it to y = self.transform([y])
        return x,y 
        #it must be returned like to to be casting as set for dataLoader
    
    def __len__(self):
        return len(self.dataset)

In [28]:
'''
X = index from vocab 0 or 1 or 2 , .... , len(vocab)
V = W1[x] 
out = dot(V,W2)
out = softmax(out)
W1.shape = (VxD)
W2.shape = (DxV)
V = (dx1)
'''
class word2vec(nn.Module):
    def __init__(self,vocab_size,word_dims): #VxD
        super(word2vec,self).__init__()
        self.embed = nn.Embedding(vocab_size,word_dims)
        self.linear = nn.Linear(word_dims,vocab_size)
        
    def forward(self,x):
        W1 = torch.relu(self.embed(x))
        W2 = self.linear(W1)
        out = torch.log_softmax(W2,dim=1)
        return out

In [8]:
corpus2 = ["""We are about to study the idea of a computational process.
She me are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells."""]
data_set = CorpusDataset(corpus2,context_size = 2,transform = TransformerToTensor())

In [29]:
#testing input/output
vocab_size = len(data_set.vocabulary)
word_embed_dim = 10
model = word2vec(vocab_size,word_embed_dim)
print("input = ",data_set[0][0], " \ny = ",data_set[0][1])
yhat = model(data_set[0][0])
print("yhat = ",yhat)

input =  tensor([0])  
y =  tensor([1])
tensor([[0.0000, 0.5319, 0.0000, 0.8346, 0.0000, 1.7922, 0.0000, 2.2565, 0.8924,
         0.0146]], grad_fn=<ReluBackward0>)
yhat =  tensor([[-4.8989, -4.1912, -3.0938, -4.7501, -3.3991, -4.0072, -4.5861, -4.0094,
         -4.0515, -3.3863, -4.1836, -3.6497, -4.4531, -4.3826, -3.7989, -4.7417,
         -3.4342, -4.8165, -4.5119, -4.3071, -4.8730, -4.4159, -3.1762, -2.9388,
         -4.0777, -3.5504, -4.1216, -3.3930, -4.1421, -4.2582, -4.1913, -3.4914,
         -3.8932, -2.7442, -4.9879, -3.8065, -4.2943, -4.3899, -4.5250, -3.8943,
         -3.9443, -3.8186, -3.9188, -5.2185, -2.9947, -4.1476, -4.4780, -4.0423,
         -5.2974, -4.8115]], grad_fn=<LogSoftmaxBackward>)


In [10]:
def train_model(model,criterion,optimizer,train_loader,iter=100):
    for epoch in range(iter):
        for x,y in train_loader:
            yhat = model(x)
            loss = criterion(yhat,y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        if(epoch%100 == 0):
            print("Epoch #",epoch , " Loss = ",loss.item())

In [37]:
corpus1 = [
    'he is a king',
    'she is a queen',
    'he is a man',
    'she is a woman',
    'warsaw is poland capital',
    'berlin is germany capital',
    'paris is france capital',
]
corpus2 = ["""We are about to study the idea of a computational process.
She me are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells."""]

corpus3 = ["The earth revolves around the sun. The moon revolves around the earth"]

In [38]:

data_set = CorpusDataset(corpus3,context_size = 2,transform = TransformerToTensor())
train_loader = DataLoader(dataset = data_set,batch_size=32)
for x,y in train_loader:
    print("x = ",x[0]," \ny= ",y[0])
    break;

x =  tensor([0])  
y=  tensor([1])


In [12]:
data_set = CorpusDataset(corpus2,context_size = 2,transform = TransformerToTensor())
# train_loader = DataLoader(dataset = data_set,batch_size=32)
#dataloader with batch_size > 1 make error with Embedded matrix
vocab_size = len(data_set.vocabulary)
word_embed_dim = 10
model = word2vec(vocab_size,word_embed_dim)
learning_rate = 0.001
optimizer = torch.optim.SGD(model.parameters(),lr = learning_rate)
criterion = nn.NLLLoss()
train_model(model,criterion,optimizer,data_set,iter=10000)

Epoch # 0  Loss =  3.576582908630371
Epoch # 100  Loss =  3.644570827484131
Epoch # 200  Loss =  3.5677151679992676
Epoch # 300  Loss =  3.435227870941162
Epoch # 400  Loss =  3.2720112800598145
Epoch # 500  Loss =  3.0820553302764893
Epoch # 600  Loss =  2.8649721145629883
Epoch # 700  Loss =  2.6252787113189697
Epoch # 800  Loss =  2.377237558364868
Epoch # 900  Loss =  2.139230728149414
Epoch # 1000  Loss =  1.924010157585144
Epoch # 1100  Loss =  1.7381389141082764
Epoch # 1200  Loss =  1.5817519426345825
Epoch # 1300  Loss =  1.451515555381775
Epoch # 1400  Loss =  1.3430540561676025
Epoch # 1500  Loss =  1.2523380517959595
Epoch # 1600  Loss =  1.176454782485962
Epoch # 1700  Loss =  1.1130121946334839
Epoch # 1800  Loss =  1.0600882768630981
Epoch # 1900  Loss =  1.0161319971084595
Epoch # 2000  Loss =  0.9798581004142761
Epoch # 2100  Loss =  0.9500794410705566
Epoch # 2200  Loss =  0.9257381558418274
Epoch # 2300  Loss =  0.9060798287391663
Epoch # 2400  Loss =  0.890168607234

In [13]:
#print(model.state_dict()['embed.weight'].size())
W1 = model.state_dict()['embed.weight'].clone().detach()
W2 = model.state_dict()['linear.weight'].clone().detach()
print(W1.shape)
print(W2.shape)

torch.Size([50, 10])
torch.Size([50, 10])


In [24]:
def get_similar_words(word,top_k,Weights):
    idx = torch.tensor(data_set.word2idx[word])
    #W1(VxD) dot word_vec(Dx1) = Vx1 
    word_vec = torch.tensor(Weights[idx]).view(-1,1)
    yhat = torch.mm(Weights,word_vec)
    values , most_similar_idx =  torch.topk(yhat, k=top_k, dim=0)
    for i in range(top_k):
        most_similar_word = data_set.idx2word[most_similar_idx[i].item()]
        if(most_similar_word == word):
            continue
        print(most_similar_word,values[i].item())

In [27]:
print(corpus2)
word = "manipulate"
print("\nWith weights W1 (nn.Embedded)\n")
get_similar_words(word,top_k=4,Weights=W1)
print("\nWith Weights W2 (nn.Linear)\n")
get_similar_words(word,top_k=4,Weights=W2)

['We are about to study the idea of a computational process.\nShe me are abstract beings that inhabit computers.\nAs they evolve, processes manipulate other abstract things called data.\nThe evolution of a process is directed by a pattern of rules\ncalled a program. People create programs to direct processes. In effect,\nwe conjure the spirits of the computer with our spells.']

With weights W1 (nn.Embedded)

processes 22.991008758544922
a 19.313079833984375
are 14.851171493530273

With Weights W2 (nn.Linear)

other 23.78009033203125
processes 23.505077362060547
abstract 19.477222442626953


  after removing the cwd from sys.path.
