**Word embeddings are a representation of the *semantics* of a word, efficiently encoding semantic information that might be relevant to the task at hand. You can embed other things too: part of speech tags, parse trees, anything! The idea of feature embeddings is central to the field.**

* [Pytorch Tutorial on Word Embedding](https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html)
* [Distributional Semantics](https://en.wikipedia.org/wiki/Distributional_semantics)

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy

In [10]:
torch.manual_seed(42)

<torch._C.Generator at 0x1081aabf0>

In [11]:
anna = "anna.txt"
lines = []
trigrams = []
with open(anna,'r') as data:
    for k,line in enumerate(data.readlines()):
        if k > 5000:
            break
        if line.endswith("\n"):
            line = line.strip("\n")
            if len(line) > 1:
                lines.append(line)
            test_sentence = line.split()
            if len(test_sentence) >= 3:
                trigrams += [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2]) for i in range(len(test_sentence) - 2)]

In [12]:
words_list = [wrd for sent in lines for wrd in sent.split()]

In [13]:
word_lst = set(words_list)

In [14]:
wrd_to_idx = {wrd: i for i,wrd in enumerate(word_lst)}

In [15]:
len(trigrams)

34412

In [8]:
#Ngram Language Model
CONTEXT_SIZE = 2
EMBEDDING_DIM = 100

class NGramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModel,self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)
        
    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1,-1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs
    
losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModel(len(word_lst), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr = 0.001)

for epoch in range(10):
    total_loss = 0
    for context, target in trigrams:
        
        context_ids = torch.tensor([wrd_to_idx[w] for w in context],dtype = torch.long)
        model.zero_grad()
        log_probs = model(context_ids)
        loss = loss_function(log_probs,torch.tensor([wrd_to_idx[target]],
                                                   dtype=torch.long))
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        print("Epoch "+str(epoch)+" Total Loss "+str(total_loss))
    losses.append(total_loss)
print(losses)

NameError: name 'word_lst' is not defined

In [198]:
y_pred = model(torch.tensor([1729,276],dtype=torch.long))

In [145]:
trigrams

[(['Happy', 'families'], 'are'),
 (['families', 'are'], 'all'),
 (['are', 'all'], 'alike;'),
 (['all', 'alike;'], 'every'),
 (['alike;', 'every'], 'unhappy'),
 (['every', 'unhappy'], 'family'),
 (['unhappy', 'family'], 'is'),
 (['family', 'is'], 'unhappy'),
 (['is', 'unhappy'], 'in'),
 (['unhappy', 'in'], 'its'),
 (['in', 'its'], 'own'),
 (['Everything', 'was'], 'in'),
 (['was', 'in'], 'confusion'),
 (['in', 'confusion'], 'in'),
 (['confusion', 'in'], 'the'),
 (['in', 'the'], "Oblonskys'"),
 (['the', "Oblonskys'"], 'house.'),
 (["Oblonskys'", 'house.'], 'The'),
 (['house.', 'The'], 'wife'),
 (['The', 'wife'], 'had'),
 (['discovered', 'that'], 'the'),
 (['that', 'the'], 'husband'),
 (['the', 'husband'], 'was'),
 (['husband', 'was'], 'carrying'),
 (['was', 'carrying'], 'on'),
 (['carrying', 'on'], 'an'),
 (['on', 'an'], 'intrigue'),
 (['an', 'intrigue'], 'with'),
 (['intrigue', 'with'], 'a'),
 (['with', 'a'], 'French'),
 (['girl,', 'who'], 'had'),
 (['who', 'had'], 'been'),
 (['had', 'be

In [35]:
wrd_to_idx["Stepan"]

1686

In [36]:
print(wrd_to_idx["Arkadyevitch"])

3357


In [199]:
#numpy.argsort(-y_pred.detach().squeeze(-1))

tensor([[7463, 5744, 1174,  ..., 5947, 3537, 4323]])

In [25]:
# for k,v in wrd_to_idx.items():
#     if v == 1174:
#         print(k)

In [None]:
# Save Model
torch.save(model.state_dict(), "word2vec_pytorch")

In [17]:
# Loading Model
model = NGramLanguageModel(len(word_lst), EMBEDDING_DIM, CONTEXT_SIZE)
model.load_state_dict(torch.load('word2vec_model'), strict=False)

<All keys matched successfully>

In [37]:
y_pred = model(torch.tensor([1686,3357],dtype=torch.long))

In [38]:
top_n, top_i = y_pred.topk(1)

In [39]:
for k,v in wrd_to_idx.items():
    if v == top_i.item():
        print(k)

impatiently
