Relevant import statements

In [14]:
import os
import numpy as np
import torch
import random
import math
import matplotlib.pyplot as plt

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision

Defining the data set:

In [15]:
#from utils import generate_training_set()

def read_in_data(text_file):
    data_output = []
    with open(text_file, "r") as file1:
        data_list = file1.readlines()
    for line in data_list:
        token_list = line.split()
        data_output.append(token_list)
    return data_output

Creating the vocabulary:

In [16]:
training_data = read_in_data("training.en")

In [21]:
def create_vocabulary(training_set):
    vocabulary = []
    for sentence in training_set:
        for word in sentence:
            if word not in vocabulary:
                vocabulary.append(word)
                
    w2i = dict()
    i2w = dict()
    for idx, word in enumerate(vocabulary):
        i2w[idx] = word
        w2i[word] = idx
    
    return w2i, i2w

In [22]:
w2i, i2w = create_vocabulary(training_data)

In [23]:
print(len(w2i.keys()))

36635


Generating the skipgram data for a single sentence:

In [24]:
def generate_skipgram(sentence, context_window_size):
    skipgram_array = []
    for idx, word in enumerate(sentence):
        window_size = random.randint(1, context_window_size)
        for index in range(max(idx - window_size, 0), min(len(sentence), idx + window_size + 1)):
            if index == idx:
                pass
            else:
                skipgram_array.append([w2i[word], w2i[sentence[index]]])
    return skipgram_array
        
        

Generating the data for the whole training corpus:

In [25]:
def generate_corpus_skipgrams(training_set, window_size):
    skipgrams = []
    for sentence in training_set:
        for skipgram in generate_skipgram(sentence, window_size):
            skipgrams.append(skipgram)
    random.shuffle(skipgrams)
    return np.array(skipgrams)

Defining the network which will generate the embeddings:

In [26]:
class Skipgram_Net(nn.Module):
    def __init__(self, embedding_dimension, vocabulary_size):
        super(Skipgram_Net, self).__init__()
        self.fc1 = nn.Linear(vocabulary_size, embedding_dimension)
        self.fc2 = nn.Linear(embedding_dimension, vocabulary_size)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        return F.log_softmax(x)

In [27]:
def make_batches(skipgram_training_data, batch_size):
    new_data = []
    num_samples = skipgram_training_data.shape[0]
    for idx in range(num_samples // batch_size):
        batch = skipgram_training_data[(idx)*batch_size : (idx+1)*batch_size]
        new_data.append(batch)
    return np.array(new_data)

In [39]:
BATCH_SIZE = 100
skipgram_training_data = generate_corpus_skipgrams(training_data, 5)
skipgram_training_data = make_batches(skipgram_training_data, BATCH_SIZE)

Training the network:

In [44]:
EMBEDDING_DIMENSION = 100
vocab_size = len(w2i.keys())
model = Skipgram_Net(EMBEDDING_DIMENSION, vocab_size)
optimizer = optim.Adam(model.parameters(), lr = 0.0001)
loss_fn = torch.nn.NLLLoss()
print(len(skipgram_training_data))

for epoch in range(3):
    print("EPOCH NUMBER:", epoch)
    i = 0
    for data_point in skipgram_training_data:
        x_values = data_point[:,0]
        y_values = data_point[:,1]
        input_to_network = torch.zeros(BATCH_SIZE, vocab_size)
        for idx in range(BATCH_SIZE):
            input_to_network[idx, x_values[idx]] = 1.0
        target = torch.tensor(y_values, dtype=torch.long)
        output_of_network = model(input_to_network)
        
        loss = loss_fn(output_of_network, target)
        if loss.item() < 0.01:
            break
        
        if i % 100 == 0:
            print("LOSS at step {} was {}".format(i, loss.item()))
        i +=1
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


204138
EPOCH NUMBER: 0
LOSS at step 0 was 10.529213905334473


  # Remove the CWD from sys.path while we load stuff.


LOSS at step 100 was 10.497106552124023
LOSS at step 200 was 10.426531791687012
LOSS at step 300 was 10.264270782470703
LOSS at step 400 was 10.031286239624023
LOSS at step 500 was 9.665971755981445
LOSS at step 600 was 9.226372718811035
LOSS at step 700 was 8.732582092285156
LOSS at step 800 was 8.236682891845703
LOSS at step 900 was 7.920852184295654
LOSS at step 1000 was 7.865729808807373
LOSS at step 1100 was 7.284582614898682
LOSS at step 1200 was 7.542508602142334
LOSS at step 1300 was 6.8805742263793945
LOSS at step 1400 was 6.900568962097168
LOSS at step 1500 was 6.9012556076049805
LOSS at step 1600 was 6.728001594543457
LOSS at step 1700 was 6.968002796173096
LOSS at step 1800 was 6.4681243896484375
LOSS at step 1900 was 6.379810810089111
LOSS at step 2000 was 6.505972385406494
LOSS at step 2100 was 6.445388317108154
LOSS at step 2200 was 6.7486796379089355
LOSS at step 2300 was 7.121939659118652
LOSS at step 2400 was 6.540307521820068
LOSS at step 2500 was 7.535333156585693
L

KeyboardInterrupt: 

Getting the word embeddings from the trained model:

In [45]:
def create_embeddings(trained_model):
    embeddings = dict()
    params = list(trained_model.parameters())
    learned_weights = trained_model.fc1.weight.data
    for word in w2i.keys():
        word_idx = w2i[word]
        embeddings[word] = learned_weights[:, word_idx].numpy()
    return embeddings

embeddings_dict = create_embeddings(model)

In [46]:
print(embeddings_dict["the"])
print(embeddings_dict["as"])

[-0.00928869 -0.0232761   0.02100292 -0.00405158 -0.0309203   0.02540967
 -0.00217791  0.02410931  0.05206803 -0.01831422 -0.05881453  0.01429875
 -0.00822011  0.01345269  0.009621    0.00642681  0.03871971 -0.00523754
  0.04309646  0.03745118 -0.00880113  0.05257478 -0.04627841  0.04463107
  0.00845229 -0.02368473 -0.00888878 -0.00955483  0.03948393 -0.02397909
  0.02089704  0.01606143  0.02256639  0.01473693  0.05186039 -0.04609038
  0.01349395 -0.02449494 -0.00146349 -0.0455589  -0.06575637  0.00218241
 -0.00816757  0.02345874  0.00545223 -0.03933325 -0.02481946  0.00852639
 -0.01204512  0.0238484  -0.00201765  0.05104383  0.00889562  0.0027562
 -0.02173823  0.0430742  -0.02392623  0.00160262  0.02351004 -0.03623751
 -0.05046012  0.0224936   0.01142139 -0.00485048 -0.042598    0.04487596
 -0.00352428 -0.00651073 -0.01516988 -0.04971393  0.00345578  0.01938071
  0.00084703 -0.00105783  0.02925267  0.01949517 -0.05746623  0.05075869
 -0.00186497 -0.02196809 -0.01765833  0.00055218 -0.