#### Relevant import statements

In [None]:
import datetime
import math
import os
import pickle
import random

import matplotlib.pyplot as plt
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision

#### Defining the data set:

In [None]:
# from utils import generate_training_set()

def read_in_data(text_file):

    data_output = []
    with open(text_file, "r") as file1:
        data_list = file1.readlines()
    for line in data_list:
        token_list = line.split()
        data_output.append(token_list)
    
    return data_output

#### Creating the vocabulary:

In [None]:
training_data = read_in_data("../data/training.en")

In [None]:
def create_vocabulary(training_set):

    vocabulary = []
    for sentence in training_set:
        for word in sentence:
            if word not in vocabulary:
                vocabulary.append(word)
                
    w2i = dict()
    i2w = dict()
    for idx, word in enumerate(vocabulary):
        i2w[idx] = word
        w2i[word] = idx
    
    return w2i, i2w

In [None]:
w2i, i2w = create_vocabulary(training_data)

In [None]:
print(len(w2i.keys()))

#### Generating the skipgram data for a single sentence:

In [1]:
def generate_skipgram(sentence, context_window_size):

    skipgram_array = []
    for idx, word in enumerate(sentence):
        window_size = random.randint(1, context_window_size)
        for index in range(max(idx - window_size, 0), min(len(sentence), idx + window_size + 1)):
            if index != idx:
                skipgram_array.append([w2i[word], w2i[sentence[index]]])

    return skipgram_array

#### Generating the data for the whole training corpus:

In [None]:
def generate_corpus_skipgrams(training_set, window_size):

    skipgrams = []
    for sentence in training_set:
        for skipgram in generate_skipgram(sentence, window_size):
            skipgrams.append(skipgram)
    random.shuffle(skipgrams)

    return np.array(skipgrams)

#### Defining the network which will generate the embeddings:

In [None]:
class Skipgram_Net(nn.Module):

    def __init__(self, embedding_dimension, vocabulary_size):
        super(Skipgram_Net, self).__init__()
        self.fc1 = nn.Linear(vocabulary_size, embedding_dimension)
        self.fc2 = nn.Linear(embedding_dimension, vocabulary_size)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        return F.log_softmax(x)

In [None]:
def make_batches(skipgram_training_data, batch_size):

    new_data = []
    num_samples = skipgram_training_data.shape[0]
    for idx in range(num_samples // batch_size):
        batch = skipgram_training_data[(idx)*batch_size : (idx+1)*batch_size]
        new_data.append(batch)
    
    return np.array(new_data)

In [None]:
BATCH_SIZE = 100
skipgram_training_data = generate_corpus_skipgrams(training_data, 5)
skipgram_training_data = make_batches(skipgram_training_data, BATCH_SIZE)

#### Training the network:

In [None]:
EMBEDDING_DIMENSION = 100
epochs = 3

vocab_size = len(w2i.keys())
model = Skipgram_Net(EMBEDDING_DIMENSION, vocab_size)
optimizer = optim.Adam(model.parameters(), lr = 0.0001)
loss_fn = torch.nn.NLLLoss()

print(len(skipgram_training_data))

for epoch in range(epochs):
    print("EPOCH NUMBER:", epoch)
    i = 0
    
    for data_point in skipgram_training_data:
        x_values = data_point[:, 0]
        y_values = data_point[:, 1]
        input_to_network = torch.zeros(BATCH_SIZE, vocab_size)

        for idx in range(BATCH_SIZE):
            input_to_network[idx, x_values[idx]] = 1.0
        target = torch.tensor(y_values, dtype=torch.long)
        output_of_network = model(input_to_network)
        
        loss = loss_fn(output_of_network, target)
        if loss.item() < 0.01:
            break
        
        if i % 100 == 0:
            print("LOSS at step {} was {}".format(i, loss.item()))
        i +=1
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
# Save trained model

# st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
# create directory if it does not exist
if not os.path.exists('../models'):
    os.makedirs('../models')

with open('../models/skipgram_{}-{}.model'.format(str(epochs), str(EMBEDDING_DIMENSION)), 'wb') as f:
    pickle.dump(model, f, protocol=pickle.HIGHEST_PROTOCOL)

#### Getting the word embeddings from the trained model:

In [None]:
def create_embeddings(trained_model):

    embeddings = dict()
    params = list(trained_model.parameters())
    learned_weights = trained_model.fc1.weight.data
    
    for word in w2i.keys():
        word_idx = w2i[word]
        embeddings[word] = learned_weights[:, word_idx].numpy()
    
    return embeddings

In [None]:
embeddings_filepath = '../models/embeddings.pickle'

# check if embeddings file exists
if os.path.exists(embeddings_filepath):
    with open(embeddings_filepath, 'rb') as file:
        embeddings_dict = pickle.load(file)
else:
    embeddings_dict = create_embeddings(model)
    with open(embeddings_filepath, 'wb') as file:
        pickle.dump(embeddings_dict, file, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
print(embeddings_dict["the"])
print(embeddings_dict["as"])