In [1]:
import datetime
import math
import os
import pickle
import random

import matplotlib.pyplot as plt
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision

In [2]:
def read_in_data(text_file):

    data_output = []
    with open(text_file, "r") as file1:
        data_list = file1.readlines()
    for line in data_list:
        token_list = line.split()
        data_output.append(token_list)
    
    return data_output

In [3]:
training_data = read_in_data("../data/english-french_small/dev.en")

training_data = training_data[:5]

In [4]:
# TODO: Preprocess data

MAX_VOCABULARY_SIZE = 1000


In [5]:
def create_vocabulary(training_set):

    vocabulary = []
    for sentence in training_set:
        for word in sentence:
            if word not in vocabulary:
                vocabulary.append(word)
                
    w2i = dict()
    i2w = dict()
    for idx, word in enumerate(vocabulary):
        i2w[idx] = word
        w2i[word] = idx
    
    return w2i, i2w

In [6]:
w2i, i2w = create_vocabulary(training_data)
V = len(w2i.keys())

In [7]:
def generate_skipgram(sentence, context_window_size):

    skipgram_array = []
    for idx, word in enumerate(sentence):
        context_set = []
        window_size = context_window_size
        for index in range(max(idx - window_size, 0), min(len(sentence), idx + window_size + 1)):
            if index != idx:
                context_set.append([word, sentence[index]])
        skipgram_array.append(context_set)

    return skipgram_array

In [8]:
WINDOW_SIZE = 2
# Training data :
context_data = [generate_skipgram(sentence, WINDOW_SIZE) for sentence in training_data]

In [9]:
def make_input_batches(training_set, batch_size):

    sentences = training_set
    random.shuffle(sentences)
    
    new_data = []
    num_samples = len(sentences)
    for idx in range(num_samples // batch_size):
        batch = sentences[(idx)*batch_size : (idx+1)*batch_size]
        new_data.append(batch)
    
    return np.array(new_data)

In [10]:
def onehot(word, vocab_size=V):
    one_hot = torch.zeros(vocab_size)
    one_hot[w2i[word]] = 1.0

    return one_hot

In [11]:
BATCH_SIZE = 10
input_data = make_input_batches(training_data, BATCH_SIZE)

In [12]:
def divergence_closed_form(mu, variance):
    '''
    Closed form of the KL divergence
    '''
    return -0.5 * torch.sum(1 + variance - torch.pow(mu, 2) - torch.exp(variance))

In [13]:
def ELBO():
    '''
    Evidence Lower BOund
    '''
    pass

In [14]:
class linearity(nn.Module):
    def __init__(self, embedding_dimension, vocabulary_size):
        super(linearity, self).__init__()
        self.fc1 = nn.Linear(vocabulary_size, embedding_dimension)
    
    def forward(self, x):
        return self.fc1(x)

In [15]:
class BSG_Net(nn.Module):

    def __init__(self, vocabulary_size, embedding_dimension=20):

        super(BSG_Net, self).__init__()

        self.embedding_dimension = embedding_dimension

        self.fc1 = nn.Linear(vocabulary_size, embedding_dimension)
        self.fc2 = nn.Linear(embedding_dimension * 2, embedding_dimension * 2)
        self.fc3 = nn.Linear(embedding_dimension * 2, embedding_dimension)
        self.fc4 = nn.Linear(embedding_dimension * 2, embedding_dimension)
        
        # for reparameterization
        self.re1 = nn.Linear(embedding_dimension, vocabulary_size)

    def forward(self, x):

        context_representation = torch.zeros(self.embedding_dimension * 2)

        for pair in x:
            center_word = self.fc1(onehot(pair[0]))
            context_word = self.fc1(onehot(pair[1]))

            concatenated = torch.cat([center_word, context_word], dim=0)
            concatenated = F.relu(self.fc2(concatenated))
            context_representation += concatenated

        mu = self.fc3(context_representation)
        sigma = F.softplus(self.fc4(context_representation))

        epsilon = torch.distributions.multivariate_normal.MultivariateNormal(torch.zeros(self.embedding_dimension), torch.eye(self.embedding_dimension)).sample()

        z = mu + epsilon * sigma

        output = F.softmax(self.re1(z), dim=0)
    
        return output, mu, sigma


In [51]:
class prior_Net(nn.Module):

    def __init__(self, vocabulary_size, embedding_dimension=20):

        super(prior_Net, self).__init__()

        self.embedding_dimension = embedding_dimension

        self.L = nn.Linear(vocabulary_size, embedding_dimension)
        self.S = nn.Linear(vocabulary_size, embedding_dimension)

        self.fc1 = nn.Linear(embedding_dimension, vocabulary_size)

    def forward(self, x):
        one_hot_x = onehot(x)
        print(one_hot_x)

        mean = self.L(one_hot_x)
        print(mean)
        std = F.softplus(self.S(one_hot_x))
        print(std)
        
        variance = torch.diag(std ** 2)
        print(variance.size())
        
        z = torch.distributions.multivariate_normal.MultivariateNormal(mean,variance).sample()

        return F.softmax(self.fc1(z))


In [52]:
model = prior_Net(V, 20)

X_data = []

for sentence in training_data:
    for context_set in sentence:
        print(context_set)
        print(model(context_set))
        break
    break


they
tensor([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.])
tensor([ 0.1105,  0.0659,  0.0550,  0.0430, -0.0277,  0.0152,  0.0858,
        -0.0045, -0.1165, -0.0768, -0.0380,  0.1864,  0.0444,  0.0791,
        -0.1116, -0.0209,  0.0007, -0.0714,  0.1654, -0.0790])
tensor([ 0.6355,  0.7289,  0.6768,  0.7013,  0.6903,  0.7614,  0.6969,
         0.6809,  0.6958,  0.6241,  0.7587,  0.7732,  0.6859,  0.6803,
         0.7058,  0.7029,  0.7312,  0.6305,  0.6773,  0.7072])
torch.Size([20, 20])
tensor(1.00000e-02 *
       [ 1.5288,  1.5307,  1.7900,  2.0366,  1.0883,  2.0240,  1.4070,
         0.9914,  0.6503,  1.0951,  1.6566,  



In [54]:
EMBEDDING_DIMENSION = 20

model = BSG_Net(V, EMBEDDING_DIMENSION)
X_data = []

for sentence in context_data:
    for context_set in sentence:
        centre_word = context_set[0][0]
        context_words = []
        for pair in context_set:
            context_words.append(pair[1])
        
        print(centre_word, context_words)
        print(model(context_set))
        break
    break
    
    
    
#     for context_set in sentence:
#         to_add = []
#         for pair in context_set:
#             center_word = model_1(onehot(pair[0]))
#             context_word = model_1(onehot(pair[1]))

#             concatenated = torch.cat([center_word, context_word], dim=0)
#             concatenated = F.relu(model_2(concatenated))
#             to_add.append(concatenated)
            
#         X_data.append(to_add)

each ['of', 'them']
(tensor(1.00000e-02 *
       [ 1.1333,  0.3253,  0.7358,  0.8131,  0.4344,  0.9877,  1.7538,
         1.2249,  1.7645,  1.5386,  1.7679,  0.4074,  0.9355,  1.2049,
         0.4938,  1.5430,  4.2881,  0.9575,  1.0323,  1.3115,  1.0283,
         0.9543,  1.1669,  0.8652,  0.4009,  0.8702,  0.4023,  1.8159,
         1.4200,  1.2375,  0.6987,  0.8913,  0.7348,  1.2793,  0.7638,
         0.9496,  1.1159,  0.6792,  1.3865,  1.9624,  2.3016,  3.2830,
         1.5433,  0.3706,  1.5974,  1.5622,  1.7638,  1.1937,  3.3319,
         0.8813,  1.1449,  3.3667,  1.0649,  1.8419,  2.5309,  1.6430,
         2.1641,  0.9611,  2.0965,  1.2293,  2.9615,  0.8344,  1.1256,
         0.6994,  2.7630,  0.7990,  0.8209,  1.9170,  1.4329,  0.5622,
         1.0094,  0.5617,  0.9119,  2.4526]), tensor([ 0.2179, -0.1650,  0.0678,  0.0549,  0.2392, -0.1790,  0.0595,
         0.3243,  0.0840,  0.0212,  0.1459, -0.0770,  0.0488, -0.0572,
         0.1568,  0.0681,  0.0598, -0.3085,  0.0341, -0.3099