In [1]:
import numpy as np
import random

import torch

In [2]:
glove = {}
f = open('data/mini_glove.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    glove[word] = coefs
f.close()

In [2]:
import torch.nn as nn
import torch.nn.functional as F

class Glove2Sparse(nn.Module):
    def __init__(self):
        super(Glove2Sparse, self).__init__()
        self.layer1 = nn.Linear(100, 500)
        self.layer2 = nn.Linear(500, 5000)
        self.layer3 = nn.Linear(5000, 500)
        self.layer4 = nn.Linear(500, 100)
    
    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = F.relu(self.layer3(x))
        x = F.relu(self.layer4(x))
        return x
    
    def encode(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return x
        
net = Glove2Sparse()

In [8]:
import torch.optim as optim

lin1_params = torch.cat([x.view(-1) for x in net.layer1.parameters()])
lin2_params = torch.cat([x.view(-1) for x in net.layer2.parameters()])
lin3_params = torch.cat([x.view(-1) for x in net.layer3.parameters()])
lin4_params = torch.cat([x.view(-1) for x in net.layer4.parameters()])
net_params = torch.cat([lin1_params, lin4_params]) #lin2_params, lin3_params, 
criterion = nn.MSELoss()
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9)

In [9]:
batch_size = 20
num_words = len(list(glove))
num_epochs = 10
learning_rate = 0.01
lambda1 = .1125  # Seems to be best for 100 -> 500 -> 5000 -> 500 -> 100

In [10]:
def sample(sample_size):
    return torch.stack([torch.tensor(random.choice(list(glove.values()))) for k in range(sample_size)], dim=0)

In [None]:
from torch.autograd import Variable

for epoch in range(15):
    epoch_loss = 0
    num_pts = 0
    for j in range(num_words // batch_size - 1):
        x_var = torch.tensor(list(glove.values())[batch_size * j: batch_size * (j + 1)])
        optimizer.zero_grad()
        xpred_var = net(x_var)
        loss = criterion(xpred_var, x_var) + lambda1 * (torch.norm(lin1_params, 1))
        loss.backward()
        optimizer.step()
        # running loss
        epoch_loss += loss.item()
        num_pts += batch_size
    print(f"Epoch {epoch} Average Loss:  {epoch_loss / num_pts}")

In [11]:
# torch.save(net.state_dict(), "data/sparse_glove.pt")
net.load_state_dict(torch.load("data/sparse_glove.pt"))

<All keys matched successfully>

In [12]:
def word2vec(word):
    return encode(net, torch.tensor(glove[word]).unsqueeze(0)).squeeze().detach()

In [13]:
def vec_dist(vec1, vec2):
    diff = vec1 - vec2
    return np.sqrt(np.dot(vec1, vec2))

def word_dist(word1, word2):
    return vec_dist(word2vec(word1), word2vec(word2))

In [14]:
def get_sparsity():
    activation_counts = []
    for word in glove:
        count = 0
        for num in word2vec(word):
            if num != 0:
                count += 1
        activation_counts.append(count)
        if len(activation_counts) % 1000 == 0:
            print(len(activation_counts))
    print(np.average(activation_counts))
    print(np.std(activation_counts))

In [15]:
def get_quick_sparsity():
    activation_counts = []
    for word in glove:
        count = 0
        for num in word2vec(word):
            if num != 0:
                count += 1
        activation_counts.append(count)
        if len(activation_counts) % 1000 == 0:
            break
    print(np.average(activation_counts))
    print(np.std(activation_counts))

In [18]:
def vanilla_loss():
    epoch_loss = 0
    num_pts = 0
    for j in range(1):
        x_var = torch.tensor(list(glove.values())[batch_size * j: batch_size * (j + 1)])
        xpred_var = net(x_var)
        loss = criterion(xpred_var, x_var)
        # running loss
        epoch_loss += loss.item()
        num_pts += batch_size
        print(f"Average Loss:  {epoch_loss / num_pts}")

In [19]:
vanilla_loss()

Average Loss:  0.015339434146881104


In [20]:
get_quick_sparsity()

128.841
55.14616685681789


In [21]:
get_sparsity()

1000
2000
3000
4000
5000
6000
7000
8000
9000
271.94321734745336
117.14839972178171


In [None]:
# lambda = 0
# loss 0.012708248198032379
# 3664.67776096823
# 97.67200633555687

# lambda1 = .05
# 611.2133131618759
# 177.31054319599318

# lambda1 = .1
# loss 0.015839888155460356
# 316.0678769541099
# 118.53467815898497

# lambda = .1125
# loss 0.015339434146881104
# 271.94321734745336
# 117.14839972178171

# lambda = .125
# loss 0.01642419397830963
# 277.6256177508825
# 118.43740749871542

# lambda = .1375
# loss 0.017420050501823426
# 449.6806858295512
# 155.40801569268137

# lambda = .15
# loss 0.018738116323947906
# 1436.8970247100353
# 138.1195882626668

# lambda1 = .2
# loss 0.018738116323947906
# 2120.891477559254
# 50.82771591344319

# lambda= .3
# loss 0.018738116323947906
# 2271.539283913263
# 39.27734433492949

In [None]:
# Scaling down to ~ 500 can get around 70 neurons active