# GloVe : getting relevant word embeddings

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import collections
import random
sns.set()

## 1. Setup

In [2]:
data = pd.read_csv('all_items_improved.csv', sep=',')

In [3]:
data = data[data.item_price > 0]
data = data[data.item_price < 200]
data['rest_location'] = data['rest_location'].apply(lambda x: x.split("\n")[1].strip())
data['rest_location'] = data['rest_location'].replace('', '0').replace('Bercy Village', '0').astype(int)

For speed, we're only using the 250 most common words.

In [4]:
context_window = 4
top_k = 250

def tokenize(string):
    return string.lower().split()

word_counter = collections.Counter()
for example in data.item:
    word_counter.update(tokenize(example))
    
vocabulary = [pair[0] for pair in word_counter.most_common(top_k)]
idx_to_word = dict(enumerate(vocabulary))
word_to_idx = dict(zip(idx_to_word.values(), idx_to_word.keys()))

### Extract coorcurrences

In [5]:
def extract_cooccurrences(dataset, word_map, amount_of_context=context_window):
    num_words = len(vocabulary)
    cooccurrences = np.zeros((num_words, num_words))
    nonzero_pairs = set()
    for example in dataset:
        words = tokenize(example)
        for target_index in range(len(words)):
            target_word = words[target_index]
            if target_word not in word_to_idx:
                continue
            target_word_index = word_to_idx[target_word]
            min_context_index = max(0, target_index - amount_of_context)
            max_word = min(len(words), target_index + amount_of_context + 1)
            for context_index in list(range(min_context_index, target_index)) + \
            list(range(target_index + 1, max_word)):
                context_word = words[context_index]
                if context_word not in word_to_idx:
                    continue
                context_word_index = word_to_idx[context_word]
                cooccurrences[target_word_index][context_word_index] += 1.0
                nonzero_pairs.add((target_word_index, context_word_index))
    return cooccurrences, list(nonzero_pairs)
                
cooccurrences, nonzero_pairs = extract_cooccurrences(data.item, vocabulary)

### Batchify data

In [6]:
def batch_iter(nonzero_pairs, cooccurrences, batch_size):
    start = -1 * batch_size
    dataset_size = len(nonzero_pairs)
    order = list(range(dataset_size))
    random.shuffle(order)

    while True:
        start += batch_size
        word_i = []
        word_j = []
        counts = []
        if start > dataset_size - batch_size:
            # Start another epoch.
            start = 0
            random.shuffle(order)
        batch_indices = order[start:start + batch_size]
        batch = [nonzero_pairs[index] for index in batch_indices]
        for k in batch:
            counts.append(cooccurrences[k])
            word_i.append(k[0])
            word_j.append(k[1])
        yield [counts, word_i, word_j]
        

To be frank, a GloVe model trained on such a small dataset and vocabulary won't be spectacular, so we won't bother with a full-fledged similarity or analogy evaluation. Instead, we'll use the simple scoring function below, which grades the model on how well it captures ten easy/simple similarity comparisons. The function returns a score between 0 and 10. Random embeddings can be expected to get a score of 5.

## 2. Modeling

### Metric

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

def similarity(model, word_one, word_two):
    vec_one = model.get_embeddings(word_to_idx[word_one]).reshape(1, -1)
    vec_two = model.get_embeddings(word_to_idx[word_two]).reshape(1, -1)
    return float(cosine_similarity(vec_one, vec_two))

def score(model):
    m = model
    score = 0
    score += similarity(m, 'cabillaud', 'poisson') > similarity(m, 'cabillaud', 'magret')
    score += similarity(m, 'de', 'au') > similarity(m, 'de', 'ou')
    score += similarity(m, 'poulet', 'boeuf') >  similarity(m, 'poulet', 'légumes')
    score += similarity(m, 'pomme', 'fruits') > similarity(m, 'pomme', 'fromage')
    score += similarity(m, 'chocolat', 'vanille') > similarity(m, 'chocolat', 'crème')
    score += similarity(m, 'mozzarella', 'fromage') > similarity(m, 'mozzarella', 'miel')
    score += similarity(m, 'café', 'thé') > similarity(m, 'café', 'huile')
    score += similarity(m, 'entrecôte', 'viande') > similarity(m, 'entrecôte', 'poisson')
    score += similarity(m, 'vin', 'champagne') > similarity(m, 'vin', 'soupe')
    score += similarity(m, 'confiture', 'nutella') > similarity(m, 'confiture', 'beignets')
    return score

### Model

In [8]:
training_set = data.item

In [9]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

class Glove(nn.Module):
    def __init__(self, embedding_dim, vocab_size, batch_size):
        super(Glove, self).__init__()
        self.word_embeddings = None
        
        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.co_word_embeds = nn.Embedding(vocab_size, embedding_dim)
        
        self.bias_1 = Variable(torch.FloatTensor(batch_size))
        self.bias_2 = Variable(torch.FloatTensor(batch_size))

    
    def forward(self, counts, words, co_words, xmax, alpha):
        
        embedded_words = self.word_embeds(words)
        embedded_co_words = self.co_word_embeds(co_words)

        embed_prod = torch.sum(embedded_words * embedded_co_words, dim=1)
        weights = torch.FloatTensor([pow(count/xmax, alpha) if count <= xmax else 1 for count in counts])
        
        counts = Variable(torch.FloatTensor(counts))
        squared_error = torch.pow(embed_prod + self.bias_1 + self.bias_2 - torch.log(counts), 2)
        cost = torch.dot(Variable(weights), squared_error)
        
        if len(weights[weights<0]) > 0:
            print('weights negative !')
        elif len(squared_error.data[squared_error.data<0]) > 0:
            print('squared diff negative !')
        #print('cost = ' + str(cost.data[0]))
        return cost
        
        
    def init_weights(self, i_range):
        self.word_embeds.weight.data.uniform_(-i_range, i_range)
        self.co_word_embeds.weight.data.uniform_(-i_range, i_range)
        
        nn.init.uniform(self.bias_1, -i_range/2, i_range/2)
        nn.init.uniform(self.bias_2, -i_range/2, i_range/2)
    
    def add_embeddings(self):

        self.word_embeddings = (self.word_embeds.weight + self.co_word_embeds.weight).data.numpy()
        return self.word_embeddings
    
    def get_embeddings(self, index):
        if self.word_embeddings is None:
            add_embeddings()
        return self.word_embeddings[index, :]

### Training Loop

In [10]:
def training_loop(batch_size, num_epochs, model, optim, data_iter, xmax, alpha):
    step = 0
    epoch = 0
    losses = []
    total_batches = int(len(training_set) / batch_size)
    while epoch <= num_epochs:
        model.train()
        counts, words, co_words = next(data_iter)        
        words_var = Variable(torch.LongTensor(words))
        co_words_var = Variable(torch.LongTensor(co_words))
        
        model.zero_grad()

        loss = model(counts, words_var, co_words_var, xmax, alpha)

        losses.append(loss.data[0])
        loss.backward()
        optimizer.step()
        
        if step % total_batches == 0:
            epoch += 1
            if epoch % 25 == 0:
                word_embeddings = model.add_embeddings()
                print( "Epoch:", (epoch), "Avg Loss:", np.mean(losses)/(total_batches*epoch), "Score:", score(model) )
        
        step += 1

## 3. Testing

In [11]:
embedding_dim = 20
vocab_size = len(vocabulary)
batch_size = 1024
learning_rate = 1.
num_epochs = 2000
alpha = 0.75
xmax = 100

glove = Glove(embedding_dim, vocab_size, batch_size)
glove.init_weights(0.1)
optimizer = torch.optim.Adadelta(glove.parameters(), lr=learning_rate)
data_iter = batch_iter(nonzero_pairs, cooccurrences, batch_size)

training_loop(batch_size, num_epochs, glove, optimizer, data_iter, xmax, alpha)

Epoch: 25 Avg Loss: -1.61620822309e+14 Score: 7
Epoch: 50 Avg Loss: -6.43360156028e+13 Score: 7
Epoch: 75 Avg Loss: -1.09242667079e+12 Score: 6
Epoch: 100 Avg Loss: 5.51203022153e+12 Score: 6
Epoch: 125 Avg Loss: 1.13444099624e+13 Score: 6
Epoch: 150 Avg Loss: 1.11231930651e+13 Score: 6
Epoch: 175 Avg Loss: 7.36788674917e+12 Score: 7
Epoch: 200 Avg Loss: 4.87527862179e+12 Score: 8
Epoch: 225 Avg Loss: 4.33119384456e+12 Score: 8
Epoch: 250 Avg Loss: 3.01967699561e+12 Score: 8
Epoch: 275 Avg Loss: 2.17280474009e+12 Score: 8
Epoch: 300 Avg Loss: 2.16321371024e+12 Score: 8
Epoch: 325 Avg Loss: 1.43964351676e+12 Score: 8
Epoch: 350 Avg Loss: 1.58855064879e+12 Score: 8
Epoch: 375 Avg Loss: 994421697607.0 Score: 8
Epoch: 400 Avg Loss: 75987764320.2 Score: 8
Epoch: 425 Avg Loss: -168252915667.0 Score: 7
Epoch: 450 Avg Loss: 570220541927.0 Score: 7
Epoch: 475 Avg Loss: -26932509087.7 Score: 8
Epoch: 500 Avg Loss: 243040586690.0 Score: 8
Epoch: 525 Avg Loss: 396763581509.0 Score: 8
Epoch: 550 Av