# GloVe : getting relevant word embeddings

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import collections
import random
sns.set()

## 1. Setup

In [2]:
data = pd.read_csv('all_items_improved.csv', sep=',')

In [3]:
data = data[data.item_price > 0]
data = data[data.item_price < 200]
data['rest_location'] = data['rest_location'].apply(lambda x: x.split("\n")[1].strip())
data['rest_location'] = data['rest_location'].replace('', '0').replace('Bercy Village', '0').astype(int)

data.index = np.arange(len(data.item))

For speed, we're only using the 250 most common words.

In [4]:
context_window = 4
top_k = 250

def tokenize(string):
    return string.lower().split()

word_counter = collections.Counter()
for example in data.item:
    word_counter.update(tokenize(example))
    
vocabulary = [pair[0] for pair in word_counter.most_common(top_k)]
idx_to_word = dict(enumerate(vocabulary))
word_to_idx = dict(zip(idx_to_word.values(), idx_to_word.keys()))

### Extract coorcurrences

In [5]:
def extract_cooccurrences(dataset, word_map, amount_of_context=context_window):
    num_words = len(vocabulary)
    cooccurrences = np.zeros((num_words, num_words))
    nonzero_pairs = set()
    for example in dataset:
        words = tokenize(example)
        for target_index in range(len(words)):
            target_word = words[target_index]
            if target_word not in word_to_idx:
                continue
            target_word_index = word_to_idx[target_word]
            min_context_index = max(0, target_index - amount_of_context)
            max_word = min(len(words), target_index + amount_of_context + 1)
            for context_index in list(range(min_context_index, target_index)) + \
            list(range(target_index + 1, max_word)):
                context_word = words[context_index]
                if context_word not in word_to_idx:
                    continue
                context_word_index = word_to_idx[context_word]
                cooccurrences[target_word_index][context_word_index] += 1.0
                nonzero_pairs.add((target_word_index, context_word_index))
    return cooccurrences, list(nonzero_pairs)
                
cooccurrences, nonzero_pairs = extract_cooccurrences(data.item, vocabulary)

### Batchify data

In [6]:
# We will train on batches of (count, word_i, word_j) combinations.

def batch_iter(nonzero_pairs, cooccurrences, batch_size):
    start = -1 * batch_size
    dataset_size = len(nonzero_pairs)
    order = list(range(dataset_size))
    random.shuffle(order)

    while True:
        start += batch_size
        word_i = []
        word_j = []
        counts = []
        if start > dataset_size - batch_size:
            # Start another epoch.
            start = 0
            random.shuffle(order)
        batch_indices = order[start:start + batch_size]
        batch = [nonzero_pairs[index] for index in batch_indices]
        for k in batch:
            counts.append(cooccurrences[k])
            word_i.append(k[0])
            word_j.append(k[1])
        yield [counts, word_i, word_j]
        

To be frank, a GloVe model trained on such a small dataset and vocabulary won't be spectacular, so we won't bother with a full-fledged similarity or analogy evaluation. Instead, we'll use the simple scoring function below, which grades the model on how well it captures ten easy/simple similarity comparisons. The function returns a score between 0 and 10. Random embeddings can be expected to get a score of 5.

## 2. Modeling

### Metric

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

def similarity(model, word_one, word_two):
    vec_one = model.get_embeddings(word_to_idx[word_one]).reshape(1, -1)
    vec_two = model.get_embeddings(word_to_idx[word_two]).reshape(1, -1)
    return float(cosine_similarity(vec_one, vec_two))

def score(model):
    m = model
    score = 0
    score += similarity(m, 'cabillaud', 'poisson') > similarity(m, 'cabillaud', 'magret')
    score += similarity(m, 'de', 'au') > similarity(m, 'de', 'ou')
    score += similarity(m, 'poulet', 'boeuf') >  similarity(m, 'poulet', 'légumes')
    score += similarity(m, 'pomme', 'fruits') > similarity(m, 'pomme', 'fromage')
    score += similarity(m, 'chocolat', 'vanille') > similarity(m, 'chocolat', 'crème')
    score += similarity(m, 'mozzarella', 'fromage') > similarity(m, 'mozzarella', 'miel')
    score += similarity(m, 'café', 'thé') > similarity(m, 'café', 'huile')
    score += similarity(m, 'entrecôte', 'viande') > similarity(m, 'entrecôte', 'poisson')
    score += similarity(m, 'vin', 'champagne') > similarity(m, 'vin', 'soupe')
    score += similarity(m, 'confiture', 'nutella') > similarity(m, 'confiture', 'beignets')
    return score

### Model

In [8]:
training_set = data.item

In [9]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

class Glove(nn.Module):
    def __init__(self, embedding_dim, vocab_size, batch_size):
        super(Glove, self).__init__()
        self.word_embeddings = None
        
        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.co_word_embeds = nn.Embedding(vocab_size, embedding_dim)
        
        self.bias_1 = Variable(torch.FloatTensor(batch_size))
        self.bias_2 = Variable(torch.FloatTensor(batch_size))

    
    def forward(self, counts, words, co_words, xmax, alpha):
        
        embedded_words = self.word_embeds(words)
        embedded_co_words = self.co_word_embeds(co_words)

        embed_prod = torch.sum(embedded_words * embedded_co_words, dim=1)
        weights = torch.FloatTensor([pow(count/xmax, alpha) if count <= xmax else 1 for count in counts])
        
        counts = Variable(torch.FloatTensor(counts))
        squared_error = torch.pow(embed_prod + self.bias_1 + self.bias_2 - torch.log(counts), 2)
        cost = torch.dot(Variable(weights), squared_error)
        
        return cost
        
        
    def init_weights(self, i_range):
        self.word_embeds.weight.data.uniform_(-i_range, i_range)
        self.co_word_embeds.weight.data.uniform_(-i_range, i_range)
        
        nn.init.uniform(self.bias_1, -i_range/2, i_range/2)
        nn.init.uniform(self.bias_2, -i_range/2, i_range/2)
    
    def add_embeddings(self):

        self.word_embeddings = (self.word_embeds.weight + self.co_word_embeds.weight).data.numpy()
        return self.word_embeddings
    
    def get_embeddings(self, index):
        if self.word_embeddings is None:
            add_embeddings()
        return self.word_embeddings[index, :]

### Training Loop

In [10]:
def training_loop(batch_size, num_epochs, model, optim, data_iter, xmax, alpha):
    step = 0
    epoch = 0
    losses = []
    total_batches = int(len(training_set) / batch_size)
    while epoch <= num_epochs:
        model.train()
        counts, words, co_words = next(data_iter)        
        words_var = Variable(torch.LongTensor(words))
        co_words_var = Variable(torch.LongTensor(co_words))
        
        model.zero_grad()

        loss = model(counts, words_var, co_words_var, xmax, alpha)

        losses.append(loss.data[0])
        loss.backward()
        optimizer.step()
        
        if step % total_batches == 0:
            epoch += 1
            if epoch % 25 == 0:
                word_embeddings = model.add_embeddings()
                print( "Epoch:", (epoch), "Avg Loss:", np.mean(losses)/(total_batches*epoch), "Score:", score(model) )
        
        step += 1

## 3. Testing

In [11]:
embedding_dim = 20
vocab_size = len(vocabulary)
batch_size = 1024
learning_rate = 1.
num_epochs = 2000
alpha = 0.75
xmax = 50

glove = Glove(embedding_dim, vocab_size, batch_size)
glove.init_weights(0.1)
optimizer = torch.optim.Adadelta(glove.parameters(), lr=learning_rate)
data_iter = batch_iter(nonzero_pairs, cooccurrences, batch_size)

training_loop(batch_size, num_epochs, glove, optimizer, data_iter, xmax, alpha)

Epoch: 25 Avg Loss: 0.237080909196 Score: 7
Epoch: 50 Avg Loss: 0.073311305709 Score: 8
Epoch: 75 Avg Loss: 0.0368637826011 Score: 7
Epoch: 100 Avg Loss: 0.0229283825998 Score: 7
Epoch: 125 Avg Loss: 0.0160282313203 Score: 7
Epoch: 150 Avg Loss: 0.0120515361653 Score: 7
Epoch: 175 Avg Loss: 0.00952154728788 Score: 7
Epoch: 200 Avg Loss: 0.00779610271279 Score: 7
Epoch: 225 Avg Loss: 0.00655690088338 Score: 7
Epoch: 250 Avg Loss: 0.00563092352251 Score: 7
Epoch: 275 Avg Loss: 0.00491689236152 Score: 7
Epoch: 300 Avg Loss: 0.0043521144968 Score: 7
Epoch: 325 Avg Loss: 0.00389575636429 Score: 7
Epoch: 350 Avg Loss: 0.00352031519006 Score: 7
Epoch: 375 Avg Loss: 0.00320684789023 Score: 7
Epoch: 400 Avg Loss: 0.00294161731788 Score: 7
Epoch: 425 Avg Loss: 0.00271466598811 Score: 7
Epoch: 450 Avg Loss: 0.00251849631068 Score: 7
Epoch: 475 Avg Loss: 0.00234736968637 Score: 7
Epoch: 500 Avg Loss: 0.00219705853159 Score: 7
Epoch: 525 Avg Loss: 0.00206393391676 Score: 7
Epoch: 550 Avg Loss: 0.00

In [29]:
word_embeddings = pd.DataFrame(glove.get_embeddings(range(top_k)))
word_embeddings.to_csv(path_or_buf='word_embeddings.csv', sep=',', index=False)

### Get menu items embeddings : mean of the embeddings of the word it contains

In [33]:
len(word_to_idx)

250

In [66]:
embed_features = pd.DataFrame(0., index=np.arange(len(data.item)), columns = ['embed_'+str(i) for i in range(embedding_dim)])

for i, example in enumerate(data.item):
    words = tokenize(example)
    embedding, counter = np.zeros(embedding_dim), 0
    for word in words:
        if word in vocabulary:
            embedding += np.array(word_embeddings.iloc[word_to_idx[word],:])
            counter +=1
    if counter != 0:
        embedding = embedding/counter
        
    embed_features.iloc[i,:] = embedding

In [69]:
embed_features

Unnamed: 0,embed_0,embed_1,embed_2,embed_3,embed_4,embed_5,embed_6,embed_7,embed_8,embed_9,embed_10,embed_11,embed_12,embed_13,embed_14,embed_15,embed_16,embed_17,embed_18,embed_19
0,-1.874162,-0.514289,0.154482,0.924165,-0.673052,-2.115821,1.702737,-4.593929,-0.564549,0.173635,-2.300689,2.310296,3.617619,0.495811,0.014132,1.554574,-1.303211,0.243914,-2.443150,1.018097
1,-1.058594,0.468069,0.227778,0.520214,0.045407,-2.331490,1.768863,-2.797484,-0.172932,0.766593,-0.857880,2.104427,2.456851,0.387224,-0.071313,1.729251,-1.895630,1.147684,-2.283334,1.955687
2,-2.207431,-0.700030,0.232034,0.708987,-1.012587,-2.084536,1.686783,-4.490981,-0.523922,0.039748,-2.252694,2.448577,3.814959,0.462065,0.466589,1.327219,-1.418127,0.126875,-2.660127,1.008788
3,-0.560825,1.917226,-0.858227,2.181453,-1.857214,-0.313267,0.698386,-1.189683,-1.423026,0.997223,-1.054851,1.238168,1.309938,-0.111402,1.405877,2.931664,-1.250099,-0.027170,-0.558327,1.979148
4,-2.435404,-0.553675,0.637538,0.568536,-0.708323,-1.756932,1.326432,-3.763396,-0.692488,1.028051,-0.671153,1.262292,2.576393,-0.019247,0.722834,1.613377,-2.805046,0.391301,-2.973859,1.476662
5,-2.264134,0.329805,-0.441389,1.195848,-1.080354,-1.242548,1.563784,-3.409066,-1.377088,-0.430742,-2.344264,1.363507,2.448313,0.085914,0.524374,1.583412,-1.246872,0.170606,-1.386775,0.706923
6,-0.351700,0.237830,-0.306684,1.541778,1.148915,-1.250233,1.835066,-2.664384,0.818470,-0.444443,-0.312803,1.487834,2.906317,-0.353389,-0.059683,1.772423,-1.038495,-0.681221,-0.669538,0.771317
7,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,-1.781802,-1.613058,2.302900,2.264477,1.053507,-0.529145,2.364003,-0.500005,1.591717,-0.122060,-2.394910,-0.325162,0.549209,-0.672685,-0.512108,-3.080264,-0.940870,1.756414,-4.225825,0.393838
