In [1]:
from collections import Counter
import numpy as np
from scipy import sparse
import itertools
from random import shuffle
from math import log

In [2]:
corpus = ('''Lionel Messi born 24 June 1987 is an Argentine professional footballer who plays as a forward for 
Spanish club Barcelona and the Argentine national team. Often considered the best player in the world and regarded
by many as the greatest of all time, Messi has a record-tying five Ballon d'Or awards,four of which he won 
consecutively, and a record five European Golden Shoes. He has spent his entire professional career with Barcelona, 
where he has won 32 trophies, including nine La Liga titles, four UEFA Champions League titles, and six Copas 
del Rey. Both a prolific goalscorer and a creative playmaker, Messi holds the records for most official goals 
scored in La Liga (383), a La Liga season (50), a club football season in Europe (73), a calendar year (91), 
El Clásico (26), as well as those for most assists in La Liga (149) and the Copa América (11). He has scored over 
600 senior career goals for club and country.

Born and raised in central Argentina, Messi was diagnosed with a growth hormone deficiency as a child. At age 13, 
he relocated to Spain to join Barcelona, who agreed to pay for his medical treatment. After a fast progression 
through Barcelona's youth academy, Messi made his competitive debut aged 17 in October 2004. Despite being 
injury-prone during his early career, he established himself as an integral player for the club within the next 
three years, finishing 2007 as a finalist for both the Ballon d'Or and FIFA World Player of the Year award, a feat 
he repeated the following year. His first uninterrupted campaign came in the 2008–09 season, during which he helped 
Barcelona achieve the first treble in Spanish football. At 22 years old, Messi won the Ballon d'Or and FIFA World 
Player of the Year award by record voting margins.

Three successful seasons followed, with Messi winning three consecutive FIFA Ballons d'Or, including an 
unprecedented fourth. His best campaign statistically to date was the 2011–12 season, in which he set the La Liga 
and European records for most goals scored in a single season, while establishing himself as Barcelona's all-time 
top scorer in official competitions in March 2012. The following two seasons, Messi finished twice second for the 
Ballon d'Or behind Cristiano Ronaldo, his perceived career rival. Messi regained his best form during the 2014–15 
campaign, breaking the all-time goalscoring records in both La Liga and the Champions League in November 2014,
[note 3] and led Barcelona to a historic second treble.

An Argentine international, Messi is his country's all-time leading goalscorer. At youth level, he won the 2005 
FIFA World Youth Championship, finishing the tournament with both the Golden Ball and Golden Shoe, and an Olympic 
gold medal at the 2008 Summer Olympics. His style of play as a diminutive, left-footed dribbler drew comparisons 
with compatriot Diego Maradona, who declared the teenager his successor. After making his senior debut in August 
2005, Messi became the youngest Argentine to play and score in a FIFA World Cup during the 2006 edition, and reached
the final of the 2007 Copa América, where he was named young player of the tournament. As the squad's captain from 
August 2011, he led Argentina to three consecutive finals: the 2014 World Cup, for which he won the Golden Ball, 
and the 2015 and 2016 Copas América. After announcing his international retirement in 2016, he reversed his 
decision and led his country to qualification for the 2018 World Cup.''').split("\n")

In [3]:
def build_vocab(corpus):
    
    vocab = Counter()
    for line in corpus:
        tokens = line.strip().split()
        vocab.update(tokens)

    return {word: (i, freq) for i, (word, freq) in enumerate(vocab.items())}

In [4]:
vocab = build_vocab(corpus)
vocab

{'(11).': (49, 1),
 '(149)': (0, 1),
 '(26),': (284, 1),
 '(383),': (197, 1),
 '(50),': (235, 1),
 '(73),': (133, 1),
 '(91),': (70, 1),
 '13,': (151, 1),
 '17': (247, 1),
 '1987': (191, 1),
 '2004.': (23, 1),
 '2005': (165, 1),
 '2005,': (140, 1),
 '2006': (183, 1),
 '2007': (134, 2),
 '2008': (258, 1),
 '2008–09': (68, 1),
 '2011,': (282, 1),
 '2011–12': (167, 1),
 '2012.': (27, 1),
 '2014': (119, 1),
 '2014,': (85, 1),
 '2014–15': (269, 1),
 '2015': (238, 1),
 '2016': (279, 1),
 '2016,': (254, 1),
 '2018': (287, 1),
 '22': (137, 1),
 '24': (84, 1),
 '32': (100, 1),
 '3]': (169, 1),
 '600': (276, 1),
 'After': (123, 3),
 'América': (47, 1),
 'América,': (250, 1),
 'América.': (83, 1),
 'An': (97, 1),
 'Argentina': (257, 1),
 'Argentina,': (189, 1),
 'Argentine': (201, 4),
 'As': (30, 1),
 'At': (125, 3),
 'August': (207, 2),
 'Ball': (299, 1),
 'Ball,': (256, 1),
 'Ballon': (224, 4),
 'Ballons': (86, 1),
 'Barcelona': (112, 3),
 "Barcelona's": (79, 2),
 'Barcelona,': (154, 2),
 'Born

In [5]:
def build_cooccur(vocab, corpus, window_size=10, min_count=None):

    vocab_size = len(vocab)
    id2word = dict((i, word) for word, (i, _) in vocab.items())

    cooccurrences = sparse.lil_matrix((vocab_size, vocab_size), dtype=np.float64)

    for i, line in enumerate(corpus):
        tokens = line.strip().split()
        token_ids = [vocab[word][0] for word in tokens]

        for center_i, center_id in enumerate(token_ids):
            context_ids = token_ids[max(0, center_i - window_size) : center_i]
            contexts_len = len(context_ids)

            for left_i, left_id in enumerate(context_ids):
                # Distance from center word
                distance = contexts_len - left_i

                # Weight by inverse of distance between words
                increment = 1.0 / float(distance)

                # Build co-occurrence matrix symmetrically (pretend we
                # are calculating right contexts as well)
                cooccurrences[center_id, left_id] += increment
                cooccurrences[left_id, center_id] += increment

    for i, (row, data) in enumerate(zip(cooccurrences.rows,
                                                   cooccurrences.data)):
        if min_count is not None and vocab[id2word[i]][1] < min_count:
            continue
        for data_idx, j in enumerate(row):
            if min_count is not None and vocab[id2word[j]][1] < min_count:
                continue
            
            yield i, j, data[data_idx]

In [6]:
cooccurrences = build_cooccur(vocab, corpus, window_size=10)

In [7]:
cooccurrences = list(cooccurrences)

In [8]:
def run_iter(vocab, data, learning_rate=0.05, x_max=100, alpha=0.75):

    global_cost = 0

    shuffle(data)

    for (v_main, v_context, b_main, b_context, gradsq_W_main, gradsq_W_context,
         gradsq_b_main, gradsq_b_context, cooccurrence) in data:

        weight = (cooccurrence / x_max) ** alpha if cooccurrence < x_max else 1

        cost_inner = (v_main.dot(v_context)
                      + b_main[0] + b_context[0]
                      - log(cooccurrence))

        cost = weight * (cost_inner ** 2)

        global_cost += 0.5 * cost

        grad_main = weight * cost_inner * v_context
        grad_context = weight * cost_inner * v_main

        grad_bias_main = weight * cost_inner
        grad_bias_context = weight * cost_inner

        v_main -= (learning_rate * grad_main / np.sqrt(gradsq_W_main))
        v_context -= (learning_rate * grad_context / np.sqrt(gradsq_W_context))

        b_main -= (learning_rate * grad_bias_main / np.sqrt(gradsq_b_main))
        b_context -= (learning_rate * grad_bias_context / np.sqrt(
                gradsq_b_context))

        gradsq_W_main += np.square(grad_main)
        gradsq_W_context += np.square(grad_context)
        gradsq_b_main += grad_bias_main ** 2
        gradsq_b_context += grad_bias_context ** 2

    return global_cost

In [9]:
def train_glove(vocab, cooccurrences, iter_callback=None, vector_size=100,
                iterations=25, **kwargs):

    vocab_size = len(vocab)
    
    W = (np.random.rand(vocab_size * 2, vector_size) - 0.5) / float(vector_size + 1)
    
    biases = (np.random.rand(vocab_size * 2) - 0.5) / float(vector_size + 1)
    
    gradient_squared = np.ones((vocab_size * 2, vector_size),
                               dtype=np.float64)

    gradient_squared_biases = np.ones(vocab_size * 2, dtype=np.float64)

    data = [(W[i_main], W[i_context + vocab_size],
             biases[i_main : i_main + 1],
             biases[i_context + vocab_size : i_context + vocab_size + 1],
             gradient_squared[i_main], gradient_squared[i_context + vocab_size],
             gradient_squared_biases[i_main : i_main + 1],
             gradient_squared_biases[i_context + vocab_size
                                     : i_context + vocab_size + 1],
             cooccurrence)
            for i_main, i_context, cooccurrence in cooccurrences]

    for i in range(iterations):
        
        cost = run_iter(vocab, data, **kwargs)

        if iter_callback is not None:
            iter_callback(W)

    return W

In [10]:
W = train_glove(vocab, cooccurrences, iter_callback=None, vector_size=100, iterations=25)

In [11]:
W

array([[-0.00174331, -0.00202208,  0.00224116, ..., -0.00014126,
         0.0017908 , -0.00048473],
       [ 0.00346717,  0.00432422,  0.00170005, ..., -0.00129644,
        -0.00034047, -0.00533263],
       [-0.00454626, -0.00425541, -0.00093582, ..., -0.0002238 ,
         0.00168117, -0.00041037],
       ..., 
       [ 0.0004233 , -0.00056123,  0.00184646, ..., -0.00323622,
         0.00332861, -0.00436428],
       [ 0.00419255, -0.00313062,  0.00486614, ...,  0.00175442,
        -0.00172273, -0.0031359 ],
       [ 0.00152512,  0.00387947, -0.00474032, ...,  0.00484817,
         0.00138962, -0.00189429]])

In [12]:
word_id = vocab['Messi'][0]
word_id

291

In [13]:
Messi = W[word_id]
Messi

array([  1.53143867e-03,  -4.05531465e-03,   2.62668359e-03,
         1.40540565e-03,   2.98461283e-03,  -9.14340022e-04,
        -1.45702668e-03,   3.01349874e-03,  -5.84766685e-04,
         2.45906174e-03,  -4.25753364e-03,  -2.56073435e-03,
         3.70762524e-03,  -8.82605012e-05,   4.89934416e-03,
        -1.49701597e-03,   3.41278977e-03,  -2.14633085e-03,
        -5.41918823e-03,  -4.22329071e-03,  -3.12668031e-03,
        -1.99793460e-03,   3.17743544e-03,  -4.19791786e-03,
         2.99864179e-03,  -1.38905189e-03,   9.86775894e-04,
         3.23955676e-03,  -2.06914702e-03,  -2.40360225e-03,
         4.13759959e-03,  -4.44583605e-03,   1.68440985e-03,
        -2.08161314e-03,  -4.23370040e-03,  -4.90794437e-03,
        -5.10610606e-03,   4.05988820e-03,  -4.33352128e-03,
        -2.41028063e-03,   4.24122072e-03,   7.29785589e-04,
         1.37778987e-03,  -3.48164208e-03,   1.09220795e-03,
        -5.36955411e-03,  -2.71217744e-03,   1.76386189e-03,
         2.71609147e-03,

In [14]:
Argentina = W[vocab['Argentina'][0]]

In [15]:
from scipy.spatial import distance
distance.cosine(Messi, Argentina)

1.0119882272834955

In [16]:
from scipy.spatial import distance
distance.cosine(W[vocab['Argentina'][0]], W[vocab['Spain'][0]])

1.0047551191138608

In [17]:
from scipy.spatial import distance
distance.cosine(W[vocab['Lionel'][0]], W[vocab['FIFA'][0]])

0.99939439877061775