In [1]:
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

from scipy.spatial.distance import cosine

from collections import deque, defaultdict
import string
import math
from random import shuffle

import glob

In [2]:
nltk_corpus = nltk.corpus.gutenberg.fileids()
corpus = "../data/small_wiki_en/*"
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')

In [3]:
def loop_through_nltk_corpus(corpus):
    # Loop on each file
    for doc in corpus:
        for w in nltk.corpus.gutenberg.words(doc):
            # Set to lower case
            w = w.lower()
            # Do not process stop words and punctuation
            if w in stop_words or w in string.punctuation:
                continue
            yield w
        yield '<ENDOFDOC>'

In [20]:
def loop_through_corpus(corpus):
    # Loop on each file
    for file in glob.glob(corpus):
        print("Processing file %s..." % file)
        lines =  open(file, encoding='ISO-8859-1').readlines()
        for l in lines:
            if l[0] == '<':
                continue
            elif 'ENDOFARTICLE' in l:
                yield '<ENDOFDOC>'
            for w in tokenizer.tokenize(l):
                # Set to lower case
                w = w.lower()
                yield w

In [21]:
def process_corpus(corpus, stop_words):
    word2id = {}
    id2word = []
    n_words = 0
    word_freq = defaultdict(int)
    
    for w in loop_through_corpus(corpus):
        word_freq[w] += 1

    for w in loop_through_corpus(corpus):
        if w not in word2id and w != '<ENDOFDOC>' and word_freq[w] > 10 and w not in stop_words:
            word2id[w] = n_words
            id2word.append(w)
            n_words += 1
    
    del word_freq
    return word2id, id2word, n_words

In [22]:
def gen_cooccurences(corpus, word2id, window_size):
    cur_context = deque(maxlen=window_size)
    for w in loop_through_corpus(corpus):
        if w not in word2id or w in stop_words:
            continue
        if w == '<ENDOFDOC>':
            cur_context.clear()
            continue
        for i in range(len(cur_context)):
            c = cur_context[i]
            d = len(cur_context) - i
            yield word2id[w], word2id[c], d
        cur_context.append(w)

In [23]:
def get_hash_key(w, c):
    bits_min = min(w, c) << 16
    bits_max = max(w, c)
    return bits_min + bits_max
    
def build_cooccur_hashtable(corpus, word2id, window_size=5):
    coocur_hashtable = defaultdict(float)
    for (w, c, d) in gen_cooccurences(corpus, word2id, window_size):
        hash_key = get_hash_key(w, c)
        coocur_hashtable[hash_key] += 1 / d
    return coocur_hashtable

def get_cooccurence_value(table, w, c):
    hash_key = get_hash_key(w, c)
    return table[hash_key]

In [24]:
def list_data(coocur_table):
    all_data = []
    for key, value in coocur_table.items():
        if value < 2:
            continue
        w = key >> 16
        c = key - (w << 16)
        all_data.append((w, c, value))
        all_data.append((c, w, value))
    return all_data

In [25]:
def weight_func(x, xmax=100, alpha=3/4):
    return min(1, (x/xmax) ** alpha)

In [89]:
#TODO check how AdaGrad works
#TODO Check initialization vectors
#TODO Check training procedure (how to sample cooccurences? What about zeros in cooc?)
def train_glove(cooccur_list, voc_size, embed_size, learning_rate, n_epoch,
                init_W_word=None, init_b_word=None, init_W_context=None, init_b_context=None):
    
    W_word    = 0.1 * np.random.rand(voc_size, embed_size) - 0.05 if init_W_word    is None else init_W_word
    b_word    = 0.1 * np.random.rand(voc_size)             - 0.05 if init_b_word    is None else init_b_word
    W_context = 0.1 * np.random.rand(voc_size, embed_size) - 0.05 if init_W_context is None else init_W_context
    b_context = 0.1 * np.random.rand(voc_size)             - 0.05 if init_b_context is None else init_b_context
    
    #sum_gradsq_W_word    = np.zeros(embed_size)
    #sum_gradsq_b_word    = 0.
    #sum_gradsq_W_context = np.zeros(embed_size)
    #sum_gradsq_b_context = 0.
    #epsilon = 1e-6
    
    common_grad = np.float64()  # Overflow can occur with 32 bits
    
    for ep in range(1, n_epoch + 1):
        print("Start epoch %i" % ep)
        total_cost = 0
        shuffle(cooccur_list)
        for (w, c, cooc_value) in cooccur_list:
            weighted_cooc_value = weight_func(cooc_value)
            
            common_grad = np.dot(W_word[w], W_context[c]) + b_word[w] + b_context[c] - math.log(cooc_value)
            
            grad_W_word    = weighted_cooc_value * W_context[c] * common_grad  # factor 2 in learning rate
            grad_b_word    = weighted_cooc_value * common_grad
            grad_W_context = weighted_cooc_value * W_word[w] * common_grad
            grad_b_context = weighted_cooc_value * common_grad

            W_word[w]    -= learning_rate * grad_W_word #/ np.sqrt(sum_gradsq_W_word + epsilon)
            b_word[w]    -= learning_rate * grad_b_word #/ math.sqrt(sum_gradsq_b_word + epsilon)
            W_context[c] -= learning_rate * grad_W_context #/ np.sqrt(sum_gradsq_W_context + epsilon)
            b_context[c] -= learning_rate * grad_b_context #/ math.sqrt(sum_gradsq_b_context + epsilon)
            
            #sum_gradsq_W_word    += np.square(grad_W_word)
            #sum_gradsq_b_word    += grad_b_word ** 2
            #sum_gradsq_W_context += np.square(grad_W_context)
            #sum_gradsq_b_context += grad_b_context ** 2
            
            total_cost += weighted_cooc_value * (common_grad ** 2)
            
        # Test embeddings
        embeddings = W_word + W_context

        print("Total cost for epoch: %f" % total_cost)
        
        print("Distance between queen and king: %f" % cosine(embeddings[word2id['queen']], embeddings[word2id['king']]))
        print("Distance between blue and green: %f" % cosine(embeddings[word2id['blue']], embeddings[word2id['green']]))
        print("Distance between good and imagine: %f" % cosine(embeddings[word2id['good']], embeddings[word2id['imagine']]))
        
    return embeddings, W_word, b_word, W_context, b_context

In [27]:
# Read corpus
word2id, id2word, voc_size = process_corpus(corpus, stop_words)
print(voc_size)

Processing file ../data/small_wiki_en/englishText_10000_20000...
Processing file ../data/small_wiki_en/englishText_0_10000...
Processing file ../data/small_wiki_en/englishText_60000_70000...
Processing file ../data/small_wiki_en/englishText_50000_60000...
Processing file ../data/small_wiki_en/englishText_30000_40000...
Processing file ../data/small_wiki_en/englishText_20000_30000...
Processing file ../data/small_wiki_en/englishText_40000_50000...
Processing file ../data/small_wiki_en/englishText_70000_80000...
Processing file ../data/small_wiki_en/englishText_10000_20000...
Processing file ../data/small_wiki_en/englishText_0_10000...
Processing file ../data/small_wiki_en/englishText_60000_70000...
Processing file ../data/small_wiki_en/englishText_50000_60000...
Processing file ../data/small_wiki_en/englishText_30000_40000...
Processing file ../data/small_wiki_en/englishText_20000_30000...
Processing file ../data/small_wiki_en/englishText_40000_50000...
Processing file ../data/small_wik

In [29]:
# Build cooccurence matrix
cooccur_hashtable = build_cooccur_hashtable(corpus, word2id, window_size=3)

Processing file ../data/small_wiki_en/englishText_10000_20000...
Processing file ../data/small_wiki_en/englishText_0_10000...
Processing file ../data/small_wiki_en/englishText_60000_70000...
Processing file ../data/small_wiki_en/englishText_50000_60000...
Processing file ../data/small_wiki_en/englishText_30000_40000...
Processing file ../data/small_wiki_en/englishText_20000_30000...
Processing file ../data/small_wiki_en/englishText_40000_50000...
Processing file ../data/small_wiki_en/englishText_70000_80000...


In [30]:
cooccur_list = list_data(cooccur_hashtable)

In [91]:
# Train GloVe
embeddings, W_word, b_word, W_context, b_context = train_glove(cooccur_list, voc_size,
                                                               embed_size=128,
                                                               learning_rate=0.01,
                                                               n_epoch=6,
                                                               init_W_word=W_word,
                                                               init_b_word=b_word,
                                                               init_W_context=W_context,
                                                               init_b_context=b_context)

Start epoch 1
Total cost for epoch: 570327.541914
Similartity between queen and king: 0.811120
Similartity between blue and green: 0.438657
Similartity between good and imagine: 1.024532
Start epoch 2
Total cost for epoch: 534188.566462
Similartity between queen and king: 0.722311
Similartity between blue and green: 0.350025
Similartity between good and imagine: 0.994354
Start epoch 3
Total cost for epoch: 502735.532637
Similartity between queen and king: 0.624445
Similartity between blue and green: 0.279142
Similartity between good and imagine: 0.951835
Start epoch 4
Total cost for epoch: 474695.909722
Similartity between queen and king: 0.515200
Similartity between blue and green: 0.221121
Similartity between good and imagine: 0.901978
Start epoch 5
Total cost for epoch: 449253.340990
Similartity between queen and king: 0.436076
Similartity between blue and green: 0.184151
Similartity between good and imagine: 0.851536
Start epoch 6
Total cost for epoch: 425876.572075
Similartity bet

In [112]:
cosine(embeddings[word2id['point']], embeddings[word2id['comma']])

0.7591438679300859

In [85]:
cosine([2, 2, 2], [-2, -2, -2])

2.0

In [None]:
np.save('embeddings.npy', embeddings)