In [None]:
# original code: https://github.com/lazyprogrammer/machine_learning_examples

In [1]:
# Course URL:
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range, input
import numpy as np
import matplotlib.pyplot as plt
import random
from datetime import datetime

import os
import sys
sys.path.append(os.path.abspath('..'))
from rnn_class.util import get_wikipedia_data
from rnn_class.brown import get_sentences_with_word2idx_limit_vocab, get_sentences_with_word2idx

from markov import get_bigram_probs

In [37]:
sentences, word2idx = get_sentences_with_word2idx_limit_vocab(2000)

V = len(word2idx)
print("Vocab size:", V)

START inf
END inf
man inf
paris inf
britain inf
england inf
king inf
woman inf
rome inf
london inf
queen inf
italy inf
france inf
the 69971
, 58334
. 49346
of 36412
and 28853
to 26158
a 23195
in 21337
that 10594
is 10109
was 9815
he 9548
for 9489
`` 8837
'' 8789
it 8760
with 7289
as 7253
his 6996
on 6741
be 6377
; 5566
at 5372
by 5306
i 5164
this 5145
had 5133
? 4693
not 4610
are 4394
but 4381
from 4370
or 4206
have 3942
an 3740
they 3620
which 3561
-- 3432
one 3292
you 3286
were 3284
her 3036
all 3001
she 2860
there 2728
would 2714
their 2669
we 2652
him 2619
been 2472
) 2466
has 2437
( 2435
when 2331
who 2252
will 2245
more 2215
if 2198
no 2139
out 2097
so 1985
said 1961
what 1908
up 1890
its 1858
about 1815
: 1795
into 1791
than 1790
them 1788
can 1772
only 1748
other 1702
new 1635
some 1618
could 1601
time 1598
! 1596
these 1573
two 1412
may 1402
then 1380
do 1363
first 1361
any 1344
my 1318
now 1314
such 1303
like 1292
our 1252
over 1236
me 1181
even 1170
most 1159
made 1125
also 

chief 119
reported 119
served 119
based 119
main 119
determined 119
image 119
decision 119
window 119
religion 119
aj 118
gun 118
responsibility 118
middle 118
europe 118
british 118
character 118
learned 117
horse 117
writing 117
appear 117
s. 117
account 117
ones 116
serious 116
activity 116
types 116
green 116
length 116
lived 115
audience 115
letters 115
returned 115
obtained 115
nuclear 115
specific 115
corner 115
forward 115
activities 115
slowly 115
doubt 114
6 114
justice 114
moving 114
latter 114
gives 114
straight 114
hit 114
plane 114
quality 114
design 114
obviously 114
operation 113
plans 113
shot 113
seven 113
a. 113
choice 113
poor 113
staff 113
function 113
figures 113
parts 113
stay 113
saying 113
include 113
15 113
born 113
pattern 113
30 112
cars 112
whatever 112
sun 112
faith 111
pool 111
hospital 110
corps 110
wish 110
lack 110
completely 110
heavy 110
waiting 110
speak 110
ball 110
standard 110
extent 110
visit 109
democratic 109
firm 109
income 109
ahead 109
deep

headed 59
sensitive 59
conclusion 59
roof 59
solution 59
bible 59
lie 59
ultimate 59
songs 59
struck 59
negroes 59
snow 59
tree 59
plants 59
finds 59
stories 59
mine 59
painting 59
exist 59
thirty 59
sexual 59
tuesday 58
roads 58
commerce 58
p. 58
dallas 58
establish 58
previously 58
causes 58
talked 58
railroad 58
critical 58
remove 58
emphasis 58
grounds 58
neighborhood 58
surprised 58
minor 58
india 58
understood 58
perfect 58
avoid 58
somebody 58
hole 58
hence 58
leg 58
busy 58
occasion 58
smile 58
stone 58
roman 58
unique 58
animals 58
sky 58
safe 58
etc. 58
orders 58
fairly 58
liked 58
useful 58
exercise 58
lose 58
culture 58
pale 58
wondered 58
charged 57
details 57
informed 57
permitted 57
professor 57
replied 57
completion 57
processes 57
apart 57
apparent 57
bay 57
truck 57
majority 57
afraid 57
artist 57
goods 57
birds 57
appearance 57
baseball 57
spot 57
flowers 57
lewis 57
notes 57
enjoyed 57
entrance 57
uncle 57
alive 57
beneath 57
combination 57
truly 57
congo 57
becomin

In [38]:
vocab_size = len(word2idx)
window_size = 5
learning_rate = 0.025
final_learning_rate = 0.0001
num_negatives = 5 # number of negative samples to draw per input word
epochs = 20
D = 50 # word embedding size
learning_rate_delta = (learning_rate - final_learning_rate) / epochs

In [43]:
def get_negative_sampling_distribution(sentences, vocab_size):
    word_freq = np.ones(vocab_size)*1e-5
    word_count = sum(len(sentence) for sentence in sentences)
    for sentence in sentences:
        for word in sentence:
            word_freq[word] += 1
            
    # smooth it
    p_neg = word_freq**0.75

    # normalize it
    p_neg = p_neg / p_neg.sum()

    assert(np.all(p_neg > 0))
    return p_neg

In [44]:
W = np.random.randn(vocab_size, D) # input-to-hidden
V = np.random.randn(D, vocab_size)
p_neg = get_negative_sampling_distribution(sentences, vocab_size)
costs = []
total_words = sum(len(sentence) for sentence in sentences)

In [45]:
# sampling 될 확률
p_neg.shape

(2001,)

In [None]:
threshold = 1e-5
p_drop = 1 - np.sqrt(threshold / p_neg)

# train

In [None]:
for epoch in range(epochs):
    np.random.shuffle(sentences)
    cost = 0
    counter = 0
    for sentence in sentences:
        # p_neg에 따라 word 선별
        sentence = [w for w in sentence
                    if np.random.random() < (1 - p_drop[w])
                    ]
        if len(sentence) < 2:
            continue
        # randomly order words so we don't always see
        # samples in the same order
        randomly_ordered_positions = np.random.choice(
            len(sentence),
            size=len(sentence),  # np.random.randint(1, len(sentence) + 1),
            replace=False,
        )
        
        for pos in randomly_ordered_positions:
            # the middle word
            word = sentence[pos]

            # get the positive context words/negative samples
            context_words = get_context(pos, sentence, window_size)
            neg_word = np.random.choice(vocab_size, p=p_neg)
            targets = np.array(context_words)

            # do one iteration of stochastic gradient descent
            c = sgd(word, targets, 1, learning_rate, W, V)
            cost += c
            c = sgd(neg_word, targets, 0, learning_rate, W, V)
            cost += c

            counter += 1
            if counter % 100 == 0:
                sys.stdout.write("processed %s / %s\r" % (counter, len(sentences)))
                sys.stdout.flush()