## Vanilla Generative Adversarial <img src="http://creationkinetica.com/wp-content/uploads/2015/03/new-twitter-logo-vector-400x400.jpg" height="30" width="30" style="display: inline-block"> Networks

In [362]:
import tensorflow as tf
import numpy as np
import os
from collections import Counter
from nltk.tokenize import TweetTokenizer
import codecs
from random import randint

In [6]:
tf.__version__

'1.4.0'

In [222]:
with codecs.open(os.path.join('../data', 'sent.csv'), 'r', encoding='utf-8') as f:
    corpus_line_by_line = f.read().lower().split("\n")

In [223]:
corpus_line_by_line = corpus_line_by_line[:1000]

In [224]:
corpus_line_by_line = [line.rstrip('\r').split(',') for line in corpus_line_by_line]

In [225]:
corpus_line_by_line[0]

['0', '                     is so sad for my apl friend.............']

In [226]:
tw = TweetTokenizer()

In [227]:
corpus_tokenized = list(map(lambda line: (line[0], tw.tokenize(line[1])), corpus_line_by_line))

In [301]:
corpus_tokenized[0]

('0', ['is', 'so', 'sad', 'for', 'my', 'apl', 'friend', '...'])

In [228]:
corpus = []
for sent in corpus_tokenized:
    corpus.extend([word for word in sent[1]])

In [229]:
def build_vocab(words, vocab_size):
    """ Build vocabulary of VOCAB_SIZE most frequent words """
    dictionary = dict()
    count = [('<UNK>', -1)]
    count.extend(Counter(words).most_common(vocab_size - 1))
    index = 0
    for word, _ in count:
        dictionary[word] = index
        index += 1
    index_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return dictionary, index_dictionary

In [230]:
vocabulary_size = len(set(corpus))

In [231]:
vocabulary_size

3198

In [232]:
vocabulary, reverse_vocabulary = build_vocab(corpus, vocabulary_size)

In [233]:
for word, index in zip(reverse_vocabulary.values(), vocabulary.values()):
    print(word, index)

<UNK> 953
. 151
! 1968
i 954
" 955
to 956
... 957
the 958
- 412
my 959
a 960
is 413
? 335
and 961
in 962
you 963
of 564
on 414
.. 415
it 964
me 965
for 186
that 967
so 968
this 276
just 969
have 970
* 565
( 971
: 972
i'm 973
get 974
now 975
not 976
) 977
at 978
with 417
all 979
day 980
no 240
be 981
are 664
but 982
was 566
out 983
' 538
up 2839
am 985
go 215
want 170
& 103
what 337
night 216
today 986
2 567
well 241
like 987
do 418
love 568
one 887
will 989
know 990
good 125
twitter 569
can 991
about 992
work 993
u 994
if 995
think 2841
got 998
can't 999
why 1000
much 1001
your 1002
going 1003
sad 1004
really 1005
lol 21
im 1006
new 1007
see 1008
miss 1009
has 9
more 338
it's 1010
back 1011
how 1012
from 1013
last 1014
<3 1015
an 1016
its 1017
don't 419
/ 1018
@ 1019
some 339
too 1020
when 1021
him 1022
oh 152
feel 1046
feeling 153
we 570
home 920
tonight 1025
off 571
days 572
here 1026
time 1027
as 1028
better 1029
3 1030
that's 573
happy 1492
getting 420
tomorrow 1032
ur 48
never 284

@jerrybattiste 498
giving 500
dias 801
gb 3068
bhai 802
fell 2328
works 2329
mosquito 1451
express 2330
spilt 2331
frown 2168
usual 2332
http://tumblr.com/xr0238tmq 2333
@311 1954
infection 2334
bread 2335
hospitol 2336
suckss 499
popularity 2337
moi 804
120gb 51
page 3088
#5am 2338
dentists 2339
handle 2340
desperate 805
stone 806
swim 510
series 808
spaz 2341
site 2342
strangely 2343
although 2344
rb 809
2moro 2345
coroner 2346
@jonswerens 147
@xxkirahxx 207
cuzin 311
beyond 2347
barkley 2348
taylorrhicks 2349
longer 381
integrity 811
sleepies 812
thanx 2350
#p1wimax 813
standing 2351
naked 2352
@kittyireland 2353
puurfect 382
loviie 2354
fucccckkkkkkkkkk 2355
asking 228
created 2356
mouth 2357
mmmyeahh 2358
@bananarchy_atx 2359
confirmed 2361
thinking 148
shift 2362
j2 2363
opposite 2364
appetite 3170
http://tinyurl.com/cbl6tm 814
abc 2366
thru 2367
affair 2368
owning 2369
talent 2370
@angeloys 2371
nursery 815
#beautiful 2372
pants 2373
depressing 2374
iowa 2375
@ivegotzooms 681
ar

In [234]:
def index_words_in_corpus(corpus):
    return [vocabulary[token] if token in vocabulary else 0 for token in corpus]

In [235]:
corpus_indexed = [(line[0], index_words_in_corpus(line[1])) for line in corpus_tokenized]

In [236]:
len(corpus_indexed)

1000

In [237]:
corpus_indexed[0]

('0', [11, 23, 76, 21, 9, 2773, 460, 6])

In [238]:
vocabulary_size

3198

In [239]:
def one_hot_encode(review):
    temp = np.zeros(vocabulary_size)
    for indx in review:
        temp[indx] = 1
    return temp

In [240]:
def one_hot_encode_class(sentiment):
    temp = np.zeros(2)
    if sentiment == 1:
        temp[1] = 1
    else:
        temp[0] = 1
    return temp

In [241]:
# data = np.array([(one_hot_encode(word), vocabulary.get(word)) for word in corpus])
data = np.array([one_hot_encode(word[1]) for word in corpus_indexed])

In [242]:
print("TRAIN: (Total number of words, Vocabulary size):", data.shape)

TRAIN: (Total number of words, Vocabulary size): (1000, 3198)


In [243]:
sample = data[0]

In [245]:
np.where(sample == 1)[0]

array([   6,    9,   11,   21,   23,   76,  460, 2773], dtype=int64)

In [246]:
def decode_sentence(sample):
    sentence = []
    for i in range(sample.shape[0]):
        if sample[i] == 1:
            sentence.append(reverse_vocabulary[i])
    return ' '.join([i for i in sentence])

In [247]:
print(sample, decode_sentence(sample))

[ 0.  0.  0. ...,  0.  0.  0.] ... my is for so sad friend apl


In [None]:
def get_batches(batch_size):
    idx = randint(data.shape[0]-3)
    return data[idx:idx+batch_size]

### State of art weight Initialization strategy

In [190]:
def xavier_init(n_inputs, n_outputs, uniform=True):
  """Set the parameter initialization using the method described.
  This method is designed to keep the scale of the gradients roughly the same
  in all layers.
  Xavier Glorot and Yoshua Bengio (2010):
           Understanding the difficulty of training deep feedforward neural
           networks. International conference on artificial intelligence and
           statistics.
  Args:
    n_inputs: The number of input nodes into each output.
    n_outputs: The number of output nodes for each input.
    uniform: If true use a uniform distribution, otherwise use a normal.
  Returns:
    An initializer.
  """
  if uniform:
    # 6 was used in the paper.
    init_range = tf.sqrt(6.0 / (n_inputs + n_outputs))
    return tf.random_uniform_initializer(-init_range, init_range)
  else:
    # 3 gives us approximately the same limits as above since this repicks
    # values greater than 2 standard deviations from the mean.
    stddev = tf.sqrt(3.0 / (n_inputs + n_outputs))
    return tf.truncated_normal_initializer(stddev=stddev)

In [191]:
'''A recent paper by He, Rang, Zhen and Sun they build on Glorot & Bengio and suggest using 2/size_of_input_neuron
''' 
def xavier_init(size):
    in_dim = size[0]
#     xavier_stddev = 1. / in_dim
#     xavier_stddev = 2. / in_dim
    xavier_stddev = 1. / tf.sqrt(in_dim / 2.)
    return tf.random_normal(shape=size, stddev=xavier_stddev)

### Discriminator

In [363]:
with tf.name_scope('Input'):
    X = tf.placeholder(tf.float32, shape=[None, vocabulary_size])

with tf.name_scope('Discriminator_weights_biases'):    
    D_W1 = tf.Variable(xavier_init([vocabulary_size, 128]))
    D_b1 = tf.Variable(tf.zeros(shape=[128]))

    D_W2 = tf.Variable(xavier_init([128, 1]))
    D_b2 = tf.Variable(tf.zeros(shape=[1]))

    theta_D = [D_W1, D_W2, D_b1, D_b2]

In [364]:
def discriminator(x):
    D_h1 = tf.nn.relu(tf.matmul(x, D_W1) + D_b1)
    D_logit = tf.matmul(D_h1, D_W2) + D_b2
    D_prob = tf.nn.sigmoid(D_logit)

    return D_prob, D_logit

### Generator

In [365]:
with tf.name_scope('Latent_space'):
    Z = tf.placeholder(tf.float32, shape=[None, 100])

with tf.name_scope('Generator_weights_biases'):
    G_W1 = tf.Variable(xavier_init([100, 128]))
    G_b1 = tf.Variable(tf.zeros(shape=[128]))

    G_W2 = tf.Variable(xavier_init([128, vocabulary_size]))
    G_b2 = tf.Variable(tf.zeros(shape=[vocabulary_size]))

    theta_G = [G_W1, G_W2, G_b1, G_b2]

In [195]:
def sample_Z(m, n):
    return np.random.uniform(0., 1., size=[m, n])

In [196]:
def generator(z):
    G_h1 = tf.nn.relu(tf.matmul(z, G_W1) + G_b1)
    G_log_prob = tf.matmul(G_h1, G_W2) + G_b2
    G_prob = tf.nn.sigmoid(G_log_prob)

    return G_prob

In [197]:
G_sample = generator(Z)
D_real, D_logit_real = discriminator(X)
D_fake, D_logit_fake = discriminator(G_sample)

In [198]:
with tf.name_scope('cost'):
    D_loss_real = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=D_logit_real, labels=tf.ones_like(D_logit_real)))
    D_loss_fake = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=D_logit_fake, labels=tf.zeros_like(D_logit_fake)))
    D_loss = D_loss_real + D_loss_fake
    G_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=D_logit_fake, labels=tf.ones_like(D_logit_fake)))

In [199]:
with tf.name_scope('train'):
    D_solver = tf.train.AdamOptimizer().minimize(D_loss, var_list=theta_D)
    G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=theta_G)

In [288]:
minibatch_size = 128
Z_dim = 100

saver = tf.train.Saver(max_to_keep=1)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
i = 0

for it in range(100000):
    if it % 1000 == 0:
        samples = sess.run(G_sample, feed_dict={Z: sample_Z(1, Z_dim)})
        for sample in samples:
            sample[sample > 0.95] = 1
            print(samples.shape, decode_sentence(sample))
                  
    X_mb = get_batches(minibatch_size)

    _, D_loss_curr = sess.run([D_solver, D_loss], feed_dict={X: X_mb, Z: sample_Z(minibatch_size, Z_dim)})
    _, G_loss_curr = sess.run([G_solver, G_loss], feed_dict={Z: sample_Z(minibatch_size, Z_dim)})
    

    if it % 1000 == 0:
        print('Iter: {}'.format(it))
        print('D loss: {:.4}'. format(D_loss_curr))
        print('G_loss: {:.4}'.format(G_loss_curr))
        print()

saver.save(sess, './tf_model/generator', global_step=1000000)


(1, 3198) 
Iter: 0
D loss: 1.432
G_loss: 6.979

(1, 3198) . ! i " to ... the - my a is and in you of on it me just ( i'm at all out am want night today one sad im see miss has its @ him feel home off 3 only any right gonna great come thanks friends cant :'( stupid pretty < mean fun you're yay sorry <--- tweet sick were rest sitting else whats let mi since five rain ): @ymptweet live girls far @spiral_galaxy status husband against =( :-/ report following hair walk talkin rats become wee explains david hospitalized guna ima nor jeans overdosed bothered strangely slope http://tumblr.com/xv31s2pi8 fetal suposed intercept :\ omgaga human magic wardrobe daily downloads level sannesias whatï blog writing meat hub uploading virus
Iter: 1000
D loss: 0.00104
G_loss: 10.05

(1, 3198) . ! i to ... the - my a is ? of me for have : i'm now ) with day no be ' what 2 like love will can think got much really new miss more from him oh we home days that's need over they gonna soon weekend damn tell year 

(1, 3198) . i " ... - ? and for that so have ( i'm now not ) at all no be are was ' up go today like love one think much going sad really new more it's from last him feeling home off only win over who gonna soon weekend thanks morning he been friends didn't had those hate already she ok yay cause were game mi congrats growing cut type talkin cuz david whyy bball spaz strangely rx showing isnt 112-102 golf bf's meantime horsie tix
Iter: 13000
D loss: 2.634e-07
G_loss: 18.48

(1, 3198) . i " ... - ? and for that so have ( i'm now not ) at all no be are was ' up go today like love one think much going sad really new more it's from last him feeling home off days only win should over who gonna soon weekend thanks morning he been friends didn't had those hate already she yay cause were game mi congrats growing cut type talkin cuz david whyy bball spaz strangely rx showing isnt 112-102 golf bf's meantime horsie tix
Iter: 14000
D loss: 1.105e-07
G_loss: 19.68

(1, 3198) . i " ... - ? and for t

(1, 3198) . ! i to ... the ? and for that this have i'm now not ) at all no be but was go today like love one work if think much going sad really new more it's from last him feel feeling home tonight off days better 3 tomorrow only make should over who they gonna soon again head come weekend wanna thanks morning > he didn't cry b away those wrong first way makes already she pretty could won't give yay sick were start else whats mi forever da live far wonder brother crown wide cough cut bought msn deep talkin says cab fucked special t4 poll david whyy baithing ex motivation immediately bball shall jeans mother spaz strangely rx intercept isnt 112-102 beard 11 xmas apl stout bf's waiit meantime tix
Iter: 28000
D loss: 1.51e-10
G_loss: 25.34

(1, 3198) . ! i to ... the ? and for that this just have i'm now not ) all no be but was go what today like love one can work if think much going sad really new more from last him feel feeling home tonight off days better 3 tomorrow only make should 

(1, 3198) . ! i to ... the - ? and .. for that this just i'm now not ) all no be but was ' go & what today like love one can work if think much going sad really miss more too oh feeling home tonight off better tomorrow over who they by gonna soon again come weekend wanna thanks > he b $ away tell year first way makes pretty i've could won't thank give something wanted sick almost else whats put funny let times forever da trust far kid brother million harry celebrate goodbye boom moving wide cough bought msn pictures luv talkin australia headed stick tourny special t4 grrr http://bit.ly/zoxzc baithing 30mins ex wat straighten immediately jeans mother strangely intercept isnt jin muslims wompp momacita sannesias haters saturday 11 stout half waiit tix sequel
Iter: 38000
D loss: 1.597e-09
G_loss: 41.19

(1, 3198) . i " to ... - ? and for that so just i'm now no be ' up & like love one work think going really miss more / too tonight off better tomorrow any over by gonna guys b away year o 

(1, 3198) . i " ... - ? and for that so just i'm now no be ' up like love one work think going really miss more off better tomorrow over by fuck guys away year o thank something wanted smile omg whats boring times fine pleased thats pull australia special momacita haters saturday
Iter: 62000
D loss: 7.219e-12
G_loss: 27.64

(1, 3198) . i " ... - ? and for that so just i'm now no be ' up like love one work think going really miss more off better tomorrow over by fuck guys away year o thank something wanted smile omg whats boring times fine pleased thats pull australia special momacita haters saturday
Iter: 63000
D loss: 6.908e-12
G_loss: 27.69

(1, 3198) . i " ... - ? and for that so just i'm now no be ' up like love one work think going really miss more off better tomorrow over by fuck guys away year o thank something wanted smile omg whats boring times fine pleased thats pull australia special momacita haters saturday
Iter: 64000
D loss: 6.622e-12
G_loss: 27.73

(1, 3198) . i " ... - 

(1, 3198) . i ... - and that so ( i'm all be are up love one more it's off 3 that's head down away sucks went summer face watch start omg sooo position re-ripped special squirrels 3rd it'll daily sannesias neighbor uniform
Iter: 95000
D loss: 8.465e-12
G_loss: 26.99

(1, 3198) . i ... - and that so ( i'm all be are up love one more it's off 3 that's head down away sucks went summer face watch start omg sooo position re-ripped special squirrels 3rd it'll daily sannesias neighbor uniform
Iter: 96000
D loss: 7.778e-12
G_loss: 27.12

(1, 3198) . i ... - and that so ( i'm all be are up love one more it's off 3 that's head down away sucks went summer face watch start omg sooo position re-ripped special squirrels 3rd it'll daily sannesias neighbor uniform
Iter: 97000
D loss: 7.203e-12
G_loss: 27.22

(1, 3198) . i ... - and that so have ( i'm all be are up love one more it's off 3 that's head down away sucks went summer face watch start omg sooo position re-ripped special squirrels 3rd standin

'./tf_model/generator-1000000'

**Check if the generator is saved properly**

In [297]:
saver.restore(sess,  './tf_model/generator-1000000')

INFO:tensorflow:Restoring parameters from ./tf_model/generator-1000000


In [300]:
samples = sess.run(G_sample, feed_dict={Z: sample_Z(1, Z_dim)})
for sample in samples:
    print(sample)
    sample[sample == 1.] = 1
    sample[sample < 1.] = 0
    print(np.where(sample == 0)[0])
    print(decode_sentence(sample))

[  8.32440076e-21   1.00000000e+00   2.41504855e-34 ...,   6.45371080e-21
   1.25916747e-13   1.05300227e-26]
[   0    2    4 ..., 3195 3196 3197]
. i ... that so have i'm all be are up love more it's we off 3 that's head down away sucks went summer face watch start omg boring sooo position re-ripped special squirrels 3rd standing it'll daily sannesias neighbor place uniform


### Reasons why sentences seem nonsensical:
- We are representing our words in discreet 1 or 0, *hint*: Use pretrained word embeddings which we can do via our **Word2Vec** notebook
- Notice how we were passing in the sentences data *without sequence* so `A B C` would be represented similiar to `C A B`
- We are storing any information over time in our network (*hint*: LSTM) which we should do if we are to generate sequential data

### Training word2vec using skipgram on twitter dataset

<img src="https://i.imgur.com/5tOMur7.png">

In [306]:
with codecs.open("../data/sentiment.vec", 'r', encoding='utf-8') as f:
    next(f)
    word_vectors_twitter = f.read().split('\n')

In [307]:
word_vectors_twitter[0]

'to 0.28784 -0.16593 0.21775 -0.03476 0.17177 -0.24388 -0.36813 -0.17193 -0.15732 -0.091941 -0.31362 0.16376 0.19695 -0.054758 -0.048513 0.25853 -0.082019 -0.22646 0.26641 -0.17032 0.052413 0.091387 0.037523 0.046991 0.16638 -0.17341 0.031912 0.039461 -0.16813 0.049594 -0.055904 0.17197 0.063157 0.011592 -0.10973 -0.16039 0.14447 0.37026 0.055234 0.12431 -0.32994 0.16109 0.24207 -0.3082 0.10437 -0.031148 0.30501 0.17281 0.074012 -0.21251 0.31639 0.12154 0.27019 0.035283 0.010634 0.14596 -0.11356 0.22376 -0.005687 0.2354 0.18467 0.30154 -0.15345 -0.12211 -0.31571 0.16496 0.081898 0.23165 -0.19046 0.24053 0.33129 0.0056417 0.12159 0.10587 0.11307 -0.11543 0.26196 0.23725 -0.1914 0.044061 0.027817 -0.23344 0.1431 -0.29592 0.09374 -0.079591 0.014051 0.090615 0.29193 0.23053 0.19635 0.61701 0.1951 -0.09174 0.17461 0.27697 0.13351 -0.23349 0.41543 0.35445 '

In [336]:
word_vectors = {}
for lin in word_vectors_twitter:
    line = lin.split()
#     print(line)
    try:
        word_vectors[line[0]] = np.array(line[1:])
    except:
        print(line, lin)

[] 


In [337]:
len(list(word_vectors.keys()))

113975

Missed out on 5 words 

In [341]:
word_vectors['to']

array([['0.28784'],
       ['-0.16593'],
       ['0.21775'],
       ['-0.03476'],
       ['0.17177'],
       ['-0.24388'],
       ['-0.36813'],
       ['-0.17193'],
       ['-0.15732'],
       ['-0.091941'],
       ['-0.31362'],
       ['0.16376'],
       ['0.19695'],
       ['-0.054758'],
       ['-0.048513'],
       ['0.25853'],
       ['-0.082019'],
       ['-0.22646'],
       ['0.26641'],
       ['-0.17032'],
       ['0.052413'],
       ['0.091387'],
       ['0.037523'],
       ['0.046991'],
       ['0.16638'],
       ['-0.17341'],
       ['0.031912'],
       ['0.039461'],
       ['-0.16813'],
       ['0.049594'],
       ['-0.055904'],
       ['0.17197'],
       ['0.063157'],
       ['0.011592'],
       ['-0.10973'],
       ['-0.16039'],
       ['0.14447'],
       ['0.37026'],
       ['0.055234'],
       ['0.12431'],
       ['-0.32994'],
       ['0.16109'],
       ['0.24207'],
       ['-0.3082'],
       ['0.10437'],
       ['-0.031148'],
       ['0.30501'],
       ['0.17281'],
    

Function to find nearest neighbours

In [343]:
from sklearn.metrics.pairwise import cosine_similarity

In [344]:
cosine_similarity([[1, 0, -1]], [[-1,-1, 0]])

array([[-0.5]])

In [355]:
def nn(word):
    nearest = 0
    max = (0, 'max')
    for w in word_vectors.keys():
        try:
            if cosine_similarity([word_vectors[word]], [word_vectors[w]])[0][0] > max[0] and w != word:
                max = (cosine_similarity([word_vectors[word]], [word_vectors[w]])[0][0], w)
        except:
            pass
#             print(w, word_vectors[word], word_vectors[w])
    print(max[1])
    

In [356]:
nn('hate')

ihate


Currently our problem is we have to use a threhold to keep some words and remove others. This threhold is something we came up with assuming we should drop lower probability words but that is not the right approach.
### Our next goal would be getting the generator to generator embeddings for each word and then we'll help generator by finding closest neighbour to it 