In [1]:
import dynet as dy
import json
from glove import Glove
import numpy as np
import csv

In [2]:
vectorDim = 50 # 50 100 200 300
ngram = 3 # 3->trigram   2->bigram  1->unigram



In [9]:
# bos # begin of sentence
# eos # end of sentence
# eol # end of line
# bol # begin of line

def read_dataset(fpath):
    print("Reading dataset...")
    poems=[]
    with open(fpath) as json_file:
        data = json.load(json_file)
        for d in data:
            poem = d["poem"]
            #poem = poem.replace("\n"," eol bol ")
            #poem = poem.lower() -- dataset already lowercased
            poem = poem.replace("\n"," eol ")
            poem = poem.replace("."," ")
            poem = poem.replace(":"," ")
            poem = poem.replace("?"," ")
            #poem = "bos bol "+poem+" eol eos"
            poem = "bos "+poem+" eos"
            poem = poem.split()
            poems.append([])
            for i in range(len(poem)-ngram+1):
                poems[-1].append(poem[i:i+ngram])
    print("Just Finished Reading dataset...")
    return poems

poems2 = read_dataset('unim_poem.json')
print("Poem Count:",len(poems2))
print(poems2[-1])  

# replace(".:?"," ")
# , ; ! " | yok
# ' - 're 's n't 've 'll ellemedim

Reading dataset...
Just Finished Reading dataset...
Poem Count: 93265
[['bos', 'abandoned', 'drive-in'], ['abandoned', 'drive-in', 'eol'], ['drive-in', 'eol', 'lit'], ['eol', 'lit', 'by'], ['lit', 'by', 'the'], ['by', 'the', 'glow'], ['the', 'glow', 'of'], ['glow', 'of', 'pink'], ['of', 'pink', 'light'], ['pink', 'light', 'eol'], ['light', 'eol', 'from'], ['eol', 'from', 'a'], ['from', 'a', 'waning'], ['a', 'waning', 'day'], ['waning', 'day', 'eos']]


In [4]:
def read_glove(fpath):
    print('Loading word vectors...')
    word2vec = {}
    embeds = []
    word2idx = {}
    with open(fpath, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            word2idx[word] = len(embeds)
            vec = np.asarray(values[1:], dtype='float32')
            word2vec[word] = vec
            embeds.append(vec)
            
    mean = np.array(embeds).mean(axis=0,dtype='float32')
    word2vec["mmeann"]=mean
    embeds.append(mean)
    
    print("Finished Loading word vectors...")
    return np.array(embeds),word2idx,word2vec

embedding,w2i, w2v= read_glove('glovo/glove.6B.'+str(vectorDim)+'d.txt')
#w2v["mmeann"]=embedding.mean(axis=0) # if the word doesn't occur in vocab, it will take mean value
print(embedding.shape)
print(embedding.size)

Loading word vectors...
Finished Loading word vectors...
(400001, 50)
20000050


In [15]:
# print(w2v['the'])
# print(w2v['mmeann'])
# print(w2v['the']+w2v['mmeann'])
# print(np.zeros(50))

In [17]:
def sumvec(w2v,words):
    _temp = np.zeros(50)
    for w in words:
        try:
            _temp+=w2v[w]
        except:
            _temp+=w2v["mmeann"]
    return _temp

h = 100 # HiddenUnit
m = vectorDim
OutUnit = 100

EPOCH = 10

_model = dy.Model()
_pW1 = _model.add_parameters((h, m))
_pb1 = _model.add_parameters(h)
_pW2 = _model.add_parameters((m, h))
_pb2 = _model.add_parameters(m)
_trainer = dy.SimpleSGDTrainer(_model)


poems = poems2[:250]
for epoch in range(1, EPOCH + 1):
    epoch_loss = 0.0
    for p in poems:
        for gram in p:
            x=sumvec(w2v,gram[:ngram-1])
            y=max(sumvec(w2v,gram[-1:]))
            dy.renew_cg()
            x = dy.inputVector(x)
            input_layer = dy.tanh(_pW1 * x + _pb1)
            hidden_layer = _pW2 * input_layer + _pb2
            output_layer = dy.softmax(hidden_layer)
            loss = dy.pickneglogsoftmax(output_layer, y)
            epoch_loss += loss.scalar_value()
            loss.backward()
            _trainer.update()
    print("Epoch %d. loss = %f" % (epoch, epoch_loss/len(poems)))

Epoch 1. loss = 126.885050
Epoch 2. loss = 126.756240
Epoch 3. loss = 126.750117
Epoch 4. loss = 126.726632
Epoch 5. loss = 126.747889
Epoch 6. loss = 126.990369
Epoch 7. loss = 126.819649
Epoch 8. loss = 126.515956
Epoch 9. loss = 126.054736
Epoch 10. loss = 126.437603


In [56]:
def predict_labels(doc,b,U,d,H):
    x = encode_doc(doc)
    _h= layer1(x,H,d)
    y = b+U*_h
    
    return dy.softmax(y)


def layer1(x,H,d):
    _H = dy.parameter(H)
    _d = dy.parameter(d)
    
    return dy.tanh(_H * x + _d)

def encode_doc(doc):
    doc2=[]
    embs=[]
    for w in doc:
        try:
            doc2.append(w2i[w])
        except:
            doc2.append(w2i["mmeann"])
    for idx in doc2:
        embs.append(E[idx])
    
    return dy.esum(embs)


def do_loss(probs, next_word, w2v):
    try:
        next_word_vector = w2v[next_word]
    except:
        next_word_vector = w2v["mmeann"]

    return -dy.log(dy.pick_batch_elem(probs,next_word_vector))


In [58]:
model = dy.Model()

trainer = dy.SimpleSGDTrainer(model)

h = 100 # HiddenUnit
m = vectorDim
OutUnit = 100

H = model.add_parameters((h, m))
d = model.add_parameters(h)
U = model.add_parameters((m, h))
b = model.add_parameters(m)

E = model.add_lookup_parameters((len(w2i), HiddenDim),init =  embedding )

for p in poems:
    for gram in p:
        dy.renew_cg()
        probs = predict_labels(gram[:ngram-1],b,U,d,H)
        
        loss = do_loss(probs,gram[-1],w2v)
        loss.forward()
        loss.backward()
        trainer.update()

TypeError: only size-1 arrays can be converted to Python scalars

In [32]:
print(len(w2i))
print(E)
print(E.shape())
print(E[0])
print(E[1])

400000
LookupParameter /_4
(400000, 50)
expression 9/0
expression 10/0
