In [1]:
import sys
import numpy as np
import json
import os, inspect
import math
sys.path.append("../")
%load_ext autoreload
%autoreload 2

In [2]:
from dhira.data.data_manager import DataManager
from dhira.data.embedding_manager import EmbeddingManager
from dhira.data.features.glove_feature import GloveFeature
from dhira.tf.models.word2vec.glove import Glove
from dhira.data.dataset.glove import GloveDataset
import logging
logger = logging.getLogger(__name__)

In [3]:
!ls ../data/offline/glove/

README.md  training.txt  validation.txt


In [4]:
glove_dataset = GloveDataset(train_files='../data/offline/glove/training.txt',
                            val_files='../data/offline/glove/validation.txt')

In [5]:
data_manager = DataManager(glove_dataset)

In [6]:
get_train_data_gen, train_data_size = data_manager.get_train_data()

100%|██████████| 219800/219800 [00:00<00:00, 1781241.04it/s]
100%|██████████| 219800/219800 [00:01<00:00, 124291.21it/s]
100%|██████████| 219800/219800 [00:16<00:00, 13490.48it/s]
100%|██████████| 861615/861615 [00:06<00:00, 140462.48it/s]


[<dhira.data.features.glove_feature.GloveFeature object at 0x7fba8d108fd0>, <dhira.data.features.glove_feature.GloveFeature object at 0x7fbb046b2be0>, <dhira.data.features.glove_feature.GloveFeature object at 0x7fbb046b29e8>, <dhira.data.features.glove_feature.GloveFeature object at 0x7fba9f5e2898>, <dhira.data.features.glove_feature.GloveFeature object at 0x7fba9f5e28d0>, <dhira.data.features.glove_feature.GloveFeature object at 0x7fba8d10b1d0>, <dhira.data.features.glove_feature.GloveFeature object at 0x7fba8d10b128>, <dhira.data.features.glove_feature.GloveFeature object at 0x7fba8d10b0f0>, <dhira.data.features.glove_feature.GloveFeature object at 0x7fba8d10b240>, <dhira.data.features.glove_feature.GloveFeature object at 0x7fba8d10b198>]


In [7]:
print('Training data size: ', train_data_size)

Training data size:  861615


In [8]:
get_val_data_gen, val_data_size = data_manager.get_validation_data()

100%|██████████| 51755/51755 [00:00<00:00, 1429873.22it/s]
100%|██████████| 51755/51755 [00:00<00:00, 161737.30it/s]
100%|██████████| 51755/51755 [00:03<00:00, 13927.03it/s]
100%|██████████| 403970/403970 [00:02<00:00, 147751.08it/s]


[<dhira.data.features.glove_feature.GloveFeature object at 0x7fba9c7cc748>, <dhira.data.features.glove_feature.GloveFeature object at 0x7fba9d878240>, <dhira.data.features.glove_feature.GloveFeature object at 0x7fba9da7a550>, <dhira.data.features.glove_feature.GloveFeature object at 0x7fba9d76a2b0>, <dhira.data.features.glove_feature.GloveFeature object at 0x7fba9e2869e8>, <dhira.data.features.glove_feature.GloveFeature object at 0x7fba9d6fbd30>, <dhira.data.features.glove_feature.GloveFeature object at 0x7fba970be668>, <dhira.data.features.glove_feature.GloveFeature object at 0x7fba5a938710>, <dhira.data.features.glove_feature.GloveFeature object at 0x7fba5a9387f0>, <dhira.data.features.glove_feature.GloveFeature object at 0x7fba9ddf7940>]


In [9]:
print('Validation data size: ', val_data_size)

Validation data size:  403970


In [10]:
batch_size = 128
num_epochs = 10
num_train_steps_per_epoch = int(math.ceil(train_data_size / batch_size))
num_val_steps = int(math.ceil(val_data_size / batch_size))
log_period = 500
val_period = 1000
save_period = 1000
patience = 0

In [11]:
batch_size

128

In [12]:
num_train_steps_per_epoch

6732

In [13]:
model = Glove(name='glove', save_dir='../models/', log_dir='../logs/', run_id='0',
                 embedding_size=30, cooccurrence_cap=100, vocabulary_size=5000, batch_size=128, learning_rate=0.01)


In [14]:
model.compile()
model.train(get_train_feature_generator=get_train_data_gen,
                get_val_feature_generator=get_val_data_gen,
                batch_size=batch_size,
                num_train_steps_per_epoch=num_train_steps_per_epoch,
                num_epochs=num_epochs,
                num_val_steps=num_val_steps,
                log_period=log_period,
                val_period=val_period,
                save_period=save_period,
                patience=patience)

Writing to /opt/dhira/logs/glove/001503422559

--------------------------------------------------

tensorboard --logdir  /opt/dhira/logs/glove/001503422559

tensorboard --logdir  /opt/dhira/models/glove/00/checkpoints --port 6007
--------------------------------------------------





'/opt/dhira/models/glove/00/glove-67320'

In [15]:
glove_dataset.embedding_for(2, model.embeddings)

array([-0.41689247,  0.26725182, -0.06535763,  1.27277553,  1.13024259,
       -0.61554569, -0.1600647 , -0.1432018 ,  0.67844713,  1.04623902,
       -0.01356751, -1.26085901,  0.64571834, -0.59177035,  0.66578978,
       -0.34551579,  0.259642  , -0.46364811,  0.38411933,  0.79517853,
       -0.28920794, -0.05337651,  0.66002786,  0.86776823,  0.54742253,
       -0.13828447, -0.04022217,  0.38150218, -0.87994802, -0.08639389], dtype=float32)

In [None]:
!ls ../models/glove/00


In [None]:
from itertools import islice
feature_generator = get_train_data_gen()
batched_features = list(islice(feature_generator, batch_size))

In [None]:
 print(len(batched_features))

In [None]:
flattened = ([ins[0] for ins in batched_features],
             [ins[1] for ins in batched_features])

In [None]:
flattened_inputs, flattened_targets = flattened

In [None]:
flattened_inputs

In [None]:
batch_inputs = tuple(map(np.asarray, tuple(zip(*flattened_inputs))))

In [None]:
batch_inputs