In [1]:
import sys
import numpy as np
import argparse
import json
import os, inspect
import math
sys.path.append("../")
%load_ext autoreload
%autoreload 2

In [2]:
from dhira.data.data_manager import DataManager
from dhira.data.embedding_manager import EmbeddingManager
from dhira.data.features.glove_feature import GloveFeature
from dhira.tf.models.word2vec.glove import Glove
from dhira.data.dataset.glove import GloveDataset
import logging
logger = logging.getLogger(__name__)



In [3]:
!ls ../data/offline/glove/


README.md  training.txt  validation.txt


In [4]:
glove_dataset = GloveDataset(name='golve',
                            feature_type=GloveFeature,
                            train_files='../data/offline/glove/training.txt',
                            val_files='../data/offline/glove/validation.txt',
                            pickle_dir='../logs/pickle',
                            vocabulary_size=5000,
                            min_occurrences=2,
                            left_size=2,
                            right_size=2)


In [29]:
data_manager = DataManager(glove_dataset)

In [30]:
get_train_data_gen, train_data_size = data_manager.get_train_data_from_file()

100%|██████████| 219800/219800 [00:00<00:00, 1837070.32it/s]
100%|██████████| 219800/219800 [00:01<00:00, 146598.51it/s]
100%|██████████| 219800/219800 [00:17<00:00, 12707.96it/s]
100%|██████████| 861615/861615 [00:07<00:00, 116793.17it/s]


In [31]:
print('Training data size: ', train_data_size)

Training data size:  861615


In [32]:
get_val_data_gen, val_data_size = data_manager.get_validation_data_from_file()

100%|██████████| 51755/51755 [00:00<00:00, 1518068.49it/s]
100%|██████████| 51755/51755 [00:00<00:00, 161639.87it/s]
100%|██████████| 51755/51755 [00:03<00:00, 14064.13it/s]
100%|██████████| 403970/403970 [00:02<00:00, 136544.93it/s]


In [33]:
print('Validation data size: ', val_data_size)

Validation data size:  403970


In [43]:
batch_size = 128
num_epochs = 10
num_train_steps_per_epoch = int(math.ceil(train_data_size / batch_size))
num_val_steps = int(math.ceil(val_data_size / batch_size))
log_period = 500
val_period = 1000
save_period = 1000
patience = 0

In [44]:
batch_size

128

In [45]:
num_train_steps_per_epoch

6732

In [46]:
model = Glove(name='glove', mode='train', save_dir='../models/', log_dir='../logs/', run_id='0',
                 embedding_size=30, cooccurrence_cap=100, vocabulary_size=5000, batch_size=128, learning_rate=0.01)
model.build_graph()

SyntaxError: invalid syntax (<ipython-input-46-ea5f3ecd6e4f>, line 2)

In [47]:
model.train(get_train_instance_generator=get_train_data_gen,
                get_val_instance_generator=get_val_data_gen,
                batch_size=batch_size,
                num_train_steps_per_epoch=num_train_steps_per_epoch,
                num_epochs=num_epochs,
                num_val_steps=num_val_steps,
                log_period=log_period,
                val_period=val_period,
                save_period=save_period,
                patience=patience)

Writing to /opt/dhira/logs/glove/00

--------------------------------------------------

tensorboard --logdir  /opt/dhira/logs/glove/00


tensorboard --logdir  /opt/dhira/logs/glove/00/checkpoints --port 6007

--------------------------------------------------


In [None]:
model.embedding_for(2)

In [None]:
!ls ../models/glove/00


In [15]:
from itertools import islice
feature_generator = get_train_data_gen()
batched_features = list(islice(feature_generator, batch_size))

In [16]:
 print(len(batched_features))

128


In [17]:
flattened = ([ins[0] for ins in batched_features],
             [ins[1] for ins in batched_features])

In [18]:
flattened_inputs, flattened_targets = flattened

In [19]:
flattened_inputs

[(array(1), array(257), array(318.5)),
 (array(1), array(2), array(35926.0)),
 (array(257), array(1), array(318.5)),
 (array(257), array(2), array(476.5)),
 (array(257), array(200), array(3.0)),
 (array(2), array(257), array(476.5)),
 (array(2), array(1), array(35926.0)),
 (array(2), array(200), array(64.5)),
 (array(2), array(244), array(169.0)),
 (array(200), array(2), array(64.5)),
 (array(200), array(257), array(3.0)),
 (array(200), array(244), array(149.0)),
 (array(244), array(200), array(149.0)),
 (array(244), array(2), array(169.0)),
 (array(27), array(77), array(208.0)),
 (array(77), array(27), array(208.0)),
 (array(3719), array(4), array(19.5)),
 (array(3719), array(649), array(1.5)),
 (array(4), array(3719), array(19.5)),
 (array(4), array(649), array(58.0)),
 (array(4), array(1366), array(6.0)),
 (array(649), array(4), array(58.0)),
 (array(649), array(3719), array(1.5)),
 (array(649), array(1366), array(3.0)),
 (array(649), array(0), array(67.5)),
 (array(1366), array(649

In [27]:
batch_inputs = tuple(map(np.asarray, tuple(zip(*flattened_inputs))))

In [28]:
batch_inputs

(array([   1,    1,  257,  257,  257,    2,    2,    2,    2,  200,  200,
         200,  244,  244,   27,   77, 3719, 3719,    4,    4,    4,  649,
         649,  649,  649, 1366, 1366, 1366, 1366,    0,    0,    0,    0,
        2239, 2239,    3,   10,    2,    2,   15,   15,   15,  114,  114,
         114,  114,    0,    0,    0,  402,  402,  402,    0,    3,    3,
           3,    5, 1809, 1809,  108,  108,    5,    5,    0,    3, 3240,
        3240, 3240,    5, 3193, 3193, 2637, 2637,    0,    0,    0,    5,
           1,    1,  108,  108, 2575, 2575, 2575,    5,   21,   21,  121,
         121,  121,   47,   47,   47,   47, 4048, 4048, 4048,   41,   41,
          41, 1181, 1181, 1181,  154,  154,  154,  514,  514,  514,  514,
           7,    7,    7, 2810, 2810, 2810, 2810,  154,  154,   15,   15,
        3194, 3194,    3,    3,    3,  307,  307]),
 array([ 257,    2,    1,    2,  200,  257,    1,  200,  244,    2,  257,
         244,  200,    2,   77,   27,    4,  649, 3719,  649