In [1]:
from preprocess import TextReader
from train import CNNText
import pandas as pd
from pyemd import emd
import numpy as np
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from model import Model

  from ._conv import register_converters as _register_converters


## 0. Some helper functions

In [2]:
from tensorflow.python.client import device_lib
def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

In [3]:
get_available_gpus()

['/device:GPU:0']

## 1. Loading the pretrained word vector

In [None]:
model = KeyedVectors.load_word2vec_format('../GoogleNews-vectors-negative300.bin',
                                         binary=True)

## 2. Process the data and convert words into vectors

In [4]:
def process_word_vectors(base_path, suffix, pretrained=False, **kwargs):
    tr = TextReader(data_dir=base_path, 
                    suffix_labels=suffix)
    print(tr.data_files)
    if tr.prepare_data(clean=True):
        X, y = tr.get_ranked_features()
    word_vectors_df = None
    if pretrained:
        model = kwargs.get('model')
        if model is None:
            raise ValueError('Model can not be None')
        wv = tr.get_embedding_vector(model)
        word_vectors = {}
        for word, vector in wv:
            word_vectors[tr.get_rank(word)] = vector
        word_vectors_df = pd.DataFrame.from_dict(word_vectors, orient='index')
    return X, y, word_vectors_df

In [5]:
!ls

CNN_4_RNN.ipynb  __pycache__  data	preprocess.py  train.py
README.md	 batch.py     model.py	train	       valid


In [6]:
X, y, _ = process_word_vectors('./data/', 
                                suffix={'mr.pos': 1, 
                                        'mr.neg': 0})

{'./data/mr.pos': 1, './data/mr.neg': 0}


100%|██████████| 5331/5331 [00:16<00:00, 329.91it/s]
100%|██████████| 5331/5331 [00:16<00:00, 331.68it/s]


In [7]:
X.shape

(10662, 51)

In [8]:
y.shape

(10662,)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
%%sh

mkdir train valid

mkdir: cannot create directory 'train': File exists
mkdir: cannot create directory 'valid': File exists


In [10]:
np.save('./train/X_train', X_train)
np.save('./train/y_train', y_train)

np.save('./valid/X_valid', X_test)
np.save('./valid/y_train', y_test)

In [11]:
!ls train/

X_train.npy  y_train.npy


In [12]:
!ls valid/

X_valid.npy  y_train.npy


## Testing the batch Iterators

In [13]:
cnnText = CNNText(
    train_path='./train/',
    valid_path='./valid/',
    epochs=50,
    batch_size=50
)

In [14]:
cnnText.train()

Epoch 1: 100%|##########| 170/170 [00:00<00:00, 744.64it/s, train_loss=17, valid_loss=17]    
Epoch 2: 100%|##########| 170/170 [00:00<00:00, 900.58it/s, train_loss=17.1, valid_loss=17.1]
Epoch 3: 100%|##########| 170/170 [00:00<00:00, 1092.54it/s, train_loss=17.2, valid_loss=17.2]
Epoch 4: 100%|##########| 170/170 [00:00<00:00, 1073.89it/s, train_loss=17.3, valid_loss=17.3]
Epoch 5: 100%|##########| 170/170 [00:00<00:00, 1019.17it/s, train_loss=17.4, valid_loss=17.4]
Epoch 6: 100%|##########| 170/170 [00:00<00:00, 1186.86it/s, train_loss=17.5, valid_loss=17.5]
Epoch 7: 100%|##########| 170/170 [00:00<00:00, 1267.70it/s, train_loss=17.6, valid_loss=17.6]
Epoch 8: 100%|##########| 170/170 [00:00<00:00, 1161.85it/s, train_loss=17.7, valid_loss=17.7]
Epoch 9: 100%|##########| 170/170 [00:00<00:00, 1195.04it/s, train_loss=17.8, valid_loss=17.8]
Epoch 10: 100%|##########| 170/170 [00:00<00:00, 1245.00it/s, train_loss=17.9, valid_loss=17.9]
Epoch 11: 100%|##########| 170/170 [00:00<00:00, 10

In [15]:
m = Model(
        nkernels=100,
        min_filter=3,
        max_filter=5,
        vocab_size=15000,
        num_class=2,
        max_len=51,
        l2_reg=1,)
print(m.train_op)

name: "Adam"
op: "NoOp"
input: "^Adam/update_embeddings/embed/group_deps"
input: "^Adam/update_conv/kernel_3/ApplyAdam"
input: "^Adam/update_conv/bias_3/ApplyAdam"
input: "^Adam/update_conv/kernel_4/ApplyAdam"
input: "^Adam/update_conv/bias_4/ApplyAdam"
input: "^Adam/update_conv/kernel_5/ApplyAdam"
input: "^Adam/update_conv/bias_5/ApplyAdam"
input: "^Adam/update_dense/dense/ApplyAdam"
input: "^Adam/update_dense/fc_bias/ApplyAdam"
input: "^Adam/Assign"
input: "^Adam/Assign_1"
device: "/device:GPU:0"



In [116]:
batch_size = 3
epochs = 5

In [117]:
# N // batch_size 

In [None]:
def train(epochs, batch_size, X, y):
    N = X.shape[0]
    pointer = 0
    for e in range(epochs):
        x_out = X[pointer: pointer+batch_size, :]
        y_out = y[pointer: pointer+batch_size]
#         for bs in range(batch_size):
        x_out.append(X[(pointer + bs) % N])
        print(x_out)
        pointer += batch_size
        

In [129]:
pointer = 0

In [131]:
y[0:3]

array([0, 1, 1])

In [130]:
.shape

(3, 51)

In [133]:
import numpy as np

In [None]:
np.loa

In [80]:
[*range(0, 65, 3)]

[0,
 3,
 6,
 9,
 12,
 15,
 18,
 21,
 24,
 27,
 30,
 33,
 36,
 39,
 42,
 45,
 48,
 51,
 54,
 57,
 60,
 63]