In [1]:
from preprocess import TextReader
from train import CNNText
import pandas as pd
from pyemd import emd
import numpy as np
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from model import Model

## 0. Some helper functions

In [2]:
from tensorflow.python.client import device_lib
def get_available_dev(device):
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == device]

In [3]:
get_available_dev('GPU')

[]

In [4]:
get_available_dev('CPU')

['/device:CPU:0']

## 1. Loading the pretrained word vector

In [None]:
model = KeyedVectors.load_word2vec_format('../GoogleNews-vectors-negative300.bin',
                                         binary=True)

## 2. Process the data and convert words into vectors

In [5]:
def process_word_vectors(base_path, suffix, pretrained=False, **kwargs):
    tr = TextReader(data_dir=base_path, 
                    suffix_labels=suffix)
    print(tr.data_files)
    if tr.prepare_data(clean=True, max_vocab=15000):
        X, y = tr.get_ranked_features()
    word_vectors_df = None
    if pretrained:
        model = kwargs.get('model')
        if model is None:
            raise ValueError('Model can not be None')
        wv = tr.get_embedding_vector(model)
        word_vectors = {}
        for word, vector in wv:
            word_vectors[tr.get_rank(word)] = vector
        word_vectors_df = pd.DataFrame.from_dict(word_vectors, orient='index')
    return X, y, word_vectors_df

In [6]:
!ls

batch.py	 preprocess.py	README.md	 train
CNN_4_RNN.ipynb  __pycache__	rt-polarity.neg  train.py
model.py	 ranks		rt-polarity.pos  valid
nohup.out	 ranks.npy	TFLOGS		 word_vectors.csv


In [7]:
X, y, _ = process_word_vectors('./', 
                                suffix={'rt-polarity.pos': 1, 
                                        'rt-polarity.neg': 0})

  2%|▏         | 126/5331 [00:00<00:04, 1258.95it/s]

{'./rt-polarity.pos': 1, './rt-polarity.neg': 0}


100%|██████████| 5331/5331 [00:02<00:00, 2086.12it/s]
100%|██████████| 5331/5331 [00:02<00:00, 1905.22it/s]


In [8]:
X.shape

(10662, 56)

In [9]:
y.shape

(10662,)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [11]:
X_train.shape

(9595, 56)

In [None]:
%%sh

mkdir train valid

In [12]:
np.save('./train/X_train', X_train)
np.save('./train/y_train', y_train)

np.save('./valid/X_valid', X_test)
np.save('./valid/y_valid', y_test)

In [13]:
!ls train/

X_train.npy  y_train.npy


In [14]:
!ls valid/

X_valid.npy  y_train.npy  y_valid.npy


In [15]:
!ls -l train/

total 4276
-rw-rw-r-- 1 paperspace paperspace 4298688 Mar  1 16:04 X_train.npy
-rw-rw-r-- 1 paperspace paperspace   76888 Mar  1 16:04 y_train.npy


## Testing the batch Iterators

In [15]:
!mkdir TFLOGS

mkdir: cannot create directory ‘TFLOGS’: File exists


In [20]:
cnnText = CNNText(
    train_path='./train/',
    valid_path='./valid/',
    epochs=5,
    batch_size=50
)

In [21]:
cnnText.train(tolerance=0.01, logdir='./TFLOGS/',)

step 10/955 (epoch 1/5), loss = 8.862795 (956.4 examples/sec; 0.052 sec/batch), lr: 0.010000
step 20/955 (epoch 1/5), loss = 5.055367 (981.5 examples/sec; 0.051 sec/batch), lr: 0.010000
step 30/955 (epoch 1/5), loss = 3.008179 (968.6 examples/sec; 0.052 sec/batch), lr: 0.010000
step 40/955 (epoch 1/5), loss = 1.922556 (1025.2 examples/sec; 0.049 sec/batch), lr: 0.010000
step 50/955 (epoch 1/5), loss = 1.347301 (1006.6 examples/sec; 0.050 sec/batch), lr: 0.010000
step 60/955 (epoch 1/5), loss = 1.052337 (942.3 examples/sec; 0.053 sec/batch), lr: 0.010000
step 70/955 (epoch 1/5), loss = 0.895418 (922.5 examples/sec; 0.054 sec/batch), lr: 0.010000
step 80/955 (epoch 1/5), loss = 0.787219 (1009.9 examples/sec; 0.050 sec/batch), lr: 0.010000
step 90/955 (epoch 1/5), loss = 0.755024 (988.7 examples/sec; 0.051 sec/batch), lr: 0.010000
step 100/955 (epoch 1/5), loss = 0.723026 (960.9 examples/sec; 0.052 sec/batch), lr: 0.010000
step 110/955 (epoch 1/5), loss = 0.681034 (981.9 examples/sec; 0.0

step 840/955 (epoch 5/5), loss = 0.677697 (956.6 examples/sec; 0.052 sec/batch), lr: 0.010000
step 850/955 (epoch 5/5), loss = 0.691320 (952.5 examples/sec; 0.052 sec/batch), lr: 0.010000
step 860/955 (epoch 5/5), loss = 0.679952 (905.3 examples/sec; 0.055 sec/batch), lr: 0.010000
step 870/955 (epoch 5/5), loss = 0.675361 (888.5 examples/sec; 0.056 sec/batch), lr: 0.010000
step 880/955 (epoch 5/5), loss = 0.690552 (973.1 examples/sec; 0.051 sec/batch), lr: 0.010000
step 890/955 (epoch 5/5), loss = 0.668033 (975.9 examples/sec; 0.051 sec/batch), lr: 0.010000
step 900/955 (epoch 5/5), loss = 0.668115 (879.4 examples/sec; 0.057 sec/batch), lr: 0.010000
step 910/955 (epoch 5/5), loss = 0.674631 (834.0 examples/sec; 0.060 sec/batch), lr: 0.010000
step 920/955 (epoch 5/5), loss = 0.681170 (899.9 examples/sec; 0.056 sec/batch), lr: 0.010000
step 930/955 (epoch 5/5), loss = 0.724435 (954.6 examples/sec; 0.052 sec/batch), lr: 0.010000
step 940/955 (epoch 5/5), loss = 0.674666 (1015.8 examples/s