In [1]:
from preprocess import TextReader
from model import CNNText

import pandas as pd
from pyemd import emd
import numpy as np
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split

## 1. Loading the pretrained word vector

In [2]:
model = KeyedVectors.load_word2vec_format('../GoogleNews-vectors-negative300.bin',
                                         binary=True)

## 2. Process the data and convert words into vectors

In [3]:
def process_word_vectors(model, base_path, suffix):
    tr = TextReader(data_dir=base_path, 
                    suffix_labels=suffix)
    print(tr.data_files)
    if tr.prepare_data(clean=True):
        X, y = tr.get_ranked_features()
    wv = tr.get_embedding_vector(model)
    word_vectors = {}
    for word, vector in wv:
        word_vectors[tr.get_rank(word)] = vector
    word_vectors_df = pd.DataFrame.from_dict(word_vectors, orient='index')
    return X, y, word_vectors_df

In [4]:
X, y, wv = process_word_vectors(model, './', 
                                suffix={'rt-polarity.pos': 1, 
                                        'rt-polarity.neg': 0})

{'./rt-polarity.pos': 1, './rt-polarity.neg': 0}


100%|██████████| 5331/5331 [00:17<00:00, 313.34it/s]
100%|██████████| 5331/5331 [00:17<00:00, 310.30it/s]


In [5]:
X.shape

(10662, 51)

In [6]:
y.shape

(10662,)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [149]:
%%sh

mkdir train valid

In [8]:
np.save('./train/X_train', X_train)
np.save('./train/y_train', y_train)

np.save('./valid/X_valid', X_test)
np.save('./valid/y_train', y_test)

In [9]:
!ls train/

X_train.npy  y_train.npy


In [10]:
!ls valid/

X_valid.npy  y_train.npy


## Testing the batch Iterators

In [13]:
cnnText = CNNText(
    train_path='./train/',
    valid_path='./valid/',
    epochs=50,
    batch_size=50
)

In [14]:
cnnText.train()

Epoch 1: 100%|##########| 170/170 [00:00<00:00, 613.77it/s, train_loss=17, valid_loss=17]    
Epoch 2: 100%|##########| 170/170 [00:00<00:00, 660.08it/s, train_loss=17.1, valid_loss=17.1]
Epoch 3: 100%|##########| 170/170 [00:00<00:00, 653.58it/s, train_loss=17.2, valid_loss=17.2]
Epoch 4: 100%|##########| 170/170 [00:00<00:00, 774.41it/s, train_loss=17.3, valid_loss=17.3]
Epoch 5: 100%|##########| 170/170 [00:00<00:00, 582.93it/s, train_loss=17.4, valid_loss=17.4]
Epoch 6: 100%|##########| 170/170 [00:00<00:00, 622.91it/s, train_loss=17.5, valid_loss=17.5]
Epoch 7: 100%|##########| 170/170 [00:00<00:00, 616.63it/s, train_loss=17.6, valid_loss=17.6]
Epoch 8: 100%|##########| 170/170 [00:00<00:00, 581.04it/s, train_loss=17.7, valid_loss=17.7]
Epoch 9: 100%|##########| 170/170 [00:00<00:00, 679.38it/s, train_loss=17.8, valid_loss=17.8]
Epoch 10: 100%|##########| 170/170 [00:00<00:00, 614.12it/s, train_loss=17.9, valid_loss=17.9]
Epoch 11: 100%|##########| 170/170 [00:00<00:00, 1034.00it/

In [116]:
batch_size = 3
epochs = 5

In [117]:
# N // batch_size 

In [None]:
def train(epochs, batch_size, X, y):
    N = X.shape[0]
    pointer = 0
    for e in range(epochs):
        x_out = X[pointer: pointer+batch_size, :]
        y_out = y[pointer: pointer+batch_size]
#         for bs in range(batch_size):
        x_out.append(X[(pointer + bs) % N])
        print(x_out)
        pointer += batch_size
        

In [129]:
pointer = 0

In [131]:
y[0:3]

array([0, 1, 1])

In [130]:
.shape

(3, 51)

In [133]:
import numpy as np

In [None]:
np.loa

In [80]:
[*range(0, 65, 3)]

[0,
 3,
 6,
 9,
 12,
 15,
 18,
 21,
 24,
 27,
 30,
 33,
 36,
 39,
 42,
 45,
 48,
 51,
 54,
 57,
 60,
 63]