In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from IPython.display import display, clear_output
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.backend import clear_session
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, accuracy_score
from zipfile import ZipFile

Using TensorFlow backend.


In [2]:
zipped_folder = "../data/data.zip"
electr_vectors_file = "electr_vectors_balanced.csv"
movies_vectors_file = "movies_vectors_balanced.csv"

In [3]:
def pad_by_zero_seqs_generator(list_of_matrices, y_list, timesteps, batch_size):
#     max_len = timesteps
#     aligned = [np.pad(i, ((0, max_len - i.shape[0]), (0, 0)), "constant", constant_values=0.0) for i in list_of_matrices]
#     aligned = np.array(aligned)
    while True:
        indices = np.random.choice(np.arange(len(list_of_matrices)), size=[batch_size])
        x = np.array(list_of_matrices)[indices]
        x = [np.pad(i, ((0, timesteps - i.shape[0]), (0, 0)), "constant", constant_values=0.0) for i in x]
        x = np.array(x)
        y = np.array(y_list)[indices].reshape([1, 1, 1])
        yield x, y

In [42]:
def count_lines(zipped_folder: str, fname: str):
    zf = ZipFile(zipped_folder)
    return sum([1 for _ in zf.open(fname)])

In [43]:
def process_vector(vector: list, padding_size: int) -> np.ndarray:
    array = np.array([np.array(sublist) for sublist in vector])
    try:
        return np.pad(array, ((0, padding_size-len(array)), (0, 0)), mode='constant', constant_values=0.0)
    except Exception as e:
        return np.zeros([1, 1, 128])

In [44]:
def process_batch(batch: pd.DataFrame) -> pd.DataFrame:
    max_len = max(map(len, batch["vectors"]))
    batch["vectors"] = batch["vectors"].apply(process_vector, args=[max_len])
#     batch["target_bin"] = batch["target_bin"].values.reshape([-1, 1, 1])

In [45]:
def batch_generator(zipped_folder: str, 
                    fname: str, 
                    batch_size: int, 
                    from_line=None, 
                    to_line=None) -> pd.DataFrame:
    zf = ZipFile(zipped_folder)
    skiprows = None
    if from_line is not None:
        skiprows = range(1, from_line)
    nrows = to_line
    if from_line is not None and to_line is not None:
        nrows = to_line - from_line 
    for batch in pd.read_csv(zf.open(fname), sep="\t", 
                             chunksize=batch_size, 
                             skiprows=skiprows, nrows=nrows):
        batch["vectors"] = batch["vectors"].apply(eval)
        yield batch
        
def train_test_generator(zipped_folder: str, 
                         fname: str, 
                         batch_size: int, 
                         test_percent: float) -> pd.DataFrame:
    generator = batch_generator(zipped_folder=zipped_folder, 
                                     fname=fname, 
                                     batch_size=batch_size)
    for num, batch in enumerate(generator):
        clear_output(True)
        print("Batch", num + 1)
        process_batch(batch)
        X_train, X_test, y_train, y_test = train_test_split(batch["vectors"].values, 
                                                            batch["target_bin"].values, 
                                                            test_size=test_percent)
        y_train = y_train.reshape([-1, 1, 1])
        y_test = y_test.reshape([-1, 1, 1])
        X_train = np.array(list(X_train))
        X_test = np.array(list(X_test))
        yield (X_train, X_test, y_train, y_test)

In [46]:
# electr_lines = count_lines(zipped_folder, electr_vectors_file)
# movies_lines = count_lines(zipped_folder, movies_vectors_file)
# print("electr lines:", count_lines(zipped_folder, electr_vectors_file))
# print("movies lines:", count_lines(zipped_folder, movies_vectors_file))

In [50]:
def train_model(model):
    for X_train, X_test, y_train, y_test in train_test_generator(zipped_folder, 
                                                                 movies_vectors_file, 
                                                                 batch_size=3000, test_percent=0.2):
        model.fit(X_train, y_train, 
                  validation_data=(X_test, y_test), 
                  steps_per_epoch=5, epochs=3, verbose=1, validation_steps=3)

In [51]:
%%time

clear_session()
hidden_size1 = 32
hidden_size2 = 150

model = Sequential()
model.add(LSTM(hidden_size1, return_sequences=True, input_shape=(None, 128)))
# model.add(LSTM(hidden_size2, return_sequences=True))
# model.add(Dense(hidden_size2, activation="hard_sigmoid"))
model.add(Dense(1, activation='hard_sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adagrad', metrics=["accuracy"])
train_model(model)

Batch 5
Train on 2400 samples, validate on 600 samples
Epoch 1/3


ResourceExhaustedError: OOM when allocating tensor with shape[2400,1904,32] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
	 [[{{node lstm_1/transpose_1}} = Transpose[T=DT_FLOAT, Tperm=DT_INT32, _class=["loc:@training/Adagrad/gradients/lstm_1/transpose_1_grad/transpose"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](lstm_1/TensorArrayStack/TensorArrayGatherV3, training/Adagrad/gradients/lstm_1/transpose_1_grad/InvertPermutation)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
