# Goal:

We want to build a character-based recurrent autoencoder (using attention and lstm recurrent units)

In [1]:
import torch
import torch.nn as nn
import pandas as pd
import unicodedata
import string

## Load dataset

In [2]:
df = pd.read_csv('../dataset/movies_complete.csv')
df.text = df.text.str.replace('\w+', '')

In [3]:
all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

In [4]:
texts = df.text.apply(unicodeToAscii).to_list()

## Preprocessing (Pytorch)

In [5]:
import torch

# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
    return all_letters.find(letter)

# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def textToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

In [6]:
#texts_encoded = [textToTensor(text) for text in texts[:10]]

# Preprocessing (Keras)

In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer

tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
tk.fit_on_texts(texts)

char_dict = {char: index + 1 for index, char in enumerate(all_letters)}
char_dict[tk.oov_token] = max(char_dict.values()) + 1
tk.word_index = char_dict

In [19]:
sequences = tk.texts_to_sequences(texts)
X = tk.texts_to_matrix(texts)

In [9]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

maxlen = 100

sequences = pad_sequences(sequences=sequences, maxlen=maxlen, padding='post')

## Define model

1. Attempt in keras
2. If that works port to pytorch

In [10]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import Nadam
import numpy as np

#import os
#os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 
#os.environ["CUDA_VISIBLE_DEVICES"] = "3"  # use id from $ nvidia-smi

In [65]:
encoder_inputs = Input(shape=(maxlen,), name='Encoder-Input')
emb_layer = Embedding(n_letters, 150,input_length = maxlen, name='Body-Word-Embedding', mask_zero=False)

x = emb_layer(encoder_inputs)
state_h = Bidirectional(LSTM(128, activation='relu', name='Encoder-Last-LSTM'))(x)
encoder_model = Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model')
seq2seq_encoder_out = encoder_model(encoder_inputs)

decoded = RepeatVector(maxlen)(seq2seq_encoder_out)
decoder_lstm = Bidirectional(LSTM(128, return_sequences=True, name='Decoder-LSTM-before'))
decoder_lstm_output = decoder_lstm(decoded)
decoder_dense = Dense(n_letters, activation='softmax', name='Final-Output-Dense-before')
decoder_outputs = decoder_dense(decoder_lstm_output)

seq2seq_Model = Model(encoder_inputs, decoder_outputs)
seq2seq_Model.compile(Nadam(lr=0.001), loss='sparse_categorical_crossentropy')
history = seq2seq_Model.fit(sequences, sequences,
          batch_size=16,
          epochs=10)

Epoch 1/10


InvalidArgumentError:  indices[7,94] = 57 is not in [0, 57)
	 [[node model_5/Encoder-Model/Body-Word-Embedding/embedding_lookup (defined at <ipython-input-65-18db1c3624ac>:17) ]] [Op:__inference_train_function_48757]

Errors may have originated from an input operation.
Input Source operations connected to node model_5/Encoder-Model/Body-Word-Embedding/embedding_lookup:
 model_5/Encoder-Model/Body-Word-Embedding/embedding_lookup/45611 (defined at /Users/lennartkeller/opt/anaconda3/envs/clustering/lib/python3.8/contextlib.py:113)

Function call stack:
train_function


In [26]:
recurrent_encoder = Sequential([
LSTM(100, return_sequences=True, input_shape=[None, maxlen]),
LSTM(30)
])
recurrent_decoder = Sequential([
RepeatVector(maxlen, input_shape=[30]),
LSTM(100, return_sequences=True),
TimeDistributed(Dense(n_letters, activation="sigmoid"))
])
recurrent_ae = Sequential([recurrent_encoder, recurrent_decoder])
recurrent_ae.compile(loss='binary_crossentropy', optimizer='adam')
recurrent_ae.summary()

Model: "sequential_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sequential_14 (Sequential)   (None, 30)                96120     
_________________________________________________________________
sequential_15 (Sequential)   (None, 100, 57)           58157     
Total params: 154,277
Trainable params: 154,277
Non-trainable params: 0
_________________________________________________________________


In [20]:
recurrent_ae.fit(sequences, X, epochs=1)

ValueError: in user code:

    /Users/lennartkeller/opt/anaconda3/envs/clustering/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:571 train_function  *
        outputs = self.distribute_strategy.run(
    /Users/lennartkeller/opt/anaconda3/envs/clustering/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:951 run  **
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /Users/lennartkeller/opt/anaconda3/envs/clustering/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2290 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /Users/lennartkeller/opt/anaconda3/envs/clustering/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2649 _call_for_each_replica
        return fn(*args, **kwargs)
    /Users/lennartkeller/opt/anaconda3/envs/clustering/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:531 train_step  **
        y_pred = self(x, training=True)
    /Users/lennartkeller/opt/anaconda3/envs/clustering/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py:885 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs,
    /Users/lennartkeller/opt/anaconda3/envs/clustering/lib/python3.8/site-packages/tensorflow/python/keras/engine/input_spec.py:176 assert_input_compatibility
        raise ValueError('Input ' + str(input_index) + ' of layer ' +

    ValueError: Input 0 of layer sequential_9 is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: [None, 59]


In [18]:
sequences.shape

(3728, 100)

In [21]:
X.shape

(3728, 59)

In [22]:
X[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 1., 1., 0., 1., 0.])