# Goal:

We want to build a character-based recurrent autoencoder (using attention and lstm recurrent units)

In [3]:
import torch
import torch.nn as nn
import pandas as pd
import unicodedata
import string

## Load dataset

In [4]:
df = pd.read_csv('../dataset/movies_complete.csv')
df.text = df.text.str.replace('\w+', '')

In [5]:
all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

In [6]:
texts = df.text.apply(unicodeToAscii).to_list()

## Preprocessing (Pytorch)

In [None]:
import torch

# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
    return all_letters.find(letter)

# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def textToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

In [None]:
#texts_encoded = [textToTensor(text) for text in texts[:10]]

# Preprocessing (Keras)

In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer

tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
tk.fit_on_texts(texts)

char_dict = {char: index + 1 for index, char in enumerate(all_letters)}
char_dict[tk.oov_token] = max(char_dict.values()) + 1
tk.word_index = char_dict

In [8]:
sequences = tk.texts_to_sequences(texts)

In [64]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

maxlen = 100

sequences = pad_sequences(sequences=sequences, maxlen=maxlen, padding='post')

## Define model

1. Attempt in keras
2. If that works port to pytorch

In [61]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import Nadam
import numpy as np

import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 
os.environ["CUDA_VISIBLE_DEVICES"] = "3"  # use id from $ nvidia-smi

In [63]:
encoder_inputs = Input(shape=(maxlen,), name='Encoder-Input')
emb_layer = Embedding(n_letters, 150,input_length = maxlen, name='Body-Word-Embedding', mask_zero=False)

x = emb_layer(encoder_inputs)
state_h = Bidirectional(LSTM(128, activation='relu', name='Encoder-Last-LSTM'))(x)
encoder_model = Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model')
seq2seq_encoder_out = encoder_model(encoder_inputs)

decoded = RepeatVector(maxlen)(seq2seq_encoder_out)
decoder_lstm = Bidirectional(LSTM(128, return_sequences=True, name='Decoder-LSTM-before'))
decoder_lstm_output = decoder_lstm(decoded)
decoder_dense = Dense(n_letters, activation='softmax', name='Final-Output-Dense-before')
decoder_outputs = decoder_dense(decoder_lstm_output)

seq2seq_Model = Model(encoder_inputs, decoder_outputs)
seq2seq_Model.compile(Nadam(lr=0.001), loss='sparse_categorical_crossentropy')
history = seq2seq_Model.fit(sequences, np.expand_dims(sequences, -1),
          batch_size=16,
          epochs=10)

Epoch 1/10


InvalidArgumentError:  indices[0,4549] = 57 is not in [0, 57)
	 [[node model_4/Encoder-Model/Body-Word-Embedding/embedding_lookup (defined at <ipython-input-63-3cfb9ca46319>:17) ]] [Op:__inference_train_function_40369]

Errors may have originated from an input operation.
Input Source operations connected to node model_4/Encoder-Model/Body-Word-Embedding/embedding_lookup:
 model_4/Encoder-Model/Body-Word-Embedding/embedding_lookup/37225 (defined at /Users/lennartkeller/opt/anaconda3/envs/clustering/lib/python3.8/contextlib.py:113)

Function call stack:
train_function
