# Goal:

We want to build a character-based recurrent autoencoder (using attention and lstm recurrent units)

In [1]:
import torch
import torch.nn as nn
import pandas as pd
import unicodedata
import string

## Load dataset

In [47]:
df = pd.read_csv('../dataset/movies_complete.csv')
df.text = df.text.str.replace('\s+', ' ')

In [98]:
all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters) +1 # for unknown token

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

In [52]:
texts = df.text.apply(unicodeToAscii).to_list()

## Preprocessing (Pytorch)

In [53]:
import torch

# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
    return all_letters.find(letter)

# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def textToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

In [54]:
#texts_encoded = [textToTensor(text) for text in texts[:10]]

# Preprocessing (Keras)

In [55]:
from tensorflow.keras.preprocessing.text import Tokenizer

tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
tk.fit_on_texts(texts)

char_dict = {char: index + 1 for index, char in enumerate(all_letters)}
char_dict[tk.oov_token] = max(char_dict.values()) + 1
tk.word_index = char_dict

In [75]:
sequences = tk.texts_to_sequences(texts)
X = tk.texts_to_matrix(texts)

In [76]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

maxlen = 20000

sequences = pad_sequences(sequences=sequences, maxlen=maxlen, padding='post')

In [78]:
(maxlen - np.count_nonzero(sequences, axis=1)).mean()

1080.9324034334763

## Define model

1. Attempt in keras
2. If that works port to pytorch

In [10]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import Nadam
import numpy as np

import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 
os.environ["CUDA_VISIBLE_DEVICES"] = "3"  # use id from $ nvidia-smi

In [11]:
#encoder_inputs = Input(shape=(maxlen,), name='Encoder-Input')
#emb_layer = Embedding(n_letters, 150,input_length = maxlen, name='Body-Word-Embedding', mask_zero=False)
#
#x = emb_layer(encoder_inputs)
#state_h = Bidirectional(LSTM(128, activation='relu', name='Encoder-Last-LSTM'))(x)
#encoder_model = Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model')
#seq2seq_encoder_out = encoder_model(encoder_inputs)
#
#decoded = RepeatVector(maxlen)(seq2seq_encoder_out)
#decoder_lstm = Bidirectional(LSTM(128, return_sequences=True, name='Decoder-LSTM-before'))
#decoder_lstm_output = decoder_lstm(decoded)
#decoder_dense = Dense(n_letters, activation='softmax', name='Final-Output-Dense-before')
#decoder_outputs = decoder_dense(decoder_lstm_output)
#
#seq2seq_Model = Model(encoder_inputs, decoder_outputs)
#seq2seq_Model.compile(Nadam(lr=0.001), loss='sparse_categorical_crossentropy')
#history = seq2seq_Model.fit(sequences, sequences,
#          batch_size=16,
#          epochs=10)

In [79]:
recurrent_encoder = Sequential([

Embedding(input_dim=n_letters, output_dim=256, input_length=maxlen),
LSTM(100, return_sequences=True),
LSTM(30)
])
recurrent_decoder = Sequential([
RepeatVector(maxlen, input_shape=[30]),
LSTM(100, return_sequences=True),
TimeDistributed(Dense(n_letters, activation="sigmoid"))
])

recurrent_ae = Sequential([recurrent_encoder, recurrent_decoder])
recurrent_ae.compile(loss='binary_crossentropy', optimizer='adam')
recurrent_ae.summary()

Model: "sequential_25"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sequential_23 (Sequential)   (None, 30)                173112    
_________________________________________________________________
sequential_24 (Sequential)   (None, 20000, 57)         58157     
Total params: 231,269
Trainable params: 231,269
Non-trainable params: 0
_________________________________________________________________
