In [None]:
import pandas as pd
import csv
import os
import spacy
import numpy as np
from tensorflow.python.keras import preprocessing
from tensorflow.python.keras import layers
from tensorflow.python.keras import Sequential, callbacks, utils
from tensorflow.python.keras.activations import linear, tanh
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.losses import mse
from tensorflow.python.keras.optimizers import SGD, Adam
from tensorflow.python import keras
# Pad your sequences so they are the same length
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

In [None]:
filename = "../data/biographie_df.csv"

In [None]:
df_biographie = pd.read_csv(filename, encoding="utf-8", sep=";")
print(df_biographie.head())

In [None]:
#!python -m spacy download en_core_web_sm
#!python -m spacy download en_core_web_lg

In [None]:
nlp_en = spacy.load('en_core_web_lg')

In [None]:
nlp_en

In [None]:
tokens = []
lemma = []
pos = []

for doc in nlp_en.pipe(df_biographie['biographie'].astype('unicode').values, batch_size=50,
                        n_threads=3):
    if doc.is_parsed:
        tokens.append([n.text for n in doc])
        lemma.append([n.lemma_ for n in doc])
        pos.append([n.pos_ for n in doc])
    else:
        # We want to make sure that the lists of parsed results have the
        # same number of entries of the original Dataframe, so add some blanks in case the parse fails
        tokens.append(None)
        lemma.append(None)
        pos.append(None)

df_biographie['species_tokens'] = tokens
df_biographie['species_lemma'] = lemma
df_biographie['species_pos'] = pos

In [None]:
df_biographie['species_lemma'][0]

In [None]:
text = '\n'.join([''.join(sentence) for sentence in df_biographie['species_tokens'][0]])
print(type(text))

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((df_biographie.values))

In [None]:
dataset

In [None]:
max_sequence_length = df_biographie.cumulative_input_vectors.apply(len).max()
# Save it as a list   
padded_sequences = pad_sequences(df_biographie.cumulative_input_vectors.tolist(), max_sequence_length).tolist()
df_biographie['padded_input_vectors'] = pd.Series(padded_sequences).apply(np.asarray)

In [None]:
import numpy as np
from tensorflow.python.keras import preprocessing
from tensorflow.python.keras import layers
from tensorflow.python.keras import Sequential, callbacks, utils
from tensorflow.python.keras.activations import linear, tanh
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.losses import mse
from tensorflow.python.keras.optimizers import SGD, Adam
from tensorflow.python import keras
class charLSTMmodel():
    
    def fit(self,text,epochs=100):
        self._load(text)
        self._build()
        self._train(epochs)
        
    def _load(self, text):
        self.idx_token = dict(enumerate(set(self._tokenise(text)),start=2))
        self.idx_token[0] = '<PAD>'
        self.idx_token[1] = '<UNK>' 
        self.token_idx = {word:i for i,word in self.idx_token.items()}       
        token_ids = [[self.token_idx[token] for token in self._tokenise(sentence)] for sentence in self._chunk(text)]
        inouts = [tokens[:i+1] for tokens in token_ids for i in range(1,len(tokens))]
        self.x_dim = max([len(x) for x in inouts]) - 1
        self.y_dim = len(self.idx_token) 
        inouts = np.array(keras.preprocessing.sequence.pad_sequences(inouts,maxlen=self.x_dim + 1, padding='pre'))
        self.X, self.Y = inouts[:,:-1], inouts[:,-1]
        
    def _tokenise(self,text):
        return list(' '.join(text.split()).replace(" ","_"))

    def generate(self,words,i=150):
        for _ in range(i):
            x = [self.token_idx[token] if token in self.token_idx else 1 for token in self._tokenise(words)] 
            x = keras.preprocessing.sequence.pad_sequences([x], maxlen=self.x_dim, padding = 'pre')
            y_hat = self.model.predict_classes(x, verbose=0)[0] #maximise
            words += self.idx_token[y_hat]
            return words.replace("_"," ")
    
    def _chunk(self,text,chunk_size = 100):
        return ''.join([c + '<S>' if not i % chunk_size else c for i,c in enumerate(text,start=1)]).split('<S>')

    def _build(self):
        self.model = keras.models.Sequential()
        self.model.add(keras.layers.Embedding(self.y_dim, 10, input_length=self.x_dim))
        self.model.add(keras.layers.LSTM(150, return_sequences = True))
        self.model.add(keras.layers.LSTM(100))
        self.model.add(keras.layers.Dense(self.y_dim, activation='softmax'))
        self.model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    def _train(self,epochs):
        earlystop =  keras.callbacks.EarlyStopping(monitor='loss', min_delta=0, patience=5, verbose=0, mode='auto')
        onehot_y = keras.utils.to_categorical(self.Y, num_classes=self.y_dim)
        self.model.fit(self.X, onehot_y, epochs=epochs, verbose=1, callbacks=[earlystop])  

In [None]:
clstm = charLSTMmodel()
clstm.fit(text, epochs=10)

In [None]:
clstm.generate("my name is Manitra")