In [1]:
import io 
import numpy as np 
import random

import tensorflow as tf
from tensorflow import keras
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

In [2]:
print('Opening file...')
path = "divina_commedia.txt"
with io.open(path, encoding='utf-8') as file:
    text = file.read().lower()

print("text length", len(text))
print()
print('\n\n\n\n\n', '***** first 1000 characters *****', '\n\n\n\n\n')
text[0:1000]

Opening file...
text length 558240






 ***** first 1000 characters ***** 







"inferno\n\n\n\ninferno: canto i\n\n\nnel mezzo del cammin di nostra vita\n  mi ritrovai per una selva oscura\n  che' la diritta via era smarrita.\n\nahi quanto a dir qual era e` cosa dura\n  esta selva selvaggia e aspra e forte\n  che nel pensier rinova la paura!\n\ntant'e` amara che poco e` piu` morte;\n  ma per trattar del ben ch'i' vi trovai,\n  diro` de l'altre cose ch'i' v'ho scorte.\n\nio non so ben ridir com'i' v'intrai,\n  tant'era pien di sonno a quel punto\n  che la verace via abbandonai.\n\nma poi ch'i' fui al pie` d'un colle giunto,\n  la` dove terminava quella valle\n  che m'avea di paura il cor compunto,\n\nguardai in alto, e vidi le sue spalle\n  vestite gia` de' raggi del pianeta\n  che mena dritto altrui per ogne calle.\n\nallor fu la paura un poco queta\n  che nel lago del cor m'era durata\n  la notte ch'i' passai con tanta pieta.\n\ne come quei che con lena affannata\n  uscito fuor del pelago a la riva\n  si volge a l'acqua perigliosa e guata,\n\ncosi` l'animo mio, 

In [3]:
chars = sorted(list(set(text)))
print('total chars: ', len(chars))

char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

print(char_indices)
print(indices_char)

total chars:  40
{'\n': 0, ' ': 1, '!': 2, '"': 3, "'": 4, '(': 5, ')': 6, ',': 7, '-': 8, '.': 9, ':': 10, ';': 11, '<': 12, '>': 13, '?': 14, '`': 15, 'a': 16, 'b': 17, 'c': 18, 'd': 19, 'e': 20, 'f': 21, 'g': 22, 'h': 23, 'i': 24, 'j': 25, 'l': 26, 'm': 27, 'n': 28, 'o': 29, 'p': 30, 'q': 31, 'r': 32, 's': 33, 't': 34, 'u': 35, 'v': 36, 'x': 37, 'y': 38, 'z': 39}
{0: '\n', 1: ' ', 2: '!', 3: '"', 4: "'", 5: '(', 6: ')', 7: ',', 8: '-', 9: '.', 10: ':', 11: ';', 12: '<', 13: '>', 14: '?', 15: '`', 16: 'a', 17: 'b', 18: 'c', 19: 'd', 20: 'e', 21: 'f', 22: 'g', 23: 'h', 24: 'i', 25: 'j', 26: 'l', 27: 'm', 28: 'n', 29: 'o', 30: 'p', 31: 'q', 32: 'r', 33: 's', 34: 't', 35: 'u', 36: 'v', 37: 'x', 38: 'y', 39: 'z'}


In [None]:
maxlen = 30 # chunk length
step = 3

sentences = []
next_chars = []

for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])

print('number of sequences: ', len(sentences))
print(sentences[11])
print(next_chars[11])

number of sequences:  186070
 mezzo del cammin di nostra vi
t


In [5]:
# first point of assignment
from sklearn.model_selection import train_test_split

sentences_train, sentences_val, next_chars_train, next_chars_val = train_test_split(
    sentences, next_chars, test_size=0.2, random_state=42
)

print(f"Training set: {len(sentences_train)}")
print(f"Validation set: {len(sentences_val)}")


Training set: 148856
Validation set: 37214


In [7]:
with io.open("vita_nova.txt", encoding='utf-8') as file:
    external_text = file.read().lower()

print("text 'vita nova' length", len(external_text))
print()
print('\n\n\n\n\n', '***** first 1000 characters *****', '\n\n\n\n\n')
external_text[0:1000]

text 'vita nova' length 104284






 ***** first 1000 characters ***** 







'i\nin quella parte del libro de la mia memoria, dinanzi a la quale poco si potrebbe leggere, si trova una rubrica la quale dice: incipit vita nova. sotto la quale rubrica io trovo scritte le parole le quali è mio intendimento d’asemplare in questo libello; e se non tutte, almeno la loro sentenzia.\n\nii\n[i] nove fiate già appresso lo mio nascimento era tornato lo cielo de la luce quasi a uno medesimo punto, quanto a la sua propria girazione, quando a li miei occhi apparve prima la gloriosa donna de la mia mente, la quale fu chiamata da molti beatrice, li quali non sapeano che si chiamare. ella era in questa vita già stata tanto, che ne lo suo tempo lo cielo stellato era mosso verso la parte d’oriente de le dodici parti l’una d’un grado, sì che quasi dal principio del suo anno nono apparve a me, ed io la vidi quasi da la fine del mio nono. apparve vestita di nobilissimo colore, umile ed onesto, sanguigno, cinta e ornata a la guisa che a la sua giovanissima etade si convenia. in quello

In [12]:
external_chars = sorted(list(set(external_text)))
print('total external chars: ', len(external_chars))

external_char_indices = dict((c, i) for i, c in enumerate(external_chars))
external_indices_char = dict((i, c) for i, c in enumerate(external_chars))

print(external_char_indices)
print(external_indices_char)

total external chars:  49
{'\n': 0, ' ': 1, '!': 2, "'": 3, '(': 4, ')': 5, ',': 6, '-': 7, '.': 8, ':': 9, ';': 10, '?': 11, '[': 12, ']': 13, 'a': 14, 'b': 15, 'c': 16, 'd': 17, 'e': 18, 'f': 19, 'g': 20, 'h': 21, 'i': 22, 'j': 23, 'l': 24, 'm': 25, 'n': 26, 'o': 27, 'p': 28, 'q': 29, 'r': 30, 's': 31, 't': 32, 'u': 33, 'v': 34, 'x': 35, 'z': 36, '«': 37, '»': 38, 'à': 39, 'â': 40, 'è': 41, 'é': 42, 'ê': 43, 'ì': 44, 'ò': 45, 'ù': 46, '‘': 47, '’': 48}
{0: '\n', 1: ' ', 2: '!', 3: "'", 4: '(', 5: ')', 6: ',', 7: '-', 8: '.', 9: ':', 10: ';', 11: '?', 12: '[', 13: ']', 14: 'a', 15: 'b', 16: 'c', 17: 'd', 18: 'e', 19: 'f', 20: 'g', 21: 'h', 22: 'i', 23: 'j', 24: 'l', 25: 'm', 26: 'n', 27: 'o', 28: 'p', 29: 'q', 30: 'r', 31: 's', 32: 't', 33: 'u', 34: 'v', 35: 'x', 36: 'z', 37: '«', 38: '»', 39: 'à', 40: 'â', 41: 'è', 42: 'é', 43: 'ê', 44: 'ì', 45: 'ò', 46: 'ù', 47: '‘', 48: '’'}


In [10]:
external_sentences = []
external_next_chars = []

for i in range(0, len(external_text) - maxlen, step):
    external_sentences.append(external_text[i: i + maxlen])
    external_next_chars.append(external_text[i + maxlen])

print('number of sequences: ', len(external_sentences))
print(external_sentences[11])
print(external_next_chars[11])

number of sequences:  34752
 mia memoria, dinanzi a la qua
l


In [13]:
# encode in one rapresentation
print('generating input and output..')

def one_hot_encoding(sentences, next_chars, maxlen, chars, char_indices): 
    x = np.zeros((len(sentences), maxlen, len(chars)), dtype=bool)
    y = np.zeros((len(sentences), len(chars)), dtype=bool)

    for i, sentence in enumerate(sentences):
        for t, char in enumerate(sentence):
            x[i, t, char_indices[char]] = 1
        y[i, char_indices[next_chars[i]]] = 1

    return x, y

x_train, y_train = one_hot_encoding(sentences_train, next_chars_train, maxlen, chars, char_indices)
x_val, y_val = one_hot_encoding(sentences_val, next_chars_val, maxlen, chars, char_indices)
x_test, y_test = one_hot_encoding(external_sentences, external_next_chars, maxlen, external_chars, external_char_indices)

generating input and output..


In [None]:
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars), activation='softmax'))

optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
model.summary()

In [None]:
import sys

def testAfterEpoch(epoch, _):
    print()
    print()
    print('***** Epoch: %d *****' % (epoch+1))
    start_index = random.randint(0, len(text)- maxlen - 1)

    generated = ''
    sentence = text[start_index : start_index + maxlen]
    generated = generated + sentence

    print('***** starting sentence *****') 
    print(sentence)
    print('*****************************')
    sys.stdout.write(generated)

    for i in range(400):
        x_pred = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_indices[char]] = 1
        
        preds = model.predict(x_pred, verbose=0)[0]
        next_index = np.argmax(preds)
        next_char = indices_char[next_index]

        sentence = sentence[1:] + next_char

        sys.stdout.write(next_char)
        sys.stdout.flush()
    print()


In [None]:
print_callback = LambdaCallback(on_epoch_end=testAfterEpoch)

In [None]:
model.fit(x, y,
          batch_size = 2048, 
          epochs = 20, 
          callbacks = [print_callback])