# Introdução

Já pensou em ler um livro escrito por três do maiores nomes da literatura mundial? Nesse projeto tentou-se mimificar o estilo de escrita de três grandes escritores: 

- Arthur Conan Doyle - Romance Policial
- Agatha Christie - Romance Policial

# Descrição do dataset

O dataset é constituto de três livros:  

- As Aventuras de Sherlock Holmes (Arthur Conan Doyle)
- O adversário secreto (Agatha Christie)

Todas as obras utilizadas estão em dominio público e foram extraídos do Project Gutenberg, disponível em: https://www.gutenberg.org/. 

# Importando as bibliotecas necessárias 

In [2]:
import re
import nltk
import random
import spacy as sp
import numpy as np
import en_core_web_sm
from tensorflow import keras
from pickle import dump, load
from nltk.corpus import stopwords
from keras.models import load_model
from keras.models import Sequential 
from tensorflow.keras import callbacks
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.utils import to_categorical
from keras_preprocessing.sequence import pad_sequences

# Carregamento de dados

In [3]:
#Função de leitura de arquivo de texto
def read_file(filepath):
    with open (filepath) as f:
        str_text = f.read()
        return str_text

In [4]:
Sherlock = read_file('/Data/Sherlock Holmes.txt')
Sherlock



In [5]:
Sherlock = Sherlock.replace('http://collegebookshelf.net', '')
Sherlock



In [6]:
Secret = read_file('/Data/The Secret Adversary.txt')
Secret



In [7]:
livros = Sherlock + Secret
livros



# Tramento dos dados e torkenização

In [8]:
nlp = sp.load('en_core_web_sm', disable=['parser', 'tagger', 'ner','lemmatizer'])


In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
stopword = stopwords.words('english')
stopword

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [11]:
# Caracteres especiais a serem deletados. Sendo unidades de medida e símbolos especiais que não agregam valor a análise
filtro = stopword + ['kg', 'm', 'cm', 'mm', 'mg', 'ml', 'm²', '-', '+', '=', 'mcg', 'nº', 'g', 'ª', 'º','\n', '\n\n', '\n\n\n', '\n \n']

# Função de tratamento
def processamento_limpeza(sentence):
    sentence = sentence.lower() # Caixa baixa
    sentence = re.sub(r"[^\w\s]", "",sentence) # Exclusão de caracteres especiais
    sentence = re.sub("[0-9]+","", sentence) # Exclusão de dígitos
    sentence = re.sub("/+","", sentence) # Exclusão de "/"
    sentence = re.sub(' +', ' ', sentence) # Exclusão de espaçamentos repetitivos entre as frases

    doc=nlp(sentence) # Tokenização

    tokens=[token.text for token in doc]
    tokens=[token for token in tokens if token not in filtro] # Filtragem adicional com os caracteres em filter_out

    return tokens

In [12]:
type(stopword)

list

In [13]:
nlp.max_length = 21986233

In [14]:
tokens = processamento_limpeza(livros)
tokens

['adventure',
 'scandal',
 'bohemia',
 'sherlock',
 'holmes',
 'always',
 'woman',
 'seldom',
 'heard',
 'mention',
 'name',
 'eyes',
 'eclipses',
 'predominates',
 'whole',
 'sex',
 'felt',
 'emotion',
 'akin',
 'love',
 'irene',
 'adler',
 'emotions',
 'one',
 'particularly',
 'abhorrent',
 'cold',
 'precise',
 'admirably',
 'balanced',
 'mind',
 'take',
 'perfect',
 'reasoning',
 'observing',
 'machine',
 'world',
 'seen',
 'lover',
 'would',
 'placed',
 'false',
 'position',
 'never',
 'spoke',
 'softer',
 'passions',
 'save',
 'gibe',
 'sneerthey',
 'admirable',
 'things',
 'observer',
 'excellent',
 'drawing',
 'veil',
 'mens',
 'motives',
 'actions',
 'trained',
 'reasoner',
 'admit',
 'intrusions',
 'delicate',
 'finely',
 'adjusted',
 'temperament',
 'introduce',
 'distracting',
 'factor',
 'might',
 'throw',
 'doubt',
 'upon',
 'mental',
 'results',
 'grit',
 'sensitive',
 'instrument',
 'crack',
 'one',
 'highpower',
 'lenses',
 'would',
 'disturbing',
 'strong',
 'emotion',

In [15]:
#verificando texto após limpeza
print(' '.join(tokens))

  good al ways interested mr sherlock holmes cases ungrateful seeing gained one answered go must pack half hour experience camp life afghanistan least effect making prompt ready traveller wants simple less time stated cab valise rattling away paddington station sherlock holmes pacing platform tall gaunt figure made even gaunter taller long grey travellingcloak closefitting cloth cap really good come watson said makes considerable difference someone thoroughly rely local aid always either worthless else biassed keep two corner seats shall get tickets carriage save immense litter papers holmes brought among rummaged read intervals notetaking meditation past reading suddenly rolled gigantic ball tossed onto rack heard anything case asked word seen paper days london press full accounts looking recent papers order master particulars seems gather one simple cases extremely difficult sounds little paradoxical profoundly true singula rity almost invariably clue featureless commonplace crime di

In [16]:
len(tokens)

84980

In [17]:
#Predição da proxima palavra
train_len = 25+1 

text_sequences = []

for i in range(train_len, len(tokens)):
    
   
    seq = tokens[i-train_len:i]
    
    
    text_sequences.append(seq)

In [18]:
text_sequences[80]

['one',
 'highpower',
 'lenses',
 'would',
 'disturbing',
 'strong',
 'emotion',
 'nature',
 'yet',
 'one',
 'woman',
 'woman',
 'late',
 'irene',
 'adler',
 'dubious',
 'questionable',
 'memory',
 'seen',
 'little',
 'holmes',
 'lately',
 'marriage',
 'drifted',
 'us',
 'away']

In [19]:
text_sequences[81]

['highpower',
 'lenses',
 'would',
 'disturbing',
 'strong',
 'emotion',
 'nature',
 'yet',
 'one',
 'woman',
 'woman',
 'late',
 'irene',
 'adler',
 'dubious',
 'questionable',
 'memory',
 'seen',
 'little',
 'holmes',
 'lately',
 'marriage',
 'drifted',
 'us',
 'away',
 'complete']

In [20]:
text_sequences[82]

['lenses',
 'would',
 'disturbing',
 'strong',
 'emotion',
 'nature',
 'yet',
 'one',
 'woman',
 'woman',
 'late',
 'irene',
 'adler',
 'dubious',
 'questionable',
 'memory',
 'seen',
 'little',
 'holmes',
 'lately',
 'marriage',
 'drifted',
 'us',
 'away',
 'complete',
 'happiness']

#Convertendo os tokens em um sistema númerico


In [21]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)

In [22]:
sequences = tokenizer.texts_to_sequences(text_sequences)

In [23]:
sequences[0]

[616,
 1327,
 1444,
 133,
 11,
 105,
 127,
 1942,
 58,
 1227,
 79,
 51,
 12222,
 12221,
 176,
 12220,
 96,
 2204,
 4376,
 499,
 991,
 990,
 6156,
 3,
 1443,
 6159]

In [24]:
tokenizer.index_word

{1: 'said',
 2: 'nt',
 3: 'one',
 4: 'would',
 5: 'tuppence',
 6: 'tommy',
 7: 'mr',
 8: 'upon',
 9: 'could',
 10: 'man',
 11: 'holmes',
 12: 'know',
 13: 'little',
 14: 'well',
 15: 'see',
 16: 'think',
 17: 'sir',
 18: 'julius',
 19: 'must',
 20: 'us',
 21: 'time',
 22: 'two',
 23: 'come',
 24: 'back',
 25: 'door',
 26: 'came',
 27: 'like',
 28: 'shall',
 29: 'may',
 30: 'yes',
 31: 'right',
 32: 'might',
 33: 'way',
 34: 'go',
 35: 'say',
 36: 'good',
 37: 'face',
 38: 'get',
 39: 'young',
 40: 'room',
 41: 'hand',
 42: 'got',
 43: 'house',
 44: 'nothing',
 45: 'much',
 46: 'away',
 47: 'never',
 48: 'mrs',
 49: 'tell',
 50: 'head',
 51: 'eyes',
 52: 'thought',
 53: 'quite',
 54: 'james',
 55: 'miss',
 56: 'last',
 57: 'made',
 58: 'heard',
 59: 'oh',
 60: 'something',
 61: 'girl',
 62: 'round',
 63: 'long',
 64: 'went',
 65: 'asked',
 66: 'looked',
 67: 'first',
 68: 'matter',
 69: 'took',
 70: 'seemed',
 71: 'take',
 72: 'look',
 73: 'morning',
 74: 'found',
 75: 'case',
 76: 'thi

In [25]:
for i in sequences[0]:
    print(f'{i} : {tokenizer.index_word[i]}')

616 : adventure
1327 : scandal
1444 : bohemia
133 : sherlock
11 : holmes
105 : always
127 : woman
1942 : seldom
58 : heard
1227 : mention
79 : name
51 : eyes
12222 : eclipses
12221 : predominates
176 : whole
12220 : sex
96 : felt
2204 : emotion
4376 : akin
499 : love
991 : irene
990 : adler
6156 : emotions
3 : one
1443 : particularly
6159 : abhorrent


In [26]:
tokenizer.word_counts

OrderedDict([('adventure', 625),
             ('scandal', 288),
             ('bohemia', 263),
             ('sherlock', 2474),
             ('holmes', 11419),
             ('always', 2866),
             ('woman', 2581),
             ('seldom', 190),
             ('heard', 4429),
             ('mention', 322),
             ('name', 3547),
             ('eyes', 4562),
             ('eclipses', 13),
             ('predominates', 14),
             ('whole', 1809),
             ('sex', 16),
             ('felt', 3033),
             ('emotion', 174),
             ('akin', 71),
             ('love', 774),
             ('irene', 411),
             ('adler', 412),
             ('emotions', 49),
             ('one', 16430),
             ('particularly', 285),
             ('abhorrent', 26),
             ('cold', 962),
             ('precise', 130),
             ('admirably', 130),
             ('balanced', 26),
             ('mind', 3224),
             ('take', 3848),
             ('perfect', 2

In [27]:
#Contando palavras únicas
vocabulary_size = len(tokenizer.word_counts)
vocabulary_size

12226

In [28]:
seq_matrix = np.array(sequences)
seq_matrix

array([[  616,  1327,  1444, ...,     3,  1443,  6159],
       [ 1327,  1444,   133, ...,  1443,  6159,   373],
       [ 1444,   133,    11, ...,  6159,   373,  2475],
       ...,
       [ 2452,  2263,     5, ..., 12224, 12225,   543],
       [ 2263,     5,   478, ..., 12225,   543,  4378],
       [    5,   478,   293, ...,   543,  4378, 12226]])

In [29]:
x_features = seq_matrix[:,:-1]
x_features

array([[  616,  1327,  1444, ...,  6156,     3,  1443],
       [ 1327,  1444,   133, ...,     3,  1443,  6159],
       [ 1444,   133,    11, ...,  1443,  6159,   373],
       ...,
       [ 2452,  2263,     5, ..., 12223, 12224, 12225],
       [ 2263,     5,   478, ..., 12224, 12225,   543],
       [    5,   478,   293, ..., 12225,   543,  4378]])

In [30]:
y_labels = seq_matrix[:,-1]
y_labels

array([ 6159,   373,  2475, ...,   543,  4378, 12226])

In [31]:
y_labels = to_categorical(y_labels,num_classes=vocabulary_size+1)
y_labels

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]], dtype=float32)

In [32]:
x_features.shape

(84954, 25)

In [33]:
seq_len = x_features[1]
seq_len

array([ 1327,  1444,   133,    11,   105,   127,  1942,    58,  1227,
          79,    51, 12222, 12221,   176, 12220,    96,  2204,  4376,
         499,   991,   990,  6156,     3,  1443,  6159])

In [34]:
seq_len.size

25

In [35]:
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, seq_len.size, input_length = seq_len.size))
    model.add(LSTM(150,return_sequences=True))
    model.add(LSTM(150,return_sequences=True))
    model.add(LSTM(150))
    model.add(Dense(150,activation='relu'))
    model.add(Dense(150,activation='relu'))
    
    opt = keras.optimizers.SGD(learning_rate=0.2, momentum=0.001, nesterov=True, name="SGD")
    
    model.add(Dense(vocabulary_size,activation = 'softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
    
    model.summary()
    
    return model

In [36]:
model = create_model(vocabulary_size+1, seq_len);

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 25, 25)            305675    
                                                                 
 lstm (LSTM)                 (None, 25, 150)           105600    
                                                                 
 lstm_1 (LSTM)               (None, 25, 150)           180600    
                                                                 
 lstm_2 (LSTM)               (None, 150)               180600    
                                                                 
 dense (Dense)               (None, 150)               22650     
                                                                 
 dense_1 (Dense)             (None, 150)               22650     
                                                                 
 dense_2 (Dense)             (None, 12227)             1

In [37]:
model.fit(x_features,y_labels,batch_size=128,epochs=300,verbose=1)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

<keras.callbacks.History at 0x7f9cc81a11d0>

In [38]:
model.save('model.h5')

In [39]:
dump(tokenizer,open('capitulo_tokenizer','wb'))

In [40]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    
    output_text = []
    
    input_text = seed_text
    
    for i in range(num_gen_words):
        
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len.size, truncating='pre')
        
        pred_word_ind = model.predict(pad_encoded, verbose=0)[0]
        classes=np.argmax(pred_word_ind)
        
        pred_word = tokenizer.index_word[classes] 
        
        input_text += ' ' + pred_word
        
        output_text.append(pred_word)
        
    return ' '.join(output_text) 

In [41]:
# gerando o texto seed
random.seed(42)
random_pick = random.randint(0,len(text_sequences))
random_seed_text = text_sequences[random_pick]
random_seed_text

['us',
 'moment',
 'girls',
 'sight',
 'told',
 'julius',
 'drive',
 'like',
 'hell',
 'london',
 'went',
 'along',
 'told',
 'whole',
 'story',
 'got',
 'soho',
 'house',
 'plenty',
 'time',
 'met',
 'mr',
 'carter',
 'outside',
 'arranging',
 'things']

In [42]:
seed_text = ' '.join(random_seed_text)
seed_text

'us moment girls sight told julius drive like hell london went along told whole story got soho house plenty time met mr carter outside arranging things'

In [43]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=50)

'went hid behind curtain recess policemen dared dared mere wrong looked hope cried lets nothing mr hersheimmer mixed cheer rita though nt another fellow would think sunday made two girl chance tuppence cheer tuppence looked harm seemed steps sitting leave take success christian coat sure better coat mrs sorry london'

# Treinando o modelo mantendo as stopwords


In [44]:
# Caracteres especiais a serem deletados. Sendo unidades de medida e símbolos especiais que não agregam valor a análise
filtro = ['kg', 'm', 'cm', 'mm', 'mg', 'ml', 'm²', '-', '+', '=', 'mcg', 'nº', 'g', 'ª', 'º','\n', '\n\n', '\n\n\n', '\n \n']

# Função de tratamento
def processamento_limpeza2(sentence):
    sentence = sentence.lower() # Caixa baixa
    sentence = re.sub(r"[^\w\s]", "",sentence) # Exclusão de caracteres especiais
    sentence = re.sub("[0-9]+","", sentence) # Exclusão de dígitos
    sentence = re.sub("/+","", sentence) # Exclusão de "/"
    sentence = re.sub(' +', ' ', sentence) # Exclusão de espaçamentos repetitivos entre as frases

    doc=nlp(sentence) # Tokenização

    tokens=[token.text for token in doc]
    tokens=[token for token in tokens if token not in filtro] # Filtragem adicional com os caracteres em filter_out

    return tokens