In [24]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

import math
import nltk
from nltk.lm.models import Lidstone
from nltk.lm.models import Laplace
from nltk.corpus import machado
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE
import re

from nltk.corpus import machado
import nltk

# Funções

In [46]:
def remover_stopwords(texto):
    # Carregue a lista de stopwords da língua desejada (exemplo: português)
    stopwords_lista = nltk.corpus.stopwords.words('portuguese')

    # Tokenize o texto em palavras
    palavras = word_tokenize(texto)

    # Crie uma lista de palavras sem as stopwords
    palavras_sem_stopwords = [palavra for palavra in palavras if palavra.lower() not in stopwords_lista]

    # Recrie o texto sem as stopwords
    texto_sem_stopwords = ' '.join(palavras_sem_stopwords)

    return texto_sem_stopwords

def generate_ngrams(text, n):
    words = text.split()
    bigrams = []
    for i in range(len(words) - n + 1):
        bigram =  " ".join(words[i:i+n])
        bigrams.append(tuple(bigram.split()))

    return bigrams

def remocao_caracteres_especiais(text):
    padrao = r'[^\w]+'
    return re.sub(padrao, ' ', text)

# Função para gerar texto
def generate_text(seed_text, next_words, model, max_sequence_length):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_length - 1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        predicted_word_index = np.argmax(predicted)
        predicted_word = tokenizer.index_word[predicted_word_index]
        seed_text += " " + predicted_word
    return seed_text

def calculate_perplexity(model, test_input, tokenizer, max_sequence_length):
    token_list = tokenizer.texts_to_sequences([test_input])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_length - 1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted)
    predicted_word_prob = predicted[0][predicted_word_index]
    perplexity = 1 / predicted_word_prob
    return perplexity

def criar_modelo_n_gram(text,n, model=None, gamma=0.1):
    tokenized_text = [list(map(str.lower, word_tokenize(sent)))
                  for sent in sent_tokenize(text)]
    train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)

    if model == 'li':
        model_machado = Lidstone(gamma,order=n )
    elif model == 'la':
        model_machado = Laplace(order=n)
    else:
        model_machado = MLE(n)

    model_machado.fit(train_data, padded_sents)
    return model_machado

def trans_corpus_text(corpus):
    text = ''
    for i in corpus:
        text += machado.raw(i) +' '

    #Pre-processamento
    #text = remover_stopwords(text)
    text = remocao_caracteres_especiais(text)

    return text

def model_rnn():
  model = tf.keras.Sequential()
  model.add(tf.keras.layers.Embedding(total_words, 64, input_length=max_sequence_length - 1))
  model.add(tf.keras.layers.SimpleRNN(128))
  model.add(tf.keras.layers.Dense(total_words, activation='softmax'))

  model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

  return model


In [26]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
nltk.download('machado')

[nltk_data] Downloading package machado to /root/nltk_data...


True

# Corpus

In [5]:
corpus = machado.fileids()

text = ''
cont = 0
total_contos = 1
for i in corpus:
    text += machado.raw(i) +' '
    if cont == total_contos:
        break
    cont+=1

# N Grams

In [64]:
model_n_gram = criar_modelo_n_gram(text,4)

Testando a geração de frases

In [67]:
en = 'O amor que vem'
t = f'{en} '
for i in model_n_gram.generate(20, text_seed=en.split()):
    t += i + ' '
print(t)

O amor que vem cá , adelaide , que era um espírito fraco , cederia ao último que lhe falasse , e os olhos 


Perplexidade infinita

In [71]:
model_n_gram.perplexity('O amor que vem')

inf

# Modelo RNN

In [6]:


# Pré-processamento de texto
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

# Criar sequências de palavras
input_sequences = []
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Padronizar sequências
max_sequence_length = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')

# Dividir em dados de entrada e saída
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

# Model RNN
model = model_rnn()

# Treinamento
model.fit(X, y, epochs=100, verbose=1)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Testando a geração de frases

In [28]:
generated_text = generate_text("O amor que vem", 20, model, max_sequence_length)
print(generated_text)

O amor que vem com este sujeito de quem seria a viúva não sei mas de não é nada menos que ele dizia casar


Perplexidade

In [29]:
test_input = "O amor que vem"
perplexity = calculate_perplexity(model, test_input, tokenizer, max_sequence_length)
print(f"Perplexidade: {perplexity:.2f}")

Perplexidade: 1.28
