In [None]:
import tensorflow as tf
import numpy as np
import os
import codecs

from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding

# Carregando os dados

In [None]:
def load_data(data_dir, class_label):

    file_names = []
    for (dirpath, dirnames, filenames) in os.walk(data_dir):
        for f in filenames:
            if f.endswith(".txt"):
                file_names.append(os.path.join(dirpath, f))

    txt_ = []
    y = []
    for f in file_names: 
        txt_.append(codecs.open(f,'r',encoding='iso8859-1').read())
        y.append(class_label)

    return txt_, y

fake_data, y_fake = load_data('../aula05/Fake.Br Corpus/full_texts/fake/', 0)    
true_data, y_true = load_data('../aula05/Fake.Br Corpus/full_texts/true/', 1)    

data_train, data_test, y_train, y_test = train_test_split(fake_data + true_data, y_fake + y_true,test_size=0.15, random_state=1447)

# Vetorização dos dados

In [None]:
# bibliotecas necessarias para o pre-processamento de dados
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import string
import re

In [None]:
def padronizacao_customizada(input_data):
    lowercase = tf.strings.lower(input_data)
    return tf.strings.regex_replace(
        lowercase, "[%s]" % re.escape(string.punctuation), ""
    )

max_features = 20000 
embedding_dim = 128 
sequence_length = 500

vectorize_layer = TextVectorization(
    standardize=padronizacao_customizada,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

vectorize_layer.adapt(data_train)

# Vetorização do texto

In [None]:
# transformacao dos textos em indices de números inteiros
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label


# Vetorização dos dados the data.
train_ds,y_train = vectorize_text(data_train, y_train)
test_ds, y_test = vectorize_text(data_test, y_test)

# Construindo o modelo

In [None]:
model = Sequential()
model.add(Embedding(max_features, embedding_dim))
model.add(Dropout(0.5))
model.add(LSTM(embedding_dim))
model.add(Dense(1, activation="sigmoid"))
print(model.summary())

In [None]:
opt = tf.keras.optimizers.Adam(clipnorm=1.0)
model.compile(
    loss='binary_crossentropy',
    optimizer=opt,
    metrics=['accuracy'],
)

In [None]:
# quanto mais epoca, em teoria, melhor. Mas tem uma hora que a rede para aprender...
epochs = 3

# Fit the model using the train and test datasets.
model.fit(train_ds, np.array(y_train), batch_size=32, epochs=epochs)

In [None]:
model.evaluate(test_ds, np.array(y_test))