In [1]:
# Import needed libraries and classes

import os
import random
import numpy as np

from Code.utils.dataset import Dataset

import tensorflow as tf
from tensorflow.keras import preprocessing
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense
from tensorflow.keras.layers import Embedding, Input
from tensorflow.keras import models, initializers

from tensorflow.keras.preprocessing.text import Tokenizer

[nltk_data] Downloading package punkt to C:\Users\Guilherme
[nltk_data]     Barbosa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Set random seed to control randomness

os.environ['TF_DETERMINISTIC_OPS'] = '1'

seed_value = 42
np.random.seed(seed_value)
tf.random.set_seed(seed_value)
random.seed(666)

In [3]:
# Load datasets

dataset = Dataset('../Dataset/DatasetsGerados/dataset_training_input.csv',
                  '../Dataset/DatasetsGerados/dataset_training_output.csv',
                  '../Dataset/DatasetsGerados/dataset_validation_input.csv',
                  '../Dataset/DatasetsGerados/dataset_validation_output.csv',
                  '../Dataset/dataset3_inputs.csv',
                  None)

X_train, y_train, X_validation, y_validation, X_test, y_test, ids = dataset.get_datasets_unprocessed('Text', 'Label', sep='\t', rem_punctuation=True)

max_words = 15000
max_len = 500

tokenizer = Tokenizer(num_words=max_words)  # limit to top max_words words
tokenizer.fit_on_texts(X_train.iloc[:, 0])

X_train = tokenizer.texts_to_sequences(X_train.iloc[:, 0])
X_validation = tokenizer.texts_to_sequences(X_validation.iloc[:, 0])
X_test = tokenizer.texts_to_sequences(X_test.iloc[:, 0])

X_train = preprocessing.sequence.pad_sequences(X_train, maxlen=max_len)
X_validation = preprocessing.sequence.pad_sequences(X_validation, maxlen=max_len)
X_test = preprocessing.sequence.pad_sequences(X_test, maxlen=max_len)

In [4]:
# Build Model

dim_embed = 100
model = Sequential()
model.add(Input((X_train.shape[1],)))
model.add(Embedding(max_words, dim_embed, embeddings_initializer=initializers.GlorotUniform(seed=44)))
model.add(Flatten())
model.add(Dense(8, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_validation,y_validation))

Epoch 1/20
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - acc: 0.4842 - loss: 0.6925 - val_acc: 0.5000 - val_loss: 0.6224
Epoch 2/20
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - acc: 0.8275 - loss: 0.3898 - val_acc: 0.9620 - val_loss: 0.0913
Epoch 3/20
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - acc: 0.9974 - loss: 0.0160 - val_acc: 0.9990 - val_loss: 0.0047
Epoch 4/20
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - acc: 1.0000 - loss: 0.0020 - val_acc: 1.0000 - val_loss: 0.0020
Epoch 5/20
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - acc: 1.0000 - loss: 3.8144e-04 - val_acc: 1.0000 - val_loss: 0.0013
Epoch 6/20
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - acc: 1.0000 - loss: 1.1761e-04 - val_acc: 1.0000 - val_loss: 0.0010
Epoch 7/20
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step -

In [5]:
# Predict test dataset

out = model.predict(X_test)

if y_test is not None:
    print(model.evaluate(X_test, y_test))

# Store results

results_filepath = './submissao3-grupo007-s2.csv'

# Ensure the directory exists
os.makedirs(os.path.dirname(results_filepath), exist_ok=True)

results = dataset.merge_results(ids, out)
results.to_csv(results_filepath, sep='\t', index=False)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
