## Importação de bibliotecas e de dados

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import string
import random
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds

In [None]:
train_labels = pd.read_csv('train_labels.csv')
test_labels = pd.read_csv('test_labels.csv')

In [None]:
train_input = np.loadtxt('train_inputs.csv')
test_input = np.loadtxt('test_inputs.csv')


ValueError: the number of columns changed from 46 to 40 at row 912; use `usecols` to select a subset and avoid this error

In [None]:
train_input = train_input.astype(int)
test_input = test_input.astype(int)

In [None]:
train_input[0]

array([ 69,   3,   1, 819,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0])

In [None]:
train_input.shape

(153708, 46)

In [None]:
train_labels.shape

(153708, 1)

In [None]:
test_input.shape

(65876, 46)

In [None]:
test_labels.shape

(65876, 1)

In [None]:
class DCNN(tf.keras.Model):

  def __init__(self,
               vocab_size,
               emb_dim=128,
               nb_filters=50,
               ffn_units=512,
               nb_classes=2,
               dropout_rate=0.1,
               training=True,
               name="dcnn"):
    super(DCNN, self).__init__(name=name)

    self.embedding = layers.Embedding(vocab_size, emb_dim)

    # Camada responsável pela interpretação da distância entre as palavras, realizando uma aproximação de palavras com semântica semelhante
    # Bem parecido com a Word2Vec
    #emb_dim -> Significa o parametro de distancia, não pode ser tão grande nem tão pequeno

    self.bigram = layers.Conv1D(filters=nb_filters, kernel_size=2, padding='same', activation='relu')

    self.trigram = layers.Conv1D(filters=nb_filters, kernel_size=3, padding='same', activation='relu')

    self.fourgram = layers.Conv1D(filters=nb_filters, kernel_size=4, padding='same', activation='relu')

    #Camada de Convolução -> Camada responsável pela leitura da imagem, ela entra linha a linha em cada pixel da imagem para identificar características ,
    # Para realizar isso, a camada convolucional não é conectada ao neuronio passado, mas sim a pixels específicos para adquirir suas features e identificar um novo ponto
    # Igual a convolução usada em telecomunicações
    #  filters -> Numero de filtros utilizados
    # kernel_size - > Significa o tamanho do filtro, assim qual a movimentação ele irá realizar na convolução
    # Ex: Frase 'Eu sou o relampago marquinhos'
    # kernel_size = 2 -> Leitura: 'Eu sou' 'sou o' 'o relampago' 'relampago marquinhos
    # kernel_size = 3 -> 'Eu sou o' 'sou relampago marquinhos




    self.pool = layers.GlobalMaxPool1D() # CAMADA DE POOL -> Diminui o tamanho das imagens de saída, assim melhorando desempenho e memória

    self.dense_1 = layers.Dense(units = ffn_units, activation = 'relu') # Camada de aprendizado das caracteristicas obtidas
    self.dropout = layers.Dropout(rate = dropout_rate) # Camada de desligamento automatico dos neuronios
    self.last_dense = layers.Dense(units = 1, activation = 'sigmoid') # Camada de saída
  def call(self, inputs, training):
    x = self.embedding(inputs)
    x_1 = self.bigram(x)
    x_1 = self.pool(x_1)

    x_2 = self.trigram(x)
    x_2 = self.pool(x_2)

    x_3 = self.fourgram(x)
    x_3 = self.pool(x_3)

    merged = tf.concat([x_1, x_2, x_3], axis = -1)
    merged = self.dense_1(merged)

    merged = self.dropout(merged, training=training)
    output = self.last_dense(merged)

    return output

In [None]:
data = pd.read_csv('data.csv')
data.shape

(219584, 2)

In [None]:
data_clean = data['tratamento3']
data_clean = data_clean.dropna()

In [None]:
tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(data_clean, target_vocab_size=2**16)

In [None]:
tokenizer.vocab_size

54930

In [None]:
tokenizer.encode('eu sou feliz')

[9198, 31175, 347]

In [None]:
tokenizer.decode([9198, 31175, 347])

'eu sou feliz'

In [None]:
vocab_size = tokenizer.vocab_size
vocab_size

54930

In [None]:
emb_dim = 200
nb_filters = 100
ffn_units = 256
batch_size = 64
nb_classes = len(set(train_labels))
nb_classes

1

In [None]:
dropout_rate = 0.2
nb_epochs = 5

In [None]:
Dcnn = DCNN(vocab_size=vocab_size, emb_dim=emb_dim, nb_filters=nb_filters,
            ffn_units=ffn_units, nb_classes=nb_classes, dropout_rate=dropout_rate)

In [None]:
Dcnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
checkpoint_path = "./"
ckpt = tf.train.Checkpoint(Dcnn=Dcnn)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)
if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print('Latest checkpoint restored')

In [None]:
history = Dcnn.fit(train_input, train_labels,
                   batch_size = batch_size,
                   epochs = nb_epochs,
                   verbose = 1,
                   validation_split = 0.10)
ckpt_manager.save()

Epoch 1/5
[1m2162/2162[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m552s[0m 254ms/step - accuracy: 0.8158 - loss: 0.4191 - val_accuracy: 0.8698 - val_loss: 0.3494
Epoch 2/5
[1m2162/2162[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m546s[0m 253ms/step - accuracy: 0.9014 - loss: 0.2821 - val_accuracy: 0.8644 - val_loss: 0.3730
Epoch 3/5
[1m2162/2162[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m560s[0m 252ms/step - accuracy: 0.9289 - loss: 0.2017 - val_accuracy: 0.8521 - val_loss: 0.4327
Epoch 4/5
[1m2162/2162[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m559s[0m 250ms/step - accuracy: 0.9524 - loss: 0.1346 - val_accuracy: 0.8462 - val_loss: 0.6016
Epoch 5/5
[1m2162/2162[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m553s[0m 246ms/step - accuracy: 0.9669 - loss: 0.0939 - val_accuracy: 0.8374 - val_loss: 0.7329


'./ckpt-1'