<a href="https://colab.research.google.com/github/LuanPCunha/TCC/blob/main/Treinamento_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Downloads e imports

In [45]:
# !pip install tensorflow
# !pip install keras

In [46]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [47]:
import json
import sklearn
import numpy as np
import pandas as pd
from numpy import loadtxt
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from keras import optimizers
from keras import layers
from keras.layers import Dropout, Conv1D, MaxPooling1D, Flatten, Dense, SpatialDropout1D, BatchNormalization
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import schedules, Adam, Adadelta, SGD, RMSprop, Adagrad, Adamax, Nadam, Ftrl # Estamos usando só o Nadam
from keras.models import Sequential
from keras.layers.embeddings import Embedding

from mlxtend.plotting import plot_confusion_matrix
from keras.callbacks import Callback, ModelCheckpoint, CSVLogger
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix

In [48]:
def tokenize(tweets_list):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(tweets_list)
    return tokenizer.texts_to_sequences(tweets_list), tokenizer

In [49]:
def pad(text_tokenized, length=None):
    return pad_sequences(text_tokenized, maxlen=length, padding='post')

In [50]:
def preprocess(tweets_list, max_text_length):
   
    preprocess_tweets_list, tweets_list_tokenizer = tokenize(tweets_list)

    preprocess_tweets_list = pad(preprocess_tweets_list, length=max_text_length)

    return preprocess_tweets_list, tweets_list_tokenizer

In [51]:
def logits_to_text(logits, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

In [52]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            self.score = roc_auc_score(self.y_val, y_pred)
            self.false_positive_rate, self.true_positive_rate, _ = roc_curve(self.y_val, y_pred)
            # print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch+1, self.score))             

# Carrega arquivo de saída (resultados)

In [53]:
# CONSTANTES DOS RESULTADOS

REDE_CNN = 'CNN'
REDE_LSTM = 'LSTM'
REDE_BILSTM = 'Bi-LSTM'

BASE_1 = 'BASE 1'
BASE_2 = 'BASE 2'

FUNCAO_ATV_SIGMOID = 'SIGMOID'
FUNCAO_ATV_SOFTMAX = 'SOFTMAX'
FUNCAO_ATV_RELU = 'RELU'

OTIMIZADOR_1 = '1'
OTIMIZADOR_2 = '2'
OTIMIZADOR_3 = '3'
OTIMIZADOR_4 = '4'

# Caminho arquivo de saída
PATH_ARQ_SAIDA = "/content/drive/MyDrive/TCC/resultados/resultados.csv"

resultados = pd.read_csv(PATH_ARQ_SAIDA, index_col=0)
resultados.head()

Unnamed: 0,rede,base,funcao_ativacao,otimizador,acuracia,roc_curve,train_resume,confusion_matrix


# Carrega base

In [54]:
# Base1 Com StopWords
PATH_BASE1_JUNTO_COM_DA_LEILA_BALANCEADA = "/content/drive/MyDrive/TCC/dados/processadas/balanceadas/Base1JuntaHateENotHateDaLeila_balanceada.csv"
#MATRIZ_CBOW_300_BASE_1 = loadtxt("/content/drive/MyDrive/TCC/dados/word_embeddings/Matriz_Base1JuntaHateENotHateDaLeila_balanceada_CBOW300.CSV", delimiter=',')

# Base2 Sem StopWords
PATH_BASE2_LEILA_LIMPISSIMA_BALANCEADA = "/content/drive/MyDrive/TCC/dados/processadas/balanceadas/Base2_maior4_menor25_limpissima_balanceada.csv"
#MATRIZ_CBOW_300_BASE_2_LEILA = loadtxt("/content/drive/MyDrive/TCC/dados/word_embeddings/Matriz_Base2_maior4_menor25_limpissima_balanceada_CBOW300.CSV", delimiter=',')

PATH_BASE_1_CLASSIFICADA_BALANCEADA = r"/content/drive/MyDrive/TCC/dados/processadas/balanceadas/Base1_classificada_balanceada.csv" #entrada

PATH_BASE_2_CLASSIFICADA_BALANCEADA = r"/content/drive/MyDrive/TCC/dados/processadas/balanceadas/Base2_classificada_balanceada.csv" #entrada
MATRIZ_CBOW_300_BASE_2 = loadtxt("/content/drive/MyDrive/TCC/dados/word_embeddings/Matriz_Base2_classificada_balanceada_CBOW300.csv", delimiter=',') #saida

# Execução modelo

In [55]:
tweets = pd.read_csv(PATH_BASE_2_CLASSIFICADA_BALANCEADA, index_col=0)
text_column = tweets['text']
text_column

0        retwet bahia fazer sendo governada pt sei baia...
1        k imagine atitude mental negativa f idiota fec...
2        general heleno é bolsonaro rosna late late lat...
3        alan ser cara pau achar havendo ruptura nesse ...
4        canalhascomunistas caçarão chapa presidencialn...
                               ...                        
10569               af hein amiga mandar outro número mail
10570    egoísmo é grande parte produto sociedade émile...
10571    abençoado novo linda querida obrigada carinho ...
10572                              leva gente amorzinhos d
10573                                   k k ata meia sonsa
Name: text, Length: 10574, dtype: object

In [56]:
# Pega a média de caracteres dos tweets de toda a base
max_text_length = int(text_column.apply(lambda x: len(str(x).split(' '))).max())
max_text_length

55

In [57]:
output_label = tweets['label']
input_data, text_tokenizer = preprocess(text_column, None)
    
text_vocab_size = len(text_tokenizer.word_index)
print("Vocabulary size:", text_vocab_size)

Vocabulary size: 19043


* Otmizadores : 1 de cada 'família'   (Nadam, RMSProp, SGD, Ftrl)
* Bactch size :  32 e 64
* val e test size : .2 e .33 
* learning rate : '0.0001' e '0.001' e '0.0005'
* Dropout Rate: 0, 0.1, 0.2
* Numero de camadas convolucionais 1 e 2
* Numero de filtros em cada camada 10 , 32 , 64

In [58]:
# parametros tunning:
BATCH_SIZE = [32, 64]
LEARN_RATE = [0.0001, 0.001]
OTMIZADORES = ['Nadam', 'RMSProp', 'SGD']
DROPOUT = [0.1, 0.2, 0.3]

# parametros estaticos:
EPOCHS = 10
VAL_AND_TST_SIZE = 0.2 # Fazer na mão
EMBEDDING_DIMENSION = 300  
MAX_TEXT_SIZE = max_text_length 
VOCAB_SIZE = text_vocab_size

In [59]:
def optimize (optimize, learning_rate):
  if optimize == 'Nadam':
    return Nadam(learning_rate=learning_rate, name="Nadam")
  
  if optimize == 'RMSProp':
    return RMSprop(learning_rate=learning_rate, name="RMSprop")
  
  if optimize == 'SGD':
    return SGD(learning_rate=learning_rate, name="SGD")

In [60]:
def create_cnn (vocab_size, embedding_dimen, max_text_size, dropout_rate ):

  modelo = keras.Sequential([
    keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIMENSION, input_length=max_text_size),
    # keras.layers.Embedding(input_dim=MATRIZ_CBOW_300_BASE_2.shape[0],
                            # output_dim=MATRIZ_CBOW_300_BASE_2.shape[1],
                            # weights=[MATRIZ_CBOW_300_BASE_2],
                            # embeddings_initializer=keras.initializers.Constant(MATRIZ_CBOW_300_BASE_2),
                            # input_length=max_text_size,
                            # trainable=True),                         
    keras.layers.Conv1D(max_text_size, 3, activation="relu"),
    # keras.layers.Conv1D(max_text_size*2, 3, activation="relu"),
    keras.layers.Flatten(input_shape=(max_text_size,)),
    keras.layers.Dense(max_text_size, activation='relu'),
    keras.layers.Dropout(dropout_rate),
    keras.layers.Dense(1, activation='sigmoid')
    ])
  
  return modelo 
  

In [61]:
def train_model(input_data, output_label, embedding_dimen, batch_size, epochs, validation_and_test_size, learning_rate, optimizer, dropout_rate):
    
    x_train, x_test, y_train, y_test = train_test_split(input_data, output_label, test_size=validation_and_test_size, random_state=42)

    opt = optimize(optimizer, learning_rate)
    
    model = create_cnn(VOCAB_SIZE, EMBEDDING_DIMENSION, MAX_TEXT_SIZE, dropout_rate)

    # model.summary()

    model.compile(loss=tf.keras.losses.BinaryCrossentropy(), optimizer=opt, metrics=['accuracy'])        
    
    ra_val = RocAucEvaluation(validation_data=(x_test, y_test), interval = 1)
    
    csv_logger = CSVLogger('log.csv', append=False, separator=';')                                                                                       
    
    history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split = validation_and_test_size, verbose=0, use_multiprocessing=True,  callbacks = [ra_val, csv_logger])
    
    scores = model.evaluate(x_test, y_test, verbose=0)
    
    return history, model, x_test, y_test, ra_val, scores, csv_logger


In [62]:
# plot dos gráficos
for BS in BATCH_SIZE:
  for LR in LEARN_RATE:
    for OT in OTMIZADORES:
      for DO in DROPOUT:
        
        history, model1, x_test, y_test, ra_val, scores, csv_logger = train_model(input_data, output_label, EMBEDDING_DIMENSION, BS, EPOCHS, VAL_AND_TST_SIZE, LR, OT, DO)
       
        #
        y_prob = model1.predict(x_test) 
        y_labelpred = y_prob.round()
        y_labeltrue=y_test
        
        # Parâmetros
        NOME_REDE = REDE_CNN
        NOME_BASE = BASE_2
        NOME_FUNCAO = FUNCAO_ATV_SIGMOID
        NOME_OTIMIZADOR = OT

        train_resume = json.dumps(pd.read_csv('log.csv',sep=';').to_dict()) 

        roc_curves =  json.dumps({
            "false_positive_rate": list(ra_val.false_positive_rate),
            "true_positive_rate": list(ra_val.true_positive_rate),
            "score": ra_val.score
        })

        confusion = confusion_matrix(y_labeltrue, y_labelpred)
        confusion_dict = json.dumps({
            "00": int(confusion[0][0]),
            "01": int(confusion[0][1]),
            "10": int(confusion[1][0]),
            "11": int(confusion[1][1])
        })

        # Salvando resultado do modelo
        registro_resultado = {resultados.columns[0]: NOME_REDE, 
                              resultados.columns[1]: NOME_BASE, 
                              resultados.columns[2]: NOME_FUNCAO, 
                              resultados.columns[3]: NOME_OTIMIZADOR, 
                              resultados.columns[4]: scores[1],
                              resultados.columns[5]: roc_curves,
                              resultados.columns[6]: train_resume,
                              resultados.columns[7]: confusion_dict}

        resultados = resultados.append(registro_resultado, ignore_index=True)


 ROC-AUC - epoch: 1 - score: 0.663901

 ROC-AUC - epoch: 2 - score: 0.751966

 ROC-AUC - epoch: 3 - score: 0.948989

 ROC-AUC - epoch: 4 - score: 0.969483

 ROC-AUC - epoch: 5 - score: 0.977256

 ROC-AUC - epoch: 6 - score: 0.980285

 ROC-AUC - epoch: 7 - score: 0.981408

 ROC-AUC - epoch: 8 - score: 0.982217

 ROC-AUC - epoch: 9 - score: 0.982655

 ROC-AUC - epoch: 10 - score: 0.982965

 ROC-AUC - epoch: 1 - score: 0.668187

 ROC-AUC - epoch: 2 - score: 0.785948

 ROC-AUC - epoch: 3 - score: 0.951351

 ROC-AUC - epoch: 4 - score: 0.971018

 ROC-AUC - epoch: 5 - score: 0.977497

 ROC-AUC - epoch: 6 - score: 0.978917

 ROC-AUC - epoch: 7 - score: 0.980933

 ROC-AUC - epoch: 8 - score: 0.981592

 ROC-AUC - epoch: 9 - score: 0.981878

 ROC-AUC - epoch: 10 - score: 0.982020

 ROC-AUC - epoch: 1 - score: 0.664811

 ROC-AUC - epoch: 2 - score: 0.749249

 ROC-AUC - epoch: 3 - score: 0.938072

 ROC-AUC - epoch: 4 - score: 0.967502

 ROC-AUC - epoch: 5 - score: 0.975044

 ROC-AUC - epoch: 6 - 

In [63]:
resultados

Unnamed: 0,rede,base,funcao_ativacao,otimizador,acuracia,roc_curve,train_resume,confusion_matrix
0,CNN,BASE 2,SIGMOID,Nadam,0.933333,"{""false_positive_rate"": [0.0, 0.0, 0.0, 0.0, 0...","{""epoch"": {""0"": 0, ""1"": 1, ""2"": 2, ""3"": 3, ""4""...","{""00"": 973, ""01"": 74, ""10"": 67, ""11"": 1001}"
1,CNN,BASE 2,SIGMOID,Nadam,0.932388,"{""false_positive_rate"": [0.0, 0.0, 0.0, 0.0, 0...","{""epoch"": {""0"": 0, ""1"": 1, ""2"": 2, ""3"": 3, ""4""...","{""00"": 986, ""01"": 61, ""10"": 82, ""11"": 986}"
2,CNN,BASE 2,SIGMOID,Nadam,0.93286,"{""false_positive_rate"": [0.0, 0.0, 0.0, 0.0, 0...","{""epoch"": {""0"": 0, ""1"": 1, ""2"": 2, ""3"": 3, ""4""...","{""00"": 994, ""01"": 53, ""10"": 89, ""11"": 979}"
3,CNN,BASE 2,SIGMOID,RMSProp,0.935225,"{""false_positive_rate"": [0.0, 0.0, 0.0, 0.0, 0...","{""epoch"": {""0"": 0, ""1"": 1, ""2"": 2, ""3"": 3, ""4""...","{""00"": 978, ""01"": 69, ""10"": 68, ""11"": 1000}"
4,CNN,BASE 2,SIGMOID,RMSProp,0.931915,"{""false_positive_rate"": [0.0, 0.0, 0.0, 0.0, 0...","{""epoch"": {""0"": 0, ""1"": 1, ""2"": 2, ""3"": 3, ""4""...","{""00"": 966, ""01"": 81, ""10"": 63, ""11"": 1005}"
5,CNN,BASE 2,SIGMOID,RMSProp,0.92766,"{""false_positive_rate"": [0.0, 0.0, 0.0, 0.0, 0...","{""epoch"": {""0"": 0, ""1"": 1, ""2"": 2, ""3"": 3, ""4""...","{""00"": 958, ""01"": 89, ""10"": 64, ""11"": 1004}"
6,CNN,BASE 2,SIGMOID,SGD,0.495035,"{""false_positive_rate"": [0.0, 0.00095510983763...","{""epoch"": {""0"": 0, ""1"": 1, ""2"": 2, ""3"": 3, ""4""...","{""00"": 1046, ""01"": 1, ""10"": 1067, ""11"": 1}"
7,CNN,BASE 2,SIGMOID,SGD,0.51253,"{""false_positive_rate"": [0.0, 0.0, 0.000955109...","{""epoch"": {""0"": 0, ""1"": 1, ""2"": 2, ""3"": 3, ""4""...","{""00"": 981, ""01"": 66, ""10"": 965, ""11"": 103}"
8,CNN,BASE 2,SIGMOID,SGD,0.495035,"{""false_positive_rate"": [0.0, 0.00095510983763...","{""epoch"": {""0"": 0, ""1"": 1, ""2"": 2, ""3"": 3, ""4""...","{""00"": 1045, ""01"": 2, ""10"": 1066, ""11"": 2}"
9,CNN,BASE 2,SIGMOID,Nadam,0.961229,"{""false_positive_rate"": [0.0, 0.00095510983763...","{""epoch"": {""0"": 0, ""1"": 1, ""2"": 2, ""3"": 3, ""4""...","{""00"": 1020, ""01"": 27, ""10"": 55, ""11"": 1013}"


In [64]:
# Caso queira deletar uma linha use o código abaixo
# Use a propriedade label para especificar o índice da linha
#resultados = resultados.drop(labels=1, axis=0)
#resultados = resultados.reset_index(drop=True)
#resultados

In [65]:
# Salva arquivo de saída
resultados.to_csv(PATH_ARQ_SAIDA)

In [66]:
# y_prob = model1.predict(x_test) 

# y_labelpred = y_prob.round()

# y_labeltrue=y_test

In [67]:
# # Parâmetros
# NOME_REDE = REDE_CNN
# NOME_BASE = BASE_2
# NOME_FUNCAO = FUNCAO_ATV_SIGMOID
# NOME_OTIMIZADOR = OTIMIZADOR_1

# train_resume = json.dumps(pd.read_csv('log.csv',sep=';').to_dict()) 

# roc_curve =  json.dumps({
#     "false_positive_rate": list(ra_val.false_positive_rate),
#     "true_positive_rate": list(ra_val.true_positive_rate),
#     "score": ra_val.score
# })

# confusion = confusion_matrix(y_labeltrue,y_labelpred)
# confusion_dict = json.dumps({
#     "00": int(confusion[0][0]),
#     "01": int(confusion[0][1]),
#     "10": int(confusion[1][0]),
#     "11": int(confusion[1][1])
# })

# # Salvando resultado do modelo
# registro_resultado = {resultados.columns[0]: NOME_REDE, 
#                       resultados.columns[1]: NOME_BASE, 
#                       resultados.columns[2]: NOME_FUNCAO, 
#                       resultados.columns[3]: NOME_OTIMIZADOR, 
#                       resultados.columns[4]: scores[1],
#                       resultados.columns[5]: roc_curve,
#                       resultados.columns[6]: train_resume,
#                       resultados.columns[7]: confusion_dict}

# resultados = resultados.append(registro_resultado, ignore_index=True)
# resultados