# Treinamento do Modelo

In [None]:
! nvidia-smi

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install transformers
!pip install nltk
!pip install enelvo

!pip install sentencepiece
!pip install bertopic[all]

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import pickle
import torch
import nltk
import sys
import ast
import json
nltk.download('stopwords')
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from torch.nn import BCEWithLogitsLoss, BCELoss
from nltk.corpus import stopwords
from enelvo import normaliser
from tqdm import tqdm, trange
from ast import literal_eval
from transformers import *
from flair.embeddings import TransformerDocumentEmbeddings

from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups

In [None]:
# Seleciona a CPU para processamento caso não haja GPU compatível com CUDA.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

In [None]:
#Leitura da base de treinamento
from google.colab import files
import io

#file = 'corpus_elec_para_bert_raw.csv'
file = 'drive/MyDrive/bases_tcc/tweets_eleicao_enelvo_raw.csv'

# converting json dataset from dictionary to dataframe
df = pd.read_csv(file)

df = df.drop(df[df.sentiment == 'brasil'].index)

In [None]:
df.to_excel('df.xlsx')

In [None]:
#Realiza o one-hot encoding na coluna 2º Filtro para o treinamento.
one_hot = pd.get_dummies(df['sentiment'])
df = df.drop('sentiment',axis = 1)
df = df.join(one_hot)

In [None]:
#Seleciona a linguagem do corretor para português e seleciona o dicionário personalizado.
import re
#spell.word_frequency.load_text_file('palavras.txt')

def removePontuacao(sentenca): 
    sentenca = re.sub(u'[^a-zA-ZáéíóúÁÉÍÓÚâêîôÂÊÎÔãõÃÕçÇ: ]', r' ',sentenca)
    sentenca = re.sub(r'[?|!|\'|"|#]',r'',sentenca)
    sentenca = re.sub(r'[.|,|)|(|\|/]',r' ',sentenca)
    sentenca = sentenca.replace("\n"," ")
    return sentenca

def removeruidos(sentenca):
    #sentenca = sentenca.replace("username","")
    sentenca = sentenca.replace("hashtag","")
    sentenca = sentenca.replace("retweeet","")
    return sentenca

def corrige_internetes(sentenca):
    norm = normaliser.Normaliser(sanitize = True, capitalize_inis=True, capitalize_pns=True, capitalize_acs=True)
    norm_sentence = norm.normalise(sentenca)
    return norm_sentence

def deEmojify(sentenca):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',sentenca)

In [None]:
#Realiza o pré-processamento dos textos de comentários.
#df["text"] = df["text"].swifter.allow_dask_on_strings(enable=True).apply(lambda x: x.lower())
df["text"] = df["text"].apply(lambda x: removeruidos(x))
df["text"] = df["text"].apply(lambda x: deEmojify(x))
#df["text"] = df["text"].swifter.allow_dask_on_strings(enable=True).apply(lambda x: removePontuacao(x))
#df["text"] = df["text"].swifter.allow_dask_on_strings(enable=True).apply(lambda x: corretorGramatical(x))
#df["text"] = df["text"].swifter.allow_dask_on_strings(enable=True).apply(lambda x: corrige_internetes(x))

In [None]:
df

In [None]:
#Faz a importação do modelo BERT pré-treinado para pt-br.
bert = AutoModel.from_pretrained('neuralmind/bert-large-portuguese-cased')

#Carrega o BERT tokenizer para pt-br.
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-large-portuguese-cased', do_lower_case=False)

In [None]:
#Obtém a lista de colunas que contém as classificações após o one hot encoding.
cols = df.columns
label_cols = list(cols[4:8])

#Obtém o número de classificações possíveios.
num_labels = len(label_cols)

#Embaralha as linhas do dataframe.
df = df.sample(frac=1).reset_index(drop=True)

#Cria uma coluna contendo uma lista com os valores das colunas do one hot encoding.
df['one_hot_labels'] = list(df[label_cols].values)

#Obtém a lista todos os valores das labels e de todos os comentários
labels = list(df.one_hot_labels.values)
comments = list(df.text.values)

In [None]:
label_cols

In [None]:
print('Count of 1 per label: \n', df[label_cols].sum(), '\n') # Label counts, may need to downsample or upsample
print('Count of 0 per label: \n', df[label_cols].eq(0).sum())

In [None]:
#Atribui o valor máximo de tokens por comentário para 100 - Evita estouro de memória de GPU
max_length = 140

#Cria tokenizador BERT com os dados pré-treinados em português.
tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-large-portuguese-cased', do_lower_case=False)

#Utiliza o método "batch_encode_plus" para obter os tokens, os tipos de tokens e as attention masks.
encodings = tokenizer.batch_encode_plus(comments,max_length=max_length,pad_to_max_length=True)

#Sentenças pré-treinadas tokenizadas e codificadas, token type ids e attention masks.
input_ids = encodings['input_ids']
token_type_ids = encodings['token_type_ids']
attention_masks = encodings['attention_mask']

In [None]:
#Identifica as entradas dos índices das one hot labels que ocorrem apenas uma vez. Isso permitirá realizarmos um split estratificado.
label_counts = df.one_hot_labels.astype(str).value_counts()
one_freq = label_counts[label_counts==1].keys()
one_freq_idxs = sorted(list(df[df.one_hot_labels.astype(str).isin(one_freq)].index), reverse=True)

#Reune as entradas que possuem somente uma instância para forçar elas para os dados de treinamento após o split.
one_freq_input_ids = [input_ids.pop(i) for i in one_freq_idxs]
one_freq_token_types = [token_type_ids.pop(i) for i in one_freq_idxs]
one_freq_attention_masks = [attention_masks.pop(i) for i in one_freq_idxs]
one_freq_labels = [labels.pop(i) for i in one_freq_idxs]

In [None]:
#Usa a função train_test_split para separar os dados em dados de treino e de validação. 10% para validação, 90% treino.
train_inputs, validation_inputs, train_labels, validation_labels, train_token_types, validation_token_types, train_masks, validation_masks = train_test_split(input_ids, labels, token_type_ids, attention_masks,
                                                            random_state=2020, test_size=0.10, stratify = labels)

#Adiciona os índices que apareceram uma única vez em dados de treino.
train_inputs.extend(one_freq_input_ids)
train_labels.extend(one_freq_labels)
train_masks.extend(one_freq_attention_masks)
train_token_types.extend(one_freq_token_types)

#Converte todos os dados em torch tensors, o tipo de dado requerido pelo modelo.
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
train_token_types = torch.tensor(train_token_types)

validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)
validation_token_types = torch.tensor(validation_token_types)

In [None]:
#Atribui um tamanho de batch para o treinamento. Valores recomendados: 32, 48 ou 128. Valores altos podem estourar a memória da GPU.
batch_size = 24

#Cria um iterador para os dados com o torch DataLoader. Isso ajuda a economizamr memória durante o treinamento, pois ao contrário de um loop,
#com um iterador não é necessário carregar o dataset inteiro para a memória.
train_data = TensorDataset(train_inputs, train_masks, train_labels, train_token_types)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels, validation_token_types)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [None]:
#Carrega o modelo pré-treinado. O modelo incluirá uma única camada linear de classificação no topo.
model = BertForSequenceClassification.from_pretrained('neuralmind/bert-large-portuguese-cased', num_labels=num_labels)
model.cuda()

In [None]:
#Configurando parâmetros de otimização customizados.
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [None]:
#Otimizador ADAMW
optimizer = AdamW(optimizer_grouped_parameters,lr=2e-5,correct_bias=True)
#optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
# Armazena a perda e a acurácia para plotagem.
train_loss_set = []

# Número de passagens para o treinamento.
epochs = 12
i=1
# Trange permite exibir uma barra de progresso.
for _ in trange(epochs, desc="Epoch"):

  # Treinamento

  # Configura o modelo para o modo de treinamento.
  model.train()
 
  # Variáveis de rastreamento.
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0

  # Treina os dados para uma passagem
  for step, batch in enumerate(train_dataloader):
    # Adiciona o batch à GPU
    batch = tuple(t.to(device) for t in batch)
    # Descompacta as entradas do dataloader
    b_input_ids, b_input_mask, b_labels, b_token_types = batch
    # Limpa os gradientes (por padrão eles acumulam)
    optimizer.zero_grad()

    # Classificação multilabel
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    logits = outputs[0]
    loss_func = BCEWithLogitsLoss() 
    loss = loss_func(logits.view(-1,num_labels),b_labels.type_as(logits).view(-1,num_labels)) # Converte as labels para float para o cálculo
    train_loss_set.append(loss.item())   

    # Passe para trás
    loss.backward()
    # Atualiza os parâmetros e dá um passo usando o gradiente calculado.
    optimizer.step()
    # scheduler.step()

    # Atualiza as variáveis de rastramento.
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print(" Train loss: {}".format(tr_loss/nb_tr_steps))

###############################################################################

  # Validação

  # Configura o modelo para o modo de avaliação para avaliar as perdas no conjunto de validação
  model.eval()

  # Variáveis para reunir os resultados completos.
  logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

  # Predição.
  for i, batch in enumerate(validation_dataloader):
    batch = tuple(t.to(device) for t in batch)
    # Descompacta as entradas do dataloader
    b_input_ids, b_input_mask, b_labels, b_token_types = batch
    with torch.no_grad():
      # Passe para frente
      outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
      b_logit_pred = outs[0]
      pred_label = torch.sigmoid(b_logit_pred)

      b_logit_pred = b_logit_pred.detach().cpu().numpy()
      pred_label = pred_label.to('cpu').numpy()
      b_labels = b_labels.to('cpu').numpy()

    tokenized_texts.append(b_input_ids)
    logit_preds.append(b_logit_pred)
    true_labels.append(b_labels)
    pred_labels.append(pred_label)

  # "Achata" as saídas
  pred_labels = [item for sublist in pred_labels for item in sublist]
  true_labels = [item for sublist in true_labels for item in sublist]

  # Calcula a acurácia
  threshold = 0.50
  pred_bools = [pl>threshold for pl in pred_labels]
  true_bools = [tl==1 for tl in true_labels]
  val_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')*100
  val_flat_accuracy = accuracy_score(true_bools, pred_bools)*100
  
  print(' Acurácia F1: ' + str(round(val_f1_accuracy,2)) + '%')
  print('Acurácia Flat: ' + str(round(val_flat_accuracy,2)) + '%')

In [None]:
# Salva o modelo.
torch.save(model, '/content/drive/MyDrive/TCC/model_bert_tcc/bert_model_full')

In [None]:
#model = torch.load('/content/drive/MyDrive/TCC/model_bert_tcc/bert_model_full')

# Aplicação do modelo para Classificação

In [None]:
from google.colab import files
#uploaded = files.upload()

In [None]:
df_classificacao = pd.read_csv('drive/MyDrive/bases_tcc/tweets_2turno.csv')

In [None]:
df_classificacao

In [None]:
test_df = df_classificacao.copy()
test_df = test_df.rename(columns={'Mensagem': 'comment_text'})
test_df = test_df[test_df['comment_text'].notna()]

In [None]:
#test_df["comment_text"] = test_df["comment_text"].apply(lambda x: x.lower())
test_df["comment_text"] = test_df["comment_text"].apply(lambda x: removeruidos(x))
test_df["comment_text"] = test_df["comment_text"].apply(lambda x: deEmojify(x))
#test_df["comment_text"] = test_df["comment_text"].swifter.allow_dask_on_strings(enable=True).apply(lambda x: corretorGramatical(x))

In [None]:
#test_df = test_df.drop('Positiva', 1)
#test_df = test_df.drop('Neutra', 1)
#test_df = test_df.drop('Negativa', 1)
#test_df = test_df.drop('Unnamed: 0', 1)

In [None]:
test_df = test_df.join(one_hot)
test_cols = test_df.columns

In [None]:
test_cols

In [None]:
test_label_cols = list(test_cols[6:])
test_df['one_hot_labels'] = list(test_df[test_label_cols].values)

In [None]:
test_label_cols

In [None]:
test_df

In [None]:
# Gathering input data
test_labels = list(test_df.one_hot_labels.values)
test_comments = list(test_df.comment_text.values)

# Encoding input data
test_encodings = tokenizer.batch_encode_plus(test_comments,max_length=max_length,pad_to_max_length=True)
test_input_ids = test_encodings['input_ids']
test_token_type_ids = test_encodings['token_type_ids']
test_attention_masks = test_encodings['attention_mask']


# Make tensors out of data
test_inputs = torch.tensor(test_input_ids)
test_labels = torch.tensor(test_labels)
test_masks = torch.tensor(test_attention_masks)
test_token_types = torch.tensor(test_token_type_ids)

# Create test dataloader
test_data = TensorDataset(test_inputs, test_masks, test_labels, test_token_types)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [None]:
# Test

# Put model in evaluation mode to evaluate loss on the validation set
model.eval()

#track variables
logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

# Predict
for i, batch in enumerate(test_dataloader):
  batch = tuple(t.to(device) for t in batch)
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels, b_token_types = batch
  with torch.no_grad():
    # Forward pass
    outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    b_logit_pred = outs[0]
    pred_label = torch.sigmoid(b_logit_pred)

    b_logit_pred = b_logit_pred.detach().cpu().numpy()
    pred_label = pred_label.to('cpu').numpy()
    b_labels = b_labels.to('cpu').numpy()

  tokenized_texts.append(b_input_ids)
  logit_preds.append(b_logit_pred)
  true_labels.append(b_labels)
  pred_labels.append(pred_label)

# Flatten outputs
tokenized_texts = [item for sublist in tokenized_texts for item in sublist]
pred_labels = [item for sublist in pred_labels for item in sublist]
true_labels = [item for sublist in true_labels for item in sublist]
# Converting flattened binary values to boolean values
true_bools = [tl==1 for tl in true_labels]

In [None]:
pred_bools = [pl>0.50 for pl in pred_labels] #boolean output after thresholding

# Print and save classification report
#print('Test F1 Accuracy: ', f1_score(true_bools, pred_bools,average='micro'))
#print('Test Flat Accuracy: ', accuracy_score(true_bools, pred_bools),'\n')
clf_report = classification_report(true_bools,pred_bools,target_names=test_label_cols)
pickle.dump(clf_report, open('classification_report.txt','wb')) #save report
#print(clf_report)

In [None]:
idx2label = dict(zip(range(14),label_cols))
print(idx2label)

In [None]:
# Getting indices of where boolean one hot vector true_bools is True so we can use idx2label to gather label names
true_label_idxs, pred_label_idxs=[],[]
for vals in true_bools:
  true_label_idxs.append(np.where(vals)[0].flatten().tolist())
for vals in pred_bools:
  pred_label_idxs.append(np.where(vals)[0].flatten().tolist())

# Gathering vectors of label names using idx2label
true_label_texts, pred_label_texts = [], []
for vals in true_label_idxs:
  if vals:
    true_label_texts.append([idx2label[val] for val in vals])
  else:
    true_label_texts.append(vals)

for vals in pred_label_idxs:
  if vals:
    pred_label_texts.append([idx2label[val] for val in vals])
  else:
    pred_label_texts.append(vals)

# Decoding input ids to comment text
comment_texts = [tokenizer.decode(text,skip_special_tokens=True,clean_up_tokenization_spaces=False) for text in tokenized_texts]

In [None]:
# Converting lists to df
comparisons_df = pd.DataFrame({'comment_text': comment_texts, 'pred_labels':pred_label_texts})
comparisons_df['pred_labels'] = [','.join(map(str, l)) for l in comparisons_df['pred_labels']]
comparisons_df.to_csv('comparisons.csv')

In [None]:
comparisons_df

In [None]:
df_classificacao = df_classificacao.reindex(columns=['Usuario', 'Mensagem', 'Localizacao', 'candidato'])
df_final = df_classificacao.copy()

In [None]:
df_final['Polaridade'] = comparisons_df['pred_labels']

In [None]:
df_final

In [None]:
df_final.to_csv('/content/drive/MyDrive/TCC/model_bert_tcc/df_final.csv')
files.download('df_final.csv')