# Import das bibliotecas

In [None]:
!pip install transformers

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, precision_score, recall_score

import tensorflow as tf

from transformers import BertTokenizer
from transformers import BertForPreTraining 
from transformers import BertModel

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
tf.random.set_seed(42)

# Inicialização da base de dados

In [None]:
df = pd.read_csv('/content/drive/MyDrive/ProjetoFinal_ProcessText/datasets/NoStem_StopwordKeep_dataset.csv')
df

Unnamed: 0,sent_rating,processed_text_KeepStopwords,data
0,negativo,Bom em questão de som ela é realmente boa no e...,2021-06-09 00:00:00
1,negativo,O produto não responde corretamente aos comand...,2020-11-26 00:00:00
2,negativo,Quem compra uma caixa de som pensa normalmente...,2021-01-05 00:00:00
3,negativo,A funcionalidade é muito boa Comprei as lâmpad...,2021-06-16 00:00:00
4,positivo,Excelente assistente som muito bom e limpo a A...,2020-12-10 00:00:00
...,...,...,...
3805,positivo,A mídia não pôde ser carregada Gostei de tudo ...,2021-07-06 00:00:00
3806,positivo,Bom dia comprei a echo dot preta e o cabo da t...,2021-05-04 00:00:00
3807,positivo,Amei a Alexa ela é incrível Porém fiquei chate...,2021-05-18 00:00:00
3808,positivo,Atendeu minhas expectativas Reconhece voz ente...,2021-01-09 00:00:00


# Processamento dos dados

## Divisão em treino, validação e teste

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['processed_text_KeepStopwords'], df['sent_rating'], test_size=0.20, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.20, random_state=42)

In [None]:
ds_train = pd.DataFrame(data=X_train,columns=['processed_text_KeepStopwords'])
y_train = LabelEncoder().fit_transform(y_train)
ds_train.insert(len(ds_train.columns), 'target', y_train)
ds_train = ds_train.reset_index(drop=True)

ds_valid = pd.DataFrame(data=X_valid,columns=['processed_text_KeepStopwords'])
y_valid = LabelEncoder().fit_transform(y_valid)
ds_valid.insert(len(ds_valid.columns), 'target', y_valid)
ds_valid = ds_valid.reset_index(drop=True)

ds_test = pd.DataFrame(data=X_test,columns=['processed_text_KeepStopwords'])
y_test = LabelEncoder().fit_transform(y_test)
ds_test.insert(len(ds_test.columns), 'target', y_test)
ds_test = ds_test.reset_index(drop=True)

# Lidando com o BERT

In [None]:
# Tokenizer
tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=True)

Downloading:   0%|          | 0.00/205k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/647 [00:00<?, ?B/s]

In [None]:
# Colocar os dados no formato do BERT
def convert_example_to_feature(review):
  return tokenizer.encode_plus(review,
                add_special_tokens = True, # add [CLS], [SEP]
                max_length = max_length, # max length of the text that can go to BERT
                pad_to_max_length = True, # add [PAD] tokens
                return_attention_mask = True, # add attention mask to not focus on pad tokens
              )

In [None]:
max_length = 512
batch_size = 6

## Funções ajudantes

In [None]:
def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
  return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_masks,
  }, label

In [None]:
def encode_examples(ds):
  # prepare list, so that we can build up final TensorFlow dataset from slices.
  input_ids_list = []
  token_type_ids_list = []
  attention_mask_list = []
  label_list = []

  for row in ds.values:
    bert_input = convert_example_to_feature(row[0])
    input_ids_list.append(bert_input['input_ids'])
    token_type_ids_list.append(bert_input['token_type_ids'])
    attention_mask_list.append(bert_input['attention_mask'])
    label_list.append([row[1]])
  return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)

## Formação dos Dataset de treino e teste

In [None]:
# train dataset
ds_train_encoded = encode_examples(ds_train).batch(batch_size)
# validation dataset
ds_valid_encoded = encode_examples(ds_valid).batch(batch_size)
# test dataset
ds_test_encoded = encode_examples(ds_test).batch(batch_size)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


# Inicialização do BERT

In [None]:
from transformers import TFBertForSequenceClassification

# recommended learning rate for Adam 5e-5, 3e-5, 2e-5
learning_rate = 2e-5
# we will do just 1 epoch, though multiple epochs might be better as long as we will not overfit the model
number_of_epochs = 1
# model initialization
model = TFBertForSequenceClassification.from_pretrained('neuralmind/bert-base-portuguese-cased',from_pt=True)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# choosing Adam optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)
# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy')]
model.compile(optimizer=optimizer, loss=loss, metrics=metric)

# Treinando o BERT

In [None]:
bert_history = model.fit(ds_train_encoded, epochs=number_of_epochs, validation_data=ds_valid_encoded)



# Testando o BERT

In [None]:
test_result = model.evaluate(ds_test_encoded)



## Precisão, Revocação e F1-score

In [None]:
y_pred = model.predict(ds_test_encoded)

In [None]:
y_pred_list = []
for predict in y_pred[0]:
  if predict[0] > predict[1]:
    y_pred_list.append(0)
  else:
    y_pred_list.append(1)

In [None]:
print('Precisão:',precision_score(y_test,y_pred_list))
print('Revocação:',recall_score(y_test,y_pred_list))
print('F1-score:',f1_score(y_test,y_pred_list))

Precisão: 0.9391575663026521
Revocação: 0.9376947040498442
F1-score: 0.9384255650818394


## Salvando as predições junto ao dataframe original

In [None]:
_, predict_df, _, _ = train_test_split(df, df['sent_rating'], test_size=0.2, random_state=42)
predict_df.insert(1, 'predict', y_pred_list)
predict_df.to_csv('/content/drive/MyDrive/ProjetoFinal_ProcessText/predict_datasets/BERT_predict.csv')