# Experimento com BERT

In [39]:
import pandas as pd

import tensorflow as tf
import tensorflow_addons as tfa

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy

from transformers import BertTokenizer
from transformers import TFBertForSequenceClassification

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


In [4]:
df = pd.read_csv("Dados/total_pares_smartphones.csv", dtype = {'ean_1': str, 'ean_2': str})
df.head(3)

Unnamed: 0,descricao_1,ean_1,titulo_1,url_1,titulo_cb_1,loja_1,descricao_2,ean_2,titulo_2,url_2,titulo_cb_2,loja_2,match
0,O smartphone Samsung Galaxy S22 Ultra chegou p...,7892509122382,Smartphone Samsung Galaxy S22 Ultra 256GB 5G W...,https://www.americanas.com.br/produto/4645251827,smartphone samsung galaxy s22 ultra 256gb 5g w...,americanas,O Smartphone bison X10 da umidigi é ultra-resi...,6973553520636,Smartphone umidigi bison X10 Dual Sim lte 6.53...,https://www.americanas.com.br/produto/4707302977,smartphone umidigi bison x10 dual sim lte 6.53...,americanas,0
1,,8908012587507,Smartphone Xiaomi Redmi 9 India 64GB 4GB RAM T...,https://www.amazon.com.br/Smartphone-Xiaomi-Re...,smartphone xiaomi redmi 9 india 64gb 4gb ram t...,amazon,O Moto E40 é um smartphone projetado para ofer...,7892597351367,Smartphone Motorola Moto E40 64GB 4G Wi-Fi Tel...,https://www.americanas.com.br/produto/4120059496,smartphone motorola moto e40 64gb 4g wi-fi tel...,americanas,0
2,Todas as informações divulgadas são de respons...,7892509117722,Smartphone Samsung Galaxy A32 128GB 4G Wi-Fi T...,https://www.americanas.com.br/produto/2969966651,smartphone samsung galaxy a32 128gb 4g wi-fi t...,americanas,,7892509117722,Smartphone Samsung Galaxy A32 128GB 6.4 Octa C...,https://www.amazon.com.br/Smartphone-Samsung-G...,smartphone samsung galaxy a32 128gb 6.4 octa c...,amazon,1


In [7]:
df.dtypes

descricao_1    object
ean_1          object
titulo_1       object
url_1          object
titulo_cb_1    object
loja_1         object
descricao_2    object
ean_2          object
titulo_2       object
url_2          object
titulo_cb_2    object
loja_2         object
match           int64
dtype: object

In [8]:
df.shape

(480, 13)

In [9]:
df[df["titulo_1"]==df["titulo_2"]]

Unnamed: 0,descricao_1,ean_1,titulo_1,url_1,titulo_cb_1,loja_1,descricao_2,ean_2,titulo_2,url_2,titulo_cb_2,loja_2,match
433,,7892509120623,Smartphone Samsung Galaxy A52s 5G 128GB 6.5 6G...,https://www.amazon.com.br/Smartphone-Galaxy-A5...,smartphone samsung galaxy a52s 5g 128gb 6.5 6g...,amazon,Características: - Marca: Samsung - Modelo: A5...,7892509120623,Smartphone Samsung Galaxy A52s 5G 128GB 6.5 6G...,https://www.americanas.com.br/produto/4472100131,smartphone samsung galaxy a52s 5g 128gb 6.5 6g...,americanas,1


## Otimizador e Inicialização do Modelo

In [31]:
test = None

# can be up to 512 for BERT
MAX_LENGTH = 256
BATCH_SIZE = 1
tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case = True)

#######################INICIOS FUNÇÕES DE APOIO#######################

def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
  return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_masks,
  }, label


def convert_example_to_feature(titulo_1, titulo_2):
    return tokenizer.encode_plus(titulo_1, titulo_2,
                                 add_special_tokens = True, # adiciona [CLS], [SEP]
                                 max_length = MAX_LENGTH, # comprimento máximo do texto de entrada
                                 padding = 'max_length', # adiciona [PAD] até o tam_max (MAX_LENGTH)
                                 truncation = True, # padrão = 'longest_first'
                                 return_attention_mask = True, # adiciona máscara de atenção para não focar nos tokens do pad
                                )

def encode_examples(df_titulos, labels, limit = -1):
    
    # prepare list, so that we can build up final TensorFlow dataset from slices.
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    label_list = []
    
    if (limit > 0):
        ds = ds.take(limit)
    
    # for review, label in tfds.as_numpy(ds):
    for titulo_1, titulo_2, label in zip(df_titulos["titulo_1"], df_titulos["titulo_2"], labels):
        
        bert_input = convert_example_to_feature(titulo_1, titulo_2)
        input_ids_list.append(bert_input['input_ids'])
        token_type_ids_list.append(bert_input['token_type_ids'])
        attention_mask_list.append(bert_input['attention_mask'])
        label_list.append([label])
        
    return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)

def get_bert_data(X_train, y_train, X_valid, y_valid, X_test, y_test):
    
    # train dataset
    ds_train = encode_examples(X_train, y_train).batch(BATCH_SIZE)

    # test dataset
    ds_test = encode_examples(X_test, y_test).batch(BATCH_SIZE)

    #validation dataset
    ds_valid = encode_examples(X_valid, y_valid).batch(BATCH_SIZE)

    return ds_train, ds_valid, ds_test

#######################FIM FUNÇOES DE APOIO#######################

def get_test_metrics(model, ds_test, y_test):

    #Predictin test dataset
    tf_output = model.predict(ds_test)[0]
    tf_prediction = tf.nn.softmax(tf_output, axis=1)
    # labels = ['Negative','Positive'] #(0:negative, 1:positive)
    label = tf.argmax(tf_prediction, axis=1)
    label_pred = label.numpy()
    # print(label_pred)

    print(classification_report(y_test, label_pred))

    print(confusion_matrix(y_test, label_pred))

    return label_pred


def pipeline_bert(name, X_train, y_train, X_valid, y_valid, X_test, y_test): #X_train = [titulos1, titulos2]

    learning_rate = 2e-5
    number_of_epochs = 3
    ds_train, ds_valid, ds_test = get_bert_data(X_train, y_train, X_valid, y_valid, X_test, y_test)
    
    # model initialization
    model = TFBertForSequenceClassification.from_pretrained('neuralmind/bert-base-portuguese-cased', from_pt = True)

    # choosing Adam optimizer
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric_acc = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    model.compile(optimizer=optimizer, loss=loss, metrics=[metric_acc])

    #Training model
    bert_history = model.fit(ds_train, epochs=number_of_epochs, validation_data=ds_valid)
    
    #Predict test data
    result = get_test_metrics(model, ds_test, y_test)
    return result

    # metrics = calc_metrics(y_test, result, name)
    # return metrics
    

In [34]:
X = df[["titulo_1", "titulo_2"]][:50]#[df["titulo_1"].to_list(), df["titulo_2"].to_list()]
y = df["match"][:50].to_list()

X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size = 0.2, random_state = 42, stratify=y_train_valid)


In [38]:
pipeline_bert("name", X_train, y_train, X_valid, y_valid, X_test, y_test)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3
              precision    recall  f1-score   support

           0       0.92      1.00      0.96        12
           1       1.00      0.67      0.80         3

    accuracy                           0.93        15
   macro avg       0.96      0.83      0.88        15
weighted avg       0.94      0.93      0.93        15



NameError: name 'confusion_matrix' is not defined

In [40]:
pipeline_bert("name", X_train, y_train, X_valid, y_valid, X_test, y_test)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3
              precision    recall  f1-score   support

           0       0.92      1.00      0.96        12
           1       1.00      0.67      0.80         3

    accuracy                           0.93        15
   macro avg       0.96      0.83      0.88        15
weighted avg       0.94      0.93      0.93        15

[[12  0]
 [ 1  2]]


array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0], dtype=int64)