# Experimento com BERT

In [1]:
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import funcoes_bert as fb

In [2]:
# force CPU (make CPU visible)
cpus = tf.config.experimental.list_physical_devices('CPU')
print(cpus)
tf.config.set_visible_devices([], 'GPU')  # hide the GPU
tf.config.set_visible_devices(cpus[0], 'CPU') # unhide potentially hidden CPU
tf.config.get_visible_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]


[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]

In [61]:
SEED = 42

In [3]:
arquivos = ['hn_balanceado', 'hn_desbalanceado', 'sn_balanceado', 'sn_desbalanceado']

lista_df = []
for arquivo in arquivos:

    df = pd.read_csv(f"Dados/Datasets/{arquivo}.csv", dtype = {'ean_1': str, 'ean_2': str})
    lista_df.append(df)

In [4]:
lista_df[0].dtypes

descricao_1    object
ean_1          object
titulo_1       object
url_1          object
titulo_cb_1    object
loja_1         object
descricao_2    object
ean_2          object
titulo_2       object
url_2          object
titulo_cb_2    object
loja_2         object
match           int64
dtype: object

In [5]:
df.shape

(13290, 13)

In [6]:
df[df["titulo_1"]==df["titulo_2"]]

Unnamed: 0,descricao_1,ean_1,titulo_1,url_1,titulo_cb_1,loja_1,descricao_2,ean_2,titulo_2,url_2,titulo_cb_2,loja_2,match
2231,Esmaltec traz o fogão feito especialmente para...,7899081743783,Fogão 4 Bocas a Gás Esmaltec Esmeralda Glass 4...,https://www.amazon.com.br/Esmaltec-Esmeralda-A...,fogão 4 bocas a gás esmaltec esmeralda glass 4...,amazon,Esmaltec traz o fogão feito especialmente para...,7899081743783,Fogão 4 Bocas a Gás Esmaltec Esmeralda Glass 4...,https://www.americanas.com.br/produto/2599745729,fogão 4 bocas a gás esmaltec esmeralda glass 4...,americanas,1
7817,,7892509119160,Smartphone Samsung Galaxy S20 Fe 128GB 4G Wi-F...,https://www.amazon.com.br/Smartphone-Samsung-G...,smartphone samsung galaxy s20 fe 128gb 4g wi-f...,amazon,Todas as informações divulgadas são de respons...,7892509119160,Smartphone Samsung Galaxy S20 Fe 128GB 4G Wi-F...,https://www.americanas.com.br/produto/3234381133,smartphone samsung galaxy s20 fe 128gb 4g wi-f...,americanas,1
10407,Todas as informações divulgadas são de respons...,7892509118439,Smartphone Samsung Galaxy A32 128GB 4G Wi-Fi T...,https://www.americanas.com.br/produto/2969968331,smartphone samsung galaxy a32 128gb 4g wi-fi t...,americanas,,7892509118439,Smartphone Samsung Galaxy A32 128GB 4G Wi-Fi T...,https://www.amazon.com.br/Smartphone-Samsung-G...,smartphone samsung galaxy a32 128gb 4g wi-fi t...,amazon,1


## Otimizador e Inicialização do Modelo

In [7]:
'''test = None

# can be up to 512 for BERT
MAX_LENGTH = 256
BATCH_SIZE = 32
tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case = False)'''

"test = None\n\n# can be up to 512 for BERT\nMAX_LENGTH = 256\nBATCH_SIZE = 32\ntokenizer = BertTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case = False)"

In [8]:
'''
#######################INICIOS FUNÇÕES DE APOIO#######################

def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
  return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_masks,
  }, label


def convert_example_to_feature(titulo_1, titulo_2):
    return tokenizer.encode_plus(titulo_1, titulo_2,
                                 add_special_tokens = True, # adiciona [CLS], [SEP]
                                 max_length = MAX_LENGTH, # comprimento máximo do texto de entrada
                                 padding = 'max_length', # adiciona [PAD] até o tam_max (MAX_LENGTH)
                                 truncation = True, # padrão = 'longest_first'
                                 return_attention_mask = True, # adiciona máscara de atenção para não focar nos tokens do pad
                                )

def encode_examples(df_titulos, labels, limit = -1):
    
    # prepare list, so that we can build up final TensorFlow dataset from slices.
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    label_list = []
    
    if (limit > 0):
        ds = ds.take(limit)
    
    # for review, label in tfds.as_numpy(ds):
    for titulo_1, titulo_2, label in zip(df_titulos["titulo_1"], df_titulos["titulo_2"], labels):
        
        bert_input = convert_example_to_feature(titulo_1, titulo_2)
        input_ids_list.append(bert_input['input_ids'])
        token_type_ids_list.append(bert_input['token_type_ids'])
        attention_mask_list.append(bert_input['attention_mask'])
        label_list.append([label])
        
    return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)

def get_bert_data(X_train, y_train, X_valid, y_valid, X_test, y_test):
    
    # train dataset
    ds_train = encode_examples(X_train, y_train).batch(BATCH_SIZE)

    # test dataset
    ds_test = encode_examples(X_test, y_test).batch(BATCH_SIZE)

    #validation dataset
    ds_valid = encode_examples(X_valid, y_valid).batch(BATCH_SIZE)

    return ds_train, ds_valid, ds_test

#######################FIM FUNÇOES DE APOIO#######################

def get_test_metrics(model, ds_test, y_test):

    #Predictin test dataset
    tf_output = model.predict(ds_test)[0]
    tf_prediction = tf.nn.softmax(tf_output, axis=1)
    # labels = ['Negative','Positive'] #(0:negative, 1:positive)
    label = tf.argmax(tf_prediction, axis=1)
    label_pred = label.numpy()
    # print(label_pred)

    print(classification_report(y_test, label_pred))

    print(confusion_matrix(y_test, label_pred))

    return label_pred


def pipeline_bert(name, X_train, y_train, X_valid, y_valid, X_test, y_test): #X_train = [titulos1, titulos2]

    learning_rate = 2e-5
    number_of_epochs = 3
    ds_train, ds_valid, ds_test = get_bert_data(X_train, y_train, X_valid, y_valid, X_test, y_test)
    
    # model initialization
    model = TFBertForSequenceClassification.from_pretrained('neuralmind/bert-base-portuguese-cased', from_pt = True)

    # choosing Adam optimizer
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric_acc = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    model.compile(optimizer=optimizer, loss=loss, metrics=[metric_acc])

    #Training model
    bert_history = model.fit(ds_train, epochs=number_of_epochs, validation_data=ds_valid)
    
    #Predict test data
    result = get_test_metrics(model, ds_test, y_test)
    return result

    # metrics = calc_metrics(y_test, result, name)
    # return metrics
'''

'\n#######################INICIOS FUNÇÕES DE APOIO#######################\n\ndef map_example_to_dict(input_ids, attention_masks, token_type_ids, label):\n  return {\n      "input_ids": input_ids,\n      "token_type_ids": token_type_ids,\n      "attention_mask": attention_masks,\n  }, label\n\n\ndef convert_example_to_feature(titulo_1, titulo_2):\n    return tokenizer.encode_plus(titulo_1, titulo_2,\n                                 add_special_tokens = True, # adiciona [CLS], [SEP]\n                                 max_length = MAX_LENGTH, # comprimento máximo do texto de entrada\n                                 padding = \'max_length\', # adiciona [PAD] até o tam_max (MAX_LENGTH)\n                                 truncation = True, # padrão = \'longest_first\'\n                                 return_attention_mask = True, # adiciona máscara de atenção para não focar nos tokens do pad\n                                )\n\ndef encode_examples(df_titulos, labels, limit = -1):\n    \n  

## Rodando pipeline experimento

In [41]:
lista_df_resultado = []
for nome, df in zip(arquivos, lista_df):
    
    X = df[["titulo_1", "titulo_2"]]#[df["titulo_1"].to_list(), df["titulo_2"].to_list()]
    y = df["match"].to_list()

    X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y, test_size = 0.3, random_state = SEED, stratify = y)
    X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size = 0.2, random_state = SEED, stratify = y_train_valid)

    (nome, historico, y_test, y_pred) = fb.pipeline_bert(nome, X_train[:20], y_train[:20], X_valid[:20], y_valid[:20], X_test[:20], y_test[:20])

    pd.DataFrame.from_dict(historico.history).to_csv(f'Dados/Resultados/{nome}_historico.csv', index = False)

    report = classification_report(y_test, y_pred, output_dict = True)
    df_resultado = pd.DataFrame(report).transpose()
    df_resultado['modelo'] = nome

    lista_df_resultado.append(df_resultado)

    break

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3


In [42]:
print(classification_report(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.90      1.00      0.95        18
           1       0.00      0.00      0.00         2

    accuracy                           0.90        20
   macro avg       0.45      0.50      0.47        20
weighted avg       0.81      0.90      0.85        20

[[18  0]
 [ 2  0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
