# Experimento com BERT

In [3]:
import pandas as pd

import tensorflow as tf

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy

from transformers import BertTokenizer
from transformers import TFBertForSequenceClassification

In [5]:
df = pd.read_csv("Dados/total_pares_smartphones.csv", dtype = {'ean_1': str, 'ean_2': str})
df.head(3)

Unnamed: 0,descricao_1,ean_1,titulo_1,url_1,titulo_cb_1,loja_1,descricao_2,ean_2,titulo_2,url_2,titulo_cb_2,loja_2,match
0,O smartphone Samsung Galaxy S22 Ultra chegou p...,7892509122382,Smartphone Samsung Galaxy S22 Ultra 256GB 5G W...,https://www.americanas.com.br/produto/4645251827,smartphone samsung galaxy s22 ultra 256gb 5g w...,americanas,O Smartphone bison X10 da umidigi é ultra-resi...,6973553520636,Smartphone umidigi bison X10 Dual Sim lte 6.53...,https://www.americanas.com.br/produto/4707302977,smartphone umidigi bison x10 dual sim lte 6.53...,americanas,0
1,,8908012587507,Smartphone Xiaomi Redmi 9 India 64GB 4GB RAM T...,https://www.amazon.com.br/Smartphone-Xiaomi-Re...,smartphone xiaomi redmi 9 india 64gb 4gb ram t...,amazon,O Moto E40 é um smartphone projetado para ofer...,7892597351367,Smartphone Motorola Moto E40 64GB 4G Wi-Fi Tel...,https://www.americanas.com.br/produto/4120059496,smartphone motorola moto e40 64gb 4g wi-fi tel...,americanas,0
2,Todas as informações divulgadas são de respons...,7892509117722,Smartphone Samsung Galaxy A32 128GB 4G Wi-Fi T...,https://www.americanas.com.br/produto/2969966651,smartphone samsung galaxy a32 128gb 4g wi-fi t...,americanas,,7892509117722,Smartphone Samsung Galaxy A32 128GB 6.4 Octa C...,https://www.amazon.com.br/Smartphone-Samsung-G...,smartphone samsung galaxy a32 128gb 6.4 octa c...,amazon,1


In [7]:
df.dtypes

descricao_1    object
ean_1          object
titulo_1       object
url_1          object
titulo_cb_1    object
loja_1         object
descricao_2    object
ean_2          object
titulo_2       object
url_2          object
titulo_cb_2    object
loja_2         object
match           int64
dtype: object

In [9]:
df.shape

(480, 13)

In [8]:
df[df["titulo_1"]==df["titulo_2"]]

Unnamed: 0,descricao_1,ean_1,titulo_1,url_1,titulo_cb_1,loja_1,descricao_2,ean_2,titulo_2,url_2,titulo_cb_2,loja_2,match
433,,7892509120623,Smartphone Samsung Galaxy A52s 5G 128GB 6.5 6G...,https://www.amazon.com.br/Smartphone-Galaxy-A5...,smartphone samsung galaxy a52s 5g 128gb 6.5 6g...,amazon,Características: - Marca: Samsung - Modelo: A5...,7892509120623,Smartphone Samsung Galaxy A52s 5G 128GB 6.5 6G...,https://www.americanas.com.br/produto/4472100131,smartphone samsung galaxy a52s 5g 128gb 6.5 6g...,americanas,1


## Otimizador e Inicialização do Modelo

In [None]:
from transformers import BertTokenizer
from transformers import TFBertForSequenceClassification
import tensorflow_addons as tfa
test = None

# can be up to 512 for BERT
MAX_LENGTH = 256
BATCH_SIZE = 1
tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=True)

#######################INICIOS FUNÇOES DE APOIO#######################

def get_bert_data(X_train, y_train, X_valid, y_valid, X_test, y_test):
    # train dataset
    ds_train = encode_examples(X_train, y_train).shuffle(3).batch(BATCH_SIZE)

    # test dataset
    ds_test = encode_examples(X_test, y_test).batch(BATCH_SIZE)

    #validation dataset
    ds_valid = encode_examples(X_valid, y_valid).batch(BATCH_SIZE)

    return ds_train, ds_valid, ds_test


def encode_examples(texts, labels, limit=-1):
    # prepare list, so that we can build up final TensorFlow dataset from slices.
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    label_list = []
    if (limit > 0):
        ds = ds.take(limit)
    # for review, label in tfds.as_numpy(ds):
    for text, label in zip(texts, labels):
        bert_input = convert_example_to_feature(text)
        input_ids_list.append(bert_input['input_ids'])
        token_type_ids_list.append(bert_input['token_type_ids'])
        attention_mask_list.append(bert_input['attention_mask'])
        label_list.append([label])
    return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)


def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
  return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_masks,
  }, label


def convert_example_to_feature(review):
    return tokenizer.encode_plus(review,
                add_special_tokens = True, # add [CLS], [SEP]
                max_length = MAX_LENGTH, # max length of the text that can go to BERT
                pad_to_max_length = True, # add [PAD] tokens
                return_attention_mask = True, # add attention mask to not focus on pad tokens
              )


#######################FIM FUNÇOES DE APOIO#######################


def pipeline_bert(name, X_train, y_train, X_valid, y_valid, X_test, y_test):

    learning_rate = 2e-5
    number_of_epochs = 3
    ds_train, ds_valid, ds_test = get_bert_data(X_train, y_train, X_valid, y_valid, X_test, y_test)
    
    # model initialization
    model = TFBertForSequenceClassification.from_pretrained('bert-base-portuguese-cased', from_pt = True)

    # choosing Adam optimizer
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric_acc = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    model.compile(optimizer=optimizer, loss=loss, metrics=[metric_acc])

    #Training model
    bert_history = model.fit(ds_train, epochs=number_of_epochs, validation_data=ds_valid)
    
    #Predict test data
    # result = bert.get_test_metrics(model, ds_test, y_test)

    # metrics = calc_metrics(y_test, result, name)
    # return metrics
    
    

