# Bibliotecas

In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import funcoes_modelos as fmod
import funcoes_bow as fbow
import funcoes_cos as fcos

import time

## Configurações

In [2]:
# flag_cpu = True
# if flag_cpu == True:
#     # force CPU (make CPU visible)
#     cpus = tf.config.experimental.list_physical_devices('CPU')
#     print(cpus)
#     tf.config.set_visible_devices([], 'GPU')  # hide the GPU
#     tf.config.set_visible_devices(cpus[0], 'CPU') # unhide potentially hidden CPU
#     tf.config.get_visible_devices()

## Constantes

In [3]:
SEED = 42

# Leitura dos arquivos

In [4]:
arquivos = ['hn_balanceado', 'hn_desbalanceado', 'sn_balanceado', 'sn_desbalanceado']

lista_df = []
for arquivo in arquivos:

    df = pd.read_csv(f"Dados/Datasets/{arquivo}.csv", dtype = {'ean_1': str, 'ean_2': str})
    lista_df.append(df)

In [5]:
#lista_df[0].dtypes

In [6]:
#print(f"Tamanho dos Datasets:\n\n\t\t| Hard\t| Soft\t|\nBalanceado\t| {lista_df[0].shape[0]}\t| {lista_df[2].shape[0]}\t|\nDesbalanceado\t| {lista_df[1].shape[0]}\t| {lista_df[3].shape[0]}\t|")

# Separando o Dataset

In [7]:
def train_test_valid(df):
    
    X = df[["titulo_1", "titulo_2"]]
    y = df["match"].to_list()
    
    X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y, test_size = 0.3, random_state = SEED, stratify = y)
    X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size = 0.285, random_state = SEED, stratify = y_train_valid)
    
    return X_train, y_train, X_test, y_test, X_valid, y_valid


In [8]:
lista_train_test_valid = []
for df in lista_df:
    lista_train_test_valid.append(train_test_valid(df))

In [9]:
#for nome, dataset in zip(arquivos, lista_train_test_valid):    
#    X_train, y_train, X_test, y_test, X_valid, y_valid = dataset

# Experimentos Com Os Modelos

## Funções

In [10]:
def salvar_df_resultado(report, modelo, nome_dataset, tempo_exec):
    
    df_resultado = pd.DataFrame(report).transpose()
    df_resultado['modelo'] = modelo
    df_resultado['dataset'] = nome_dataset
    df_resultado['tempo'] = tempo_exec

    df_resultado.to_csv(f'Dados/Resultados/{modelo}/{nome_dataset}_resultado.csv', index = True)
    
    return df_resultado


def salvar_modelo(modelo, dataset, metodo):
    
    nome_arquivo = f"Dados/Modelos/modelo_{dataset}_{metodo}"
    pickle.dump(modelo, open(nome_arquivo, 'wb'))

def carregar_modelo(nome, metodo):
    
    modelo = pickle.load(open(f"Dados/Modelos/modelo_{nome}_{metodo}", 'rb'))
    return modelo


def salvar_modelo_bert(modelo, dataset, metodo):
    
    nome_arquivo = f"Dados/Modelos/{metodo}/{dataset}/"
    modelo.save_pretrained(nome_arquivo)

def carregar_modelo_bert(nome, dataset, metodo):
    
    from transformers import TFAutoModel
    
    modelo = TFAutoModel.from_pretrained(f"Dados/Modelos/{metodo}/{dataset}/")
    return modelo

## BERT, roBERTa, XLMR e ELECTRA

In [11]:
metodos = ("BERT", "roBERTa",  "XLMR", "ELECTRA")

In [12]:
fim = 200
for metodo in metodos:

    lista_df_resultado = []
    
    for nome, dataset in zip(arquivos, lista_train_test_valid):    
        
        X_train, y_train, X_test, y_test, X_valid, y_valid = dataset

        start_time = time.time()
        #(nome, historico, y_test, y_pred, modelo) = fmod.pipeline_bert(metodo, nome, X_train, y_train, X_valid, y_valid, X_test, y_test)
        (nome, historico, y_test, y_pred, modelo) = fmod.pipeline_bert(metodo, nome, X_train[:fim], y_train[:fim], X_valid[:fim], y_valid[:fim], X_test[:fim], y_test[:fim])
        end_time = time.time()
        runtime = end_time - start_time
        
        pd.DataFrame.from_dict(historico.history).to_csv(f'Dados/Resultados/{metodo}/{nome}_historico.csv', index = False)

        report = classification_report(y_test, y_pred, output_dict = True)
        
        df_resultado = salvar_df_resultado(report, metodo, nome, runtime)
        salvar_modelo_bert(modelo, nome, metodo)

        lista_df_resultado.append(df_resultado)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ResourceExhaustedError:  OOM when allocating tensor with shape[64,12,256,256] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node tf_bert_for_sequence_classification/bert/encoder/layer_._0/attention/self/Softmax (defined at c:\Users\mcso\anaconda3\envs\tf_gpu\lib\site-packages\transformers\models\bert\modeling_tf_bert.py:306) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_24376]

Errors may have originated from an input operation.
Input Source operations connected to node tf_bert_for_sequence_classification/bert/encoder/layer_._0/attention/self/Softmax:
 tf_bert_for_sequence_classification/bert/encoder/layer_._0/attention/self/Add (defined at c:\Users\mcso\anaconda3\envs\tf_gpu\lib\site-packages\transformers\models\bert\modeling_tf_bert.py:303)

Function call stack:
train_function


## BoW Co ocorrencia

In [None]:
'''metodo = "BoW"
lista_df_resultado = []
fim = 150

for nome, dataset in zip(arquivos, lista_train_test_valid):
    
    #print(f"\n\nNome: {nome}\n\n")
    
    X_train, y_train, X_test, y_test, X_valid, y_valid = dataset

    start_time = time.time()
    
    name_dataset, y_test, y_pred, modelo = fbow.pipeline_rf(nome, X_train, y_train, X_valid, y_valid, X_test, y_test)
    # name_dataset, y_test, y_pred = fbow.pipeline_rf(nome, X_train[:fim], y_train[:fim], X_valid[:fim], y_valid[:fim], X_test[:fim], y_test[:fim])
    
    #print(f"\n\n{nome}\n\n")

    end_time = time.time()
    runtime = end_time - start_time

    report = classification_report(y_test, y_pred, output_dict = True)
    
    df_resultado = salvar_df_resultado(report, metodo, nome, runtime)
    salvar_modelo(modelo, nome, "BoWCo")

    lista_df_resultado.append(df_resultado)'''



Nome: hn_balanceado




[32m[I 2022-07-13 16:21:13,544][0m A new study created in memory with name: no-name-644541cf-a519-4301-9f11-72a5486de4cd[0m
[32m[I 2022-07-13 16:21:33,306][0m Trial 0 finished with value: 0.9092092417832736 and parameters: {'n_estimators': 148, 'max_depth': 41, 'criterion': 'gini'}. Best is trial 0 with value: 0.9092092417832736.[0m
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))




Nome: hn_desbalanceado




[32m[I 2022-07-13 16:22:06,089][0m A new study created in memory with name: no-name-94560645-55fb-4b98-80d4-bfc649094fe9[0m
[32m[I 2022-07-13 16:24:50,624][0m Trial 0 finished with value: 0.9090909090909091 and parameters: {'n_estimators': 586, 'max_depth': 60, 'criterion': 'entropy'}. Best is trial 0 with value: 0.9090909090909091.[0m
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))




Nome: sn_balanceado




[32m[I 2022-07-13 16:27:44,431][0m A new study created in memory with name: no-name-71ed33ef-ed21-4a73-b3b6-ab371f0d356a[0m


KeyboardInterrupt: 

## Similaridade de Cosseno

In [None]:
'''metodo = "Cos"
lista_df_resultado = []
thresh_list = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

for nome, dataset in zip(arquivos, lista_train_test_valid):

    X_train, y_train, X_test, y_test, X_valid, y_valid = dataset

    for threshold in thresh_list:
        start_time = time.time()

        name_dataset, y_test, y_pred = fcos.cos_threshold(nome, X_test, y_test, threshold)

        end_time = time.time()
        runtime = end_time - start_time

        report = classification_report(y_test, y_pred, output_dict = True)
        
        df_resultado = salvar_df_resultado(report, f'{metodo}_0{int(threshold*10)}', nome, runtime)

        lista_df_resultado.append(df_resultado)'''

    

In [None]:
#len(lista_df_resultado)

28