# Bibliotecas

In [None]:
import pandas as pd
import tensorflow as tf
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import funcoes_modelos as fmod
import funcoes_bow as fbow

import time

## Configurações

In [None]:
flag_cpu = True
if flag_cpu == True:
    # force CPU (make CPU visible)
    cpus = tf.config.experimental.list_physical_devices('CPU')
    print(cpus)
    tf.config.set_visible_devices([], 'GPU')  # hide the GPU
    tf.config.set_visible_devices(cpus[0], 'CPU') # unhide potentially hidden CPU
    tf.config.get_visible_devices()

## Constantes

In [None]:
SEED = 42

# Leitura dos arquivos

In [None]:
arquivos = ['hn_balanceado', 'hn_desbalanceado', 'sn_balanceado', 'sn_desbalanceado']

lista_df = []
for arquivo in arquivos:

    df = pd.read_csv(f"Dados/Datasets/{arquivo}.csv", dtype = {'ean_1': str, 'ean_2': str})
    lista_df.append(df)

In [None]:
#lista_df[0].dtypes

In [None]:
print(f"Tamanho dos Datasets:\n\n\t\t| Hard\t| Soft\t|\nBalanceado\t| {lista_df[0].shape[0]}\t| {lista_df[2].shape[0]}\t|\nDesbalanceado\t| {lista_df[1].shape[0]}\t| {lista_df[3].shape[0]}\t|")

# Separando o Dataset

In [None]:
def train_test_valid(df):
    
    X = df[["titulo_1", "titulo_2"]]
    y = df["match"].to_list()
    
    X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y, test_size = 0.3, random_state = SEED, stratify = y)
    X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size = 0.285, random_state = SEED, stratify = y_train_valid)
    
    return X_train, y_train, X_test, y_test, X_valid, y_valid


In [None]:
lista_train_test_valid = []
for df in lista_df:
    lista_train_test_valid.append(train_test_valid(df))

In [None]:
for nome, dataset in zip(arquivos, lista_train_test_valid):    
    X_train, y_train, X_test, y_test, X_valid, y_valid = dataset

In [None]:
fim = 1

In [None]:
modelo = "BoW"

for nome, dataset in zip(arquivos, lista_train_test_valid):    
    
    X_train, y_train, X_test, y_test, X_valid, y_valid = dataset

    start_time = time.time()
    
    X_train_vec, X_valid_vec, X_test_vec = fbow.pipeline_rf(nome, X_train, y_train, X_valid, y_valid, X_test, y_test)
    #X_train_vec, X_valid_vec, X_test_vec = fbow.pipeline_rf(nome, X_train[:fim], y_train[:fim], X_valid[:fim], y_valid[:fim], X_test[:fim], y_test[:fim])
    
    print(f"\n\n{nome}\n\n")

    break

In [None]:
X_train_vec[0]

In [None]:
    a

# Experimentos Com Os Modelos

## Funções

In [None]:
def salvar_df_resultado(report, modelo, nome_dataset, tempo_exec):
    
    df_resultado = pd.DataFrame(report).transpose()
    df_resultado['modelo'] = modelo
    df_resultado['dataset'] = nome_dataset
    df_resultado['tempo'] = tempo_exec

    df_resultado.to_csv(f'Dados/Resultados/{modelo}/{nome_dataset}_resultado.csv', index = True)
    
    return df_resultado

## BERT, roBERTa, XLMR e ELECTRA

In [None]:
    a

In [None]:
modelos = ("BERT", "roBERTa", "XLMR", "ELECTRA")

In [None]:
for modelo in modelos:
    
    lista_df_resultado = []
    
    for nome, dataset in zip(arquivos, lista_train_test_valid):    
        
        X_train, y_train, X_test, y_test, X_valid, y_valid = dataset

        start_time = time.time()
        (nome, historico, y_test, y_pred) = fmod.pipeline_bert(modelo, nome, X_train[:5], y_train[:5], X_valid[:5], y_valid[:5], X_test[:5], y_test[:5])
        end_time = time.time()
        runtime = end_time - start_time
        
        pd.DataFrame.from_dict(historico.history).to_csv(f'Dados/Resultados/{modelo}/{nome}_historico.csv', index = False)

        report = classification_report(y_test, y_pred, output_dict = True)
        
        df_resultado = salvar_df_resultado(report, modelo, nome, runtime)

        lista_df_resultado.append(df_resultado)

## BoW

In [None]:
modelo = "BoW"

for nome, dataset in zip(arquivos, lista_train_test_valid):    
    
    X_train, y_train, X_test, y_test, X_valid, y_valid = dataset

    start_time = time.time()
    #(nome, y_test, y_pred) = fbow.pipeline_rf(nome, X_train, y_train, X_valid, y_valid, X_test, y_test)
    X_train_vec, X_valid_vec, X_test_vec = fbow.pipeline_rf(nome, X_train[:1], y_train[:1], X_valid[:1], y_valid[:1], X_test[:1], y_test[:1])

    break
    '''end_time = time.time()
    runtime = end_time - start_time

    report = classification_report(y_test, y_pred, output_dict = True)
    
    df_resultado = salvar_df_resultado(report, modelo, nome, runtime)

    lista_df_resultado.append(df_resultado)'''

    

In [None]:
X_train.head()

In [None]:
len(X_train_vec[0])

In [None]:
print(classification_report(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

In [None]:
df_resultado

In [None]:
'''lista_df_resultado = []
for nome, df in zip(arquivos, lista_df):
    
    X = df[["titulo_1", "titulo_2"]]#[df["titulo_1"].to_list(), df["titulo_2"].to_list()]
    y = df["match"].to_list()

    X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y, test_size = 0.3, random_state = SEED, stratify = y)
    X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size = 0.2, random_state = SEED, stratify = y_train_valid)

    (nome, historico, y_test, y_pred) = fmod.pipeline_bert(nome, X_train[:5], y_train[:5], X_valid[:5], y_valid[:5], X_test[:5], y_test[:5])

    pd.DataFrame.from_dict(historico.history).to_csv(f'Dados/Resultados/BERTo/{nome}_historico.csv', index = False)

    report = classification_report(y_test, y_pred, output_dict = True)
    df_resultado = pd.DataFrame(report).transpose()
    df_resultado['modelo'] = nome

    break

    #df_resultado.to_csv(f'Dados/Resultados/BERTo/{nome}_resultado.csv', index = True)

    lista_df_resultado.append(df_resultado)'''