# Bibliotecas

In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import funcoes_modelos as fmod
import funcoes_bow as fbow

import time

## Configurações

In [2]:
flag_cpu = True
if flag_cpu == True:
    # force CPU (make CPU visible)
    cpus = tf.config.experimental.list_physical_devices('CPU')
    print(cpus)
    tf.config.set_visible_devices([], 'GPU')  # hide the GPU
    tf.config.set_visible_devices(cpus[0], 'CPU') # unhide potentially hidden CPU
    tf.config.get_visible_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]


## Constantes

In [3]:
SEED = 42

# Leitura dos arquivos

In [4]:
arquivos = ['hn_balanceado', 'hn_desbalanceado', 'sn_balanceado', 'sn_desbalanceado']

lista_df = []
for arquivo in arquivos:

    df = pd.read_csv(f"Dados/Datasets/{arquivo}.csv", dtype = {'ean_1': str, 'ean_2': str})
    lista_df.append(df)

In [5]:
#lista_df[0].dtypes

In [6]:
print(f"Tamanho dos Datasets:\n\n\t\t| Hard\t| Soft\t|\nBalanceado\t| {lista_df[0].shape[0]}\t| {lista_df[2].shape[0]}\t|\nDesbalanceado\t| {lista_df[1].shape[0]}\t| {lista_df[3].shape[0]}\t|")

Tamanho dos Datasets:

		| Hard	| Soft	|
Balanceado	| 8400	| 8400	|
Desbalanceado	| 13290	| 13290	|


# Separando o Dataset

In [7]:
def train_test_valid(df):
    
    X = df[["titulo_1", "titulo_2"]]
    y = df["match"].to_list()
    
    X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y, test_size = 0.3, random_state = SEED, stratify = y)
    X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size = 0.285, random_state = SEED, stratify = y_train_valid)
    
    return X_train, y_train, X_test, y_test, X_valid, y_valid


In [8]:
lista_train_test_valid = []
for df in lista_df:
    lista_train_test_valid.append(train_test_valid(df))

In [9]:
#for nome, dataset in zip(arquivos, lista_train_test_valid):    
#    X_train, y_train, X_test, y_test, X_valid, y_valid = dataset

# Experimentos Com Os Modelos

## Funções

In [11]:
def salvar_df_resultado(report, modelo, nome_dataset, tempo_exec):
    
    df_resultado = pd.DataFrame(report).transpose()
    df_resultado['modelo'] = modelo
    df_resultado['dataset'] = nome_dataset
    df_resultado['tempo'] = tempo_exec

    df_resultado.to_csv(f'Dados/Resultados/{modelo}/{nome_dataset}_resultado.csv', index = True)
    
    return df_resultado

## BERT, roBERTa, XLMR e ELECTRA

In [12]:
a

NameError: name 'a' is not defined

In [None]:
for modelo in modelos:
    
    lista_df_resultado = []
    
    for nome, dataset in zip(arquivos, lista_train_test_valid):    
        
        X_train, y_train, X_test, y_test, X_valid, y_valid = dataset

        start_time = time.time()
        (nome, historico, y_test, y_pred) = fmod.pipeline_bert(modelo, nome, X_train[:5], y_train[:5], X_valid[:5], y_valid[:5], X_test[:5], y_test[:5])
        end_time = time.time()
        runtime = end_time - start_time
        
        pd.DataFrame.from_dict(historico.history).to_csv(f'Dados/Resultados/{modelo}/{nome}_historico.csv', index = False)

        report = classification_report(y_test, y_pred, output_dict = True)
        
        df_resultado = salvar_df_resultado(report, modelo, nome, runtime)

        lista_df_resultado.append(df_resultado)

## BoW Co ocorrencia

In [None]:
modelo = "BoW"
lista_df_resultado = []

for nome, dataset in zip(arquivos, lista_train_test_valid):    
    
    X_train, y_train, X_test, y_test, X_valid, y_valid = dataset

    start_time = time.time()
    
    name_dataset, y_test, y_pred = fbow.pipeline_rf(nome, X_train, y_train, X_valid, y_valid, X_test, y_test)
    #name_dataset, y_test, y_pred = fbow.pipeline_rf(nome, X_train[:fim], y_train[:fim], X_valid[:fim], y_valid[:fim], X_test[:fim], y_test[:fim])
    
    #print(f"\n\n{nome}\n\n")

    end_time = time.time()
    runtime = end_time - start_time

    report = classification_report(y_test, y_pred, output_dict = True)
    
    df_resultado = salvar_df_resultado(report, modelo, nome, runtime)

    lista_df_resultado.append(df_resultado)

In [None]:
lista_df_resultado[0]

## Similaridade de Cosseno

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy import spatial
from sklearn.metrics.pairwise import cosine_similarity

###Bag of Words###
def get_cos(dataframe):

    cosine_list = []
    
    vectors, vectorizer = fbow.vectorize_dataframe(dataframe, binario = False)
        
    for i in range(0, len(vectors), 2):
        cosine_list.append(1 - spatial.distance.cosine(vectors[i], vectors[i+1]))
    
    return cosine_list

In [14]:
def cos_threshold(name_dataset, df_X, y_test, threshold):
    
    df_X["cos_sim"] = get_cos(df_X)
    df_X[f'cos{threshold}'] = (df_X["cos_sim"]>threshold).astype(int)
    y_pred = df_X[f'cos{threshold}']

    return name_dataset, y_test, y_pred

In [15]:
modelo = "Cos"
lista_df_resultado = []
thresh_list = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

for nome, dataset in zip(arquivos, lista_train_test_valid):

    X_train, y_train, X_test, y_test, X_valid, y_valid = dataset

    for threshold in thresh_list:
        start_time = time.time()

        name_dataset, y_test, y_pred = cos_threshold(nome, X_test, y_test, threshold)

        end_time = time.time()
        runtime = end_time - start_time

        report = classification_report(y_test, y_pred, output_dict = True)
        
        df_resultado = salvar_df_resultado(report, f'{modelo}_0{int(threshold*10)}', nome, runtime)

        lista_df_resultado.append(df_resultado)

    

In [16]:
len(lista_df_resultado)

28