# Importando as Bibliotecas

In [1]:
import ranking

import pandas as pd
import numpy as np
import string

from scipy import spatial

from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer

import time

# Lendo os Datasets

In [2]:
arquivos = ['ale_1_1', 'ale_5_1', 'hn_1_1', 'hn_5_1']

lista_df_treino = []
lista_df_teste = []
for arquivo in arquivos:

    df_treino = pd.read_csv(f"Dados/Datasets/Treino/{arquivo}_treino.csv", dtype = {'ean_1': str, 'ean_2': str})
    df_teste = pd.read_csv(f"Dados/Datasets/Teste/{arquivo}_teste.csv", dtype = {'ean_1': str, 'ean_2': str})
    lista_df_treino.append(df_treino)
    lista_df_teste.append(df_teste)

In [3]:
df_teste = lista_df_teste[0]

# Aplicando o BoW

In [4]:
# remoção de pontuação e acentos
ranking.fazer_pre_processamento(df_teste)

# colocando os titulos em um dataframe com 1 coluna só
df_concat = ranking.concatenar_df(df_teste)

# calculando o tamanho máximo do título
tam_max = ranking.calcular_tam_max(df_concat['titulo_sa'])

In [5]:
# calculando o BoW do título
cv, titulo_bow = ranking.formatar_entrada_bow(df_concat['titulo_sa'], mf = tam_max)
#np.unique(titulo_bow)

# Criando um Dataframe com Produtos com EAN Repetido

In [6]:
vc = df_concat['ean'].value_counts()
ean_repetido = vc[vc > 1].index.values

In [7]:
COLUNAS = (
           "indice", "titulo_sa", "ean", "categoria"
          )


def criar_dicionario(indice, titulo_sa, ean, categoria):
    
    return {
            'indice' : indice, 'titulo_sa' : titulo_sa, 'ean' : ean, 'categoria' : categoria
           }


def criar_df_match(df_concat, ean_repetido):

    df_matches = pd.DataFrame(columns = COLUNAS)
    for ean in ean_repetido:

        # pega o indice da primeira linha com aquele EAN
        filtro = (df_concat['ean'] == ean)
        indice = next(iter(filtro.index[filtro]))

        dicionario = criar_dicionario(
                                    indice = indice,
                                    titulo_sa = df_concat.loc[indice]['titulo_sa'],
                                    ean = df_concat.loc[indice]['ean'],
                                    categoria = df_concat.loc[indice]['categoria']
                                    )

        df_matches = df_matches.append(dicionario, ignore_index = True)

    df_matches.sort_values('indice', inplace = True)
    df_matches.reset_index(drop = True, inplace = True)

    return df_matches

In [8]:
df_matches = criar_df_match(df_concat, ean_repetido)

# Calculando a Distância Entre os Vetores

Calculando a distância entre os vetores

In [9]:
resultado = ranking.calcular_dis_2_vetores_cond(titulo_bow, df_matches)

Colocando o resultado em ordem (menor distância até maior distância)

In [10]:
indices, valores = ranking.ordenar_resultado(resultado)

# Calculando as Métricas

Calculando o acccuracy@k

In [11]:
for k in [1, 10, 50]:

    # criando uma coluna nova no df
    df_matches[f'k-{k}'] = 0
    df_matches[f'k-{k}-qtd'] = 0
    df_matches[f'k-{k}-qtd/{k}'] = 0

    for enum, i in enumerate(df_matches['indice'].to_list()):

        for j in range(k):
            
            if df_concat['ean'].loc[i] == df_concat['ean'].loc[indices[i][j]]:

                df_matches[f'k-{k}'].loc[enum] = 1
                df_matches[f'k-{k}-qtd'].loc[enum] += 1

        df_matches[f'k-{k}-qtd/{k}'].loc[enum] = (df_matches[f'k-{k}-qtd'].loc[enum])/k

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [12]:
df_matches['k-1'].value_counts()

1    376
0    172
Name: k-1, dtype: int64

In [13]:
df_matches['k-10'].value_counts()

1    477
0     71
Name: k-10, dtype: int64

In [14]:
df_matches['k-50'].value_counts()

1    534
0     14
Name: k-50, dtype: int64

In [15]:
#df_matches[["k-1", "k-1-qtd", "k-1-qtd/1", "k-10", "k-10-qtd", "k-10-qtd/10", "k-50", "k-50-qtd", "k-50-qtd/50"]]

In [16]:
#df_matches[["k-1", "k-1-qtd", "k-1-qtd/1", "k-10", "k-10-qtd", "k-10-qtd/10", "k-50", "k-50-qtd", "k-50-qtd/50"]].describe()

In [17]:
# criando uma coluna nova no df
df_matches['match_rank'] = 0
df_matches['1/match_rank'] = 0
df_matches['match_rank/total'] = 0
tam_df = df_concat.shape[0]

# para cada linha do dataframe
for enum, i in enumerate(df_matches['indice'].to_list()):

    # para cada uma das distâncias encontradas rankeadas em ordem decrescente
    for cont, j in enumerate(indices[i]):
    
        # se for match
        if (df_concat['ean'].loc[i] == df_concat['ean'].loc[j]) and (i != j):

            # guarda a posição do primeiro match, lembrando que o enumerate começa em 0
            df_matches['match_rank'].loc[enum] = (cont + 1)
            df_matches['1/match_rank'].loc[enum] = 1/(cont + 1)
            df_matches['match_rank/total'].loc[enum] = (cont + 1)/tam_df
            
            # para o for
            break

In [25]:
df_matches[["k-1", "k-10", "k-50", 'match_rank', '1/match_rank']]

Unnamed: 0,k-1,k-10,k-50,match_rank,1/match_rank
0,0,0,1,21,0.047619
1,0,0,1,18,0.055556
2,1,1,1,1,1.000000
3,1,1,1,1,1.000000
4,1,1,1,1,1.000000
...,...,...,...,...,...
543,1,1,1,1,1.000000
544,1,1,1,1,1.000000
545,0,1,1,2,0.500000
546,1,1,1,1,1.000000


In [32]:
df_teste['ean_1'].loc[1176]

'3002801019257'

In [33]:
df_teste['ean_2'].loc[1176]

'3002801019257'

In [27]:
df_matches[df_matches['match_rank'] == 342]

Unnamed: 0,indice,titulo_sa,ean,categoria,k-1,k-1-qtd,k-1-qtd/1,k-10,k-10-qtd,k-10-qtd/10,k-50,k-50-qtd,k-50-qtd/50,match_rank,1/match_rank,match_rank/total
483,1176,telefone robusto doogee s40 pro 4 gb de ram 64...,3002801019257,celulares,0,0,0,0,0,0.0,0,0,0.0,342,0.002924,0.128668


In [26]:
df_matches[["k-1", "k-10", "k-50", 'match_rank', '1/match_rank']].describe()

Unnamed: 0,k-1,k-10,k-50,match_rank,1/match_rank
count,548.0,548.0,548.0,548.0,548.0
mean,0.686131,0.870438,0.974453,7.989051,0.750285
std,0.464488,0.336128,0.157925,27.656813,0.383151
min,0.0,0.0,0.0,1.0,0.002924
25%,0.0,1.0,1.0,1.0,0.5
50%,1.0,1.0,1.0,1.0,1.0
75%,1.0,1.0,1.0,2.0,1.0
max,1.0,1.0,1.0,342.0,1.0


In [34]:
#df_matches[["k-1", "k-10", "k-50", "match_rank"]][df_matches['match_rank'] > 50]

Unnamed: 0,k-1,k-10,k-50,match_rank
70,0,0,0,69
84,0,0,0,127
206,0,0,0,90
215,0,0,0,143
301,0,0,0,80
323,0,0,0,54
339,0,0,0,262
377,0,0,0,180
483,0,0,0,342
486,0,0,0,246


In [21]:
df_matches.to_csv(f"Dados/Resultados/Cos_Rankeado/{arquivos[0]}.csv", index = False)