# Importando as Bibliotecas

In [1]:
import pandas as pd
import numpy as np
import string

from scipy import spatial

from sklearn.feature_extraction.text import CountVectorizer

# Lendo os Datasets

In [2]:
arquivos = ['celulares']
#arquivos = ['fogoes']
#arquivos = ['geladeiras']
#arquivos = ['notebooks']
#arquivos = ['tvs']

lista_df_pp = []
for arquivo in arquivos:

    df_pp = pd.read_csv(f"Dados/Produtos Processados Pares/ppp_{arquivo}.csv", dtype = {'ean': str})
    lista_df_pp.append(df_pp)

In [3]:
df_pp = lista_df_pp[0]
df_pp.head(1)

Unnamed: 0,categoria,descricao,ean,preco,titulo,url,titulo_cb,loja
0,/categoria/celulares-e-smartphones/smartphone/...,Smartphone Samsung Galaxy S22 Ultra 512GB 5G c...,7892509122580,"R$ 8.858,63",Smartphone Samsung Galaxy S22 Ultra 512GB 5G c...,https://www.americanas.com.br/produto/4987525162,smartphone samsung galaxy s22 ultra 512gb 5g c...,americanas


# Aplicando o BoW

In [4]:
def remove_pontuacao(texto):
    
    texto_sp = texto.translate(str.maketrans('', '', string.punctuation))
    
    return texto_sp   


def formatar_entrada_bow(dados, mf = 1000):
    
    cv = CountVectorizer(
                         lowercase = True,
                         strip_accents = 'unicode',
                         max_features = mf
                        )

    cv.fit(dados)
    dados_transformados = cv.transform(dados).toarray()

    #X = matriz.fit_transform(dados).toarray()
    
    return cv, dados_transformados


def calcular_dis_cos(vetor_1, vetor_2):
                
    return (1 - spatial.distance.cosine(vetor_1, vetor_2))


def ordenar_resultado(res):

    indices = []
    valores = []

    for i in range(len(res)):
        
        ind = res[i].argsort()
        val = res[i][res[i].argsort()]

        indices.append( list(reversed(ind)) )
        valores.append( list(reversed(val)) )

    return indices, valores

In [5]:
# removendo a pontuação do título
df_pp["titulo_pp"] = df_pp["titulo"].apply(lambda x: remove_pontuacao(x))

# calculando o tamanho máximo do título
tam_max = max(df_pp.apply(lambda row: len(row["titulo_pp"]), axis = 1))

In [6]:
# calculando o BoW do título
cv, titulo_bow = formatar_entrada_bow(df_pp['titulo_pp'], mf = tam_max)
#np.unique(titulo_bow)

Calculando a distância entre os vetores

In [7]:
resultado = np.zeros((len(titulo_bow), len(titulo_bow)))

for i in range(len(titulo_bow)):

    for j in range(len(titulo_bow)):

        if i == j:
            resultado[i][j] = -1
        else:
            resultado[i][j] = calcular_dis_cos(titulo_bow[i], titulo_bow[j])

Calculando o acccuracy@k

In [8]:
indices, valores = ordenar_resultado(resultado)

In [9]:
for k in [1, 10, 50]:

    # criando uma coluna nova no df
    df_pp[f'k-{k}'] = 0
    df_pp[f'k-{k}-qtd'] = 0
    df_pp[f'k-{k}-qtd/{k}'] = 0

    for i in range(len(indices)):

        for j in range(k):
            
            if df_pp['ean'].loc[i] == df_pp['ean'].loc[indices[i][j]]:

                df_pp[f'k-{k}'].loc[i] = 1
                df_pp[f'k-{k}-qtd'].loc[i] += 1

        df_pp[f'k-{k}-qtd/{k}'].loc[i] = (df_pp[f'k-{k}-qtd'].loc[i])/k
        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [10]:
df_pp['k-1'].value_counts()

0    271
1    136
Name: k-1, dtype: int64

In [11]:
df_pp['k-10'].value_counts()

1    306
0    101
Name: k-10, dtype: int64

In [12]:
df_pp['k-50'].value_counts()

1    369
0     38
Name: k-50, dtype: int64

In [13]:
df_pp[["k-1", "k-1-qtd", "k-1-qtd/1", "k-10", "k-10-qtd", "k-10-qtd/10", "k-50", "k-50-qtd", "k-50-qtd/50"]]

Unnamed: 0,k-1,k-1-qtd,k-1-qtd/1,k-10,k-10-qtd,k-10-qtd/10,k-50,k-50-qtd,k-50-qtd/50
0,0,0,0,1,1,0.1,1,1,0.02
1,1,1,1,1,3,0.3,1,4,0.08
2,0,0,0,1,1,0.1,1,2,0.04
3,0,0,0,1,1,0.1,1,2,0.04
4,0,0,0,1,1,0.1,1,1,0.02
...,...,...,...,...,...,...,...,...,...
402,1,1,1,1,1,0.1,1,1,0.02
403,0,0,0,1,1,0.1,1,1,0.02
404,0,0,0,1,1,0.1,1,1,0.02
405,0,0,0,0,0,0.0,1,1,0.02


In [14]:
df_pp[["k-1", "k-1-qtd", "k-1-qtd/1", "k-10", "k-10-qtd", "k-10-qtd/10", "k-50", "k-50-qtd", "k-50-qtd/50"]].describe()

Unnamed: 0,k-1,k-1-qtd,k-1-qtd/1,k-10,k-10-qtd,k-10-qtd/10,k-50,k-50-qtd,k-50-qtd/50
count,407.0,407.0,407.0,407.0,407.0,407.0,407.0,407.0,407.0
mean,0.334152,0.334152,0.334152,0.751843,0.990172,0.099017,0.906634,1.287469,0.025749
std,0.472274,0.472274,0.472274,0.432475,0.794006,0.079401,0.291303,0.838375,0.016767
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,1.0,1.0,0.1,1.0,1.0,0.02
50%,0.0,0.0,0.0,1.0,1.0,0.1,1.0,1.0,0.02
75%,1.0,1.0,1.0,1.0,1.0,0.1,1.0,2.0,0.04
max,1.0,1.0,1.0,1.0,4.0,0.4,1.0,4.0,0.08


In [15]:
# criando uma coluna nova no df
df_pp['match_rank'] = 0
df_pp['1/match_rank'] = 0
df_pp['match_rank/total'] = 0
tam_df = df_pp.shape[0]

# para cada linha do dataframe
for i in range(tam_df):

    # para cada uma das distâncias encontradas rankeadas em ordem decrescente
    for cont, j in enumerate(indices[i]):
    
        # se for match
        if (df_pp['ean'].loc[i] == df_pp['ean'].loc[j]) and (i != j):

            # guarda a posição do primeiro match, lembrando que o enumerate começa em 0
            df_pp['match_rank'].loc[i] = (cont + 1)
            df_pp['1/match_rank'].loc[i] = 1/(cont + 1)
            df_pp['match_rank/total'].loc[i] = (cont + 1)/tam_df
            
            # para o for
            break

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [16]:
#tirando a média e dividindo pela quantidade total de produtos
df_pp[['match_rank', 'match_rank/total', '1/match_rank']].describe()

Unnamed: 0,match_rank,match_rank/total,1/match_rank
count,407.0,407.0,407.0
mean,15.265356,0.037507,0.4607
std,34.911194,0.085777,0.405064
min,1.0,0.002457,0.003096
25%,1.0,0.002457,0.1
50%,3.0,0.007371,0.333333
75%,10.0,0.02457,1.0
max,323.0,0.793612,1.0


In [17]:
#df_pp[["k-1", "k-10", "k-50", "match_rank"]][df_pp['match_rank'] > 50]

In [18]:
df_pp.to_csv(f"Dados/Produtos Processados Pares/Metricas/{arquivos[0]}.csv", index = False)