# Sistema de Recomendação de Compras - Modeling

## Preparação do ambiente

### Instalando as bibliotecas

In [57]:
# !pip install --user -r requirements.txt

### Importando os pacotes

In [58]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from category_encoders import OrdinalEncoder
from imblearn.over_sampling import RandomOverSampler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from scipy.stats import norm
import scipy.stats as st
import sweetviz as sv
import seaborn as sns
import statistics  as sts
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
from tpot import TPOTClassifier
from yellowbrick.model_selection import FeatureImportances
import chardet
from datetime import datetime
from datetime import date
from calendar import monthrange
import os
import random

## Funções

In [59]:
def EliminarDuplicadas (dataframe): 
    return dataframe.drop_duplicates()

In [60]:
def EliminarOutliers (coluna):
    limitesuperior = coluna.mean() + 3*coluna.std()
    limiteinferior = coluna.mean() - 3*coluna.std()
    return coluna.apply(lambda x: limitesuperior if x>= limitesuperior else limiteinferior if x<= limiteinferior else x)

In [61]:
def Corresp(dataset1, dataset2, coluna):
    dtTeste1 = dataset1[coluna].to_frame()
    dtTeste1['count1'] = 1
    dtTeste1

    dtTeste2 = dataset2[coluna].to_frame()
    dtTeste2['count2'] = 1
    dtTeste2

    dtTesteM1 = dtTeste1.merge(dtTeste2, how='outer')
    dtTesteM1

    print('Valores de ' + coluna + ' não inclusos no primeiro dataset: ' + str(len(dtTesteM1[dtTesteM1['count2'].isna()])))

    dtTesteM2 = dtTeste2.merge(dtTeste1, how='outer')
    dtTesteM2

    print('Valores de ' + coluna + ' não inclusos no segundo dataset: ' + str(len(dtTesteM2[dtTesteM2['count1'].isna()])))

In [62]:
def ExportCSV (dataframe, nomearquivo, diretorio):
    outname = nomearquivo + '.csv'
    outdir = './' + diretorio
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    fullname = os.path.join(outdir, outname)    
    dataframe.to_csv(fullname, index=False)

## Leitura de dados preparados

In [63]:
df = pd.read_csv('Dados\df.csv')
dfProducts = pd.read_csv('Dados\dfProducts.csv')

In [64]:
df

Unnamed: 0,customer_id,price,product_name_lenght,product_description_lenght,product_weight_g,Succeed,review_score_count,review_score_nanmean,Size_big,Size_medium,...,product_category_name_pet_shop,product_category_name_portateis_casa_forno_e_cafe,product_category_name_portateis_cozinha_e_preparadores_de_alimentos,product_category_name_relogios_presentes,product_category_name_seguros_e_servicos,product_category_name_sinalizacao_e_seguranca,product_category_name_tablets_impressao_imagem,product_category_name_telefonia,product_category_name_telefonia_fixa,product_category_name_utilidades_domesticas
0,00012a2ce6f8dcda20d059ce98491703,89.80,55.0,889.0,4267.0,1,1.0,1.0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,000161a058600d5901f007fab4c27140,54.90,57.0,2563.0,150.0,1,1.0,4.0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0001fd6190edaaf884bcaf3d49edf079,179.99,45.0,1042.0,750.0,1,1.0,5.0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0002414f95344307404f0ace7a26f1d5,149.90,38.0,840.0,1600.0,1,1.0,5.0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,000379cdec625522490c315e70c7a9fb,93.00,57.0,364.0,800.0,1,1.0,4.0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98661,fffcb937e9dd47a13f05ecb8290f4d3e,78.00,39.0,2244.0,1250.0,1,1.0,5.0,0,0,...,0,0,0,0,0,0,0,0,0,0
98662,fffecc9f79fd8c764f843e9951b11341,54.90,48.0,641.0,2050.0,1,1.0,3.0,0,1,...,0,0,0,0,0,0,0,0,0,0
98663,fffeda5b6d849fbd39689bb92087f431,47.90,59.0,458.0,350.0,1,1.0,4.0,0,1,...,0,0,0,0,0,0,0,1,0,0
98664,ffff42319e9b2d713724ae527742af25,199.90,60.0,268.0,2700.0,1,1.0,5.0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [65]:
dfProducts

Unnamed: 0,product_id,product_name_lenght,product_description_lenght,product_weight_g,Size_big,Size_medium,Size_small,product_category_name_agro_industria_e_comercio,product_category_name_alimentos,product_category_name_alimentos_bebidas,...,product_category_name_pet_shop,product_category_name_portateis_casa_forno_e_cafe,product_category_name_portateis_cozinha_e_preparadores_de_alimentos,product_category_name_relogios_presentes,product_category_name_seguros_e_servicos,product_category_name_sinalizacao_e_seguranca,product_category_name_tablets_impressao_imagem,product_category_name_telefonia,product_category_name_telefonia_fixa,product_category_name_utilidades_domesticas
0,1e9e8ef04dbcff4541ed26657ea517e5,40.0,287.0,225.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3aa071139cb16b67ca9e5dea641aaa2f,44.0,276.0,1000.0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,96bd76ec8810374ed1b65e291975717f,46.0,250.0,154.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,cef67bcfe19066a932b7673e239eb23d,27.0,261.0,371.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9dc1a7de274444849c219cff195d0b71,37.0,402.0,625.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32946,a0b7d5a992ccda646f2d34e418fff5a0,45.0,67.0,12300.0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32947,bf4538d88321d0fd4412a93c974510e6,41.0,971.0,1700.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32948,9a7c6041fa9592d9d9ef6cfe62a71f8c,50.0,799.0,1400.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32949,83808703fc0706a22e264b9d75f04a2e,60.0,156.0,700.0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Sistema de recomendação

### Criação do vetor de histórico

In [66]:
CategoryColumns = [col for col in df if col.startswith('product_category_name')]
SizeColumns = [col for col in df if col.startswith('Size')]
VectorColumns = CategoryColumns + SizeColumns
VectorColumns

['product_category_name_agro_industria_e_comercio',
 'product_category_name_alimentos',
 'product_category_name_alimentos_bebidas',
 'product_category_name_artes',
 'product_category_name_artes_e_artesanato',
 'product_category_name_artigos_de_festas',
 'product_category_name_artigos_de_natal',
 'product_category_name_audio',
 'product_category_name_automotivo',
 'product_category_name_bebes',
 'product_category_name_bebidas',
 'product_category_name_beleza_saude',
 'product_category_name_brinquedos',
 'product_category_name_cama_mesa_banho',
 'product_category_name_casa_conforto',
 'product_category_name_casa_conforto_2',
 'product_category_name_casa_construcao',
 'product_category_name_cds_dvds_musicais',
 'product_category_name_cine_foto',
 'product_category_name_climatizacao',
 'product_category_name_consoles_games',
 'product_category_name_construcao_ferramentas_construcao',
 'product_category_name_construcao_ferramentas_ferramentas',
 'product_category_name_construcao_ferramentas

In [67]:
Colunas = df.columns
DropColumns = Colunas.drop(VectorColumns)

In [68]:
Vector = df.drop(columns=DropColumns)
Vector

Unnamed: 0,Size_big,Size_medium,Size_small,product_category_name_agro_industria_e_comercio,product_category_name_alimentos,product_category_name_alimentos_bebidas,product_category_name_artes,product_category_name_artes_e_artesanato,product_category_name_artigos_de_festas,product_category_name_artigos_de_natal,...,product_category_name_pet_shop,product_category_name_portateis_casa_forno_e_cafe,product_category_name_portateis_cozinha_e_preparadores_de_alimentos,product_category_name_relogios_presentes,product_category_name_seguros_e_servicos,product_category_name_sinalizacao_e_seguranca,product_category_name_tablets_impressao_imagem,product_category_name_telefonia,product_category_name_telefonia_fixa,product_category_name_utilidades_domesticas
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98661,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98662,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98663,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
98664,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [69]:
Vector = Vector.to_numpy()
Vector

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 1, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]], dtype=int64)

### Criação do vetor de produto

In [70]:
CategoryColumns = [col for col in df if col.startswith('product_category_name')]
SizeColumns = [col for col in df if col.startswith('Size')]
VectorColumns = CategoryColumns + SizeColumns
VectorColumns

['product_category_name_agro_industria_e_comercio',
 'product_category_name_alimentos',
 'product_category_name_alimentos_bebidas',
 'product_category_name_artes',
 'product_category_name_artes_e_artesanato',
 'product_category_name_artigos_de_festas',
 'product_category_name_artigos_de_natal',
 'product_category_name_audio',
 'product_category_name_automotivo',
 'product_category_name_bebes',
 'product_category_name_bebidas',
 'product_category_name_beleza_saude',
 'product_category_name_brinquedos',
 'product_category_name_cama_mesa_banho',
 'product_category_name_casa_conforto',
 'product_category_name_casa_conforto_2',
 'product_category_name_casa_construcao',
 'product_category_name_cds_dvds_musicais',
 'product_category_name_cine_foto',
 'product_category_name_climatizacao',
 'product_category_name_consoles_games',
 'product_category_name_construcao_ferramentas_construcao',
 'product_category_name_construcao_ferramentas_ferramentas',
 'product_category_name_construcao_ferramentas

In [71]:
ColunasProducts = dfProducts.columns
DropColumnsProducts = Colunas.drop(VectorColumns)

In [72]:
VectorProducts = df.drop(columns=DropColumnsProducts)
VectorProducts

Unnamed: 0,Size_big,Size_medium,Size_small,product_category_name_agro_industria_e_comercio,product_category_name_alimentos,product_category_name_alimentos_bebidas,product_category_name_artes,product_category_name_artes_e_artesanato,product_category_name_artigos_de_festas,product_category_name_artigos_de_natal,...,product_category_name_pet_shop,product_category_name_portateis_casa_forno_e_cafe,product_category_name_portateis_cozinha_e_preparadores_de_alimentos,product_category_name_relogios_presentes,product_category_name_seguros_e_servicos,product_category_name_sinalizacao_e_seguranca,product_category_name_tablets_impressao_imagem,product_category_name_telefonia,product_category_name_telefonia_fixa,product_category_name_utilidades_domesticas
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98661,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98662,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98663,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
98664,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [73]:
VectorProducts = VectorProducts.to_numpy()
VectorProducts

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 1, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]], dtype=int64)