In [49]:
import pandas as pd
import zipfile
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import squarify
import plotly.express as px
from scipy.stats import chi2_contingency
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [50]:
zip_path = '../BD/Supermercado.zip' 
csv_filename = 'Supermercado.csv'

with zipfile.ZipFile(zip_path, 'r') as z:
    with z.open(csv_filename) as f:
        df = pd.read_csv(f)


In [52]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [53]:
week_days = {
    0: 'Monday',
    1: 'Tuesday',
    2: 'Wednesday',
    3: 'Thursday',
    4: 'Friday',
    5: 'Saturday',
    6: 'Sunday'
}

df['day'] = df['order_dow'].map(week_days)

In [58]:
df.isnull().sum()

order_id                       0
user_id                        0
order_number                   0
order_dow                      0
order_hour_of_day              0
days_since_prior_order    124342
product_id                     0
add_to_cart_order              0
reordered                      0
department_id                  0
department                     0
product_name                   0
day                            0
dtype: int64

Dado que el objetivo es crear un aloritmo de clasificacion, se eliminan las filas nulas del dataset.

In [59]:
df = df.dropna()

Finalmente, se valida de que efectivamente se hayan eliminado los valores nulos.

In [60]:
df.isnull().sum()

order_id                  0
user_id                   0
order_number              0
order_dow                 0
order_hour_of_day         0
days_since_prior_order    0
product_id                0
add_to_cart_order         0
reordered                 0
department_id             0
department                0
product_name              0
day                       0
dtype: int64

Se valida de que no existan valores duplicados en el dataset.

#### Ventas segun Dia y Momento del Dia

Con el objetivo de realizar una segmentación de horarios por parte del día, se procede a realizar un mapeo de horas, agrupandolas por los momentos: "Morning", "Afternoon", "Night" y "Down".

In [66]:
def order_time(x):
    if x in[6, 7, 8, 9, 10, 11, 12]:
        return 'Morning'
    if x in[13, 14, 15, 16, 17]:
        return 'Afternoon'
    if x in[18, 19, 20, 21, 22]:
        return 'Night'
    if x in[23, 24, 0, 1, 2, 3, 4, 5]:
        return 'Dawn'
    return x

Se almacena la el valor de momento del día en la variable "order_time_list".

In [67]:
df['order_time_list']=df['order_hour_of_day'].apply(order_time)
df.sample(5)

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,department_id,department,product_name,day,order_time_list
645489,1240832,119528,2,1,11,6.0,24,6,0,4,produce,fresh fruits,Tuesday,Morning
1240307,818556,199521,48,1,8,7.0,24,10,1,4,produce,fresh fruits,Tuesday,Morning
1062989,1638088,114948,63,3,8,4.0,24,7,1,4,produce,fresh fruits,Thursday,Morning
1308962,2297954,85458,49,3,6,17.0,120,9,1,16,dairy eggs,yogurt,Thursday,Morning
1462153,1732715,199485,3,2,14,30.0,21,12,0,16,dairy eggs,packaged cheese,Wednesday,Afternoon


#### Segmentacion de Clientes por Cantidad de Ordenes

Se maneja la hipótesis de que el comportamiento de los clientes varia según la cantidad de ordenes o compras que realizan a lo largo del período de tiempo de análisis.
Por tal motivo, se plantea segmentar en grupos de clientes por cantidad de ordenes realizadas.

In [78]:
max_order_numbers = df.groupby('user_id')['order_number'].max().reset_index()
max_order_numbers.rename(columns={'order_number': 'max_order'}, inplace=True)

df = df.merge(max_order_numbers, on='user_id', how='left')

El enfoque que se plantea es el de generar 5 grupos de clientes según la cantidad de compras realizadas, comenzando con ordenes entre 1 y 20, y finalizando con el último grupo que posee clientes que realizaron entre 81 y 100 pedidos (siendo 100 el máximo de ordenes realizadas por clientes registradas en el dataset).

In [79]:
def order_number_group(x):
    if x <= 20:
        x = '1-20 order'
    elif x >20 and x <=40:
        x = '21-40 order'
    elif x >40 and x <=60:
        x = '41-60 order'
    elif x >60 and x <=80:
        x = '61-80 order'
    else:
        x = '81-100 order'
    return x

Luego de realizar el mapeo de grupos de clientes, se procede a almacenar la variable en la columna 'order_number_group'. 

In [80]:
df['order_number_group']=df['max_order'].apply(order_number_group)
df.sample(5)

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,department_id,department,product_name,day,order_time_list,max_order,order_number_group
137333,1766913,8359,55,4,9,2.0,31,17,1,7,beverages,refrigerated,Friday,Morning,72,61-80 order
1785746,1768889,125135,13,6,19,6.0,115,4,1,7,beverages,water seltzer sparkling water,Sunday,Night,13,1-20 order
665604,416398,43206,35,0,13,1.0,83,4,1,4,produce,fresh vegetables,Monday,Afternoon,82,81-100 order
1461813,1084891,182101,2,5,20,7.0,96,14,0,20,deli,lunch meat,Saturday,Night,5,1-20 order
495258,2227487,176923,14,6,13,30.0,98,13,0,7,beverages,juice nectars,Sunday,Afternoon,14,1-20 order


#### Se exporta el DF con los cambios realizados

In [94]:
df.to_csv('../BD/Supermercado_mod.csv', index=False)

zip_filename = '../BD/Supermercado_mod.zip'
csv_filename = '../BD/Supermercado_mod.csv'

with zipfile.ZipFile(zip_filename, 'w', compression=zipfile.ZIP_DEFLATED) as zipf:
    zipf.write(csv_filename, arcname='Supermercado_mod.csv')