In [1]:
import pandas as pd
import zipfile
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import squarify
import plotly.express as px
from scipy.stats import chi2_contingency
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from IPython.display import display, HTML

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [4]:
zip_path = '../00_Data_Bases/Supermercado.zip' 
csv_filename = 'Supermercado.csv'

with zipfile.ZipFile(zip_path, 'r') as z:
    with z.open(csv_filename) as f:
        df = pd.read_csv(f)


In [5]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [6]:
week_days = {
    0: 'Monday',
    1: 'Tuesday',
    2: 'Wednesday',
    3: 'Thursday',
    4: 'Friday',
    5: 'Saturday',
    6: 'Sunday'
}

df['day'] = df['order_dow'].map(week_days)

In [7]:
df.isnull().sum()

order_id                       0
user_id                        0
order_number                   0
order_dow                      0
order_hour_of_day              0
days_since_prior_order    124342
product_id                     0
add_to_cart_order              0
reordered                      0
department_id                  0
department                     0
product_name                   0
day                            0
dtype: int64

Dado que el objetivo es crear un aloritmo de clasificacion, se eliminan las filas nulas del dataset.

In [8]:
df = df.dropna()

Finalmente, se valida de que efectivamente se hayan eliminado los valores nulos.

In [9]:
df.isnull().sum()

order_id                  0
user_id                   0
order_number              0
order_dow                 0
order_hour_of_day         0
days_since_prior_order    0
product_id                0
add_to_cart_order         0
reordered                 0
department_id             0
department                0
product_name              0
day                       0
dtype: int64

Se valida de que no existan valores duplicados en el dataset.

#### Ventas segun Dia y Momento del Dia

Con el objetivo de realizar una segmentación de horarios por parte del día, se procede a realizar un mapeo de horas, agrupandolas por los momentos: "Morning", "Afternoon", "Night" y "Down".

In [10]:
def order_time(x):
    if x in[6, 7, 8, 9, 10, 11, 12]:
        return 'Morning'
    if x in[13, 14, 15, 16, 17]:
        return 'Afternoon'
    if x in[18, 19, 20, 21, 22]:
        return 'Night'
    if x in[23, 24, 0, 1, 2, 3, 4, 5]:
        return 'Dawn'
    return x

Se almacena la el valor de momento del día en la variable "order_time_list".

In [11]:
df['order_time_list']=df['order_hour_of_day'].apply(order_time)
df.sample(5)

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,department_id,department,product_name,day,order_time_list
1754334,2124132,17448,26,1,15,4.0,54,20,0,17,household,paper goods,Tuesday,Afternoon
491952,1083867,55030,67,3,13,5.0,24,1,1,4,produce,fresh fruits,Thursday,Afternoon
266566,2464705,196961,51,2,0,5.0,112,20,1,3,bakery,bread,Wednesday,Dawn
406144,2409770,195303,3,4,14,19.0,112,7,0,3,bakery,bread,Friday,Afternoon
1939002,2294343,30888,56,3,10,2.0,120,7,1,16,dairy eggs,yogurt,Thursday,Morning


#### Segmentacion de Clientes por Cantidad de Ordenes

Se maneja la hipótesis de que el comportamiento de los clientes varia según la cantidad de ordenes o compras que realizan a lo largo del período de tiempo de análisis.
Por tal motivo, se plantea segmentar en grupos de clientes por cantidad de ordenes realizadas.

In [12]:
max_order_numbers = df.groupby('user_id')['order_number'].max().reset_index()
max_order_numbers.rename(columns={'order_number': 'max_order'}, inplace=True)

df = df.merge(max_order_numbers, on='user_id', how='left')

El enfoque que se plantea es el de generar 5 grupos de clientes según la cantidad de compras realizadas, comenzando con ordenes entre 1 y 20, y finalizando con el último grupo que posee clientes que realizaron entre 81 y 100 pedidos (siendo 100 el máximo de ordenes realizadas por clientes registradas en el dataset).

In [13]:
def order_number_group(x):
    if x <= 20:
        x = '1-20 order'
    elif x >20 and x <=40:
        x = '21-40 order'
    elif x >40 and x <=60:
        x = '41-60 order'
    elif x >60 and x <=80:
        x = '61-80 order'
    else:
        x = '81-100 order'
    return x

Luego de realizar el mapeo de grupos de clientes, se procede a almacenar la variable en la columna 'order_number_group'. 

In [14]:
df['order_number_group']=df['max_order'].apply(order_number_group)
df.sample(5)

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,department_id,department,product_name,day,order_time_list,max_order,order_number_group
822979,2187034,93456,14,3,17,5.0,93,5,1,3,bakery,breakfast bakery,Thursday,Afternoon,14,1-20 order
1521736,2339916,72027,37,3,15,4.0,24,5,1,4,produce,fresh fruits,Thursday,Afternoon,37,21-40 order
279314,3234699,128664,14,0,20,11.0,98,3,1,7,beverages,juice nectars,Monday,Night,14,1-20 order
267716,729498,139618,13,1,16,5.0,112,8,1,3,bakery,bread,Tuesday,Afternoon,13,1-20 order
621330,2730337,51439,12,4,12,6.0,83,12,1,4,produce,fresh vegetables,Friday,Morning,29,21-40 order


#### One Hot Encoding

Tomamos todas las variables categóricas para realizar el one hot encoding 

In [15]:
df_one_hot = pd.get_dummies(df, columns=['order_number_group', 'department', 'product_name', 'day', 'order_time_list'], dtype=int, drop_first=True)

In [16]:
df_one_hot.sample(3)

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,department_id,...,product_name_yogurt,day_Monday,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,day_Wednesday,order_time_list_Dawn,order_time_list_Morning,order_time_list_Night
1243624,1310189,59595,6,0,5,7.0,83,3,1,4,...,0,1,0,0,0,0,0,1,0,0
681529,1329777,9757,25,4,10,5.0,51,15,1,13,...,0,0,0,0,0,0,0,0,1,0
235394,1602166,148666,52,0,9,11.0,24,5,1,4,...,0,1,0,0,0,0,0,0,1,0


In [17]:
df_one_hot.columns.value_counts()

order_id                                   1
product_name_frozen breads doughs          1
product_name_laundry                       1
product_name_lunch meat                    1
product_name_marinades meat preparation    1
                                          ..
product_name_cocoa drink mixes             1
product_name_coffee                        1
product_name_cold flu allergy              1
product_name_condiments                    1
order_time_list_Night                      1
Name: count, Length: 177, dtype: int64

In [18]:
columns_to_group = [col for col in df_one_hot.columns if 'product_name_' in col or 'department_' in col]

df_one_hot_ag_counts = df_one_hot.groupby('user_id')[columns_to_group].sum()

df_other_columns = df_one_hot.drop(columns=columns_to_group).drop_duplicates(subset='user_id').set_index('user_id')

df_one_hot_ag = df_other_columns.join(df_one_hot_ag_counts).reset_index()

In [19]:
df_one_hot_ag.sample(3)

Unnamed: 0,user_id,order_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,max_order,...,product_name_spreads,product_name_tea,product_name_tofu meat alternatives,product_name_tortillas flat bread,product_name_trail mix snack mix,product_name_trash bags liners,product_name_vitamins supplements,product_name_water seltzer sparkling water,product_name_white wines,product_name_yogurt
77287,187038,2874011,16,1,18,18.0,72,1,0,16,...,0,0,0,0,0,0,0,3,0,1
43484,122472,663582,25,0,14,30.0,93,1,1,25,...,0,0,0,1,0,0,0,0,0,0
10890,133743,3306076,18,2,14,24.0,49,1,1,18,...,0,0,0,0,0,0,0,0,0,5


Verifico

In [23]:
df_one_hot_ag.drop(['order_id'], axis=1, inplace=True)


In [None]:
user_id_to_check = 1226918

original_data = df[df['user_id'] == user_id_to_check]
print("Datos en el dataset original para user_id", user_id_to_check)
print(original_data)

aggregated_data = df_one_hot_ag[df_one_hot_ag['user_id'] == user_id_to_check]
print("\nDatos en el dataset agrupado para user_id", user_id_to_check)

display(HTML(aggregated_data.to_html(max_rows=20, max_cols=100, notebook=True)))

Datos en el dataset original para user_id 1226918
Empty DataFrame
Columns: [order_id, user_id, order_number, order_dow, order_hour_of_day, days_since_prior_order, product_id, add_to_cart_order, reordered, department_id, department, product_name, day, order_time_list, max_order, order_number_group]
Index: []

Datos en el dataset agrupado para user_id 1226918


Unnamed: 0,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,max_order,order_number_group_21-40 order,order_number_group_41-60 order,order_number_group_61-80 order,order_number_group_81-100 order,day_Monday,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,day_Wednesday,order_time_list_Dawn,order_time_list_Morning,order_time_list_Night,department_id,department_babies,department_bakery,department_beverages,department_breakfast,department_bulk,department_canned goods,department_dairy eggs,department_deli,department_dry goods pasta,department_frozen,department_household,department_international,department_meat seafood,department_missing,department_other,department_pantry,department_personal care,department_pets,department_produce,department_snacks,product_name_asian foods,product_name_baby accessories,product_name_baby bath body care,product_name_baby food formula,product_name_bakery desserts,product_name_baking ingredients,product_name_baking supplies decor,...,product_name_mint gum,product_name_missing,product_name_more household,product_name_muscles joints pain relief,product_name_nuts seeds dried fruit,product_name_oils vinegars,product_name_oral hygiene,product_name_other,product_name_other creams cheeses,product_name_packaged cheese,product_name_packaged meat,product_name_packaged poultry,product_name_packaged produce,product_name_packaged seafood,product_name_packaged vegetables fruits,product_name_paper goods,product_name_pasta sauce,product_name_pickled goods olives,product_name_plates bowls cups flatware,product_name_popcorn jerky,product_name_poultry counter,product_name_prepared meals,product_name_prepared soups salads,product_name_preserved dips spreads,product_name_protein meal replacements,product_name_red wines,product_name_refrigerated,product_name_refrigerated pudding desserts,product_name_salad dressing toppings,product_name_seafood counter,product_name_shave needs,product_name_skin care,product_name_soap,product_name_soft drinks,product_name_soup broth bouillon,product_name_soy lactosefree,product_name_specialty cheeses,product_name_specialty wines champagnes,product_name_spices seasonings,product_name_spirits,product_name_spreads,product_name_tea,product_name_tofu meat alternatives,product_name_tortillas flat bread,product_name_trail mix snack mix,product_name_trash bags liners,product_name_vitamins supplements,product_name_water seltzer sparkling water,product_name_white wines,product_name_yogurt


#### Se exporta el DF con los cambios realizados

In [25]:
df_one_hot_ag.to_csv('../00_Data_Bases/Supermercado_onehot.csv', index=False)

zip_filename = '../00_Data_Bases/Supermercado_onehot.zip'
csv_filename = '../00_Data_Bases/Supermercado_onehot.csv'

with zipfile.ZipFile(zip_filename, 'w', compression=zipfile.ZIP_DEFLATED) as zipf:
    zipf.write(csv_filename, arcname='Supermercado_onehot.csv')