In [1]:
import pandas as pd
import zipfile
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import squarify
import plotly.express as px
from scipy.stats import chi2_contingency
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
zip_path = '../BD/Supermercado.zip' 
csv_filename = 'Supermercado.csv'

with zipfile.ZipFile(zip_path, 'r') as z:
    with z.open(csv_filename) as f:
        df = pd.read_csv(f)


In [3]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [4]:
week_days = {
    0: 'Monday',
    1: 'Tuesday',
    2: 'Wednesday',
    3: 'Thursday',
    4: 'Friday',
    5: 'Saturday',
    6: 'Sunday'
}

df['day'] = df['order_dow'].map(week_days)

In [5]:
df.isnull().sum()

order_id                       0
user_id                        0
order_number                   0
order_dow                      0
order_hour_of_day              0
days_since_prior_order    124342
product_id                     0
add_to_cart_order              0
reordered                      0
department_id                  0
department                     0
product_name                   0
day                            0
dtype: int64

Dado que el objetivo es crear un aloritmo de clasificacion, se eliminan las filas nulas del dataset.

In [6]:
df = df.dropna()

Finalmente, se valida de que efectivamente se hayan eliminado los valores nulos.

In [7]:
df.isnull().sum()

order_id                  0
user_id                   0
order_number              0
order_dow                 0
order_hour_of_day         0
days_since_prior_order    0
product_id                0
add_to_cart_order         0
reordered                 0
department_id             0
department                0
product_name              0
day                       0
dtype: int64

Se valida de que no existan valores duplicados en el dataset.

#### Ventas segun Dia y Momento del Dia

Con el objetivo de realizar una segmentación de horarios por parte del día, se procede a realizar un mapeo de horas, agrupandolas por los momentos: "Morning", "Afternoon", "Night" y "Down".

In [8]:
def order_time(x):
    if x in[6, 7, 8, 9, 10, 11, 12]:
        return 'Morning'
    if x in[13, 14, 15, 16, 17]:
        return 'Afternoon'
    if x in[18, 19, 20, 21, 22]:
        return 'Night'
    if x in[23, 24, 0, 1, 2, 3, 4, 5]:
        return 'Dawn'
    return x

Se almacena la el valor de momento del día en la variable "order_time_list".

In [9]:
df['order_time_list']=df['order_hour_of_day'].apply(order_time)
df.sample(5)

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,department_id,department,product_name,day,order_time_list
1261340,3170111,197176,10,0,16,14.0,69,2,0,15,canned goods,soup broth bouillon,Monday,Afternoon
417452,3248455,205165,7,6,16,9.0,123,11,1,4,produce,packaged vegetables fruits,Sunday,Afternoon
303901,982598,97395,4,0,14,27.0,26,1,1,7,beverages,coffee,Monday,Afternoon
311518,101903,77790,7,6,7,7.0,52,5,1,1,frozen,frozen breakfast,Sunday,Morning
1950213,1917809,94922,12,1,11,3.0,83,5,0,4,produce,fresh vegetables,Tuesday,Morning


#### Segmentacion de Clientes por Cantidad de Ordenes

Se maneja la hipótesis de que el comportamiento de los clientes varia según la cantidad de ordenes o compras que realizan a lo largo del período de tiempo de análisis.
Por tal motivo, se plantea segmentar en grupos de clientes por cantidad de ordenes realizadas.

In [10]:
max_order_numbers = df.groupby('user_id')['order_number'].max().reset_index()
max_order_numbers.rename(columns={'order_number': 'max_order'}, inplace=True)

df = df.merge(max_order_numbers, on='user_id', how='left')

El enfoque que se plantea es el de generar 5 grupos de clientes según la cantidad de compras realizadas, comenzando con ordenes entre 1 y 20, y finalizando con el último grupo que posee clientes que realizaron entre 81 y 100 pedidos (siendo 100 el máximo de ordenes realizadas por clientes registradas en el dataset).

In [11]:
def order_number_group(x):
    if x <= 20:
        x = '1-20 order'
    elif x >20 and x <=40:
        x = '21-40 order'
    elif x >40 and x <=60:
        x = '41-60 order'
    elif x >60 and x <=80:
        x = '61-80 order'
    else:
        x = '81-100 order'
    return x

Luego de realizar el mapeo de grupos de clientes, se procede a almacenar la variable en la columna 'order_number_group'. 

In [12]:
df['order_number_group']=df['max_order'].apply(order_number_group)
df.sample(5)

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,department_id,department,product_name,day,order_time_list,max_order,order_number_group
313732,748336,46624,2,0,13,8.0,3,1,1,19,snacks,energy granola bars,Monday,Afternoon,14,1-20 order
1071861,1138805,166102,13,0,15,8.0,105,7,1,13,pantry,doughs gelatins bake mixes,Monday,Afternoon,31,21-40 order
413319,1832488,197297,33,5,16,13.0,63,13,1,9,dry goods pasta,grains rice dried goods,Saturday,Afternoon,33,21-40 order
904780,3011336,74798,34,4,21,8.0,96,24,1,20,deli,lunch meat,Friday,Night,40,21-40 order
1380059,2164906,97705,20,2,20,3.0,131,14,0,9,dry goods pasta,dry pasta,Wednesday,Night,32,21-40 order


#### Eliminaciion de Department con valor "Missing"

In [13]:
missing_count = df[df['department'] == 'missing'].shape[0]
print("Number of rows with 'missing' in the department column:", missing_count)


Number of rows with 'missing' in the department column: 4608


In [14]:
missing_departments_df = df[df['department'] == 'missing']
missing_departments_df.head(5000)


Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,department_id,department,product_name,day,order_time_list,max_order,order_number_group
145,887348,109205,13,1,8,28.00,100,1,1,21,missing,missing,Tuesday,Morning,13,1-20 order
156,1750132,57626,13,0,10,12.00,100,1,1,21,missing,missing,Monday,Morning,13,1-20 order
983,1218457,160124,13,0,13,20.00,100,25,1,21,missing,missing,Monday,Afternoon,13,1-20 order
1050,3152951,67226,50,1,19,2.00,100,1,1,21,missing,missing,Tuesday,Night,50,41-60 order
1477,2949066,166141,29,4,13,1.00,100,4,0,21,missing,missing,Friday,Afternoon,60,41-60 order
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1892785,273225,106209,13,2,13,30.00,100,11,0,21,missing,missing,Wednesday,Afternoon,13,1-20 order
1894262,2098439,195841,46,3,17,0.00,100,10,1,21,missing,missing,Thursday,Afternoon,46,41-60 order
1894546,283361,129128,21,5,14,3.00,100,10,1,21,missing,missing,Saturday,Afternoon,21,21-40 order
1895064,3089868,145480,54,6,14,3.00,100,7,0,21,missing,missing,Sunday,Afternoon,54,41-60 order


In [15]:
df = df[df['department'] != 'missing']

In [16]:
print("Number of rows with 'missing' in the department column:", df[df['department'] == 'missing'].shape[0])

Number of rows with 'missing' in the department column: 0


Dado que representa una porcion infima del dataset, y no aporta informacion de lo comprado, se decide eliminarla

In [17]:
unique_values = df['product_name'].unique()
print(unique_values)

['tea' 'fresh vegetables' 'fresh fruits' 'yogurt' 'soy lactosefree'
 'bakery desserts' 'frozen breakfast' 'butter' 'cereal' 'eggs'
 'buns rolls' 'cream' 'water seltzer sparkling water' 'baking ingredients'
 'pickled goods olives' 'packaged poultry' 'packaged cheese'
 'other creams cheeses' 'honeys syrups nectars' 'coffee' 'refrigerated'
 'energy granola bars' 'plates bowls cups flatware' 'paper goods'
 'oral hygiene' 'diapers wipes' 'food storage' 'nuts seeds dried fruit'
 'soap' 'packaged vegetables fruits' 'hot dogs bacon sausage' 'lunch meat'
 'chips pretzels' 'soft drinks' 'meat counter' 'poultry counter'
 'fresh dips tapenades' 'milk' 'ice cream ice' 'prepared soups salads'
 'condiments' 'bread' 'juice nectars' 'canned fruit applesauce'
 'preserved dips spreads' 'packaged produce' 'canned jarred vegetables'
 'fresh pasta' 'pasta sauce' 'frozen produce' 'frozen appetizers sides'
 'soup broth bouillon' 'dry pasta' 'prepared meals' 'fresh herbs'
 'hot cereal pancake mixes' 'spices se

In [18]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df['product_label'] = le.fit_transform(df['product_name'])

department_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Product Encoding Mapping:")
print(department_mapping)

df.head()


Product Encoding Mapping:
{'air fresheners candles': 0, 'asian foods': 1, 'baby accessories': 2, 'baby bath body care': 3, 'baby food formula': 4, 'bakery desserts': 5, 'baking ingredients': 6, 'baking supplies decor': 7, 'beauty': 8, 'beers coolers': 9, 'body lotions soap': 10, 'bread': 11, 'breakfast bakery': 12, 'breakfast bars pastries': 13, 'bulk dried fruits vegetables': 14, 'bulk grains rice dried goods': 15, 'buns rolls': 16, 'butter': 17, 'candy chocolate': 18, 'canned fruit applesauce': 19, 'canned jarred vegetables': 20, 'canned meals beans': 21, 'canned meat seafood': 22, 'cat food care': 23, 'cereal': 24, 'chips pretzels': 25, 'cleaning products': 26, 'cocoa drink mixes': 27, 'coffee': 28, 'cold flu allergy': 29, 'condiments': 30, 'cookies cakes': 31, 'crackers': 32, 'cream': 33, 'deodorants': 34, 'diapers wipes': 35, 'digestion': 36, 'dish detergents': 37, 'dog food care': 38, 'doughs gelatins bake mixes': 39, 'dry pasta': 40, 'eggs': 41, 'energy granola bars': 42, 'energ

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,department_id,department,product_name,day,order_time_list,max_order,order_number_group,product_label
0,1201011,147243,14,0,16,3.0,94,1,0,7,beverages,tea,Monday,Afternoon,38,21-40 order,124
1,1201011,147243,14,0,16,3.0,83,2,0,4,produce,fresh vegetables,Monday,Afternoon,38,21-40 order,53
2,1201011,147243,14,0,16,3.0,83,3,1,4,produce,fresh vegetables,Monday,Afternoon,38,21-40 order,53
3,1201011,147243,14,0,16,3.0,24,4,1,4,produce,fresh fruits,Monday,Afternoon,38,21-40 order,50
4,1201011,147243,14,0,16,3.0,120,5,1,16,dairy eggs,yogurt,Monday,Afternoon,38,21-40 order,132


#### Se exporta el DF con los cambios realizados

In [19]:
df.to_csv('../BD/Supermercado_mod.csv', index=False)

zip_filename = '../BD/Supermercado_mod.zip'
csv_filename = '../BD/Supermercado_mod.csv'

with zipfile.ZipFile(zip_filename, 'w', compression=zipfile.ZIP_DEFLATED) as zipf:
    zipf.write(csv_filename, arcname='Supermercado_mod.csv')