## Setup

In [1]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
customer_data = pd.read_csv("../data/raw/eci_customer_data.csv")
product_groups_data = pd.read_csv("../data/raw/eci_product_groups.csv")
product_master_data = pd.read_csv("../data/raw/eci_product_master.csv")
stores_clusters_data = pd.read_csv("../data/raw/eci_stores_clusters.csv")
stores_data = pd.read_csv("../data/raw/eci_stores.csv")
transactions_data = pd.read_csv("../data/raw/eci_transactions.csv")

## Exploracion de datos / Data imputation

### Transactions

In [4]:
transactions_data.head()

Unnamed: 0,TRANSACTION_ID,DATE,STORE_ID,SKU,QUANTITY,PRICE,TOTAL_SALES,SUBGROUP,STORE_SUBGROUP_DATE_ID
0,17185,2021-01-01,S00148,CLOMEJA001,1.0,52.8,52.8,Jackets,S00148_Jackets_2021-01-01
1,4108,2021-01-01,S00035,CLOCHIN014,1.0,55.25,55.25,Infants,S00035_Infants_2021-01-01
2,2400,2021-01-01,S00020,BEAHASH005,1.0,40.19,40.19,Shampoo,S00020_Shampoo_2021-01-01
3,7978,2021-01-01,S00068,TOYGABO004,1.0,35.41,35.41,Board Games,S00068_Board Games_2021-01-01
4,6946,2021-01-01,S00059,BEAHASH006,1.0,38.28,38.28,Shampoo,S00059_Shampoo_2021-01-01


In [3]:
# Convertimos la columna DATE a tipo datetime
transactions_data["DATE"] = pd.to_datetime(transactions_data["DATE"], format="%Y-%m-%d")

In [4]:
# Tenemos data desde 2021-01-01 hasta 2023-12-31 (3 años)
transactions_data.describe().round(2)

Unnamed: 0,TRANSACTION_ID,DATE,QUANTITY,PRICE,TOTAL_SALES
count,19004759.0,19004759,18909672.0,19004759.0,19004759.0
mean,9502380.0,2022-06-22 20:26:04.793325568,1.44,68.69,97.37
min,1.0,2021-01-01 00:00:00,1.0,4.02,4.02
25%,4751190.5,2021-09-23 00:00:00,1.0,22.94,25.5
50%,9502380.0,2022-06-21 00:00:00,1.0,41.24,48.01
75%,14253569.5,2023-03-16 00:00:00,2.0,74.02,93.02
max,19004759.0,2023-12-31 00:00:00,85.0,431.87,8369.26
std,5486201.51,,1.01,84.06,155.9


In [7]:
transactions_data.isna().sum()

TRANSACTION_ID                0
DATE                          0
STORE_ID                      0
SKU                           0
QUANTITY                  95087
PRICE                         0
TOTAL_SALES                   0
SUBGROUP                      0
STORE_SUBGROUP_DATE_ID        0
dtype: int64

In [5]:
# Si la columna de Quantity es NaN, llenamos los valores de Quantitiy como Total_sales / Price
transactions_data['QUANTITY'] = transactions_data.apply(
    lambda row: row['TOTAL_SALES'] / row['PRICE'] if pd.isna(row['QUANTITY']) else row['QUANTITY'], axis=1
)

### Clientes

In [9]:
customer_data.head()

Unnamed: 0,client_id,phone_number,email_address,city,state,zip_code,education_level,occupation,loyalty_member,loyalty_number,loyalty_points
0,500001,,n.edwards+spam@outlook.com,Spokane,WA,99201,High School,,No,,339.0
1,500002,(412) 698-7960,lisa_cook@yahoo.com,Pittsburgh,PA,15222,High School,Sales Representative,Yes,LP525082,406.0
2,500003,(303) 491-3774,,Fort Collins,CO,80526,College Graduate,Analyst,True,950139,14.0
3,500004,(803) 255-0104,betty-campbell@gmail.com,Charleston,SC,29401,Some College,Supervisor,No,,
4,500005,(303) 596-8987,,Aurora,CO,80012,Associates,,Yes,,


In [6]:
# Cantidad de clientes
customer_data["client_id"].nunique()

801923

### Productos

In [11]:
product_groups_data.head()

Unnamed: 0,sku,product_name,price_group_id,price_group_name,group_type
0,BOOEDTE001,PageTurn Essential Textbooks,BACK_TO_SCHOOL_01,Back to School Essentials,Seasonal
1,BOOEDTE002,Bookworm Advanced Textbooks,BACK_TO_SCHOOL_01,Back to School Essentials,Seasonal
2,BOOEDTE004,PageTurn Essential Textbooks,BACK_TO_SCHOOL_01,Back to School Essentials,Seasonal
3,CLOMESH002,DressRight Shirts Collection,BACK_TO_SCHOOL_01,Back to School Essentials,Seasonal
4,CLOMESH005,StyleX Relaxed Shirts,BACK_TO_SCHOOL_01,Back to School Essentials,Seasonal


In [12]:
product_groups_data.isna().sum()

sku                                   0
 product_name                         0
 price_group_id                       0
 price_group_name                     0
 group_type                           0
dtype: int64

In [9]:
# Renombro columnas para que coincidan con las de transactions_data
product_master_data.rename(columns={"sku": "SKU"}, inplace=True)
product_groups_data.rename(columns={"sku": "SKU"}, inplace=True)

In [14]:
len(product_groups_data)

80

In [10]:
product_groups_data["SKU"].nunique()

73

In [11]:
# Hay repetidos, que se deben a que un producto puede pertenecer a varios grupos
product_groups_data[product_groups_data["SKU"].duplicated(keep=False)].sort_values("SKU")

Unnamed: 0,SKU,product_name,price_group_id,price_group_name,group_type
7,ELECOLA006,Electra Laptops Edge XL,BACK_TO_SCHOOL_01,Back to School Essentials,Seasonal
31,ELECOLA006,Electra Laptops Edge XL,COMP_SETUP_01,Computer Setup Essentials,Bundle
51,ELECOLA006,Electra Laptops Edge XL,MOBILE_DEVICES_01,Personal Computing Devices,Substitute
8,ELECOLA009,ByteWare Laptops Smart 2025,BACK_TO_SCHOOL_01,Back to School Essentials,Seasonal
52,ELECOLA009,ByteWare Laptops Smart 2025,MOBILE_DEVICES_01,Personal Computing Devices,Substitute
34,ELEMOAC008,ByteWare Accessories Air 2025,COMP_SETUP_01,Computer Setup Essentials,Bundle
48,ELEMOAC008,ByteWare Accessories Air 2025,GIFT_UNDER_50_01,Gift Ideas Under $50,Promotional
35,ELEMOAC009,TechPro Accessories Air X,COMP_SETUP_01,Computer Setup Essentials,Bundle
49,ELEMOAC009,TechPro Accessories Air X,GIFT_UNDER_50_01,Gift Ideas Under $50,Promotional
38,SPOFIEQ004,FitZone Sport Equipment,FITNESS_COMBO_01,Complete Fitness Package,Bundle


In [12]:
# Dado que no se diferencian en las transacciones, los eliminamos por simplicidad
product_groups_data.drop_duplicates(subset=["SKU"], inplace=True)

In [18]:
product_master_data.head()

Unnamed: 0,SKU,product_name,category,group,subgroup,brand,base_price,initial_ticket_price,costos
0,ELECOLA001,TechPro Laptops Smart 2025,Electronics,Computing,Laptops,TechPro,284.94,355.14,225.84
1,ELECOLA002,Electra Laptops Ultra Pro,Electronics,Computing,Laptops,Electra,311.83,352.74,209.99
2,ELECOLA003,CircuitCity Laptops Air 2024,Electronics,Computing,Laptops,CircuitCity,246.96,301.35,192.61
3,ELECOLA004,TechPro Laptops Elite 2025,Electronics,Computing,Laptops,TechPro,324.96,420.49,212.23
4,ELECOLA005,CircuitCity Laptops Pro S,Electronics,Computing,Laptops,CircuitCity,339.88,386.23,231.75


In [13]:
product_master_data.isna().sum()

SKU                     0
product_name            0
category                0
group                   0
subgroup                0
brand                   0
base_price              0
initial_ticket_price    0
costos                  0
dtype: int64

In [14]:
# Notar el formato de las columnas de product_groups_data
product_groups_data.columns

Index(['SKU', 'product_name', 'price_group_id', 'price_group_name',
       'group_type'],
      dtype='object')

In [15]:
# Unimos los datos de productos con los grupos de productos
productos = pd.merge(product_master_data, product_groups_data, on="SKU", how="left")

In [16]:
# Hay muchos productos que no tienen grupo asignado
productos.isna().sum()

SKU                       0
product_name_x            0
category                  0
group                     0
subgroup                  0
brand                     0
base_price                0
initial_ticket_price      0
costos                    0
product_name_y          788
price_group_id          788
price_group_name        788
group_type              788
dtype: int64

In [18]:
# Asignamos Comun a los productos que no tienen grupo asignado (price_group_id y group_type, el resto no las utilizamos)
productos["price_group_id"] = productos['price_group_id'].fillna('Comun')
productos['group_type']= productos['group_type'].fillna('Comun')


In [19]:
# Notar que hay 74 subproductos, pero en las transacciones solo aparecen 73. En concreto, no aparece 'Basketball'
productos["subgroup"].nunique()

74

### Stores

In [20]:
stores_clusters_data.head()

Unnamed: 0,STORE_ID,BRAND,STORE_NAME,CLUSTER
0,S00001,PrimeGoods,PrimeGoods - Hillsboro,Cluster_West_Coast
1,S00002,MegaShop,MegaShop Indianapolis Eastside,Midwest_Cluster
2,STORE0003,MarketExpress,MarketExpress - Las Vegas,Mountain_Cluster
3,STORE0004,MegaShop,MegaShop - Oklahoma City,Texas_Cluster
4,S00005,PrimeGoods,PrimeGoods - Charlotte,NC_Cluster


In [26]:
stores_clusters_data.describe()

Unnamed: 0,STORE_ID,BRAND,STORE_NAME,CLUSTER
count,157,157,157,140
unique,157,9,144,13
top,S00001,EssentialsPlus,EssentialsPlus - Spartanburg,Cluster_West_Coast
freq,1,25,3,16


In [21]:
# Aunque Cluster tenga NaN, no es necesario el dato dado que ya sabemos el estado y la ciudad de cada tienda
stores_clusters_data.isna().sum()

STORE_ID       0
BRAND          0
STORE_NAME     0
CLUSTER       17
dtype: int64

In [22]:
# Notar que hay diferentes formatos de STORE_ID, pero coinciden con los de stores_data y los de transactions_data
(stores_clusters_data["STORE_ID"].unique() == stores_data["STORE_ID"].unique()).sum()

np.int64(157)

In [23]:
stores_data.head()

Unnamed: 0,STORE_ID,BRAND,STORE_NAME,ADDRESS1,ADDRESS2,CITY,STATE,ZIP,OPENDATE,CLOSEDATE,STORE_TYPE,REGION
0,S00001,PrimeGoods,PrimeGoods - Hillsboro,5735 NE Washington Pl,,Hillsboro,OR,74820,2015-01-10 00:00:00,,Express,West
1,S00002,MegaShop,MegaShop Indianapolis Eastside,3557 Cedar Ct,Unit 89,Indianapolis,IN,33483-1775,2012-03-23 00:00:00,,Express,Midwest
2,STORE0003,MarketExpress,MarketExpress - Las Vegas,7870 W Church Ave,,Las Vegas,NV,49099,2021-09-23 00:00:00,,Outlet,West
3,STORE0004,MegaShop,MegaShop - Oklahoma City,3105 N Washington Pl,,Oklahoma City,OK,80592,06/30/2015,,Mall,Southwest
4,S00005,PrimeGoods,PrimeGoods - Charlotte,8756 SW Maple Ln,Suite 848,Charlotte,NC,32299,2019-06-10 00:00:00,,Express,Southeast


In [30]:
stores_data.describe()

Unnamed: 0,STORE_ID,BRAND,STORE_NAME,ADDRESS1,ADDRESS2,CITY,STATE,ZIP,OPENDATE,CLOSEDATE,STORE_TYPE,REGION
count,157,157,157,157,43,157,157,151,157,18,155,157
unique,157,9,144,157,34,77,22,151,155,18,8,5
top,S00001,EssentialsPlus,EssentialsPlus - Spartanburg,5735 NE Washington Pl,Building E,Hillsboro,SC,74820,2017-09-27 00:00:00,2028-02-21 00:00:00,Mall,Southeast
freq,1,25,3,1,3,5,13,1,2,1,44,47


In [24]:
stores_data.isna().sum()

STORE_ID        0
BRAND           0
STORE_NAME      0
ADDRESS1        0
ADDRESS2      114
CITY            0
STATE           0
ZIP             6
OPENDATE        0
CLOSEDATE     139
STORE_TYPE      2
REGION          0
dtype: int64

In [25]:
# Completamos los NaN en STORE_TYPE con el valor mas frecuente
stores_data["STORE_TYPE"]= stores_data["STORE_TYPE"].fillna(stores_data["STORE_TYPE"].mode()[0])

In [26]:
# Hay distintos formatos de fechas, lo unificamos
stores_data_fixed = stores_data.copy()
stores_data_fixed["OPENDATE"] = pd.to_datetime(stores_data["OPENDATE"], errors='coerce')
mask = stores_data_fixed[stores_data_fixed["OPENDATE"].isna()]
stores_data_fixed.loc[mask.index, "OPENDATE"] = pd.to_datetime(stores_data.loc[mask.index, "OPENDATE"], errors='coerce')

In [27]:
# Reemplazamos los valores de CLOSDATE que son NaN o mayor que la fecha actual, por la ultima fecha más un año (indicando que todavia no cerro)
stores_data_fixed["CLOSEDATE"] = pd.to_datetime(stores_data["CLOSEDATE"], format="mixed")

In [28]:
# Fecha de hoy
fecha_hoy = pd.Timestamp("2024-01-01")
stores_data_fixed.loc[
    (stores_data_fixed["CLOSEDATE"].isna()) | 
    (stores_data_fixed["CLOSEDATE"] > fecha_hoy), 
    "CLOSEDATE"
] = fecha_hoy + pd.DateOffset(years=1)

In [29]:
# Como no utilizamos Adress, podemos dejar que haya NaN
stores_data_fixed.isna().sum()

STORE_ID        0
BRAND           0
STORE_NAME      0
ADDRESS1        0
ADDRESS2      114
CITY            0
STATE           0
ZIP             6
OPENDATE        0
CLOSEDATE       0
STORE_TYPE      0
REGION          0
dtype: int64

Notar que, aunque haya 157 tiendas, varias cerraron antes del 2021 y hay dos que cerraron entre 2021 y 2024 (104  21)

In [30]:
stores_data_fixed

Unnamed: 0,STORE_ID,BRAND,STORE_NAME,ADDRESS1,ADDRESS2,CITY,STATE,ZIP,OPENDATE,CLOSEDATE,STORE_TYPE,REGION
0,S00001,PrimeGoods,PrimeGoods - Hillsboro,5735 NE Washington Pl,,Hillsboro,OR,74820,2015-01-10,2025-01-01,Express,West
1,S00002,MegaShop,MegaShop Indianapolis Eastside,3557 Cedar Ct,Unit 89,Indianapolis,IN,33483-1775,2012-03-23,2025-01-01,Express,Midwest
2,STORE0003,MarketExpress,MarketExpress - Las Vegas,7870 W Church Ave,,Las Vegas,NV,49099,2021-09-23,2025-01-01,Outlet,West
3,STORE0004,MegaShop,MegaShop - Oklahoma City,3105 N Washington Pl,,Oklahoma City,OK,80592,2015-06-30,2025-01-01,Mall,Southwest
4,S00005,PrimeGoods,PrimeGoods - Charlotte,8756 SW Maple Ln,Suite 848,Charlotte,NC,32299,2019-06-10,2025-01-01,Express,Southeast
...,...,...,...,...,...,...,...,...,...,...,...,...
152,S00153,MegaShop,MegaShop - Oklahoma City,3894 Park Pl,,Oklahoma City,OK,86274,2020-04-01,2025-01-01,Boutique,Southwest
153,S00154,BudgetShop,BudgetShop - Newark,5992 View Rd,,Newark,NJ,61756,2009-01-05,2025-01-01,Outlet,Northeast
154,S00155,EssentialsPlus,EssentialsPlus - Columbia,6240 N River Ave,,Columbia,SC,99211-2505,2015-05-20,2025-01-01,Mall,Southeast
155,S00156,FamilyStore,FamilyStore - Naperville,4679 Market Dr,,Naperville,IL,86674,2014-12-23,2025-01-01,Street,Midwest


## Merge de datos

In [31]:
stores = pd.merge(stores_data_fixed, stores_clusters_data, on="STORE_ID", how="left")

In [32]:
datos_unidos = pd.merge(transactions_data, stores, on="STORE_ID", how="left")

In [33]:
del transactions_data

In [34]:
datos_unidos = pd.merge(datos_unidos, productos, on="SKU", how="left")

In [35]:
datos_unidos.columns

Index(['TRANSACTION_ID', 'DATE', 'STORE_ID', 'SKU', 'QUANTITY', 'PRICE',
       'TOTAL_SALES', 'SUBGROUP', 'STORE_SUBGROUP_DATE_ID', 'BRAND_x',
       'STORE_NAME_x', 'ADDRESS1', 'ADDRESS2', 'CITY', 'STATE', 'ZIP',
       'OPENDATE', 'CLOSEDATE', 'STORE_TYPE', 'REGION', 'BRAND_y',
       'STORE_NAME_y', 'CLUSTER', 'product_name_x', 'category', 'group',
       'subgroup', 'brand', 'base_price', 'initial_ticket_price', 'costos',
       'product_name_y', 'price_group_id', 'price_group_name', 'group_type'],
      dtype='object')

In [39]:
# Renombramos las columnas
datos_unidos.rename(columns={"price_group_id": "PRICE_GROUP_ID", "BRAND_x": "BRAND",
                             "category":"CATEGORY", "group":"GROUP", "base_price": "BASE_PRICE", 
                             "initial_ticket_price": "INITIAL_TICKET_PRICE", "costos": "COSTOS",
                             "group_type": "GROUP_TYPE"}, inplace=True)

In [40]:
# Nos quedamos con las columnas que nos interesan
datos_unidos = datos_unidos[["DATE",  "SKU", "QUANTITY", "PRICE", "TOTAL_SALES",
    "REGION", "CITY", "STATE", "STORE_TYPE", "STORE_ID", "OPENDATE", "CLOSEDATE", "STORE_SUBGROUP_DATE_ID",
    "CATEGORY", "GROUP", "SUBGROUP","GROUP_TYPE", "PRICE_GROUP_ID", "BRAND", "INITIAL_TICKET_PRICE", "BASE_PRICE", "COSTOS" ]]

In [41]:
len(datos_unidos)

19159425

In [42]:
# No aseguramos que no haya valores NaN
datos_unidos.isna().sum()

DATE                      0
SKU                       0
QUANTITY                  0
PRICE                     0
TOTAL_SALES               0
REGION                    0
CITY                      0
STATE                     0
STORE_TYPE                0
STORE_ID                  0
OPENDATE                  0
CLOSEDATE                 0
STORE_SUBGROUP_DATE_ID    0
CATEGORY                  0
GROUP                     0
SUBGROUP                  0
GROUP_TYPE                0
PRICE_GROUP_ID            0
BRAND                     0
INITIAL_TICKET_PRICE      0
BASE_PRICE                0
COSTOS                    0
dtype: int64

## Feature Engineering

In [48]:
# Nos aseguramos que este ordenado por fecha
datos_unidos = datos_unidos.copy()
datos_unidos.sort_values(by="DATE")

Unnamed: 0,DATE,SKU,QUANTITY,PRICE,TOTAL_SALES,REGION,CITY,STATE,STORE_TYPE,STORE_ID,...,STORE_SUBGROUP_DATE_ID,CATEGORY,GROUP,SUBGROUP,GROUP_TYPE,PRICE_GROUP_ID,BRAND,INITIAL_TICKET_PRICE,BASE_PRICE,COSTOS
0,2021-01-01,CLOMEJA001,1.0,52.80,52.80,Midwest,Akron,OH,Flagship,S00148,...,S00148_Jackets_2021-01-01,Clothing,Men,Jackets,Comun,Comun,BudgetShop,67.45,59.21,34.69
7,2021-01-01,GROPABA010,1.0,9.16,9.16,West,Hillsboro,OR,Mall,S00144,...,S00144_Baking_2021-01-01,Groceries,Pantry,Baking,Comun,Comun,QuickBuy,11.97,9.63,8.06
1,2021-01-01,CLOCHIN014,1.0,55.25,55.25,Southeast,Tampa,FL,Outlet,S00035,...,S00035_Infants_2021-01-01,Clothing,Children,Infants,Comun,Comun,BudgetShop,65.54,51.39,29.00
2,2021-01-01,BEAHASH005,1.0,40.19,40.19,Midwest,Cleveland,OH,Street,S00020,...,S00020_Shampoo_2021-01-01,Beauty,Haircare,Shampoo,Comun,Comun,FamilyStore,48.18,40.85,14.75
3,2021-01-01,TOYGABO004,1.0,35.41,35.41,West,Lakewood,CO,Express,S00068,...,S00068_Board Games_2021-01-01,Toys,Games,Board Games,Comun,Comun,MarketExpress,37.99,34.15,19.76
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19159395,2023-12-31,GROPAPA013,1.0,8.68,8.68,West,North Las Vegas,NV,Flagship,S00145,...,S00145_Pasta_2023-12-31,Groceries,Pantry,Pasta,Comun,Comun,PrimeGoods,10.46,8.31,6.11
19159394,2023-12-31,GROSNCH011,1.0,9.28,9.28,West,Salem,OR,Mall,S00040,...,S00040_Chips_2023-12-31,Groceries,Snacks,Chips,Comun,Comun,EssentialsPlus,12.48,10.01,8.27
19159393,2023-12-31,GROPACA015,1.0,11.28,11.28,Southwest,Phoenix,AZ,Street,S00055,...,S00055_Canned Goods_2023-12-31,Groceries,Pantry,Canned Goods,Comun,Comun,BudgetShop,14.21,11.87,8.90
19159412,2023-12-31,HOMAPSP005,1.0,87.16,87.16,Midwest,Carmel,IN,Outlet,000053,...,000053_Specialty_2023-12-31,Home & Kitchen,Appliances,Specialty,Comun,Comun,MegaShop,112.07,94.74,50.19


In [49]:
datos_unidos["OPENDATE"] = pd.to_datetime(datos_unidos["OPENDATE"])
datos_unidos["CLOSEDATE"] = pd.to_datetime(datos_unidos["CLOSEDATE"])

In [50]:
datos_unidos["YEAR_OPEN"] = datos_unidos["OPENDATE"].dt.year
datos_unidos["YEAR_CLOSE"] = datos_unidos["CLOSEDATE"].dt.year
datos_unidos["MONTH_OPEN"] = datos_unidos["OPENDATE"].dt.month
datos_unidos["MONTH_CLOSE"] = datos_unidos["CLOSEDATE"].dt.month

In [51]:
datos_unidos["YEAR"] = datos_unidos["DATE"].dt.year
datos_unidos["MONTH"] = datos_unidos["DATE"].dt.month
datos_unidos["DAY"] = datos_unidos["DATE"].dt.day
datos_unidos["DAY_OF_WEEK"] = datos_unidos["DATE"].dt.day_name()
datos_unidos["WEEK"] = datos_unidos["DATE"].dt.isocalendar().week

In [52]:
# La mayoria de datos son transacciones repetidas
datos_unidos.duplicated(subset=["SKU", "DATE", "STORE_ID"], keep=False).sum()

np.int64(18451229)

In [53]:
# Las agregamos para que tengamos una sola fila por SKU, DATE y STORE_ID, promediando el precio y sumando la cantidad y las ventas totales
otras_cols = [c for c in datos_unidos.columns if c not in ['SKU', 'DATE', 'STORE_ID'] + ['PRICE', 'QUANTITY', "TOTAL_SALES"]]

agg_dict = {'PRICE': 'mean', 'QUANTITY': 'sum', "TOTAL_SALES": 'sum'}
agg_dict.update({col: 'first' for col in otras_cols})

datos_unidos = datos_unidos.groupby(['SKU', 'DATE', 'STORE_ID'], as_index=False).agg(agg_dict)

In [54]:
len(datos_unidos)

6859693

Ahorramos memoria cambiando el tipo de datos

In [55]:
cols_category= ["SKU", "REGION", "CITY", "STATE", "STORE_TYPE", "STORE_ID", "STORE_SUBGROUP_DATE_ID",
    "CATEGORY", "GROUP", "SUBGROUP","GROUP_TYPE", "PRICE_GROUP_ID", "BRAND", "DAY_OF_WEEK"]
for col in cols_category:
    datos_unidos[col] = datos_unidos[col].astype("category")

In [56]:
for col in ["PRICE", "TOTAL_SALES", "INITIAL_TICKET_PRICE", "BASE_PRICE", "COSTOS" ]:
    datos_unidos[col] = datos_unidos[col].astype("float32")

In [57]:
for col in ["QUANTITY", "DAY", "WEEK", "MONTH", "YEAR", "YEAR_OPEN", "YEAR_CLOSE", "MONTH_OPEN", "MONTH_CLOSE"]:
    datos_unidos[col] = datos_unidos[col].astype("int16")

In [58]:
# Memoria utilizada (GB)
datos_unidos.memory_usage(deep=True).sum() / (1024**3)

np.float64(1.0472219986841083)

## Outliers

In [59]:
datos_unidos.describe().round(2)

Unnamed: 0,DATE,PRICE,QUANTITY,TOTAL_SALES,OPENDATE,CLOSEDATE,INITIAL_TICKET_PRICE,BASE_PRICE,COSTOS,YEAR_OPEN,YEAR_CLOSE,MONTH_OPEN,MONTH_CLOSE,YEAR,MONTH,DAY,WEEK
count,6859693,6859693.0,6859693.0,6859693.0,6859693,6859693,6859693.0,6859693.0,6859693.0,6859693.0,6859693.0,6859693.0,6859693.0,6859693.0,6859693.0,6859693.0,6859693.0
mean,2022-07-01 15:24:47.001414656,69.69,4.0,272.53,2015-08-13 14:25:29.189950976,2024-12-26 08:00:01.998457600,84.78,70.71,44.5,2015.08,2024.97,6.87,1.13,2022.0,6.55,15.72,26.66
min,2021-01-01 00:00:00,4.02,1.0,4.75,2001-08-29 00:00:00,2023-10-29 00:00:00,8.88,8.02,5.76,2001.0,2023.0,1.0,1.0,2021.0,1.0,1.0,1.0
25%,2021-10-01 00:00:00,23.29,2.0,60.37,2012-05-29 00:00:00,2025-01-01 00:00:00,28.49,24.06,11.21,2012.0,2025.0,4.0,1.0,2021.0,4.0,8.0,14.0
50%,2022-07-02 00:00:00,41.82,4.0,137.41,2016-02-26 00:00:00,2025-01-01 00:00:00,50.29,42.63,19.76,2016.0,2025.0,7.0,1.0,2022.0,7.0,16.0,27.0
75%,2023-04-01 00:00:00,74.78,5.0,284.2,2019-10-14 00:00:00,2025-01-01 00:00:00,92.68,77.56,45.22,2019.0,2025.0,10.0,1.0,2023.0,10.0,23.0,40.0
max,2023-12-31 00:00:00,431.87,108.0,13211.92,2023-10-14 00:00:00,2025-01-01 00:00:00,452.69,358.79,281.26,2023.0,2025.0,12.0,11.0,2023.0,12.0,31.0,53.0
std,,85.01,2.51,410.86,,,102.98,85.73,63.88,4.86,0.23,3.37,1.09,0.82,3.45,8.8,15.05


In [60]:
datos_unidos.QUANTITY.quantile(0.999)

np.float64(22.0)

In [61]:
# Eliminamos los outliers de la columna QUANTITY, aquellos que superen el 99.9%
datos_unidos = datos_unidos[datos_unidos.QUANTITY <= datos_unidos.QUANTITY.quantile(0.999)]

In [63]:
# Lo mismo con PRICE y TOTAL SALES
datos_unidos = datos_unidos[datos_unidos.PRICE <= datos_unidos.PRICE.quantile(0.999)]
datos_unidos = datos_unidos[datos_unidos.TOTAL_SALES <= datos_unidos.TOTAL_SALES.quantile(0.999)]

In [64]:
datos_unidos.describe().round(2)

Unnamed: 0,DATE,PRICE,QUANTITY,TOTAL_SALES,OPENDATE,CLOSEDATE,INITIAL_TICKET_PRICE,BASE_PRICE,COSTOS,YEAR_OPEN,YEAR_CLOSE,MONTH_OPEN,MONTH_CLOSE,YEAR,MONTH,DAY,WEEK
count,6825953,6825953.0,6825953.0,6825953.0,6825953,6825953,6825953.0,6825953.0,6825953.0,6825953.0,6825953.0,6825953.0,6825953.0,6825953.0,6825953.0,6825953.0,6825953.0
mean,2022-07-02 00:37:01.328155904,68.59,3.97,262.26,2015-08-13 12:45:25.080586496,2024-12-26 08:03:11.842399744,83.5,69.65,43.69,2015.08,2024.97,6.87,1.13,2022.0,6.55,15.71,26.67
min,2021-01-01 00:00:00,4.02,1.0,4.75,2001-08-29 00:00:00,2023-10-29 00:00:00,8.88,8.02,5.76,2001.0,2023.0,1.0,1.0,2021.0,1.0,1.0,1.0
25%,2021-10-01 00:00:00,23.22,2.0,60.06,2012-05-29 00:00:00,2025-01-01 00:00:00,28.32,24.03,11.16,2012.0,2025.0,4.0,1.0,2021.0,4.0,8.0,14.0
50%,2022-07-02 00:00:00,41.67,4.0,136.48,2016-02-26 00:00:00,2025-01-01 00:00:00,50.05,42.57,19.57,2016.0,2025.0,7.0,1.0,2022.0,7.0,16.0,27.0
75%,2023-04-01 00:00:00,74.08,5.0,280.8,2019-10-14 00:00:00,2025-01-01 00:00:00,90.94,77.07,44.34,2019.0,2025.0,10.0,1.0,2023.0,10.0,23.0,40.0
max,2023-12-31 00:00:00,375.75,22.0,2948.51,2023-10-14 00:00:00,2025-01-01 00:00:00,452.69,358.79,281.26,2023.0,2025.0,12.0,11.0,2023.0,12.0,31.0,53.0
std,,83.31,2.36,370.05,,,101.21,84.26,62.72,4.86,0.23,3.37,1.09,0.82,3.45,8.8,15.05


In [65]:
len(datos_unidos)

6825953

In [66]:
# Guardamos los datos
datos_unidos.to_csv("../data/procesados/datos_unidos.csv", index=False)

In [68]:
print(datos_unidos.columns)

Index(['SKU', 'DATE', 'STORE_ID', 'PRICE', 'QUANTITY', 'TOTAL_SALES', 'REGION',
       'CITY', 'STATE', 'STORE_TYPE', 'OPENDATE', 'CLOSEDATE',
       'STORE_SUBGROUP_DATE_ID', 'CATEGORY', 'GROUP', 'SUBGROUP', 'GROUP_TYPE',
       'PRICE_GROUP_ID', 'BRAND', 'INITIAL_TICKET_PRICE', 'BASE_PRICE',
       'COSTOS', 'YEAR_OPEN', 'YEAR_CLOSE', 'MONTH_OPEN', 'MONTH_CLOSE',
       'YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'WEEK'],
      dtype='object')
