In [3]:
import polars as pl

In [2]:
# modify the max number of rows to display
pl.Config.set_tbl_rows(25)

polars.config.Config

In [3]:
lzdf = pl.scan_csv("./data/favorita_dataset/train.csv", has_header=True, separator=",") 

In [29]:
sub = lzdf.filter(pl.col("unit_sales") > 0).group_by(["store_nbr", "item_nbr"]).agg(
    pl.len().alias("total_sales"),
).filter(
    pl.col("total_sales") > 1600
).collect()

In [31]:
sub.group_by("store_nbr").agg(
    pl.len().alias("total_items")
).sort("total_items", descending=True).head(10)

store_nbr,total_items
i64,u32
45,612
44,583
3,536
47,527
49,462
46,453
48,399
8,394
51,332
6,320


In [6]:
stores = [44, 3, 45, 47]
items = [502331, 314384, 213652, 507478, 258396] #[1047679, 584028, 1167614, 1430040, 165704] #1503844, 1473474, 1503844,

In [148]:
sub.filter(
    (pl.col("store_nbr").is_in(stores))
).group_by("item_nbr").agg(
    pl.col("total_sales").mean().alias("total_items")
).sort("total_items", descending=True)#.head(20)

item_nbr,total_items
i64,f64
502331,1679.0
314384,1679.0
582865,1679.0
213652,1679.0
507478,1679.0
258396,1678.75
567623,1678.75
261052,1678.75
311994,1678.75
692537,1678.75


In [23]:
# filter lzdf for the selected stores and items
sales_df = lzdf.filter(
    (pl.col("unit_sales") > 0)
    & (pl.col("store_nbr").is_in(stores))
    & (pl.col("item_nbr").is_in(items))
).collect()

sales_df.pivot(
    on="item_nbr",
    index="store_nbr",
    values="unit_sales",
    aggregate_function="sum"
    )

store_nbr,213652,258396,314384,502331,507478
i64,f64,f64,f64,f64,f64
3,46218.0,46500.0,170131.0,127136.0,47970.0
44,52193.0,50796.0,175134.0,144292.0,47541.0
45,33146.0,36737.0,233796.0,120938.0,48940.0
47,28411.0,38632.0,193371.0,146289.0,34795.0


In [27]:
# cast the 'onpromotion' column to boolean type given that it is a string in the dataset 'False' or 'True'. null values are converted to False
sales_df = sales_df.with_columns(
    (~(pl.col("onpromotion").is_null()) & (pl.col("onpromotion") == 'True')),
    pl.col("date").str.strptime(pl.Date, "%Y-%m-%d"),
).drop("id").rename({"onpromotion": "is_on_promotion", "store_nbr": "store_id", "item_nbr": "product_id", "unit_sales": "units_sold"})

sales_df.write_parquet("./data/favorita_dataset/subset/sales_train.parquet")


In [29]:
lzdf = pl.scan_csv("./data/favorita_dataset/items.csv", has_header=True, separator=",")

In [31]:
products_df = lzdf.filter(pl.col("item_nbr").is_in(items)) \
.with_columns(
    pl.col("perishable").cast(pl.Boolean),
    pl.col("class").cast(pl.String).cast(pl.Categorical),
    pl.col("family").cast(pl.Categorical),
).rename({"item_nbr": "product_id"}).collect()
products_df.write_parquet("./data/favorita_dataset/subset/products.parquet")
products_df

product_id,family,class,perishable
i64,cat,cat,bool
213652,"""GROCERY I""","""1048""",False
258396,"""GROCERY I""","""1010""",False
314384,"""GROCERY I""","""1004""",False
502331,"""BREAD/BAKERY""","""2702""",True
507478,"""EGGS""","""2502""",True


In [4]:
lzdf = pl.scan_csv("./data/favorita_dataset/stores.csv", has_header=True, separator=",")

In [9]:
stores_df = lzdf.filter(pl.col("store_nbr").is_in(stores)) \
.with_columns(
    pl.col("store_nbr").cast(pl.Int32),
    pl.col("city").cast(pl.String).cast(pl.Categorical),
    pl.col("state").cast(pl.String).cast(pl.Categorical),
    pl.lit("Ecuador").alias("country").cast(pl.Categorical),
    pl.col("type").cast(pl.String).cast(pl.Categorical),
    pl.col("cluster").cast(pl.String).cast(pl.Categorical),
).rename({"store_nbr": "store_id"}).collect()
stores_df.write_parquet("./data/favorita_dataset/subset/stores.parquet")
stores_df

store_id,city,state,type,cluster,country
i32,cat,cat,cat,cat,cat
3,"""Quito""","""Pichincha""","""D""","""8""","""Ecuador"""
44,"""Quito""","""Pichincha""","""A""","""5""","""Ecuador"""
45,"""Quito""","""Pichincha""","""A""","""11""","""Ecuador"""
47,"""Quito""","""Pichincha""","""A""","""14""","""Ecuador"""


In [168]:
lzdf = pl.scan_csv("./data/favorita_dataset/holidays_events.csv", has_header=True, separator=",")

In [170]:
# NOTE: Pay special attention to the transferred column. A holiday that is transferred officially falls on that calendar day, but was moved to another date by the government. A transferred day is more like a normal day than a holiday. To find the day that it was actually celebrated, look for the corresponding row where type is Transfer. For example, the holiday Independencia de Guayaquil was transferred from 2012-10-09 to 2012-10-12, which means it was celebrated on 2012-10-12. Days that are type Bridge are extra days that are added to a holiday (e.g., to extend the break across a long weekend). These are frequently made up by the type Work Day which is a day not normally scheduled for work (e.g., Saturday) that is meant to payback the Bridge.
# Additional holidays are days added a regular calendar holiday, for example, as typically happens around Christmas (making Christmas Eve a holiday).
# so in this code wi will deal with transferred holidays and holidays that are not transferred
holidays_df = lzdf.filter(pl.col("locale_name").is_in(["Quito", "Ecuador", "Pichincha"])) \
.with_columns(
    pl.col("date").str.strptime(pl.Date, "%Y-%m-%d"),
    pl.col("description").cast(pl.String).str.replace("Traslado ", "", literal=True),
    pl.col("transferred").cast(pl.Boolean),
).filter(~pl.col("transferred")).drop("transferred").collect()

from datetime import date
# add earthquake holidays on April 16, 2016 and some other holidays
# NOTE: The earthquake on April 16, 2016, was a significant event in Ecuador, causing widespread devastation and loss of life.
earthquake_holidays = pl.DataFrame({
    "date": [date(2016, 4, 16), date(2016, 4, 17), date(2016, 4, 18), date(2016, 4, 19), date(2016, 4, 20)],
    "type": ["Holiday", "Holiday", "Holiday", "Holiday", "Holiday"],
    "locale": ["National", "National", "National", "National", "National"],
    "locale_name": ["Ecuador", "Ecuador", "Ecuador", "Ecuador", "Ecuador"],
    "description": ["Terremoto", "Terremoto", "Terremoto", "Terremoto", "Terremoto"],
}
)

holidays_df = pl.concat([holidays_df, earthquake_holidays], how="vertical") \
.with_columns(
    pl.col("date").cast(pl.Date),
    pl.col("type").cast(pl.Categorical),
    pl.col("locale").cast(pl.Categorical),
    pl.col("locale_name").cast(pl.Categorical),
)
holidays_df.write_parquet("./data/favorita_dataset/subset/events.parquet")
holidays_df

date,type,locale,locale_name,description
date,cat,cat,cat,str
2012-08-10,"""Holiday""","""National""","""Ecuador""","""Primer Grito de Independencia"""
2012-10-12,"""Transfer""","""National""","""Ecuador""","""Independencia de Guayaquil"""
2012-11-02,"""Holiday""","""National""","""Ecuador""","""Dia de Difuntos"""
2012-11-03,"""Holiday""","""National""","""Ecuador""","""Independencia de Cuenca"""
2012-12-05,"""Additional""","""Local""","""Quito""","""Fundacion de Quito-1"""
2012-12-06,"""Holiday""","""Local""","""Quito""","""Fundacion de Quito"""
2012-12-21,"""Additional""","""National""","""Ecuador""","""Navidad-4"""
2012-12-22,"""Additional""","""National""","""Ecuador""","""Navidad-3"""
2012-12-23,"""Additional""","""National""","""Ecuador""","""Navidad-2"""
2012-12-24,"""Bridge""","""National""","""Ecuador""","""Puente Navidad"""


In [171]:
# plot sales for each item-store combination
from plotly.graph_objects import Figure

for item in items:
    for store in stores:
        df = subset.filter((pl.col("store_nbr") == store) & (pl.col("item_nbr") == item))
        if not df.is_empty():
            df = df.with_columns(pl.col("date").cast(pl.Date))
            df = df.sort("date")
            fig = Figure()
            fig.add_scatter(
                x=df["date"].to_list(),
                y=df["unit_sales"].to_list(),
                mode="lines",
                name=f"Store {store} - Item {item}"
            )
            fig.update_layout(
                title=f"Sales for Store {store} - Item {item}",
                xaxis_title="Date",
                yaxis_title="Unit Sales",
                template="plotly_white"
            )
            fig.show()
            