In [3]:
import polars as pl

In [4]:
retailers_with_id = [
        "coup_de_pates",
        "ds_restauration",
        "even",
        "metro",
        "pomona",
        "pro_a_pro",
        "sysco",
        "transgourmet",
    ]
retailers_without_id = [
        "ducreux",
        "relais_dor",
    ]

retailers = retailers_with_id + retailers_without_id

# Extract products sample

In [8]:
datasets = []

for retailer_with_id in retailers_with_id:
    dataset = pl.read_parquet(f'data/silver/with_id/{retailer_with_id}.parquet').rename(lambda column_name: column_name.replace(f"_{retailer_with_id}", ""))
    dataset = dataset.unique(subset=[c for c in dataset.columns if c.startswith('level') or c == 'volume_unit']).with_columns(pl.lit(retailer_with_id).alias('retailer'))
    datasets.append(dataset)

dataset_concat = pl.concat(datasets)
dataset_concat = dataset_concat.sort(['retailer']+ [c for c in dataset_concat.columns if c.startswith('level')] + ['volume_unit'])

dataset_concat.write_csv('data/log/model_4/products_subset_withId.csv', separator=';')

In [9]:
datasets = []

for retailer_without_id in retailers_without_id:
    dataset = pl.read_parquet(f'data/silver/without_id/{retailer_without_id}.parquet').rename(lambda column_name: column_name.replace(f"_{retailer_without_id}", ""))
    dataset = dataset.unique(subset=[c for c in dataset.columns if c.startswith('level') or c == 'volume_unit']).with_columns(pl.lit(retailer_without_id).alias('retailer'))
    datasets.append(dataset)

dataset_concat = pl.concat(datasets)
dataset_concat = dataset_concat.sort(['retailer']+ [c for c in dataset_concat.columns if c.startswith('level')] + ['volume_unit'])

dataset_concat.write_csv('data/log/model_4/products_subset_withoutId.csv', separator=';')

# Extract volume unit

In [10]:
datasets = []

for retailer in retailers:
    if retailer in retailers_with_id:
        dataset = pl.read_parquet(f'data/silver/with_id/{retailer}.parquet').rename({f'volume_unit_{retailer}':'volume_unit'})
    else :
        dataset = pl.read_parquet(f'data/silver/without_id/{retailer}.parquet').rename({f'volume_unit_{retailer}':'volume_unit'})

    dataset = dataset.select('volume_unit').unique().with_columns(pl.lit(retailer).alias('retailer'))
    datasets.append(dataset)

dataset_concat = pl.concat(datasets).select('retailer', 'volume_unit').sort('retailer')
dataset_concat.write_csv('data/log/model_4/volume_unit_subset.csv', separator=';')

In [17]:
import xlsxwriter

from src.utils.profiling import incremental_sublists

retailers_with_id = [
        "coup_de_pates",
        "ds_restauration",
        "even",
        "metro",
        "pomona",
        "pro_a_pro",
        "sysco",
        "transgourmet",
    ]
retailers_without_id = [
        "ducreux",
        "relais_dor",
    ]

retailers = retailers_with_id + retailers_without_id

wb = xlsxwriter.Workbook(
            f"data/log/model_4/volume_unit_subset_detailed.xlsx"
        )


agg_cols_lst = incremental_sublists([f'level_{i}_standard' for i in range(1, 5)])
for i, agg_cols in enumerate(agg_cols_lst, start=1):

    datasets = []
    for retailer in retailers:
        if retailer in retailers_with_id:
            dataset = pl.read_parquet(f'data/silver/with_id/{retailer}.parquet').rename(lambda column_name: column_name.replace(f"_{retailer}", ""))
            product_col = 'product_id'
        else :
            dataset = pl.read_parquet(f'data/silver/without_id/{retailer}.parquet').rename(lambda column_name: column_name.replace(f"_{retailer}", ""))
            product_col = 'product_code'

        dataset = dataset.group_by(['volume_unit'] + agg_cols).agg(pl.col(product_col).n_unique().alias('nb_products')).unique().with_columns(pl.lit(retailer).alias('retailer')).sort(agg_cols + ['retailer', 'volume_unit'])
        datasets.append(dataset)

    dataset_concat = pl.concat(datasets)
    dataset_concat.write_excel(workbook=wb, worksheet=f"level_{i}")

wb.close()