## Setup

In [19]:
# Import packages
import setup
import pandas as pd
from ds_common_utils.aux.io.snowflake_tools import SnowflakeTools

In [20]:
# Setup analysis and get connection
con = SnowflakeTools().get_snowflake_ctx(
     method = 'token',
    user = '250807',
    role = 'INSIGHT_ANALYST_MERCH_DE_GENERAL_PRD',
    warehouse = 'INSIGHT_ANALYST_WH'
)
setup.set_dates(con, setup.dates)

snowflake_tools - 2025-06-25 23:54:58.489829+10:00 - Generating new token with 'INSIGHT_ANALYST_MERCH_DE_GENERAL_PRD' role and 'SESSION:ROLE-ANY' scope...


snowflake_tools - 2025-06-25 23:55:05.350206+10:00 - Saved token with '2025-06-26 03:55:05+10:00' expiry...


Unnamed: 0,START_DATE,END_DATE,PP_START_DATE
0,2024-06-01,2025-05-31,2022-10-01


In [21]:
# Possible granularities for purchase paths
granularity = {
    'dept': {'sub_dept': '-- ', 'class': '-- ', 'sub_class': '-- '}, 
    'sub_dept': {'sub_dept': '', 'class': '-- ', 'sub_class': '-- '}, 
    'class': {'sub_dept': '', 'class': '', 'sub_class': '-- '}, 
    'sub_class': {'sub_dept': '', 'class': '', 'sub_class': ''}
    }

## Get data

In [22]:
# Get data at different granularities
consumer_data = {}

for pft_level in granularity:
    # Set parquet path
    parquet_path = 'parquets/df_purchase_path_consumer_AU_rodent_pest_control_' + pft_level + '_20_limit.parquet'
    # Get data
    try:
        consumer_data[pft_level] = pd.read_parquet(parquet_path)
    except:
        # Comment out necessary lines in query depending on granularity
        with open('sql/project-path-consumer.sql', 'r') as query:
            pp_query = query.read().format(pp_sub_dept=granularity[pft_level]['sub_dept'], pp_class=granularity[pft_level]['class'], pp_sub_class=granularity[pft_level]['sub_class'])
            df = pd.read_sql_query(pp_query, con)
        consumer_data[pft_level] = df
        df.to_parquet(parquet_path)

KeyboardInterrupt: 

In [None]:
# Get data at different granularities
commercial_data = {}

for pft_level in granularity:
    # Set parquet path
    parquet_path = 'parquets/df_purchase_path_commercial_NZ_rodent_pest_control_' + pft_level + '_20_limit.parquet'
    # Get data
    try:
        commercial_data[pft_level] = pd.read_parquet(parquet_path)
    except:
        # Comment out necessary lines in query depending on granularity
        with open('sql/project-path-commercial.sql', 'r') as query:
            pp_query = query.read().format(pp_sub_dept=granularity[pft_level]['sub_dept'], pp_class=granularity[pft_level]['class'], pp_sub_class=granularity[pft_level]['sub_class'])
            df = pd.read_sql_query(pp_query, con)
        commercial_data[pft_level] = df
        df.to_parquet(parquet_path)

# Results

In [None]:
with pd.ExcelWriter("data/consumer_purchase_path_raw_data.xlsx") as writer:
    for df in consumer_data:
        consumer_data[df].to_excel(writer, sheet_name=df)

In [None]:
with pd.ExcelWriter("data/commercial_purchase_path_raw_data.xlsx") as writer:
    for df in commercial_data:
        commercial_data[df].to_excel(writer, sheet_name=df)

In [None]:
# Pivot results
df_consumer_results = {}
family_tree_levels = ['ITEM_CATEGORY_NAME', 'ITEM_DEPARTMENT_NAME', 'ITEM_SUB_DEPARTMENT_NAME', 'ITEM_CLASS_NAME', 'ITEM_SUB_CLASS_NAME']

for i, pft_level in enumerate(granularity):
    df = consumer_data[pft_level]
    df = df.loc[df['SALES_RANK'] <= 10, :]
    df = df.pivot_table(index=family_tree_levels[:i + 2], columns='WEEKS_SINCE_PURCHASE', values='SALES_RANK')
    df = df.astype('Int64')
    df = df.style.background_gradient(subset=pd.IndexSlice[:, df.columns], cmap='Greens_r')
    df_consumer_results[pft_level] = df

In [None]:
# Pivot results
df_commercial_results = {}
family_tree_levels = ['ITEM_CATEGORY_NAME', 'ITEM_DEPARTMENT_NAME', 'ITEM_SUB_DEPARTMENT_NAME', 'ITEM_CLASS_NAME', 'ITEM_SUB_CLASS_NAME']

for i, pft_level in enumerate(granularity):
    df = commercial_data[pft_level]
    df = df.loc[df['SALES_RANK'] <= 10, :]
    df = df.pivot_table(index=family_tree_levels[:i + 2], columns='WEEKS_SINCE_PURCHASE', values='SALES_RANK')
    df = df.astype('Int64')
    df = df.style.background_gradient(subset=pd.IndexSlice[:, df.columns], cmap='Greens_r')
    df_commercial_results[pft_level] = df

TypeError: cannot safely cast non-equivalent float64 to int64

In [None]:
# Write results to excel
with pd.ExcelWriter("data/purchase_path_consumer_rodent_pest_control.xlsx") as writer:
    for df in df_consumer_results:
        df_consumer_results[df].to_excel(writer, sheet_name=df)

In [None]:
# Write results to excel
with pd.ExcelWriter("data/purchase_path_commercial_rodent_pest_control.xlsx") as writer:
    for df in df_commercial_results:
        df_commercial_results[df].to_excel(writer, sheet_name=df)

IndexError: At least one sheet must be visible