# Profiling

## Setup

In [1]:
# Import packages
import pandas as pd
from ds_common_utils.aux.io.snowflake_tools import SnowflakeTools
from datetime import datetime

In [2]:
# Setup connection
con = SnowflakeTools(
    role="INSIGHT_ANALYST_MERCH_DE_GENERAL_PRD",
    warehouse="INSIGHT_ANALYST_WH",
    database="BDWPRD_DE",
    schema="IA_MERCH_DE"
)

In [3]:
# Set item range table name
table_name = 'ev_item_range' + datetime.today().strftime('%Y_%m_%d')

In [4]:
# Date range
start_date = "'2024-02-01'"
end_date = "'2025-01-01'"

In [6]:
# Create temp item table
con.execute_statement_from_sql_file(
    file='sql/0-item-range.sql',
    formatting={'table_name': table_name})

## Data Pull

### Demographic Segments

In [7]:
# Get data
df_ds = con.read_sql_file_into_pandas(
    file="sql/1-demographic-segments.sql",
    formatting={
        'start_date': start_date,
        'end_date': end_date,
        'table_name': table_name
    }
)

In [8]:
# Check data
df_ds

Unnamed: 0,DEMOGRAPHIC_SEGMENT,RANGE_SALES,RANGE_QUANTITY,RANGE_CUSTOMERS,RANGE_SALES_SHARE,RANGE_QUANTITY_SHARE,RANGE_CUSTOMER_SHARE,OVERALL_SALES,OVERALL_QUANTITY,OVERALL_CUSTOMERS,OVERALL_SALES_SHARE,OVERALL_QUANTITY_SHARE,OVERALL_CUSTOMER_SHARE,SALES_INDEX,QUANTITY_INDEX,CUSTOMER_INDEX
0,1.Renter no kids,70415.07,892.0,54,0.017952,0.019178,0.017269,331558900.0,22295958.0,949050,0.079773,0.08183,0.16457,0.225033,0.234363,0.104934
1,2.Renter with kids,56515.65,684.0,37,0.014408,0.014706,0.011832,252484400.0,16915255.0,625842,0.060748,0.062082,0.108524,0.237178,0.236879,0.109027
2,3.Younger homeowner,759800.7,8956.0,550,0.193704,0.192552,0.175887,612860000.0,37651620.0,556166,0.147455,0.138188,0.096442,1.313648,1.393413,1.823759
3,4.Homeowner younger kids,875109.7,10289.0,647,0.2231,0.221212,0.206908,797134100.0,48228958.0,771081,0.191791,0.177008,0.133709,1.163246,1.249726,1.54745
4,5.Homeowner older kids,438863.7,5180.0,365,0.111884,0.111369,0.116725,371090800.0,22491019.0,396067,0.089285,0.082546,0.06868,1.253112,1.349181,1.699549
5,6.Older homeowner,1721788.13,20511.0,1474,0.438953,0.440983,0.471378,1791132000.0,124884611.0,2468644,0.430948,0.458347,0.428075,1.018574,0.962116,1.101158


### DIY Proficiency

In [9]:
# Get data
df_diy = con.read_sql_file_into_pandas(
    file="sql/2-diy-proficiency.sql",
    formatting={
        'start_date': start_date,
        'end_date': end_date,
        'table_name': table_name
    }
)

In [10]:
# Check data
df_diy

Unnamed: 0,PROFICIENCY_GROUP,RANGE_SALES,RANGE_QUANTITY,RANGE_CUSTOMERS,RANGE_SALES_SHARE,RANGE_QUANTITY_SHARE,RANGE_CUSTOMER_SHARE,OVERALL_SALES,OVERALL_QUANTITY,OVERALL_CUSTOMERS,OVERALL_SALES_SHARE,OVERALL_QUANTITY_SHARE,OVERALL_CUSTOMER_SHARE,SALES_INDEX,QUANTITY_INDEX,CUSTOMER_INDEX
0,1. DIY Master,1981186.23,23530.0,1658,0.551451,0.552309,0.570151,1163842000.0,71949511.0,964469,0.31401,0.293764,0.210648,1.756157,1.880107,2.706653
1,2. DIY Skilled,919603.24,10859.0,727,0.255966,0.254888,0.25,1033219000.0,67424653.0,1225229,0.278767,0.27529,0.2676,0.918206,0.925891,0.93423
2,3. DIY Maintainer,446461.9,5300.0,352,0.12427,0.124404,0.121045,855547800.0,58279219.0,1226056,0.230831,0.23795,0.267781,0.538359,0.522818,0.45203
3,4. DIY Rookie,245426.64,2914.0,171,0.068313,0.068399,0.058803,653775100.0,47269163.0,1162828,0.176392,0.192996,0.253971,0.38728,0.354405,0.231534


### RFM (Annual Segment)

In [11]:
# Get data
df_rfm = con.read_sql_file_into_pandas(
    file="sql/3-rfm.sql",
    formatting={
        'start_date': start_date,
        'end_date': end_date,
        'table_name': table_name
    }
)

In [12]:
# Check data
df_rfm

Unnamed: 0,ANNUAL_SEGMENT,RANGE_SALES,RANGE_QUANTITY,RANGE_CUSTOMERS,RANGE_SALES_SHARE,RANGE_QUANTITY_SHARE,RANGE_CUSTOMER_SHARE,OVERALL_SALES,OVERALL_QUANTITY,OVERALL_CUSTOMERS,OVERALL_SALES_SHARE,OVERALL_QUANTITY_SHARE,OVERALL_CUSTOMER_SHARE,SALES_INDEX,QUANTITY_INDEX,CUSTOMER_INDEX
0,High Spend High Frequency,3039025.95,36023.0,2412,0.771626,0.771304,0.769378,2417658000.0,153470676.0,1389069,0.586034,0.565179,0.263189,1.316691,1.364708,2.923291
1,High Spend Low Frequency,543486.67,6479.0,399,0.137994,0.138725,0.127273,762737500.0,43530323.0,1284405,0.184886,0.160307,0.243358,0.746377,0.865369,0.522987
2,Low Spend High Frequency,206133.75,2484.0,200,0.052339,0.053186,0.063796,500740500.0,41692067.0,787133,0.121378,0.153537,0.149139,0.431202,0.346404,0.427762
3,Low Spend Low Frequency,149823.57,1718.0,124,0.038041,0.036785,0.039553,444318300.0,32850443.0,1817235,0.107702,0.120977,0.344314,0.353208,0.304066,0.114875


### Commercial BOT

In [13]:
# Get data
df_bot = con.read_sql_file_into_pandas(
    file="sql/4-commercial-bot.sql",
    formatting={
        'start_date': start_date,
        'end_date': end_date,
        'table_name': table_name
    }
)

In [14]:
# Check data
df_bot

Unnamed: 0,COMMERCIAL_INDUSTRY_SEGMENT_REPORT_GROUP_CODE,RANGE_SALES,RANGE_QUANTITY,RANGE_CUSTOMERS,RANGE_SALES_SHARE,RANGE_QUANTITY_SHARE,RANGE_CUSTOMER_SHARE,OVERALL_SALES,OVERALL_QUANTITY,OVERALL_CUSTOMERS,OVERALL_SALES_SHARE,OVERALL_QUANTITY_SHARE,OVERALL_CUSTOMER_SHARE,SALES_INDEX,QUANTITY_INDEX,CUSTOMER_INDEX
0,B&O,15083597.05,197324.0,5521,0.2686,0.267521,0.315973,2201105000.0,122771087.0,538361,0.374803,0.393591,0.50817,0.716642,0.679694,0.621786
1,Builder,23653019.42,313654.0,6048,0.421199,0.425235,0.346134,1915227000.0,91116091.0,146239,0.326124,0.292108,0.138038,1.291529,1.455746,2.507527
2,Trades,17419834.4,226623.0,5904,0.310202,0.307243,0.337893,1760491000.0,98037273.0,374811,0.299776,0.314297,0.353792,1.03478,0.977558,0.955061
3,Unknown,,,0,,,0.0,-4126597.0,1295.0,1,-0.000703,4e-06,1e-06,,,0.0


### Commercial Industry Segment

In [15]:
# Get data
df_is = con.read_sql_file_into_pandas(
    file="sql/5-commercial-industry-segment.sql",
    formatting={
        'start_date': start_date,
        'end_date': end_date,
        'n_commercial_segments': 25, # Can change, 25-40 recommended otherwise additional segments are too small
        'table_name': table_name
    }
)

In [16]:
# Check data
df_is.sort_values(by='SALES_INDEX', ascending=False)

Unnamed: 0,COMMERCIAL_INDUSTRY_SEGMENT_CODE,RANGE_SALES,RANGE_QUANTITY,RANGE_CUSTOMERS,RANGE_SALES_SHARE,RANGE_QUANTITY_SHARE,RANGE_CUSTOMER_SHARE,OVERALL_SALES,OVERALL_QUANTITY,OVERALL_CUSTOMERS,OVERALL_SALES_SHARE,OVERALL_QUANTITY_SHARE,OVERALL_CUSTOMER_SHARE,SALES_INDEX,QUANTITY_INDEX,CUSTOMER_INDEX
2,Carpentry Services,7746647.48,101461.0,2201,0.145537,0.145108,0.134199,356025300.0,17328440.0,53133,0.066116,0.060797,0.055513,2.20124,2.386779,2.417434
5,Construction Services,3261790.06,43700.0,882,0.06128,0.062499,0.053777,232202700.0,13311173.0,34740,0.043121,0.046702,0.036296,1.421095,1.338251,1.481623
4,Commercial Builder,6133881.51,81254.0,1708,0.115238,0.116208,0.10414,440024800.0,20389615.0,29320,0.081715,0.071537,0.030633,1.410239,1.624457,3.399602
10,Landscaper and Gardening Services,3155882.1,40764.0,987,0.05929,0.0583,0.060179,240971900.0,16283854.0,49256,0.04475,0.057132,0.051462,1.324917,1.020451,1.169387
19,Residential Builder,11908064.36,158015.0,2778,0.223718,0.225991,0.16938,999168500.0,44758034.0,51179,0.185552,0.157033,0.053472,1.205693,1.439129,3.167639
21,Site Preparation and Development,2349283.49,30685.0,680,0.044136,0.043885,0.041461,243831000.0,12657269.0,31000,0.045281,0.044408,0.032389,0.974722,0.988231,1.280095
8,Financial and Insurance Services,866976.75,11378.0,321,0.016288,0.016273,0.019572,102629300.0,5646607.0,28327,0.019059,0.019811,0.029596,0.854615,0.821393,0.661306
16,Professional Computer and Scientific Services,3496643.84,45316.0,1301,0.065692,0.06481,0.079324,465707000.0,25792940.0,125335,0.086485,0.090494,0.130949,0.759579,0.716182,0.605763
3,Cleaning Services,1463799.09,18811.0,632,0.027501,0.026903,0.038534,197259600.0,11331920.0,43241,0.036632,0.039758,0.045178,0.75072,0.676676,0.852937
12,Owner Builder,563598.08,7262.0,246,0.010588,0.010386,0.014999,76283660.0,4006455.0,19688,0.014166,0.014057,0.02057,0.747433,0.738871,0.729169


### Location

In [17]:
# Get data
df_loc = con.read_sql_file_into_pandas(
    file="sql/6-location.sql",
    formatting={
        'start_date': start_date,
        'end_date': end_date,
        'table_name': table_name
    }
)

In [18]:
# Check data
df_loc.sort_values(by='SALES_INDEX', ascending=False)

Unnamed: 0,LOCATION_NAME,RANGE_SALES,RANGE_QUANTITY,RANGE_SALES_SHARE,RANGE_QUANTITY_SHARE,OVERALL_SALES,OVERALL_QUANTITY,OVERALL_SALES_SHARE,OVERALL_QUANTITY_SHARE,SALES_INDEX,QUANTITY_INDEX
200,Maroochydore Trade Centre,586768.00,7983.0,0.008617,0.009074,11967201.60,417777.0,0.000752,4.232677e-04,11.458507,21.437469
217,Mitchell Trade Centre,409760.90,5436.0,0.006017,0.006179,9208233.83,316111.0,0.000579,3.202655e-04,10.399403,19.292643
72,Cameron Park Trade Centre,505543.55,7020.0,0.007424,0.007979,11544576.45,393973.0,0.000725,3.991508e-04,10.233750,19.990449
99,Cromer Trade Centre,320405.92,4152.0,0.004705,0.004719,8534033.68,263842.0,0.000536,2.673096e-04,8.774056,17.654902
335,Tuggerah Trade Centre,465465.78,6386.0,0.006835,0.007259,12676985.36,381066.0,0.000797,3.860742e-04,8.580765,18.800986
...,...,...,...,...,...,...,...,...,...,...,...
340,Unanderra F&T,,,,,11789014.51,1653.0,0.000741,1.674720e-06,,
349,WA Support Centre Cafe,,,,,204243.28,82037.0,0.000013,8.311518e-05,,
350,Wacol F&T,,,,,8133351.14,838.0,0.000511,8.490100e-07,,
355,Warnervale F&T,,,,,19991586.37,2552.0,0.001256,2.585540e-06,,


### Location Region

In [19]:
# Get data
df_reg = con.read_sql_file_into_pandas(
    file="sql/7-location-region.sql",
    formatting={
        'start_date': start_date,
        'end_date': end_date,
        'table_name': table_name
    }
)

In [20]:
# Check data
df_reg

Unnamed: 0,LOCATION_REGION_NAME,RANGE_SALES,RANGE_QUANTITY,RANGE_SALES_SHARE,RANGE_QUANTITY_SHARE,OVERALL_SALES,OVERALL_QUANTITY,OVERALL_SALES_SHARE,OVERALL_QUANTITY_SHARE,SALES_INDEX,QUANTITY_INDEX
0,East Division,23792714.48,306336.0,0.349394,0.348193,4319151000.0,261698214.0,0.271404,0.265138,1.28736,1.313255
1,North Division,17159536.79,221782.0,0.251987,0.252086,4350527000.0,264063594.0,0.273375,0.267534,0.92176,0.942258
2,South Division,16325037.36,211021.0,0.239732,0.239855,4191870000.0,263808733.0,0.263406,0.267276,0.910124,0.897405
3,Trade Strategies,,,,,28764400.0,141824.0,0.001807,0.000144,,
4,West Division,10819734.28,140648.0,0.158887,0.159866,3023797000.0,197315563.0,0.190007,0.199909,0.836215,0.799695


### Metro/Regional

In [21]:
# Get data
df_mr = con.read_sql_file_into_pandas(
    file="sql/8-location-metro-regional.sql",
    formatting={
        'start_date': start_date,
        'end_date': end_date,
        'table_name': table_name
    }
)

In [22]:
# Check data
df_mr

Unnamed: 0,TRADE_REGION_CODE,RANGE_SALES,RANGE_QUANTITY,RANGE_SALES_SHARE,RANGE_QUANTITY_SHARE,OVERALL_SALES,OVERALL_QUANTITY,OVERALL_SALES_SHARE,OVERALL_QUANTITY_SHARE,SALES_INDEX,QUANTITY_INDEX
0,Metro,48207478.23,622159.0,0.707923,0.70717,11903400000.0,747412725.0,0.747977,0.757236,0.94645,0.933884
1,Regional,19889544.68,257628.0,0.292077,0.29283,4010714000.0,239615203.0,0.252023,0.242764,1.15893,1.206231


### Drop item range table

In [23]:
# Drop item range table
con.execute_statement_from_sql_string(
    statement='DROP TABLE bdwprd_de.ia_merch_de.{table_name};',
    formatting={'table_name': table_name}
    )