In [109]:
import pandas as pd
import numpy as np
import sys
sys.path.append('../src')
# import importlib
# import predict_inputs_view
# importlib.reload(predict_inputs_view)
from sales_statistics import get_col_stats_filename
from predict_inputs_view import prepare_and_save_user_inputs_file

In [83]:
# !NOTE - make sure you uploaded data file into appropriate folder 
filename = '../data/raw/rossman_prepared.csv'
raw_df = pd.read_csv(filename)
target_col = 'Sales'

# Statistics

In [84]:
open_df = raw_df[raw_df['Open'] == 1]
stats_df = open_df[(open_df['Year'] == 2013) | (open_df['Year'] == 2014)]

In [85]:
statistic_setups = [
    {
        'group_by': 'Month',
        'values_map': {
            1: "January", 2: "February", 3: "March", 4: "April",
            5: "May", 6: "June", 7: "July", 8: "August",
            9: "September", 10: "October", 11: "November", 12: "December"
        },
        'additional_cols': [{
            'Open': 'count',
            'Promo': 'sum'
        }]
    },
    {
        'group_by': 'DayOfWeek',
        'values_map': {
            1: "Monday", 2: "Tuesday", 3: "Wednesday", 4: "Thursday",
            5: "Friday", 6: "Saturday", 7: "Sunday"
        },
        'additional_cols': [{
            'Open': 'count',
            'Promo': 'sum'
        }]
    }
]

In [86]:
for setup in statistic_setups:
    group_by_col = setup['group_by']
    filename = get_col_stats_filename(group_by_col)
    values_map = setup['values_map']

    # Build aggregation dictionary dynamically
    agg_dict = {'Sales': 'mean'}  # always include average sales
    
    for col_cfg in setup['additional_cols']:
        for col_name, agg_func in col_cfg.items():
            agg_dict[col_name] = agg_func

    df_avg = stats_df.groupby(group_by_col, as_index=False).agg(agg_dict)
    df_avg.rename(columns={'Sales': 'Average_Sales', 'Open': 'Total_Open_Days', 'Promo': 'Total_Days_With_Promo'}, inplace=True)
    df_avg['Average_Sales'] = df_avg['Average_Sales'].round(2)
    df_avg[group_by_col] = df_avg[group_by_col].map(values_map)

    # Save df_avg to a CSV file
    df_avg.to_csv('../data/streamlit/' + filename, index=False)

In [87]:
df_avg

Unnamed: 0,DayOfWeek,Average_Sales,Total_Open_Days,Total_Days_With_Promo
0,Monday,8236.63,101905,56119
1,Tuesday,7063.32,106405,56275
2,Wednesday,6697.33,102983,55111
3,Thursday,6752.63,98542,53654
4,Friday,7087.08,101865,53194
5,Saturday,5845.51,106203,0
6,Sunday,8096.82,2695,0


# Predict Form user input fields

In [110]:
user_inpute_cols = [
    'Date', 'Promo', 'StoreType', 'Assortment', 
    'StateHoliday', 'SchoolHoliday',
    'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval', 
    'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear',
    'DaysAfterHoliday', 'DaysBeforeHoliday',
    'Sales_Lag1', 'Sales_Lag2', 'Sales_Lag3', 'Sales_Lag7', 'Sales_Lag14',
    'Sales_Lag30', 'Customers_Lag1', 'Customers_Lag7', 'SalesPerCustomer_Lag1']

In [111]:
summary_df = prepare_and_save_user_inputs_file(open_df, user_inpute_cols, date_cols=['Date'], relative_filepath='../data/streamlit/')

In [112]:
summary_df

Unnamed: 0,Column,Type,Min,Max,Unique_Values,Default_Value
0,Date,date,2013-01-31,2015-07-31,,
1,Promo,boolean,,,0|1,0
2,StoreType,categorical,,,c|a|d|b,a
3,Assortment,categorical,,,a|c|b,a
4,StateHoliday,categorical,,,0|a|b|c,0
5,SchoolHoliday,boolean,,,0|1,0
6,Promo2,boolean,,,0|1,0
7,Promo2SinceWeek,numeric,1.0,50.0,,23
8,Promo2SinceYear,numeric,2009.0,2015.0,,2011
9,PromoInterval,categorical,,,"nan|Jan,Apr,Jul,Oct|Feb,May,Aug,Nov|Mar,Jun,Se...","Jan,Apr,Jul,Oct"
