# **Dashboard**

# 1. Imports and Loading

In [177]:
# Import libraries
import pandas as pd
import numpy as np

from dash import Dash, html, dash_table, dcc, callback, Output, Input, MATCH, callback_context
import plotly.express as px
import dash_bootstrap_components as dbc

In [178]:
# Load the data
df = pd.read_csv('/home/shadybea/OneDrive/General/Data Mining/Project/DM2425_ABCDEats_DATASET.csv')

# 2. Data Transformations

In [179]:
_rename_dict = {
        'customer_region' : 'cust_region'
        , 'payment_method' : 'pay_method'
        , 'customer_age' : 'cust_age'
        , 'vendor_count' : 'n_vendor'
        , 'product_count' : 'n_product'
        , 'n_order' : 'n_order'
        , 'is_chain' : 'n_chain'
        , 'CUI_American' : 'American'
        , 'CUI_Asian' : 'Asian'
        , 'CUI_Beverages' : 'Beverages'
        , 'CUI_Cafe' : 'Cafe'
        , 'CUI_Chicken Dishes' : 'Chicken Dishes'
        , 'CUI_Chinese' : 'Chinese'
        , 'CUI_Desserts' : 'Desserts'
        , 'CUI_Healthy' : 'Healthy'
        , 'CUI_Indian' : 'Indian'
        , 'CUI_Italian' : 'Italian'
        , 'CUI_Japanese' : 'Japanese'
        , 'CUI_Noodle Dishes' : 'Noodle Dishes'
        , 'CUI_OTHER' : 'OTHER'
        , 'CUI_Street Food / Snacks' : 'Street Food / Snacks'
        , 'CUI_Thai' : 'Thai'
}

# Rename the columns for easier reference
df.rename(columns=_rename_dict, inplace=True)

In [180]:
# Make the integer value of the customer hex values, the index. 
df['customer_id'] = df['customer_id'].apply(lambda x: int(x, 16))

# Set 'customer_id' as the index
df = df[~df['customer_id'].duplicated()].set_index('customer_id')

In [181]:
# Fill missing values for first_order
df.loc[df['first_order'].isna(), 'first_order'] = 0

In [182]:
# Fill missing values for HR_0
sum_week = df[[f"DOW_{n}" for n in range(7)]].sum(axis=1)
sum_day = df[[f"HR_{n}" for n in range(24)]].sum(axis=1)

df.loc[df['HR_0'].isna(), 'HR_0'] = (sum_week - sum_day)

In [183]:
# has at least one vendor
has_vendor = df['n_vendor'] != 0 

# has at least one product
has_product = df['n_product'] != 0 

# purchase must have been made on a valid dow
some_day = (df[[f"DOW_{n}" for n in range(7)]] != 0).any(axis = 1) 

# purchase must have been made at a valid hour
some_hour = (df[[f"HR_{n}" for n in range(24)]] != 0).any(axis = 1)  

# some type of cuisine must have been ordered
some_food = (df[df.columns[9:24]] != 0).any(axis = 1) 

df = df[(has_vendor & has_product & some_day & some_hour & some_food)]  # And we drop these values

In [184]:
# Customer Region
df.loc[df['cust_region'] == '-', 'cust_region'] = '8670'
df.loc[df['cust_region'].isin(['2440', '2490']), 'cust_region'] = '2400'

# Add the feature Customer CIty
df['cust_city'] = df['cust_region'].apply(lambda x: x[0])

In [185]:
# Last Promo
df.loc[df['last_promo'] == '-', 'last_promo'] = 'NO_PROMO'

In [186]:
# Tidying up datatypes
for col in df.iloc[:, 0:9]:
    if col in ['last_promo', 'pay_method']:
        df[col] = df[col].astype(object)
    else:
        df[col] = df[col].astype('Int64')

for col in df.iloc[:, 9:24]:
    df[col] = df[col].astype(float)

for col in df.iloc[:, 24:]:
    df[col] = df[col].astype('Int64')

In [187]:
# Categorical variables
non_metric_features = ['cust_region', 'last_promo', 'pay_method', 'cust_city']

# Hour of day variables
hour_features = df.columns[31:55].to_list()

# Day of week variables
day_features = df.columns[24:31].to_list()

# Cuisine features
cuisine_features = df.columns[9:24].to_list()

# Metric variables, that are not above
metric_features = df.columns.drop(non_metric_features).drop(hour_features).drop(day_features).drop(cuisine_features).to_list()

In [188]:
# Total amount spent by customer on all types of cuisine
df['total_amt'] = df[cuisine_features].sum(axis=1)

# Number of orders made by the customer
df['n_order'] = df[day_features].sum(axis=1)

# Amount spent on average per product
df['avg_amt_per_product'] = df['total_amt'] / df['n_product']

# Amount spent on average per order
df['avg_amt_per_order'] = df['total_amt'] / df['n_order']

# Amount spent on average per vendor
df['avg_amt_per_vendor'] = df['total_amt'] / df['n_vendor']

# Total days as customer
df['days_cust'] = df['last_order'] - df['first_order']

# Average days between orders
df['avg_days_to_order'] = df['days_cust'] / df['n_order']

# Days the customer is due, according to their average days between orders
df['days_due'] = 90 - df['last_order'] + df['avg_days_to_order']

# Percentage of orders placed to restaurants that are part of a chain
df['per_chain_order'] = df['n_chain'] / df['n_order']

# And we add these tese features to the metric features list.
metric_features.extend([
    'n_order'
    ,'per_chain_order'
    ,'total_amt'
    ,'avg_amt_per_order'
    ,'avg_amt_per_product'
    ,'avg_amt_per_vendor'
    ,'days_cust'
    ,'avg_days_to_order'
    ,'days_due'
])

In [189]:
# Create a mask to check if each day column is populated
mask = df[[f'DOW_{i}' for i in range(7)]] > 0

# Sum over the mask to get the count of days with purchases for each row
df.loc[:, 'n_days_week'] = mask.sum(axis=1)

# Updating the list of metric features
metric_features.append('n_days_week')

In [190]:
# Create a mask to check if each hour column is populated
mask = df[hour_features] > 0

# Sum over the mask to get the count of hours with purchases for each row
df.loc[:, 'n_times_day'] = mask.sum(axis=1)

# Updating the list of metric features
metric_features.append('n_times_day')

In [191]:
# Flag customers who have purchased in more than one day
df['regular'] = (df['days_cust'] > 1)

non_metric_features.append('regular')

In [192]:
# Create a mask where values are greater than zero (indicating an order)
mask = df[cuisine_features] > 0

# Use mask to get the number of cuisines for each row
df.loc[:, 'n_cuisines'] = mask.sum(axis=1)

# Updating the metric_features_list
metric_features.append('n_cuisines')

In [193]:
# Dropping specified columns and getting remaining columns as a list
targets = df.drop(columns=[
    'cust_age'
    , 'first_order'
    , 'last_order'
    , 'days_cust'
    , 'days_due'
    , 'avg_days_to_order'
    , 'per_chain_order'
    , 'cust_region'
    , 'cust_city'
    , 'last_promo'
    , 'pay_method'
    , 'n_cuisines'
    , 'regular'
] + hour_features + day_features).columns.tolist()

# Initialize an empty dfFrame to store log-transformed columns
log_transformed = pd.DataFrame()

# Apply log1p to each column in targets and add it to log_transformed with the prefix 'log_'
for col in targets:
    log_transformed[f"log_{col}"] = np.log1p(df[col])

# We create a list of log_features to assist us in our exploration
log_features = log_transformed.columns.tolist()

# Concatenate the original dfFrame with the new log-transformed dfFrame
df = pd.concat([df, log_transformed], axis=1)

In [194]:
# Initialize dictionaries for feature groups with flags and relevant columns
feature_groups = {
    'foodie': ['n_vendor', 'n_product', 'n_order', 'n_cuisines'],
    'gluttonous': ['avg_amt_per_order', 'total_amt', 'n_chain'],
    'loyal': ['avg_amt_per_vendor'] + cuisine_features
}


# Create columns to hold the flags for each feature group
df['foodie_flag'] = 0
df['gluttonous_flag'] = 0
df['loyal_flag'] = 0

# Function to calculate IQR bounds
def calculate_bounds(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return lower_bound, upper_bound

# Assign flags for each feature group
for group, features in feature_groups.items():
    for feature in features:
        log_feature = f"log_{feature}"
        
        if feature == 'n_cuisines':
            log_feature = feature
        
        lower_bound, upper_bound = calculate_bounds(df.loc[(df['regular'] == 1) & (df[feature] > 0), log_feature])
        
        # Mark outliers for each group
        if group == 'foodie':
            df.loc[df['regular'] == 1, 'foodie_flag'] |= (
                df.loc[df['regular'] == 1, log_feature] > upper_bound
            ).astype(int)
        elif group == 'gluttonous':
            df.loc[df['regular'] == 1, 'gluttonous_flag'] |= (
                df.loc[df['regular'] == 1, log_feature] > upper_bound
            ).astype(int)
        elif group == 'loyal':
            df.loc[df['regular'] == 1, 'loyal_flag'] |= (
                df.loc[df['regular'] == 1, log_feature] > upper_bound
            ).astype(int)

# Display results
for group in ['foodie_flag', 'gluttonous_flag', 'loyal_flag']:
    print(f"Number of customers flagged as {group.split('_')[0]}:", df[group].sum())

non_metric_features.extend([
    'foodie_flag'
    ,'gluttonous_flag'
    ,'loyal_flag'
])

Number of customers flagged as foodie: 2857
Number of customers flagged as gluttonous: 648
Number of customers flagged as loyal: 455


In [195]:
def top_n(row, col_list, n):
    # Sort the specified columns in descending order
    sorted_row = row[col_list].sort_values(ascending=False)

    # Get the unique sorted values
    unique_sorted_values = sorted_row.unique()

    # Ensure there are enough unique values to determine the n-th largest
    if len(unique_sorted_values) >= n:
        nth_value = unique_sorted_values[n - 1]  # Get the n-th largest unique value

        # If the n-th value is 0, return None
        if nth_value == 0:
            return None
        
        # If n > 1, check for uniqueness against the (n-1)-th largest
        if n > 1:
            prev_value = unique_sorted_values[n - 2]  # (n-1)-th largest unique value
            # If nth_value is equal to the (n-1)-th value, we don't want to return it
            if nth_value == prev_value:
                return None
        
        # Return the index of the n-th largest value
        return sorted_row[sorted_row == nth_value].index[0]

    # Return None if conditions are not met
    return None

In [196]:
df['top_cuisine'] = df.apply(top_n, col_list=cuisine_features, n=1, axis=1)

non_metric_features.append('top_cuisine')

In [197]:
# Average amount spent per day as customer
df['avg_amt_per_day'] = np.round(df['total_amt'] / df['days_cust'], 4)

# Average number of products ordered per day as customer
df['avg_product_per_day'] = np.round(df['n_product'] / df['days_cust'], 4)

# Average number of orders per day as customer
df['avg_order_per_day'] = np.round(df['n_order'] / df['days_cust'], 4)

metric_features.extend([
    'avg_amt_per_day'
    ,'avg_product_per_day'
    ,'avg_order_per_day'
])

In [198]:
df.loc[df['cust_age'].isna(), 'cust_age'] = df['cust_age'].mean().astype('int')

In [199]:
# Creating age buckets
df['age_bucket'] = np.where(
    df['cust_age'] < 25, '15-24', np.where(
        df['cust_age'] < 35, '25-34', np.where(
            df['cust_age'] < 45, '35-44', np.where(
                df['cust_age'] < 55, '45-54', np.where(
                    df['cust_age'] < 65, '55-64', '65+'
                )
            )
        )
    )
)

non_metric_features.insert(4, 'age_bucket')

# 3. Dashboard

In [432]:
# Initialize the app
external_stylesheet = [dbc.themes.VAPOR]
app = Dash(__name__, external_stylesheets=external_stylesheet)

In [433]:
applicable_columns = [col for col in df.columns if col not in non_metric_features]

In [434]:
# App layout
app.layout = dbc.Container([
    dbc.Row([
        html.Div('My Dashboard', className="text-primary text-center fs-3")
    ]),
    dbc.Row([
        dbc.Col([
            dcc.Dropdown(
                id='column-dropdown',
                options=[{'label': col, 'value': col} for col in df.columns],
                value=df.columns[0],  # Set default value to the first column
                style={'max-width': '250px'}
            )
        ], width = 3),
        # dbc.Col([
        #     dcc.RangeSlider(
        #         id='value-slider',
        #         min=df[applicable_columns[0]].min(),
        #         max=df[applicable_columns[0]].max(),
        #         step=1,
        #         marks={},
        #         value=[df[applicable_columns[0]].min(), df[applicable_columns[0]].max()]
        #     )
        # ])
    ], style={'margin-bottom': '15px'}),
    dbc.Row([
        dbc.Col([
            dcc.Graph(figure={}, id='col_histogram')
        ], width=6)
        ,dbc.Col([
            dcc.Graph(figure={}, id='col_boxplot')
        ], width=6)
    ])
])

In [435]:
# Dropdown Menu Interaction
@app.callback(
    Output({"type": "output", "index": MATCH}, "children"),
    [Input({"type": "dropdown-item", "index": MATCH}, "n_clicks")],
)

def update_output(n_clicks):
    ctx = callback_context
    if not ctx.triggered:
        return "Select an item."
    return f"You clicked {ctx.triggered[0]['prop_id']}"

In [436]:
# Slider Interaction
@callback(
    Output('value-slider', 'min'),
    Output('value-slider', 'max'),
    Output('value-slider', 'marks'),
    Output('value-slider', 'value'),
    Input('column-dropdown', 'value')
)

def update_slider(col_chosen):
    # Get the minimun and maximum values for the column
    min_value = df[col_chosen].min()
    max_value = df[col_chosen].max()

    # Get the unique values for the column
    marks = {i: str(i) for i in range(min_value, max_value + 1)}

    return min_value, max_value, marks, [min_value, max_value]

In [437]:
# # Graphs Interaction
# @callback(
#     [Output(component_id='col_histogram', component_property='figure'),
#     Output(component_id='col_boxplot', component_property='figure')],
#     [Input(component_id='column-dropdown', component_property='value'),
#     Input('value-slider', 'value')]
# )

# def update_graph(col_chosen, value_range):
#     min_value, max_value = value_range
#     filtered_df = df[(df[col_chosen] >= min_value) & (df[col_chosen] <= max_value)]

#     fig_hist = px.histogram(filtered_df, x=col_chosen)
#     fig_box = px.box(filtered_df, y=col_chosen)  # TODO: change color
    
#     return fig_hist, fig_box

In [438]:
# Graphs Interaction
@callback(
    [Output(component_id='col_histogram', component_property='figure'),
    Output(component_id='col_boxplot', component_property='figure')],
    [Input(component_id='column-dropdown', component_property='value')]
)

def update_graph(col_chosen):

    fig_hist = px.histogram(df, x=col_chosen)
    fig_box = px.box(df, y=col_chosen)  # TODO: change color
    
    return fig_hist, fig_box

In [439]:
# Run the app
if __name__ == '__main__':
    app.run(debug=True)