In [1]:
# IMPORTS
import pandas as pd
import numpy as np
import joblib

In [2]:
# GLOBALS
PATH = '../dashboard/data/'

FILENAME = 'regulars.csv'
DATA = pd.read_csv(f'{PATH}{FILENAME}')

SCALER = joblib.load('models/std_scaler.pkl')
ENCODER = joblib.load('models/hot_encoder.pkl')

In [13]:
OG_DICT = {
        'customer_region' : 'cust_region'
        , 'payment_method' : 'pay_method'
        , 'customer_age' : 'cust_age'
        , 'vendor_count' : 'n_vendor'
        , 'product_count' : 'n_product'
        , 'n_order' : 'n_order'
        , 'is_chain' : 'n_chain'
        , 'CUI_American' : 'american'
        , 'CUI_Asian' : 'asian'
        , 'CUI_Beverages' : 'beverages'
        , 'CUI_Cafe' : 'cafe'
        , 'CUI_Chicken Dishes' : 'chicken_dishes'
        , 'CUI_Chinese' : 'chinese'
        , 'CUI_Desserts' : 'desserts'
        , 'CUI_Healthy' : 'healthy'
        , 'CUI_Indian' : 'indian'
        , 'CUI_Italian' : 'italian'
        , 'CUI_Japanese' : 'japanese'
        , 'CUI_Noodle Dishes' : 'noodle_dishes'
        , 'CUI_OTHER' : 'other'
        , 'CUI_Street Food / Snacks' : 'street_food_snacks'
        , 'CUI_Thai' : 'thai'
}

METRIC = \
{
    'cust_age': 27.505,
    'n_vendor': 3.626,
    'n_product': 6.392,
    'n_chain': 3.267,
    'first_order': 23.081,
    'last_order': 68.927,
    'american': 5.574,
    'asian': 11.208,
    'beverages': 2.489,
    'cafe': 0.778,
    'chicken_dishes': 0.86,
    'chinese': 1.509,
    'desserts': 0.902,
    'healthy': 0.97,
    'indian': 1.677,
    'italian': 3.586,
    'japanese': 3.241,
    'noodle_dishes': 0.739,
    'other': 3.317,
    'street_food_snacks': 3.965,
    'thai': 0.907,
    'total_amt': 41.723,
    'n_order': 5.008,
    'avg_amt_per_product': 6.879,
    'avg_amt_per_order': 9.066,
    'avg_amt_per_vendor': 12.67,
    'days_cust': 45.846,
    'avg_days_to_order': 11.276,
    'days_due': 32.349,
    'per_chain_order': 0.637,
    'n_days_week': 3.185,
    'n_times_day': 3.42,
    'n_cuisines': 2.628,
    'log_n_vendor': 1.418,
    'log_n_product': 1.808,
    'log_n_chain': 1.203,
    'log_american': 0.94,
    'log_asian': 1.223,
    'log_beverages': 0.447,
    'log_cafe': 0.124,
    'log_chicken_dishes': 0.227,
    'log_chinese': 0.287,
    'log_desserts': 0.167,
    'log_healthy': 0.18,
    'log_indian': 0.289,
    'log_italian': 0.556,
    'log_japanese': 0.552,
    'log_noodle_dishes': 0.162,
    'log_other': 0.592,
    'log_street_food_snacks': 0.425,
    'log_thai': 0.192,
    'log_total_amt': 3.436,
    'log_n_order': 1.634,
    'log_avg_amt_per_product': 1.97,
    'log_avg_amt_per_order': 2.157,
    'log_avg_amt_per_vendor': 2.41,
    'log_n_days_week': 1.372,
    'log_n_times_day': 1.403,
    'avg_amt_per_day': 1.345,
    'avg_product_per_day': 0.189,
    'avg_order_per_day': 0.149
}

TIME_LIKELYHOODS = \
{ 
    'DAY':
    {
        'DOW_0': 0.638,
        'DOW_1': 0.65,
        'DOW_2': 0.679,
        'DOW_3': 0.71,
        'DOW_4': 0.777,
        'DOW_5': 0.746,
        'DOW_6': 0.808        
    },

    'HOUR':
    {
        'HR_0': 0.053,
        'HR_1': 0.06,
        'HR_2': 0.07,
        'HR_3': 0.136,
        'HR_4': 0.114,
        'HR_5': 0.094,
        'HR_6': 0.078,
        'HR_7': 0.084,
        'HR_8': 0.142,
        'HR_9': 0.263,
        'HR_10': 0.374,
        'HR_11': 0.436,
        'HR_12': 0.369,
        'HR_13': 0.276,
        'HR_14': 0.247,
        'HR_15': 0.318,
        'HR_16': 0.414,
        'HR_17': 0.452,
        'HR_18': 0.391,
        'HR_19': 0.287,
        'HR_20': 0.166,
        'HR_21': 0.083,
        'HR_22': 0.053,
        'HR_23': 0.051   
    }
}

In [11]:
# HELPER FUNCTIONS

def top_n(row, col_list, n):
    # Sort the specified columns in descending order
    sorted_row = row[col_list].sort_values(ascending=False)

    # Get the unique sorted values
    unique_sorted_values = sorted_row.unique()

    # Ensure there are enough unique values to determine the n-th largest
    if len(unique_sorted_values) >= n:
        nth_value = unique_sorted_values[n - 1]  # Get the n-th largest unique value

        # If the n-th value is 0, return None
        if nth_value == 0:
            return None
        
        # If n > 1, check for uniqueness against the (n-1)-th largest
        if n > 1:
            prev_value = unique_sorted_values[n - 2]  # (n-1)-th largest unique value
            # If nth_value is equal to the (n-1)-th value, we don't want to return it
            if nth_value == prev_value:
                return None
        
        # Return the index of the n-th largest value
        return sorted_row[sorted_row == nth_value].index[0]

    # Return None if conditions are not met
    return None

def throw_dice(colnames, likelyhoods: list) -> str:
    return np.random.choice(colnames, p=np.array(likelyhoods)/np.sum(likelyhoods))

In [12]:
def process_input(data_point: dict) -> dict:
    if not isinstance(data_point, dict):
        raise TypeError("The input must be a dictionary.")

    # Workflow
    # Apply preprocessment to datapoint
    point = preproc(data_point)
    # call scaler
    # call models
    
    return data_point


In [14]:
def preproc(raw_data_point: dict) -> dict: 
    # 1. Enforce datatypes
    for key in raw_data_point:
        if key in METRIC_FEATURE_LIST:
            if type(raw_data_point[key]) not in 'Int64':
                try:
                    raw_data_point[key] = raw_data_point[key].astype('Int64')
                except ValueError:
                    raw_data_point[key] = np.nan
        else:
            if type(raw_data_point[key]) not in 'str':
                raw_data_point[key] = raw_data_point[key].astype('Object')

    # Assume user is leaving blank non-natural primary key items in the table.
    
    # Fill missing HR, and DAY with 0
    raw_data_point[[f"DOW_{n}" for n in range(7)]].fillna(0, inplace=True)
    raw_data_point[[f"HR_{n}" for n in range(7)]].fillna(0, inplace=True)
    
    # But correct the number of WEEK and Day purchases to match
    sum_week = raw_data_point[[f"DOW_{n}" for n in range(7)]].sum(axis=1)
    sum_day = raw_data_point[[f"HR_{n}" for n in range(24)]].sum(axis=1)
    diff = sum_week - sum_day
    
    # Correct if not equal
    likelyhood = None
    if diff < 0:
        likelyhood = TIME_LIKELYHOODS['DAY']
    elif diff > 0:
        likelyhood = TIME_LIKELYHOODS['WEEK']
    else:
        pass

    if likelyhoods:
        for _ in range(diff):
            raw_data_point[throw_dice(**likelyhoods)] += 1

    # Finally, set n_order equal to either sum of either 
    n_order = sum_week 
    
    # Fill missing ammounts in cuisines with 0
    raw_data_point[list({v for k, v in OG_DICT.items() if k.startswith('CUI')})].fillna(0, inplace=True)
    
    columns_to_add = ['log_total_amt', 'log_avg_amt_per_product']
    columns_to_add = ['total_amt', 'avg_amt_per_product', 'n_chain', 'n_cuisines']
    
    columns_to_add = ['total_amt', 'n_cuisines', 'n_vendor', 'n_product']
    
    , 'vendor_count' : 'n_vendor'
    , 'is_chain' : 'n_chain'
    , 'product_count' : 'n_product'

    'customer_region' : 'cust_region'
    , 'payment_method' : 'pay_method'
    , 'customer_age' : 'cust_age'

SyntaxError: illegal target for annotation (379136628.py, line 52)

In [None]:
# has at least one vendor
has_vendor = data['n_vendor'] != 0 

# has at least one product
has_product = data['n_product'] != 0 

# purchase must have been made on a valid dow
some_day = (data[[f"DOW_{n}" for n in range(7)]] != 0).any(axis = 1) 

# purchase must have been made at a valid hour
some_hour = (data[[f"HR_{n}" for n in range(24)]] != 0).any(axis = 1)  

# some type of cuisine must have been ordered
some_food = (data[data.columns[9:24]] != 0).any(axis = 1) 

data = data[(has_vendor & has_product & some_day & some_hour & some_food)]  # And we drop these values

In [None]:
# Customer Region
data.loc[data['cust_region'] == '-', 'cust_region'] = '8670'
data.loc[data['cust_region'].isin(['2440', '2490']), 'cust_region'] = '2400'

# Add the feature Customer CIty
data['cust_city'] = data['cust_region'].apply(lambda x: x[0])

In [None]:
# Last Promo
data.loc[data['last_promo'] == '-', 'last_promo'] = 'NO_PROMO'

In [None]:
# Tidying up datatypes
for col in data.iloc[:, 0:9]:
    if col in ['last_promo', 'pay_method']:
        data[col] = data[col].astype(object)
    else:
        data[col] = data[col].astype('Int64')

for col in data.iloc[:, 9:24]:
    data[col] = data[col].astype(float)

for col in data.iloc[:, 24:]:
    data[col] = data[col].astype('Int64')

In [None]:
# Categorical variables
non_metric_features = []

# Hour of day variables
hour_features = data.columns[31:55].to_list()

# Day of week variables
day_features = data.columns[24:31].to_list()

# Cuisine features
cuisine_features = data.columns[9:24].to_list()

# Metric variables, that are not above
metric_features = data.columns.drop(non_metric_features).drop(hour_features).drop(day_features).drop(cuisine_features).to_list()

In [None]:
# Total amount spent by customer on all types of cuisine
data['total_amt'] = data[cuisine_features].sum(axis=1)

# Number of orders made by the customer
data['n_order'] = data[day_features].sum(axis=1)

# Amount spent on average per product
data['avg_amt_per_product'] = data['total_amt'] / data['n_product']

# Amount spent on average per order
data['avg_amt_per_order'] = data['total_amt'] / data['n_order']

# Amount spent on average per vendor
data['avg_amt_per_vendor'] = data['total_amt'] / data['n_vendor']

# Total days as customer
data['days_cust'] = data['last_order'] - data['first_order']

# Average days between orders
data['avg_days_to_order'] = data['days_cust'] / data['n_order']

# Days the customer is due, according to their average days between orders
data['days_due'] = 90 - data['last_order'] + data['avg_days_to_order']

# Percentage of orders placed to restaurants that are part of a chain
data['per_chain_order'] = data['n_chain'] / data['n_order']

# And we add these tese features to the metric features list.
metric_features.extend([
    'n_order'
    ,'per_chain_order'
    ,'total_amt'
    ,'avg_amt_per_order'
    ,'avg_amt_per_product'
    ,'avg_amt_per_vendor'
    ,'days_cust'
    ,'avg_days_to_order'
    ,'days_due'
])

In [None]:
# Create a mask to check if each day column is populated
mask = data[[f'DOW_{i}' for i in range(7)]] > 0

# Sum over the mask to get the count of days with purchases for each row
data.loc[:, 'n_days_week'] = mask.sum(axis=1)

# Updating the list of metric features
metric_features.append('n_days_week')

In [None]:
# Create a mask to check if each hour column is populated
mask = data[hour_features] > 0

# Sum over the mask to get the count of hours with purchases for each row
data.loc[:, 'n_times_day'] = mask.sum(axis=1)

# Updating the list of metric features
metric_features.append('n_times_day')

In [None]:
# Flag customers who have purchased in more than one day
data['regular'] = (data['days_cust'] > 1)

non_metric_features.append('regular')

In [None]:
# Create a mask where values are greater than zero (indicating an order)
mask = data[cuisine_features] > 0

# Use mask to get the number of cuisines for each row
data.loc[:, 'n_cuisines'] = mask.sum(axis=1)

# Updating the metric_features_list
metric_features.append('n_cuisines')

In [None]:
# Dropping specified columns and getting remaining columns as a list
targets = data.drop(columns=[
    'cust_age'
    , 'first_order'
    , 'last_order'
    , 'days_cust'
    , 'days_due'
    , 'avg_days_to_order'
    , 'per_chain_order'
    , 'cust_region'
    , 'cust_city'
    , 'last_promo'
    , 'pay_method'
    , 'n_cuisines'
    , 'regular'
] + hour_features + day_features).columns.tolist()

# Initialize an empty dfFrame to store log-transformed columns
log_transformed = pd.DataFrame()

# Apply log1p to each column in targets and add it to log_transformed with the prefix 'log_'
for col in targets:
    log_transformed[f"log_{col}"] = np.log1p(data[col])

# We create a list of log_features to assist us in our exploration
log_features = log_transformed.columns.tolist()

# Concatenate the original dfFrame with the new log-transformed dfFrame
data = pd.concat([data, log_transformed], axis=1)

In [None]:
# Initialize dictionaries for feature groups with flags and relevant columns
feature_groups = {
    'foodie': ['n_vendor', 'n_product', 'n_order', 'n_cuisines'],
    'gluttonous': ['avg_amt_per_order', 'total_amt', 'n_chain'],
    'loyal': ['avg_amt_per_vendor'] + cuisine_features
}

# Create columns to hold the flags for each feature group
data['foodie_flag'] = 0
data['gluttonous_flag'] = 0
data['loyal_flag'] = 0

# Function to calculate IQR bounds
def calculate_bounds(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return lower_bound, upper_bound

# Assign flags for each feature group
for group, features in feature_groups.items():
    for feature in features:
        log_feature = f"log_{feature}"
        
        if feature == 'n_cuisines':
            log_feature = feature
        
        lower_bound, upper_bound = calculate_bounds(data.loc[(data['regular'] == 1) & (data[feature] > 0), log_feature])
        
        # Mark outliers for each group
        if group == 'foodie':
            data.loc[data['regular'] == 1, 'foodie_flag'] |= (
                data.loc[data['regular'] == 1, log_feature] > upper_bound
            ).astype(int)
        elif group == 'gluttonous':
            data.loc[data['regular'] == 1, 'gluttonous_flag'] |= (
                data.loc[data['regular'] == 1, log_feature] > upper_bound
            ).astype(int)
        elif group == 'loyal':
            data.loc[data['regular'] == 1, 'loyal_flag'] |= (
                data.loc[data['regular'] == 1, log_feature] > upper_bound
            ).astype(int)

# Display results
for group in ['foodie_flag', 'gluttonous_flag', 'loyal_flag']:
    print(f"Number of customers flagged as {group.split('_')[0]}:", data[group].sum())

non_metric_features.extend([
    'foodie_flag'
    ,'gluttonous_flag'
    ,'loyal_flag'
])

In [None]:
data['top_cuisine'] = data.apply(top_n, col_list=cuisine_features, n=1, axis=1)

non_metric_features.append('top_cuisine')

In [None]:
# Average amount spent per day as customer
data['avg_amt_per_day'] = np.round(data['total_amt'] / data['days_cust'], 4)

# Average number of products ordered per day as customer
data['avg_product_per_day'] = np.round(data['n_product'] / data['days_cust'], 4)

# Average number of orders per day as customer
data['avg_order_per_day'] = np.round(data['n_order'] / data['days_cust'], 4)

metric_features.extend([
    'avg_amt_per_day'
    ,'avg_product_per_day'
    ,'avg_order_per_day'
])

In [None]:
data.loc[data['cust_age'].isna(), 'cust_age'] = data['cust_age'].mean().astype('int')

In [None]:
# Creating age buckets
data['age_bucket'] = np.where(
    data['cust_age'] < 25, '15-24', np.where(
        data['cust_age'] < 35, '25-34', np.where(
            data['cust_age'] < 45, '35-44', np.where(
                data['cust_age'] < 55, '45-54', np.where(
                    data['cust_age'] < 65, '55-64', '65+'
                )
            )
        )
    )
)

non_metric_features.insert(4, 'age_bucket')

In [None]:
data.loc[data['cust_age'].isna(), 'cust_age'] = np.ceil(data['cust_age'].mean())

In [None]:
data.topcs