In [19]:
# IMPORTS
import joblib
import pandas as pd
import numpy as np

In [20]:
# GLOBALS
PATH = '../dashboard/data/'

FILENAME = 'raw_data.csv'
raw_data_point = pd.read_csv(f'{PATH}{FILENAME}')

SCALER = joblib.load('models/std_scaler.pkl')
ENCODER = joblib.load('models/hot_encoder.pkl')

In [109]:
OG_LIST = [
    'cust_region',
    'last_promo',
    'pay_method',
    'cust_age',   
    'n_vendor',
    'n_product',
    'n_chain',
    'first_order',
    'last_order',
    'american',
    'asian',
    'beverages'
    'cafe',
    'chicken_dishes',
    'chinese',
    'desserts'
    'healthy',
    'indian',
    'italian',
    'japanese',
    'noodle_dishes',
    'other',
    'street_food_snacks',
    'thai',
    'DOW_0',
    'DOW_1',
    'DOW_2',
    'DOW_3',
    'DOW_4',
    'DOW_5',
    'DOW_6',
    'HR_0',
    'HR_1',
    'HR_2',
    'HR_3',
    'HR_4',
    'HR_5',
    'HR_6',
    'HR_7',
    'HR_8',
    'HR_9',
    'HR_10',
    'HR_11',
    'HR_12',
    'HR_13',
    'HR_14',
    'HR_15',
    'HR_16',
    'HR_17',
    'HR_18',
    'HR_19',
    'HR_20',
    'HR_21',
    'HR_22',
    'HR_23'   
]

NON_METRIC_KEYS = OG_LIST[:3]  
METRIC_KEYS = OG_LIST[3:]      
CUISINE_KEYS = OG_LIST[9:22]

MEANS = \
{
    'cust_age': 27.505,
    'first_order': 23.081,
    'last_order': 68.927,
}

TIME_LIKELYHOODS = \
{ 
    'DAY':
    {
        'DOW_0': 0.638,
        'DOW_1': 0.65,
        'DOW_2': 0.679,
        'DOW_3': 0.71,
        'DOW_4': 0.777,
        'DOW_5': 0.746,
        'DOW_6': 0.808        
    },

    'HOUR':
    {
        'HR_0': 0.053,
        'HR_1': 0.06,
        'HR_2': 0.07,
        'HR_3': 0.136,
        'HR_4': 0.114,
        'HR_5': 0.094,
        'HR_6': 0.078,
        'HR_7': 0.084,
        'HR_8': 0.142,
        'HR_9': 0.263,
        'HR_10': 0.374,
        'HR_11': 0.436,
        'HR_12': 0.369,
        'HR_13': 0.276,
        'HR_14': 0.247,
        'HR_15': 0.318,
        'HR_16': 0.414,
        'HR_17': 0.452,
        'HR_18': 0.391,
        'HR_19': 0.287,
        'HR_20': 0.166,
        'HR_21': 0.083,
        'HR_22': 0.053,
        'HR_23': 0.051   
    }
}

In [120]:
# HELPER FUNCTIONS

def top_n(row, col_list, n):
    # Sort the specified columns in descending order
    sorted_row = row[col_list].sort_values(ascending=False)

    # Get the unique sorted values
    unique_sorted_values = sorted_row.unique()

    # Ensure there are enough unique values to determine the n-th largest
    if len(unique_sorted_values) >= n:
        nth_value = unique_sorted_values[n - 1]  # Get the n-th largest unique value

        # If the n-th value is 0, return None
        if nth_value == 0:
            return None
        
        # If n > 1, check for uniqueness against the (n-1)-th largest
        if n > 1:
            prev_value = unique_sorted_values[n - 2]  # (n-1)-th largest unique value
            # If nth_value is equal to the (n-1)-th value, we don't want to return it
            if nth_value == prev_value:
                return None
        
        # Return the index of the n-th largest value
        return sorted_row[sorted_row == nth_value].index[0]

    # Return None if conditions are not met
    return None

def throw_dice(likelihood_dict):
    # Unzip the dictionary into choices and probabilities
    choices, probabilities = zip(*likelihood_dict.items())
    
    # Normalize probabilities
    probabilities = np.array(probabilities) / np.sum(probabilities)
    
    # Return a random choice based on the probabilities
    return np.random.choice(choices, p=probabilities)

In [117]:
def process_input(data_point: dict) -> dict:
    if not isinstance(data_point, dict):
        raise TypeError("The input must be a dictionary.")

    # Workflow
    # Apply preprocessment to datapoint
    point = preproc(data_point)
    # call scaler
    # call models
    
    return data_point


In [121]:
test_data = {
    "n_order": 3,
    "n_cuisines": 5,
    "total_amt": 150.0,
    "n_vendor": 2,
    "n_chain": 1,
    "n_product": 5,
    "first_order": 10.5,
    "last_order": 45.3,
    "DOW_0": 1, "DOW_1": 1, "DOW_2": 0, "DOW_3": 0, "DOW_4": 0, "DOW_5": 0, "DOW_6": 0,
    "HR_0": 0, "HR_1": 0, "HR_2": 0, "HR_3": 1, "HR_4": 0, "HR_5": 0, "HR_6": 0, "HR_7": 1,
    "HR_8": 0, "HR_9": 0, "HR_10": 0, "HR_11": 0,
    "thai": 2, "italian": 3, "asian": 0, 'american':100
}

In [124]:
preproc(test_data)

{'n_order': 3.0,
 'n_cuisines': 3,
 'total_amt': 105.0,
 'n_vendor': 3,
 'n_chain': 1.0,
 'n_product': 5.0,
 'first_order': 23.081,
 'last_order': 68.927,
 'DOW_0': 1.0,
 'DOW_1': 1.0,
 'DOW_2': 1.0,
 'DOW_3': 0.0,
 'DOW_4': 0.0,
 'DOW_5': 0.0,
 'DOW_6': 0.0,
 'HR_0': 0.0,
 'HR_1': 0.0,
 'HR_2': 0.0,
 'HR_3': 1.0,
 'HR_4': 0.0,
 'HR_5': 0.0,
 'HR_6': 0.0,
 'HR_7': 1.0,
 'HR_8': 0.0,
 'HR_9': 0.0,
 'HR_10': 0.0,
 'HR_11': 0.0,
 'thai': 2.0,
 'italian': 3.0,
 'asian': 0.0,
 'american': 100.0,
 'cust_age': 27.505,
 'beveragescafe': 0.0,
 'chicken_dishes': 0.0,
 'chinese': 0.0,
 'dessertshealthy': 0.0,
 'indian': 0.0,
 'japanese': 0.0,
 'noodle_dishes': 0.0,
 'other': 0.0,
 'street_food_snacks': 0.0,
 'HR_12': 0.0,
 'HR_13': 0.0,
 'HR_14': 0.0,
 'HR_15': 0.0,
 'HR_16': 1.0,
 'HR_17': 0.0,
 'HR_18': 0.0,
 'HR_19': 0.0,
 'HR_20': 0.0,
 'HR_21': 0.0,
 'HR_22': 0.0,
 'HR_23': 0.0,
 'cust_region': nan,
 'last_promo': nan,
 'pay_method': nan}

In [123]:
def preproc(raw_data_point: dict) -> dict: 
    
    # Enforce datatypes for metric elements
    for key in METRIC_KEYS:
        try:
            raw_data_point[key] = np.float64(raw_data_point[key])
            
            # Check if the value is less than 0
            if raw_data_point[key] < 0:
                raise ValueError
        except (ValueError, KeyError) as e:
            raw_data_point[key] = np.nan
    
    # Enforce string types for non-metric elements
    for key in NON_METRIC_KEYS:
        try:
            if not isinstance(raw_data_point[key], str):
                raw_data_point[key] = np.nan
        except KeyError:
            raw_data_point[key] = np.nan
            
    # Initialize sums
    n_week = 0
    n_day = 0
    
    # Calculate n_week and n_day directly, filling missing values with 0
    for n in range(7):
        dow_key = f"DOW_{n}"
        if pd.isna(raw_data_point[dow_key]):
            raw_data_point[dow_key] = 0
        n_week += raw_data_point[dow_key]
    
    for n in range(24):
        hr_key = f"HR_{n}"
        if pd.isna(raw_data_point[hr_key]):
            raw_data_point[hr_key] = 0
        n_day += raw_data_point[hr_key]

    diff = int(np.ceil(n_week - n_day))

    # Correct if not equal
    likelyhood = None
    if diff < 0:
        likelyhood = TIME_LIKELYHOODS['DAY']
    elif diff > 0:
        likelyhood = TIME_LIKELYHOODS['HOUR']
    else:
        pass

    if diff != 0:
        for _ in range(diff):
            raw_data_point[throw_dice(likelyhood)] += 1

    # Finally, set n_order equal to the sum of either 
    n_order = n_week 

    # Fill missing amounts in cuisines with 0
    for key in CUISINE_KEYS:
        if pd.isna(raw_data_point[key]):
            raw_data_point[key] = 0
            
    # Calculate number of cuisines ordered
    n_cuisines = sum(
        (raw_data_point[key] > 0) 
        for key in CUISINE_KEYS 
    )

    # Check if customer to segment has spent any money
    if n_cuisines == 0:
        return "Error: Invalid customer. Specify customer ammounts spent per cuisine."        
       
    # Check if number of cuisines ordered is larger than the number of orders made
    diff = int(np.ceil(n_cuisines > n_order))
    
    if diff > 0:
        for _ in range(diff):
            # Assign another HR stochastically
            raw_data_point[throw_dice(TIME_LIKELYHOODS['HOUR'])] += 1
            # Assign another DAY stochastically
            raw_data_point[throw_dice(TIME_LIKELYHOODS['DAY'])] += 1
            n_order += 1
            
    # Sum total cuisines to obtain the total amount spent
    total_amt = sum(
        raw_data_point[key] for key in CUISINE_KEYS
    )

    raw_data_point['n_order'] = n_order
    raw_data_point['n_cuisines'] = n_cuisines
    raw_data_point['total_amt'] = total_amt
        
    try:
        # Assuming each vendor only serves one type of cuisine, which might introduce some bias, but how else to solve?
        if raw_data_point['n_vendor'] < n_cuisines:
            raw_data_point['n_vendor'] = n_cuisines
    except ValueError:
        raw_data_point['n_vendor'] = n_cuisines

    try:
        if pd.isna(raw_data_point['n_chain']): 
            raise ValueError
        # Check if the customer made a consistent number of purchases from chained restaurants 
        if raw_data_point['n_chain'] > raw_data_point['n_vendor']:
            raw_data_point['n_chain'] = raw_data_point['n_vendor']

        # Check if the customer made an illegal number of purchases from chained restaurants
        if raw_data_point['n_chain'] < 0: 
            raw_data_point['n_chain'] = 0
            
    except ValueError:
        # If here then definitely illegal
        raw_data_point['n_chain'] = 0
    
    try:
        # Check if the number of products is at least equal to the number of orders made
        if raw_data_point['n_product'] < raw_data_point['n_order']:
            raw_data_point['n_product'] = raw_data_point['n_order']
    except ValueError:
        raw_data_point['n_product'] = raw_data_point['n_order']

    if raw_data_point['first_order']:
        raw_data_point['first_order'] = MEANS['first_order']
        
    if raw_data_point['last_order']:
        raw_data_point['last_order'] = MEANS['last_order']

    if raw_data_point['cust_age']:
        raw_data_point['cust_age'] = MEANS['cust_age']
        
    return raw_data_point


In [None]:
# Amount spent on average per product
raw_data_point['avg_amt_per_product'] = raw_data_point['total_amt'] / raw_data_point['n_product']

# Amount spent on average per order
raw_data_point['avg_amt_per_order'] = raw_data_point['total_amt'] / raw_data_point['n_order']

# Amount spent on average per vendor
raw_data_point['avg_amt_per_vendor'] = raw_data_point['total_amt'] / raw_data_point['n_vendor']

# Total days as customer
raw_data_point['days_cust'] = raw_data_point['last_order'] - raw_data_point['first_order']

# Average days between orders
raw_data_point['avg_days_to_order'] = raw_data_point['days_cust'] / raw_data_point['n_order']

# Days the customer is due, according to their average days between orders
raw_data_point['days_due'] = 90 - raw_data_point['last_order'] + raw_data_point['avg_days_to_order']

# Percentage of orders placed to restaurants that are part of a chain
raw_data_point['per_chain_order'] = raw_data_point['n_chain'] / raw_data_point['n_order']

# And we add these tese features to the metric features list.
metric_features.extend([
    'n_order'
    ,'per_chain_order'
    ,'total_amt'
    ,'avg_amt_per_order'
    ,'avg_amt_per_product'
    ,'avg_amt_per_vendor'
    ,'days_cust'
    ,'avg_days_to_order'
    ,'days_due'
])

In [None]:
# Create a mask to check if each day column is populated
mask = raw_data_point[[f'DOW_{i}' for i in range(7)]] > 0

# Sum over the mask to get the count of days with purchases for each row
raw_data_point.loc[:, 'n_days_week'] = mask.sum(axis=1)

# Updating the list of metric features
metric_features.append('n_days_week')

In [None]:
# Create a mask to check if each hour column is populated
mask = raw_data_point[hour_features] > 0

# Sum over the mask to get the count of hours with purchases for each row
raw_data_point.loc[:, 'n_times_day'] = mask.sum(axis=1)

# Updating the list of metric features
metric_features.append('n_times_day')

In [None]:
# Flag customers who have purchased in more than one day
raw_data_point['regular'] = (raw_data_point['days_cust'] > 1)

non_metric_features.append('regular')

In [None]:
# Create a mask where values are greater than zero (indicating an order)
mask = raw_data_point[cuisine_features] > 0

# Use mask to get the number of cuisines for each row
raw_data_point.loc[:, 'n_cuisines'] = mask.sum(axis=1)

# Updating the metric_features_list
metric_features.append('n_cuisines')

In [None]:
# Dropping specified columns and getting remaining columns as a list
targets = raw_data_point.drop(columns=[
    'cust_age'
    , 'first_order'
    , 'last_order'
    , 'days_cust'
    , 'days_due'
    , 'avg_days_to_order'
    , 'per_chain_order'
    , 'cust_region'
    , 'cust_city'
    , 'last_promo'
    , 'pay_method'
    , 'n_cuisines'
    , 'regular'
] + hour_features + day_features).columns.tolist()

# Initialize an empty dfFrame to store log-transformed columns
log_transformed = pd.DataFrame()

# Apply log1p to each column in targets and add it to log_transformed with the prefix 'log_'
for col in targets:
    log_transformed[f"log_{col}"] = np.log1p(raw_data_point[col])

# We create a list of log_features to assist us in our exploration
log_features = log_transformed.columns.tolist()

# Concatenate the original dfFrame with the new log-transformed dfFrame
raw_data_point = pd.concat([raw_data_point, log_transformed], axis=1)

In [None]:
# Initialize dictionaries for feature groups with flags and relevant columns
feature_groups = {
    'foodie': ['n_vendor', 'n_product', 'n_order', 'n_cuisines'],
    'gluttonous': ['avg_amt_per_order', 'total_amt', 'n_chain'],
    'loyal': ['avg_amt_per_vendor'] + cuisine_features
}

# Create columns to hold the flags for each feature group
raw_data_point['foodie_flag'] = 0
raw_data_point['gluttonous_flag'] = 0
raw_data_point['loyal_flag'] = 0

# Function to calculate IQR bounds
def calculate_bounds(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return lower_bound, upper_bound

# Assign flags for each feature group
for group, features in feature_groups.items():
    for feature in features:
        log_feature = f"log_{feature}"
        
        if feature == 'n_cuisines':
            log_feature = feature
        
        lower_bound, upper_bound = calculate_bounds(raw_data_point.loc[(raw_data_point['regular'] == 1) & (raw_data_point[feature] > 0), log_feature])
        
        # Mark outliers for each group
        if group == 'foodie':
            raw_data_point.loc[raw_data_point['regular'] == 1, 'foodie_flag'] |= (
                raw_data_point.loc[raw_data_point['regular'] == 1, log_feature] > upper_bound
            ).astype(int)
        elif group == 'gluttonous':
            raw_data_point.loc[raw_data_point['regular'] == 1, 'gluttonous_flag'] |= (
                raw_data_point.loc[raw_data_point['regular'] == 1, log_feature] > upper_bound
            ).astype(int)
        elif group == 'loyal':
            raw_data_point.loc[raw_data_point['regular'] == 1, 'loyal_flag'] |= (
                raw_data_point.loc[raw_data_point['regular'] == 1, log_feature] > upper_bound
            ).astype(int)

# Display results
for group in ['foodie_flag', 'gluttonous_flag', 'loyal_flag']:
    print(f"Number of customers flagged as {group.split('_')[0]}:", raw_data_point[group].sum())

non_metric_features.extend([
    'foodie_flag'
    ,'gluttonous_flag'
    ,'loyal_flag'
])

In [None]:
raw_data_point['top_cuisine'] = raw_data_point.apply(top_n, col_list=cuisine_features, n=1, axis=1)

non_metric_features.append('top_cuisine')

In [None]:
# Average amount spent per day as customer
raw_data_point['avg_amt_per_day'] = np.round(raw_data_point['total_amt'] / raw_data_point['days_cust'], 4)

# Average number of products ordered per day as customer
raw_data_point['avg_product_per_day'] = np.round(raw_data_point['n_product'] / raw_data_point['days_cust'], 4)

# Average number of orders per day as customer
raw_data_point['avg_order_per_day'] = np.round(raw_data_point['n_order'] / raw_data_point['days_cust'], 4)

metric_features.extend([
    'avg_amt_per_day'
    ,'avg_product_per_day'
    ,'avg_order_per_day'
])

In [None]:
raw_data_point.loc[raw_data_point['cust_age'].isna(), 'cust_age'] = raw_data_point['cust_age'].mean().astype('int')

In [None]:
# Creating age buckets
raw_data_point['age_bucket'] = np.where(
    raw_data_point['cust_age'] < 25, '15-24', np.where(
        raw_data_point['cust_age'] < 35, '25-34', np.where(
            raw_data_point['cust_age'] < 45, '35-44', np.where(
                raw_data_point['cust_age'] < 55, '45-54', np.where(
                    raw_data_point['cust_age'] < 65, '55-64', '65+'
                )
            )
        )
    )
)

non_metric_features.insert(4, 'age_bucket')

In [None]:
raw_data_point.loc[raw_data_point['cust_age'].isna(), 'cust_age'] = np.ceil(raw_data_point['cust_age'].mean())

In [None]:
raw_data_point.topcs