### Imports

Imports for this script.

In [150]:
# IMPORTS
import joblib
import pandas as pd
import numpy as np

# PROCESSING
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# MODELS

### Globals

List of global variables used.

In [154]:
# PROCESSING PICKLES
SCALER = joblib.load('../dashboard/models/std_scaler.pkl')
PCA = joblib.load('../dashboard/models/pca_components.pkl')

# INTERMEDIATE MODEL PICKLES
MINISOM = joblib.load('../dashboard/models/minisom.pkl')
NMF = joblib.load('../dashboard/models/nmf.pkl')

# MODEL PICKLES
SPENDING = joblib.load('../dashboard/models/spending_clustering.pkl')
CITY = joblib.load('../dashboard/models/city_clustering.pkl')
CUISINE = joblib.load('../dashboard/models/cuisine_clustering.pkl')
TIME = joblib.load('../dashboard/models/hour_clustering.pkl')

# VARIABLE LISTS
FULl_VAR_LIST = \
[
    'customer_id',
    'cust_age',
    'n_vendor',
    'n_product',
    'n_chain',
    'first_order',
    'last_order',
    'american',
    'asian',
    'beverages',
    'cafe',
    'chicken_dishes',
    'chinese',
    'desserts',
    'healthy',
    'indian',
    'italian',
    'japanese',
    'noodle_dishes',
    'other',
    'street_food_snacks',
    'thai',
    'DOW_0',
    'DOW_1',
    'DOW_2',
    'DOW_3',
    'DOW_4',
    'DOW_5',
    'DOW_6',
    'HR_0',
    'HR_1',
    'HR_2',
    'HR_3',
    'HR_4',
    'HR_5',
    'HR_6',
    'HR_7',
    'HR_8',
    'HR_9',
    'HR_10',
    'HR_11',
    'HR_12',
    'HR_13',
    'HR_14',
    'HR_15',
    'HR_16',
    'HR_17',
    'HR_18',
    'HR_19',
    'HR_20',
    'HR_21',
    'HR_22',
    'HR_23',
    'total_amt',
    'n_order',
    'avg_amt_per_product',
    'avg_amt_per_order',
    'avg_amt_per_vendor',
    'days_cust',
    'avg_days_to_order',
    'days_due',
    'per_chain_order',
    'n_days_week',
    'n_times_day',
    'n_cuisines',
    'log_n_vendor',
    'log_n_product',
    'log_n_chain',
    'log_american',
    'log_asian',
    'log_beverages',
    'log_cafe',
    'log_chicken_dishes',
    'log_chinese',
    'log_desserts',
    'log_healthy',
    'log_indian',
    'log_italian',
    'log_japanese',
    'log_noodle_dishes',
    'log_other',
    'log_street_food_snacks',
    'log_thai',
    'log_total_amt',
    'log_n_order',
    'log_avg_amt_per_product',
    'log_avg_amt_per_order',
    'log_avg_amt_per_vendor',
    'log_n_days_week',
    'log_n_times_day',
    'avg_amt_per_day',
    'avg_product_per_day',
    'avg_order_per_day',
    'cust_region_2360.0',
    'cust_region_2400.0',
    'cust_region_4140.0',
    'cust_region_4660.0',
    'cust_region_8370.0',
    'cust_region_8550.0',
    'cust_region_8670.0',
    'last_promo_DELIVERY',
    'last_promo_DISCOUNT',
    'last_promo_FREEBIE',
    'last_promo_NO_PROMO',
    'pay_method_CARD',
    'pay_method_CASH',
    'pay_method_DIGI',
    'cust_city_2.0',
    'cust_city_4.0',
    'cust_city_8.0',
    'age_bucket_15-24',
    'age_bucket_25-34',
    'age_bucket_35-44',
    'age_bucket_45-54',
    'age_bucket_55-64',
    'age_bucket_65+',
    'regular',
    'foodie_flag',
    'gluttonous_flag',
    'loyal_flag',
    'transaction_volume',
    'interaction_rate'
]

OG_LIST = [
    'cust_region',
    'last_promo',
    'pay_method',
    'cust_age',   
    'n_vendor',
    'n_product',
    'n_chain',
    'first_order',
    'last_order',
    'american',
    'asian',
    'beverages'
    'cafe',
    'chicken_dishes',
    'chinese',
    'desserts'
    'healthy',
    'indian',
    'italian',
    'japanese',
    'noodle_dishes',
    'other',
    'street_food_snacks',
    'thai',
    'DOW_0',
    'DOW_1',
    'DOW_2',
    'DOW_3',
    'DOW_4',
    'DOW_5',
    'DOW_6',
    'HR_0',
    'HR_1',
    'HR_2',
    'HR_3',
    'HR_4',
    'HR_5',
    'HR_6',
    'HR_7',
    'HR_8',
    'HR_9',
    'HR_10',
    'HR_11',
    'HR_12',
    'HR_13',
    'HR_14',
    'HR_15',
    'HR_16',
    'HR_17',
    'HR_18',
    'HR_19',
    'HR_20',
    'HR_21',
    'HR_22',
    'HR_23'   
]

NON_METRIC_KEYS = OG_LIST[:3]  
METRIC_KEYS = OG_LIST[3:]      
CUISINE_KEYS = OG_LIST[9:22]

LOGS = \
[
   'n_vendor',
    'n_product',
    'n_chain',
    'american',
    'asian',
    'beverages',
    'cafe',
    'chicken_dishes',
    'chinese',
    'desserts',
    'healthy',
    'indian',
    'italian',
    'japanese',
    'noodle_dishes',
    'other',
    'street_food_snacks',
    'thai',
    'total_amt',
    'n_order',
    'avg_amt_per_product',
    'avg_amt_per_order',
    'avg_amt_per_vendor',
    'n_days_week',
    'n_times_day'
]

PCA_FEATURES = \
[
    'avg_amt_per_day', 
    'avg_product_per_day', 
    'avg_order_per_day', 
    'n_product', 
    'n_order'
]

SCALER_FEATURES = [
    'cust_age', 'n_vendor', 'n_product', 'n_chain', 'first_order', 'last_order', 
    'american', 'asian', 'beverages', 'cafe', 'chicken_dishes', 'chinese', 
    'desserts', 'healthy', 'indian', 'italian', 'japanese', 'noodle_dishes', 
    'other', 'street_food_snacks', 'thai',
    'DOW_0', 'DOW_1', 'DOW_2', 'DOW_3', 'DOW_4', 'DOW_5', 'DOW_6', 
    'HR_0', 'HR_1', 'HR_2', 'HR_3', 'HR_4', 'HR_5', 'HR_6', 'HR_7', 'HR_8', 
    'HR_9', 'HR_10', 'HR_11', 'HR_12', 'HR_13', 'HR_14', 'HR_15', 'HR_16', 
    'HR_17', 'HR_18', 'HR_19', 'HR_20', 'HR_21', 'HR_22', 'HR_23',
    'total_amt', 'n_order', 'avg_amt_per_product', 'avg_amt_per_order', 
    'avg_amt_per_vendor', 'days_cust', 'avg_days_to_order', 'days_due', 
    'per_chain_order', 'n_days_week', 'n_times_day', 'n_cuisines', 
    'log_n_vendor', 'log_n_product', 'log_n_chain', 'log_american', 
    'log_asian', 'log_beverages', 'log_cafe', 'log_chicken_dishes', 
    'log_chinese', 'log_desserts', 'log_healthy', 'log_indian', 'log_italian', 
    'log_japanese', 'log_noodle_dishes', 'log_other', 'log_street_food_snacks', 
    'log_thai', 'log_total_amt', 'log_n_order', 'log_avg_amt_per_product', 
    'log_avg_amt_per_order', 'log_avg_amt_per_vendor', 'log_n_days_week', 
    'log_n_times_day', 'avg_amt_per_day', 'avg_product_per_day', 
    'avg_order_per_day'
]

# FEATURE DICTIONARIES
MEANS = \
{
    'cust_age': 27.505,
    'first_order': 23.081,
    'last_order': 68.927,
}

TIME_LIKELYHOODS = \
{ 
    'DAY':
    {
        'DOW_0': 0.638,
        'DOW_1': 0.65,
        'DOW_2': 0.679,
        'DOW_3': 0.71,
        'DOW_4': 0.777,
        'DOW_5': 0.746,
        'DOW_6': 0.808        
    },

    'HOUR':
    {
        'HR_0': 0.053,
        'HR_1': 0.06,
        'HR_2': 0.07,
        'HR_3': 0.136,
        'HR_4': 0.114,
        'HR_5': 0.094,
        'HR_6': 0.078,
        'HR_7': 0.084,
        'HR_8': 0.142,
        'HR_9': 0.263,
        'HR_10': 0.374,
        'HR_11': 0.436,
        'HR_12': 0.369,
        'HR_13': 0.276,
        'HR_14': 0.247,
        'HR_15': 0.318,
        'HR_16': 0.414,
        'HR_17': 0.452,
        'HR_18': 0.391,
        'HR_19': 0.287,
        'HR_20': 0.166,
        'HR_21': 0.083,
        'HR_22': 0.053,
        'HR_23': 0.051   
    }
}

# Initialize dictionaries for feature groups with flags and relevant columns
FEATURE_GROUPS = {
    'foodie': ['n_vendor', 'n_product', 'n_order', 'n_cuisines'],
    'gluttonous': ['avg_amt_per_order', 'total_amt', 'n_chain'],
    'loyal': ['avg_amt_per_vendor'] + CUISINE_KEYS
}

AttributeError: Can't get attribute 'SpectralCluster' on <module '__main__'>

### Helpers

Helpers for some of the routines.

In [134]:
# Get the largest value in a dataframe

def top_n(row, col_list, n):
    # Sort the specified columns in descending order
    sorted_row = row[col_list].sort_values(ascending=False)

    # Get the unique sorted values
    unique_sorted_values = sorted_row.unique()

    # Ensure there are enough unique values to determine the n-th largest
    if len(unique_sorted_values) >= n:
        nth_value = unique_sorted_values[n - 1]  # Get the n-th largest unique value

        # If the n-th value is 0, return None
        if nth_value == 0:
            return None
        
        # If n > 1, check for uniqueness against the (n-1)-th largest
        if n > 1:
            prev_value = unique_sorted_values[n - 2]  # (n-1)-th largest unique value
            # If nth_value is equal to the (n-1)-th value, we don't want to return it
            if nth_value == prev_value:
                return None
        
        # Return the index of the n-th largest value
        return sorted_row[sorted_row == nth_value].index[0]

    # Return None if conditions are not met
    return None

# Throws a dice based on given n_choices and probabilities.
def throw_dice(likelihood_dict):
    # Unzip the dictionary into choices and probabilities
    choices, probabilities = zip(*likelihood_dict.items())
    
    # Normalize probabilities
    probabilities = np.array(probabilities) / np.sum(probabilities)
    
    # Return a random choice based on the probabilities
    return np.random.choice(choices, p=probabilities)

# Function to calculate IQR bounds
def calculate_bounds(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return lower_bound, upper_bound

### Preprocess

Wrangle the user input data, before it is ready for variable engineering.

In [135]:
def preproc(raw_data_point):
    # Make a copy of the data point
    data = raw_data_point.copy()
    
    # Enforce datatypes for metric elements
    for key in METRIC_KEYS:
        try:
            data[key] = np.float64(data[key])
            
            # Check if the value is less than 0
            if data[key] < 0:
                raise ValueError
        except (ValueError, KeyError) as e:
            data[key] = np.nan
    
    # Enforce string types for non-metric elements
    for key in NON_METRIC_KEYS:
        try:
            if not isinstance(data[key], str):
                data[key] = np.nan
        except KeyError:
            data[key] = np.nan
            
    # Initialize sums
    n_week = 0
    n_day = 0
    
    # Calculate n_week and n_day directly, filling missing values with 0
    for n in range(7):
        dow_key = f"DOW_{n}"
        if pd.isna(data[dow_key]):
            data[dow_key] = 0
        n_week += data[dow_key]
    
    for n in range(24):
        hr_key = f"HR_{n}"
        if pd.isna(data[hr_key]):
            data[hr_key] = 0
        n_day += data[hr_key]

    diff = int(np.ceil(n_week - n_day))

    # Correct if not equal
    likelyhood = None
    var = None
    if diff < 0:
        likelyhood = TIME_LIKELYHOODS['DAY']
        n_day += diff
    elif diff > 0:
        likelyhood = TIME_LIKELYHOODS['HOUR']
        n_week += diff
    else:
        pass

    if diff != 0:
        for _ in range(diff):
            data[throw_dice(likelyhood)] += 1
    
    # Finally, set n_order equal to the sum of either 
    n_order = n_week 

    # Fill missing amounts in cuisines with 0
    for key in CUISINE_KEYS:
        if pd.isna(data[key]):
            data[key] = 0
            
    # Calculate number of cuisines ordered
    n_cuisines = sum(
        (data[key] > 0) 
        for key in CUISINE_KEYS 
    )

    # Check if customer to segment has spent any money
    if n_cuisines == 0:
        return "Error: Invalid customer. Specify customer ammounts spent per cuisine."        
       
    # Check if number of cuisines ordered is larger than the number of orders made
    diff = int(np.ceil(n_cuisines > n_order))
    
    if diff > 0:
        for _ in range(diff):
            # Assign another HR stochastically
            data[throw_dice(TIME_LIKELYHOODS['HOUR'])] += 1
            n_day += 1
            
            # Assign another DAY stochastically
            data[throw_dice(TIME_LIKELYHOODS['DAY'])] += 1
            n_order += 1
            n_week += 1
    # Sum total cuisines to obtain the total amount spent
    total_amt = sum(
        data[key] for key in CUISINE_KEYS
    )

    data['n_order'] = n_order
    data['n_times_day'] = n_day
    data['n_days_week'] = n_week
    data['n_cuisines'] = n_cuisines
    data['total_amt'] = total_amt
    
    try:
        # Assuming each vendor only serves one type of cuisine, which might introduce some bias, but how else to solve?
        if data['n_vendor'] < n_cuisines:
            data['n_vendor'] = n_cuisines
    except ValueError:
        data['n_vendor'] = n_cuisines

    try:
        if pd.isna(data['n_chain']): 
            raise ValueError
        # Check if the customer made a consistent number of purchases from chained restaurants 
        if data['n_chain'] > data['n_vendor']:
            data['n_chain'] = data['n_vendor']

        # Check if the customer made an illegal number of purchases from chained restaurants
        if data['n_chain'] < 0: 
            data['n_chain'] = 0
            
    except ValueError:
        # If here then definitely illegal
        data['n_chain'] = 0
    
    try:
        # Check if the number of products is at least equal to the number of orders made
        if data['n_product'] < data['n_order']:
            data['n_product'] = data['n_order']
    except ValueError:
        data['n_product'] = data['n_order']

    # Fill in low risk values with known dataset means.
    if pd.isna(data['first_order']):
        data['first_order'] = MEANS['first_order']
        
    if pd.isna(data['last_order']):
        data['last_order'] = MEANS['last_order']

    if pd.isna(data['last_order']):
        data['cust_age'] = MEANS['cust_age']
    
    return data

### Process

Process the data so that it follows the same data structure as the training data.

In [136]:
def process(preprocessed_data):
    # Make a copy of the data point
    data = preprocessed_data.copy()

    # Amount spent on average per product
    data['avg_amt_per_product'] = data['total_amt'] / data['n_product'] if data['n_product'] > 0 else 0

    # Amount spent on average per order
    data['avg_amt_per_order'] = data['total_amt'] / data['n_order'] if data['n_order'] > 0 else 0

    # Amount spent on average per vendor
    data['avg_amt_per_vendor'] = data['total_amt'] / data['n_vendor'] if data['n_vendor'] > 0 else 0

    # Total days as customer
    data['days_cust'] = data['last_order'] - data['first_order']

    # Average days between orders
    data['avg_days_to_order'] = data['days_cust'] / data['n_order'] if data['n_order'] > 0 else 0

    # Days the customer is due, according to their average days between orders
    data['days_due'] = 90 - data['last_order'] + data['avg_days_to_order']

    # Percentage of orders placed to restaurants that are part of a chain
    data['per_chain_order'] = data['n_chain'] / data['n_order'] if data['n_order'] > 0 else 0

    # Average amount spent per day as customer
    data['avg_amt_per_day'] = round(data['total_amt'] / data['days_cust'], 4) if data['days_cust'] > 0 else 0

    # Average number of products ordered per day as customer
    data['avg_product_per_day'] = round(data['n_product'] / data['days_cust'], 4) if data['days_cust'] > 0 else 0

    # Average number of orders per day as customer
    data['avg_order_per_day'] = round(data['n_order'] / data['days_cust'], 4) if data['days_cust'] > 0 else 0

    # Apply log1p to each column in LOGS and add it to log_transformed with the prefix 'log_'
    for col in LOGS:
        if col in data and data[col] > 0:
            data[f"log_{col}"] = np.log1p(data[col])
        else:
            data[f"log_{col}"] = 0

    # Creating age buckets
    data['age_bucket'] = (
        '15-24' if data['cust_age'] < 25 else
        '25-34' if data['cust_age'] < 35 else
        '35-44' if data['cust_age'] < 45 else
        '45-54' if data['cust_age'] < 55 else
        '55-64' if data['cust_age'] < 65 else
        '65+'
    )

    # Flag customers who have purchased in more than one day
    data['regular'] = data['days_cust'] > 1

    # Create columns to hold the flags for each feature group
    data['foodie_flag'] = 0
    data['gluttonous_flag'] = 0
    data['loyal_flag'] = 0

    # Assign flags for each feature group
    for group, features in FEATURE_GROUPS.items():
        for feature in features:
            log_feature = f"log_{feature}" if feature != 'n_cuisines' else feature
            if data[log_feature] > 0:
                lower_bound, upper_bound = calculate_bounds([data[log_feature]])
                # Mark outliers for each group
                if group == 'foodie':
                    data['foodie_flag'] |= int(data[log_feature] > upper_bound)
                elif group == 'gluttonous':
                    data['gluttonous_flag'] |= int(data[log_feature] > upper_bound)
                elif group == 'loyal':
                    data['loyal_flag'] |= int(data[log_feature] > upper_bound)
    return data

SyntaxError: invalid syntax (3664856685.py, line 74)

### Scaling

Scalling the data, so that it is on the same scaling assd the models, and can be used in predict

In [139]:
def scale(processed_data):
    # Create a copy of the input data to avoid modifying the original dataset
    data = processed_data.copy()

    # Convert data to a DataFrame
    data = pd.DataFrame(data, columns=processed_data.columns) 
    
    # Identify features not in SCALER_FEATURES
    other_features = [col for col in data.columns if col not in SCALER_FEATURES]
    
    # Step 1: Scale the SCALER_FEATURES
    scaled_data = SCALER.transform(data[SCALER_FEATURES]
    missing_columns = [col for col in SCALER_FEATURES if col not in data.columns]
    print("Missing columns:", missing_columns)
    
    # Create a DataFrame for the scaled features
    scaled_df = pd.DataFrame(scaled_data, columns=SCALER_FEATURES, index=data.index)
    
    # Concatenate the scaled features with the unscaled features
    data = pd.concat([scaled_df, data[other_features]], axis=1)

    # Step 2: Apply PCA transformation to the specified features
    pca_data = PCA.transform(data[PCA_FEATURES])
    
    # Create a DataFrame for the PCA-transformed features
    pca_df = pd.DataFrame(pca_data, columns=[f'PC{i}' for i in range(pca_data.shape[1])], index=data.index)
    
    # Step 3: Drop 'PC2' and rename 'PC0' and 'PC1'
    pca_df.drop(columns='PC2', inplace=True, errors='ignore')  # Ignore if 'PC2' is not found
    pca_df.rename(columns={'PC0': 'transaction_volume', 'PC1': 'interaction_rate'}, inplace=True)
    
    # Step 4: Concatenate the PCA-transformed features with the rest of the data
    data = pd.concat([pca_df, data.drop(columns=PCA_FEATURES)], axis=1)
    
    return data


### Predicting

In [153]:
def predict(scaled_point)
    data = scaled_point.copy()

    MINISOM.
# INTERMEDIATE MODEL PICKLES
MINISOM = joblib.load('../dashboard/models/minisom.pkl')
NMF = joblib.load('../dashboard/models/nmf.pkl')

# MODEL PICKLES
SPENDING = joblib.load('../dashboard/models/spending_clustering.pkl')
CITY = joblib.load('../dashboard/models/city_clustering.pkl')
CUISINE = joblib.load('../dashboard/models/cuisine_clustering.pkl')
TIME = joblib.load('../dashboard/models/hour_clustering.pkl')


Unnamed: 0,transaction_volume,interaction_rate,cust_age,n_vendor,n_chain,first_order,last_order,american,asian,beverages,...,age_bucket_45-54,age_bucket_55-64,age_bucket_65+,regular,foodie_flag,gluttonous_flag,loyal_flag,top_cuisine,transaction_volume.1,interaction_rate.1
0,32.898381,-8.932696,-3.898937,-1.629598,-1.089055,-1.187410,-3.813539,-0.529285,-0.485634,-0.358491,...,0.0,0.0,0.0,True,0,0,0,indian,9.861899,-3.578223
1,46.171226,-11.783180,-4.038629,-1.629598,-1.164803,-1.187410,-3.813539,-0.420625,-0.434944,-0.358491,...,0.0,0.0,0.0,True,0,0,0,asian,15.648610,-5.018780
2,26.600708,-7.633377,-3.639510,-1.629598,-1.164803,-1.187410,-3.813539,-0.529285,-0.464701,-0.358491,...,0.0,0.0,0.0,True,0,0,0,asian,7.659009,-3.114611
3,25.481555,-7.217356,-3.958805,-1.629598,-1.013307,-1.187410,-3.813539,-0.485433,-0.511194,-0.334214,...,0.0,0.0,0.0,True,0,0,0,american,5.377867,-2.266646
4,33.189194,-9.040799,-3.898937,-1.629598,-1.013307,-1.187410,-3.813539,-0.441954,-0.511194,0.028672,...,0.0,0.0,0.0,True,0,0,0,beverages,10.454657,-3.798568
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23429,32.119256,-8.643073,-3.998717,-1.629598,-1.089055,-0.976951,-3.572919,-0.501990,-0.511194,-0.358491,...,0.0,0.0,0.0,True,0,0,0,other,8.273828,-2.987892
23430,26.150107,-7.465875,-3.878981,-1.774272,-1.013307,-0.976951,-3.572919,-0.529285,-0.477854,-0.358491,...,0.0,0.0,0.0,True,0,0,0,asian,6.740559,-2.773197
23431,26.545741,-7.612944,-3.998717,-1.629598,-1.013307,-0.976951,-3.572919,-0.529285,-0.511194,-0.358491,...,0.0,0.0,0.0,True,0,0,0,desserts,7.546971,-3.072963
23432,26.194208,-7.482269,-3.539731,-1.629598,-1.089055,-0.976951,-3.572919,-0.483494,-0.488022,-0.358491,...,1.0,0.0,0.0,True,0,0,0,asian,6.830450,-2.806612


## Run

Self explanatory

In [None]:
def process_input(data_point: dict) -> dict:
    if not isinstance(data_point, dict):
        raise TypeError("The input must be a dictionary.")

    # Workflow
    
    # Preprocess
    preproc_point = preproc(data_point)
    # Process
    processed_point = process(preproc_point)
    # Scale
    scaled_point = scale(processed_point)
    # Predict
    predicted_point = predict(scaled_point)
    
    return data_point