### Imports

Imports for this script.

In [184]:
# IMPORTS
import joblib
import pandas as pd
import numpy as np

# PROCESSING
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# MODELS
from gower import gower_matrix
from scipy.linalg import svd
from sklearn.preprocessing import normalize

### Globals

List of global variables used.

In [185]:
# CONTROL
DATA = pd.read_csv('data/regulars.csv')

# PROCESSING PICKLES
HOT_ENCODER = joblib.load('models/hot_encoder.pkl')
SCALER = joblib.load('../dashboard/models/std_scaler.pkl')
MINMAXSCALER = joblib.load('../dashboard/models/minmax_scaler.pkl')
PCA = joblib.load('../dashboard/models/pca_components.pkl')

# INTERMEDIATE MODEL PICKLES
MINISOM = joblib.load('../dashboard/models/minisom.pkl')
NMF = joblib.load('../dashboard/models/nmf.pkl')
SPCA = joblib.load('../dashboard/models/spca.pkl')

# MODEL PICKLES
SPENDING = joblib.load('../dashboard/models/spending_clustering.pkl')
# CITY = joblib.load('../dashboard/models/city_clustering.pkl') implemented manually
CUISINE = joblib.load('../dashboard/models/cuisine_clustering.pkl')
TIME = joblib.load('../dashboard/models/hour_clustering.pkl')

CUISINE_CLUSTER_TO_CITY_DICT = {
    0 : 4,
    1 : 8,
    2 : 2
}

# VARIABLE LISTS
FULl_VAR_LIST = \
[
    'customer_id',
    'cust_age',
    'n_vendor',
    'n_product',
    'n_chain',
    'first_order',
    'last_order',
    'american',
    'asian',
    'beverages',
    'cafe',
    'chicken_dishes',
    'chinese',
    'desserts',
    'healthy',
    'indian',
    'italian',
    'japanese',
    'noodle_dishes',
    'other',
    'street_food_snacks',
    'thai',
    'DOW_0',
    'DOW_1',
    'DOW_2',
    'DOW_3',
    'DOW_4',
    'DOW_5',
    'DOW_6',
    'HR_0',
    'HR_1',
    'HR_2',
    'HR_3',
    'HR_4',
    'HR_5',
    'HR_6',
    'HR_7',
    'HR_8',
    'HR_9',
    'HR_10',
    'HR_11',
    'HR_12',
    'HR_13',
    'HR_14',
    'HR_15',
    'HR_16',
    'HR_17',
    'HR_18',
    'HR_19',
    'HR_20',
    'HR_21',
    'HR_22',
    'HR_23',
    'total_amt',
    'n_order',
    'avg_amt_per_product',
    'avg_amt_per_order',
    'avg_amt_per_vendor',
    'days_cust',
    'avg_days_to_order',
    'days_due',
    'per_chain_order',
    'n_days_week',
    'n_times_day',
    'n_cuisines',
    'log_n_vendor',
    'log_n_product',
    'log_n_chain',
    'log_american',
    'log_asian',
    'log_beverages',
    'log_cafe',
    'log_chicken_dishes',
    'log_chinese',
    'log_desserts',
    'log_healthy',
    'log_indian',
    'log_italian',
    'log_japanese',
    'log_noodle_dishes',
    'log_other',
    'log_street_food_snacks',
    'log_thai',
    'log_total_amt',
    'log_n_order',
    'log_avg_amt_per_product',
    'log_avg_amt_per_order',
    'log_avg_amt_per_vendor',
    'log_n_days_week',
    'log_n_times_day',
    'avg_amt_per_day',
    'avg_product_per_day',
    'avg_order_per_day',
    'cust_region_2360.0',
    'cust_region_2400.0',
    'cust_region_4140.0',
    'cust_region_4660.0',
    'cust_region_8370.0',
    'cust_region_8550.0',
    'cust_region_8670.0',
    'last_promo_DELIVERY',
    'last_promo_DISCOUNT',
    'last_promo_FREEBIE',
    'last_promo_NO_PROMO',
    'pay_method_CARD',
    'pay_method_CASH',
    'pay_method_DIGI',
    'cust_city_2.0',
    'cust_city_4.0',
    'cust_city_8.0',
    'age_bucket_15-24',
    'age_bucket_25-34',
    'age_bucket_35-44',
    'age_bucket_45-54',
    'age_bucket_55-64',
    'age_bucket_65+',
    'regular',
    'foodie_flag',
    'gluttonous_flag',
    'loyal_flag',
    'transaction_volume',
    'interaction_rate'
]

OG_LIST = [
    'cust_region',
    'last_promo',
    'pay_method',
    'cust_age',   
    'n_vendor',
    'n_product',
    'n_chain',
    'first_order',
    'last_order',
    'american',
    'asian',
    'beverages',
    'cafe',
    'chicken_dishes',
    'chinese',
    'desserts',
    'healthy',
    'indian',
    'italian',
    'japanese',
    'noodle_dishes',
    'other',
    'street_food_snacks',
    'thai',
    'DOW_0',
    'DOW_1',
    'DOW_2',
    'DOW_3',
    'DOW_4',
    'DOW_5',
    'DOW_6',
    'HR_0',
    'HR_1',
    'HR_2',
    'HR_3',
    'HR_4',
    'HR_5',
    'HR_6',
    'HR_7',
    'HR_8',
    'HR_9',
    'HR_10',
    'HR_11',
    'HR_12',
    'HR_13',
    'HR_14',
    'HR_15',
    'HR_16',
    'HR_17',
    'HR_18',
    'HR_19',
    'HR_20',
    'HR_21',
    'HR_22',
    'HR_23'   
]

NON_METRIC_KEYS = OG_LIST[:3]  
METRIC_KEYS = OG_LIST[3:]      
CUISINE_KEYS = OG_LIST[9:22]

LOGS = \
[
   'n_vendor',
    'n_product',
    'n_chain',
    'american',
    'asian',
    'beverages',
    'cafe',
    'chicken_dishes',
    'chinese',
    'desserts',
    'healthy',
    'indian',
    'italian',
    'japanese',
    'noodle_dishes',
    'other',
    'street_food_snacks',
    'thai',
    'total_amt',
    'n_order',
    'avg_amt_per_product',
    'avg_amt_per_order',
    'avg_amt_per_vendor',
    'n_days_week',
    'n_times_day'
]

PCA_FEATURES = \
[
    'avg_amt_per_day', 
    'avg_product_per_day', 
    'avg_order_per_day', 
    'n_product', 
    'n_order'
]

SCALER_FEATURES = [
    'cust_age', 'n_vendor', 'n_product', 'n_chain', 'first_order', 'last_order', 
    'american', 'asian', 'beverages', 'cafe', 'chicken_dishes', 'chinese', 
    'desserts', 'healthy', 'indian', 'italian', 'japanese', 'noodle_dishes', 
    'other', 'street_food_snacks', 'thai',
    'DOW_0', 'DOW_1', 'DOW_2', 'DOW_3', 'DOW_4', 'DOW_5', 'DOW_6', 
    'HR_0', 'HR_1', 'HR_2', 'HR_3', 'HR_4', 'HR_5', 'HR_6', 'HR_7', 'HR_8', 
    'HR_9', 'HR_10', 'HR_11', 'HR_12', 'HR_13', 'HR_14', 'HR_15', 'HR_16', 
    'HR_17', 'HR_18', 'HR_19', 'HR_20', 'HR_21', 'HR_22', 'HR_23',
    'total_amt', 'n_order', 'avg_amt_per_product', 'avg_amt_per_order', 
    'avg_amt_per_vendor', 'days_cust', 'avg_days_to_order', 'days_due', 
    'per_chain_order', 'n_days_week', 'n_times_day', 'n_cuisines', 
    'log_n_vendor', 'log_n_product', 'log_n_chain', 'log_american', 
    'log_asian', 'log_beverages', 'log_cafe', 'log_chicken_dishes', 
    'log_chinese', 'log_desserts', 'log_healthy', 'log_indian', 'log_italian', 
    'log_japanese', 'log_noodle_dishes', 'log_other', 'log_street_food_snacks', 
    'log_thai', 'log_total_amt', 'log_n_order', 'log_avg_amt_per_product', 
    'log_avg_amt_per_order', 'log_avg_amt_per_vendor', 'log_n_days_week', 
    'log_n_times_day', 'avg_amt_per_day', 'avg_product_per_day', 
    'avg_order_per_day'
]

# FEATURE DICTIONARIES
MEANS = \
{
    'cust_age': 27.505,
    'first_order': 23.081,
    'last_order': 68.927,
}

TIME_LIKELYHOODS = \
{ 
    'DAY':
    {
        'DOW_0': 0.638,
        'DOW_1': 0.65,
        'DOW_2': 0.679,
        'DOW_3': 0.71,
        'DOW_4': 0.777,
        'DOW_5': 0.746,
        'DOW_6': 0.808        
    },

    'HOUR':
    {
        'HR_0': 0.053,
        'HR_1': 0.06,
        'HR_2': 0.07,
        'HR_3': 0.136,
        'HR_4': 0.114,
        'HR_5': 0.094,
        'HR_6': 0.078,
        'HR_7': 0.084,
        'HR_8': 0.142,
        'HR_9': 0.263,
        'HR_10': 0.374,
        'HR_11': 0.436,
        'HR_12': 0.369,
        'HR_13': 0.276,
        'HR_14': 0.247,
        'HR_15': 0.318,
        'HR_16': 0.414,
        'HR_17': 0.452,
        'HR_18': 0.391,
        'HR_19': 0.287,
        'HR_20': 0.166,
        'HR_21': 0.083,
        'HR_22': 0.053,
        'HR_23': 0.051   
    }
}

# Initialize dictionaries for feature groups with flags and relevant columns
FEATURE_GROUPS = {
    'foodie': ['n_vendor', 'n_product', 'n_order', 'n_cuisines'],
    'gluttonous': ['avg_amt_per_order', 'total_amt', 'n_chain'],
    'loyal': ['avg_amt_per_vendor'] + CUISINE_KEYS
}

In [186]:
mapping_dict = {
    'cust_region': 'Region',
    'cust_age': 'Age',
    'n_vendor': 'Vendor Count',
    'n_product': 'Product Count',
    'n_chain': 'Chain Restaurant Order Count',
    'first_order': 'First Order Date',
    'last_order': 'Last Order Date',
    'last_promo': 'Promotion',
    'pay_method': 'Payment Method',
    'american': 'American',
    'asian': 'Asian',
    'beverages': 'Beverages',
    'cafe': 'Cafe',
    'chicken_dishes': 'Chicken Dishes',
    'chinese': 'Chinese',
    'desserts': 'Desserts',
    'healthy': 'Healthy',
    'indian': 'Indian',
    'italian': 'Italian',
    'japanese': 'Japanese',
    'noodle_dishes': 'Noodle Dishes',
    'other': 'Other Cuisines',
    'street_food_snacks': 'Street Food & Snacks',
    'thai': 'Thai',
    'DOW_0': 'Sunday',
    'DOW_1': 'Monday',
    'DOW_2': 'Tuesday',
    'DOW_3': 'Wednesday',
    'DOW_4': 'Thursday',
    'DOW_5': 'Friday',
    'DOW_6': 'Saturday',
    'HR_0': '12AM',
    'HR_1': '1AM',
    'HR_2': '2AM',
    'HR_3': '3AM',
    'HR_4': '4AM',
    'HR_5': '5AM',
    'HR_6': '6AM',
    'HR_7': '7AM',
    'HR_8': '8AM',
    'HR_9': '9AM',
    'HR_10': '10AM',
    'HR_11': '11AM',
    'HR_12': '12PM',
    'HR_13': '1PM',
    'HR_14': '2PM',
    'HR_15': '3PM',
    'HR_16': '4PM',
    'HR_17': '5PM',
    'HR_18': '6PM',
    'HR_19': '7PM',
    'HR_20': '8PM',
    'HR_21': '9PM',
    'HR_22': '10PM',
    'HR_23': '11PM',
    'cust_city': 'City',
    'total_amt': 'Total Amount',
    'n_order': 'Order Count',
    'avg_amt_per_product': 'Avg Amount per Product',
    'avg_amt_per_order': 'Avg Amount per Order',
    'avg_amt_per_vendor': 'Avg Amount per Vendor',
    'days_cust': 'Days as Customer',
    'avg_days_to_order': 'Avg Days to Order',
    'days_due': 'Order Days Due',
    'per_chain_order': '% Orders in Chain Restaurant',
    'n_days_week': 'Days of Week Ordered Count',
    'n_times_day': 'Hours Ordered Count',
    'regular': 'Is Regular',
    'n_cuisines': 'Cuisines Count',
    'log_n_vendor': 'Log Vendor Count',
    'log_n_product': 'Log Product Count',
    'log_n_chain': 'Log Chain Restaurant Order Count',
    'log_american': 'Log American',
    'log_asian': 'Log Asian',
    'log_beverages': 'Log Beverages',
    'log_cafe': 'Log Cafe',
    'log_chicken_dishes': 'Log Chicken Dishes',
    'log_chinese': 'Log Chinese',
    'log_desserts': 'Log Desserts',
    'log_healthy': 'Log Healthy',
    'log_indian': 'Log Indian',
    'log_italian': 'Log Italian',
    'log_japanese': 'Log Japanese',
    'log_noodle_dishes': 'Log Noodle Dishes',
    'log_other': 'Log Other Cuisines',
    'log_street_food_snacks': 'Log Street Food & Snacks',
    'log_thai': 'Log Thai',
    'log_total_amt': 'Log Total Amount',
    'log_n_order': 'Log Order Count',
    'log_avg_amt_per_product': 'Log Avg Amount per Product',
    'log_avg_amt_per_order': 'Log Avg Amount per Order',
    'log_avg_amt_per_vendor': 'Log Avg Amount per Vendor',
    'log_n_days_week': 'Log Days of Week Ordered Count',
    'log_n_times_day': 'Log Hours Ordered Count',
    'foodie_flag': 'Is Foodie',
    'gluttonous_flag': 'Is Gluttonous',
    'loyal_flag': 'Is Loyal',
    'top_cuisine': 'Top Cuisine',
    'avg_amt_per_day': 'Avg Amount Spent per Day',
    'avg_product_per_day': 'Avg Products Ordered per Day',
    'avg_order_per_day': 'Avg Orders Placed per Day',
    'age_bucket': 'Age Bucket',
    'transaction_volume': 'Transaction Volume',
    'interaction_rate': 'Interaction Rate'
}

### Helpers

Helpers for some of the routines.

In [187]:
# Get the largest value in a dataframe

def top_n(row, col_list, n):
    # Sort the specified columns in descending order
    sorted_row = row[col_list].sort_values(ascending=False)

    # Get the unique sorted values
    unique_sorted_values = sorted_row.unique()

    # Ensure there are enough unique values to determine the n-th largest
    if len(unique_sorted_values) >= n:
        nth_value = unique_sorted_values[n - 1]  # Get the n-th largest unique value

        # If the n-th value is 0, return None
        if nth_value == 0:
            return None
        
        # If n > 1, check for uniqueness against the (n-1)-th largest
        if n > 1:
            prev_value = unique_sorted_values[n - 2]  # (n-1)-th largest unique value
            # If nth_value is equal to the (n-1)-th value, we don't want to return it
            if nth_value == prev_value:
                return None
        
        # Return the index of the n-th largest value
        return sorted_row[sorted_row == nth_value].index[0]

    # Return None if conditions are not met
    return None

# Throws a dice based on given n_choices and probabilities.
def throw_dice(likelihood_dict):
    # Unzip the dictionary into choices and probabilities
    choices, probabilities = zip(*likelihood_dict.items())
    
    # Normalize probabilities
    probabilities = np.array(probabilities) / np.sum(probabilities)
    
    # Return a random choice based on the probabilities
    return np.random.choice(choices, p=probabilities)

### Preprocess

Wrangle the user input data, before it is ready for variable engineering.

In [188]:
def preproc(raw_data_point):
    # Make a copy of the data point
    data = raw_data_point.copy()
    
    # Enforce datatypes for metric elements
    for key in METRIC_KEYS:
        try:
            data[key] = np.float64(data[key])
            
            # Check if the value is less than 0
            if data[key] < 0:
                raise ValueError
        except (ValueError, KeyError) as e:
            data[key] = np.nan
    
    # Enforce string types for non-metric elements
    for key in NON_METRIC_KEYS:
        try:
            if not isinstance(data[key], str):
                data[key] = np.nan
        except KeyError:
            data[key] = np.nan
            
    # Initialize sums
    n_week = 0
    n_day = 0
    
    # Calculate n_week and n_day directly, filling missing values with 0
    for n in range(7):
        dow_key = f"DOW_{n}"
        if pd.isna(data[dow_key]):
            data[dow_key] = 0
        n_week += data[dow_key]
    
    for n in range(24):
        hr_key = f"HR_{n}"
        if pd.isna(data[hr_key]):
            data[hr_key] = 0
        n_day += data[hr_key]

    diff = int(np.ceil(n_week - n_day))

    # Correct if not equal
    likelyhood = None
    var = None
    if diff < 0:
        likelyhood = TIME_LIKELYHOODS['DAY']
        n_day += diff
    elif diff > 0:
        likelyhood = TIME_LIKELYHOODS['HOUR']
        n_week += diff
    else:
        pass

    if diff != 0:
        for _ in range(diff):
            data[throw_dice(likelyhood)] += 1
    
    # Finally, set n_order equal to the sum of either 
    n_order = n_week 

    # Fill missing amounts in cuisines with 0
    for key in CUISINE_KEYS:
        if pd.isna(data[key]):
            data[key] = 0
            
    # Calculate number of cuisines ordered
    n_cuisines = sum(
        (data[key] > 0) 
        for key in CUISINE_KEYS 
    )

    # Check if customer to segment has spent any money
    if n_cuisines == 0:
        return "Error: Invalid customer. Specify customer ammounts spent per cuisine."        
       
    # Check if number of cuisines ordered is larger than the number of orders made
    diff = int(np.ceil(n_cuisines > n_order))
    
    if diff > 0:
        for _ in range(diff):
            # Assign another HR stochastically
            data[throw_dice(TIME_LIKELYHOODS['HOUR'])] += 1
            n_day += 1
            
            # Assign another DAY stochastically
            data[throw_dice(TIME_LIKELYHOODS['DAY'])] += 1
            n_order += 1
            n_week += 1
    # Sum total cuisines to obtain the total amount spent
    total_amt = sum(
        data[key] for key in CUISINE_KEYS
    )

    data['n_order'] = n_order
    data['n_times_day'] = n_day
    data['n_days_week'] = n_week
    data['n_cuisines'] = n_cuisines
    data['total_amt'] = total_amt
    
    try:
        # Assuming each vendor only serves one type of cuisine, which might introduce some bias, but how else to solve?
        if data['n_vendor'] < n_cuisines:
            data['n_vendor'] = n_cuisines
    except ValueError:
        data['n_vendor'] = n_cuisines

    try:
        if pd.isna(data['n_chain']): 
            raise ValueError
        # Check if the customer made a consistent number of purchases from chained restaurants 
        if data['n_chain'] > data['n_vendor']:
            data['n_chain'] = data['n_vendor']

        # Check if the customer made an illegal number of purchases from chained restaurants
        if data['n_chain'] < 0: 
            data['n_chain'] = 0
            
    except ValueError:
        # If here then definitely illegal
        data['n_chain'] = 0
    
    try:
        # Check if the number of products is at least equal to the number of orders made
        if data['n_product'] < data['n_order']:
            data['n_product'] = data['n_order']
    except ValueError:
        data['n_product'] = data['n_order']

    # Fill in low risk values with known dataset means.
    if pd.isna(data['first_order']):
        data['first_order'] = MEANS['first_order']
        
    if pd.isna(data['last_order']):
        data['last_order'] = MEANS['last_order']

    if pd.isna(data['last_order']):
        data['cust_age'] = MEANS['cust_age']
    
    return data

### Process

Process the data so that it follows the same data structure as the training data.

In [189]:
def process(preprocessed_data):
    # Make a copy of the data point
    data = preprocessed_data.copy()

    # Amount spent on average per product
    data['avg_amt_per_product'] = data['total_amt'] / data['n_product'] if data['n_product'] > 0 else 0

    # Amount spent on average per order
    data['avg_amt_per_order'] = data['total_amt'] / data['n_order'] if data['n_order'] > 0 else 0

    # Amount spent on average per vendor
    data['avg_amt_per_vendor'] = data['total_amt'] / data['n_vendor'] if data['n_vendor'] > 0 else 0

    # Total days as customer
    data['days_cust'] = data['last_order'] - data['first_order']

    # Average days between orders
    data['avg_days_to_order'] = data['days_cust'] / data['n_order'] if data['n_order'] > 0 else 0

    # Days the customer is due, according to their average days between orders
    data['days_due'] = 90 - data['last_order'] + data['avg_days_to_order']

    # Percentage of orders placed to restaurants that are part of a chain
    data['per_chain_order'] = data['n_chain'] / data['n_order'] if data['n_order'] > 0 else 0

    # Average amount spent per day as customer
    data['avg_amt_per_day'] = round(data['total_amt'] / data['days_cust'], 4) if data['days_cust'] > 0 else 0

    # Average number of products ordered per day as customer
    data['avg_product_per_day'] = round(data['n_product'] / data['days_cust'], 4) if data['days_cust'] > 0 else 0

    # Average number of orders per day as customer
    data['avg_order_per_day'] = round(data['n_order'] / data['days_cust'], 4) if data['days_cust'] > 0 else 0

    # Apply log1p to each column in LOGS and add it to log_transformed with the prefix 'log_'
    for col in LOGS:
        if col in data and data[col] > 0:
            data[f"log_{col}"] = np.log1p(data[col])
        else:
            data[f"log_{col}"] = 0

    # Creating age buckets
    data['age_bucket'] = (
        '15-24' if data['cust_age'] < 25 else
        '25-34' if data['cust_age'] < 35 else
        '35-44' if data['cust_age'] < 45 else
        '45-54' if data['cust_age'] < 55 else
        '55-64' if data['cust_age'] < 65 else
        '65+'
    )

    # Flag customers who have purchased in more than one day
    data['regular'] = data['days_cust'] > 1

    # Create columns to hold the flags for each feature group
    data['foodie_flag'] = 0
    data['gluttonous_flag'] = 0
    data['loyal_flag'] = 0
    return data

### Scaling

Scalling the data, so that it is on the same scaling assd the models, and can be used in predict

In [190]:
def scale(processed_data):
    # Create a copy of the input data to avoid modifying the original dataset
    data = processed_data.copy()

    # Convert data to a DataFrame
    data = pd.DataFrame(data, index=['007'])

    # Identify features not in SCALER_FEATURES
    other_features = [col for col in data.columns if col not in SCALER_FEATURES]
    
    # Step 1: Scale the SCALER_FEATURES
    scaled_data = SCALER.transform(data[SCALER_FEATURES])
    
    # Create a DataFrame for the scaled features
    scaled_df = pd.DataFrame(scaled_data, columns=SCALER_FEATURES, index=data.index)
    
    # Concatenate the scaled features with the unscaled features
    data = pd.concat([scaled_df, data[other_features]], axis=1)

    # Step 2: Apply PCA transformation to the specified features
    pca_data = PCA.transform(data[PCA_FEATURES])
    
    # Create a DataFrame for the PCA-transformed features
    pca_df = pd.DataFrame(pca_data, columns=[f'PC{i}' for i in range(pca_data.shape[1])], index=data.index)
    
    # Step 3: Drop 'PC2' and rename 'PC0' and 'PC1'
    pca_df.drop(columns='PC2', inplace=True, errors='ignore')  # Ignore if 'PC2' is not found
    pca_df.rename(columns={'PC0': 'transaction_volume', 'PC1': 'interaction_rate'}, inplace=True)
    
    # Step 4: Concatenate the PCA-transformed features with the rest of the data
    data = pd.concat([pca_df, data], axis=1)
    
    return data


### Predicting

In [191]:
# RFM KMEANS
def spending_clustering(data_input):
    spending_features = \
        ['total_amt', 'n_cuisines', 'n_vendor', 'n_product']
    
    data = data_input[spending_features]
    spending_label = SPENDING.predict(data)

    return spending_label

In [192]:
# Sparse Principal Components with Gaussian Mixture Models
def cuisine_clustering(data_input):
    cuisines_features = [
        'log_american', 'log_asian', 'log_beverages', 'log_cafe', 'log_chinese', 'log_desserts', 'log_healthy', 'log_indian',
        'log_italian', 'log_japanese', 'log_noodle_dishes', 'log_other', 'log_street_food_snacks', 'log_thai', 'log_chicken_dishes'
    ]
    columns_to_add = ['log_total_amt', 'log_avg_amt_per_product']
    
    # Select the cuisine factores
    cuisines_factors = data_input[cuisines_features].copy()
    
    # Load the SPCA model
    spca = joblib.load('models/spca.pkl')

    # Transform the dataframe, returns an array
    spca_array = spca.transform(cuisines_factors)

    # Cast as dataframe with Component names
    spca_df = pd.DataFrame(
        spca_array,
        columns=[f"Component_{i+1}" for i in range(2)],
        index=cuisines_factors.index
    )

    # Build the final clustering dataframe, by adding critical columns
    cuisines_df = pd.concat([
        spca_df,
        data_input[columns_to_add]
    ], axis=1)

    # Load the Gaussian Mixture Model
    cuisines_algorithm = joblib.load('models/cuisine_clustering.pkl')

    # Obtain labels for the input
    labels = cuisines_algorithm.predict(cuisines_df)
    if data_input['cust_region'].isna().any():
        city = CUISINE_CLUSTER_TO_CITY_DICT[labels[0]]
    else:
        city = data_input['cust_region'].apply(lambda x: x[0])

    return labels, city

In [193]:
# Non-Negative Matrix Factorization with Self-Organizing Maps, projected with KMeans
def time_clustering(data_input):
    
    time_features = [
        *DATA.columns[29: 53].tolist(),
        *DATA.columns[22: 29].tolist()
    ]
    
    columns_to_add = ['total_amt', 'avg_amt_per_product', 'n_chain', 'n_cuisines']

    data_input = pd.concat([
        pd.DataFrame(SCALER.inverse_transform(data_input[SCALER_FEATURES]), columns=SCALER_FEATURES, index=data_input.index),
        data_input.drop(columns=SCALER_FEATURES)
    ], axis=1)
    
    time_input = MINMAXSCALER.transform(data_input[time_features])

    for col in time_input.columns:
        if np.isclose(time_input[col], 0):
            time_input[col] = 0
    
    W = NMF.transform(time_input).round(decimals=3)
    W_df = pd.DataFrame(W, columns=[f"Factor_{i+1}" for i in range(W.shape[1])], index=data_input.index)
    
    data_input = MINMAXSCALER.fit_transform(data_input[columns_to_add])
    
    # Concatenate the transformed data with the additional columns
    time_df = pd.concat([W_df, data_input], axis=1)
    weights_flat = MINISOM.get_weights().reshape((5 * 5), len(time_df.columns))
    labels = TIME.predict(weights_flat)
    kmeans_matrix = labels.reshape((5, 5))
    bmu_index = np.array([MINISOM.winner(x) for x in time_df.values])
    labels = [kmeans_matrix[i[0]][i[1]] for i in bmu_index]
    
    return labels

In [194]:
# Spectral Clustering, projected with KMeans
def geography_clustering(data_input):
    geography_features = [
        'per_chain_order', 'log_total_amt', 'avg_amt_per_product', 
        'n_cuisines', 'cust_city_2.0', 'cust_city_4.0', 'cust_city_8.0'
    ]
    
    # One-hot encode `cust_city`
    data_input = data_input.copy()
    if data_input.loc['007', 'cust_city'] == 2:
        data_input['cust_city_2.0'] = 1
        data_input['cust_city_4.0'] = 0
        data_input['cust_city_8.0'] = 0
    elif data_input.loc['007', 'cust_city'] == 4:
        data_input['cust_city_2.0'] = 0
        data_input['cust_city_4.0'] = 1
        data_input['cust_city_8.0'] = 0
    elif data_input.loc['007', 'cust_city'] == 8:
        data_input['cust_city_2.0'] = 0
        data_input['cust_city_4.0'] = 0
        data_input['cust_city_8.0'] = 1
    else:
       pass

    # Filter features for clustering
    geography_df = DATA[geography_features].copy()

    # Select only relevant features from `data_input` and set a unique index
    data_input = data_input[geography_features].copy()
    data_input.index = ['007']  # Assign a unique index for tracking    

    # Setting best-found Clustering hyperparameters
    n_clusters = 3
    rbf_param = 3.141542

    # Sampling the dataset for efficient computation
    arr_spectral_df = geography_df.sample(n=3000, random_state=1)

    arr_spectral_df = pd.concat([arr_spectral_df, data_input], axis=0)
    
    spectral_array = arr_spectral_df.values

    # Calculating the Gower distance matrix
    gower_dist = gower_matrix(spectral_array)

    # Applying the RBF transform
    K = np.exp(-rbf_param * gower_dist)
    
    # Computing the similarity matrix and obtaining eigenpair solutions
    D = K.sum(axis=1)
    D = np.sqrt(1 / D)
    M = np.multiply(D[np.newaxis, :], np.multiply(K, D[np.newaxis, :]))
    
    U, Sigma, _ = svd(M, full_matrices=False, lapack_driver='gesvd')

    # Points map to the eigens
    Usubset = U[:, :n_clusters]

    # Apply KMeans to the spectral embedding
    geography_algorithm = joblib.load('models/spectral_clustering.pkl')
    geography_labels = geography_algorithm.predict(normalize(Usubset))

    geography_df = pd.concat([
        arr_spectral_df,
        pd.Series(geography_labels, name='labels', index=arr_spectral_df.index)
    ], axis=1)

    # Extract the label for the row with index '007'
    geography_label = geography_df.loc['007']['labels'].astype('int')

    return geography_label

In [195]:
def predict_clusters(scaled_point):
    
    # Create a copy of the input data
    data_input = scaled_point.copy()

    # Perform clustering for cuisines
    cuisine_label, city = cuisine_clustering(data_input)

    data_input['cust_city'] = int(city)
    
    # Perform clustering for spending behavior
    spending_label = spending_clustering(data_input)
    
    # Perform clustering for time
    time_label = time_clustering(data_input)
    
    # Perform clustering for geography
    geography_label = geography_clustering(data_input)

    # Add labels and city (if applicable) to data_input
    data_input['cuisine_label'] = cuisine_label
    data_input['spending_label'] = spending_label
    data_input['time_label'] = time_label
    data_input['geography_label'] = geography_label

    return data_input

In [196]:
# Predicts profilling variables, via mode for customers in same cust_city (most discriminating variable)

def predict_profile(predicted_point):
    data = DATA[DATA['cust_city'] == predicted_point['cust_city']]

    # For each feature in NON_METRIC_KEYS, check if it's NaN in predicted_point
    for feature in NON_METRIC_KEYS:
        if pd.isna(predicted_point[feature]):  # If the feature is NaN in predicted_point
            # Get the mode of that feature in the relevant data
            mode_value = data[feature].mode()[0]  # Get the first mode
            predicted_point[feature] = mode_value  # Fill NaN with mode value
    
    return predicted_point


## Segmenting

Finds the given customers best segmentation.

In [197]:
def find_segmentation(data_point):
    # Workflow
    
    # Preprocess
    preproc_point = preproc(data_point)
    # Process
    processed_point = process(preproc_point)
    # Scale
    scaled_point = scale(processed_point)
    # Predict
    predicted_point = predict_clusters(scaled_point)

    DATA[DATA['cust_city'] == predicted_point['cust_city']][NON_METRIC_KEYS]
    
    return predicted_point

In [198]:
example = {'Region': '4660', 'Age': '20', 'Vendor Count': '2', 'Product Count': '2', 'Chain Restaurant Order Count': '2', 'First Order Date': '2', 'Last Order Date': '20', 'Promotion': 'DELIVERY', 'Payment Method': 'CARD', 'American': '20', 'Asian': None, 'Beverages': None, 'Cafe': None, 'Chicken Dishes': '20', 'Chinese': '20', 'Desserts': None, 'Healthy': None, 'Indian': None, 'Italian': None, 'Japanese': None, 'Noodle Dishes': None, 'Other Cuisines': None, 'Street Food & Snacks': None, 'Thai': None, 'Sunday': None, 'Monday': '20', 'Tuesday': None, 'Wednesday': None, 'Thursday': '20', 'Friday': None, 'Saturday': None, '12AM': None, '1AM': None, '2AM': None, '3AM': '20', '4AM': None, '5AM': None, '6AM': '20', '7AM': None, '8AM': None, '9AM': None, '10AM': None, '11AM': None, '12PM': None, '1PM': None, '2PM': None, '3PM': None, '4PM': None, '5PM': '20', '6PM': None, '7PM': None, '8PM': None, '9PM': None, '10PM': None, '11PM': None}

In [199]:
reversed_mapping = {v: k for k, v in mapping_dict.items()}

In [200]:
example_ = {reversed_mapping[k]: v for k, v in example.items()}