# **Initial Data Exploration**

# 1. Imports, Options and Ingestion

In [None]:
# Imports list

import math
import os
from itertools import combinations
from functools import partial

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker

from scipy import stats

from scipy.stats import yeojohnson
from scipy.stats import pearsonr, spearmanr
from scipy.stats import entropy as scipy_entropy

from sklearn import datasets
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split

#from _discretization import Entropy
#from _discretization.MDLP import MDLP_Discretizer

In [None]:
# Style is important
sns.set(style="white")

# Ensuring pandas always prints all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_seq_items', None)
pd.set_option('display.max_colwidth', 1000)

%config InlineBackend.figure_format = 'retina'

In [None]:
# Reading the csv data

data = pd.read_csv('wrangled_data.csv', index_col='customer_id')

In [None]:
data.info() # Load OK

#### Create lists of features for easy reference.

In [None]:
# Categorical variables
non_metric_features = ['cust_region', 'last_promo', 'pay_method']

# Hour of day variables
hour_features = data.columns[31:55]

# Day of week variables
day_features = data.columns[24:31]

# Cusine features
cuisine_features = data.columns[9:24]

# Metric variables, that are not above
metric_features = data.columns.drop(non_metric_features).drop(hour_features).drop(day_features).drop(cuisine_features).to_list()



# 2. Initial Exploration

Even though it is not the point of this part of the analysis to create variables, it must be said that the dataframe is missing some key variables, that purporte to customer, aggregate behaviour, which we create below, from the available data.

In [None]:
# Total amount spent by customer on all types of cuisine
data['total_amt'] = data[cuisine_features].sum(axis=1) 

# Number of orders made by the customer
data['n_order'] = data[day_features].sum(axis=1) 

# Ammount spent on average per product
data['avg_amt_per_product'] = data['total_amt'] / data['n_product']

# Ammount spent on average per order
data['avg_amt_per_order'] = data['total_amt'] / data['n_order'] 

# Ammount spent on average per vendor
data['avg_amt_per_vendor'] = data['total_amt'] / data['n_vendor']

# Total days as customer
data['days_cust'] = data['last_order'] - data['first_order']

# Average days between orders
data['avg_days_to_order'] = data['days_cust'] / data['n_order']

# Days the customer is due, according to their average days between orders
data['days_due'] = 90 - data['last_order'] + data['avg_days_to_order']  

# Percentage of orders placed to restaurants that are part of a chain
data['per_chain_order'] = data['n_chain'] / data['n_order']

# And we add these tese features to the metric features list.
metric_features.extend(
    [
        'n_order'
        , 'per_chain_order'
        ,'total_amt'
        , 'avg_amt_per_order'
        , 'avg_amt_per_product'
        , 'avg_amt_per_vendor'
        , 'days_cust'
        , 'avg_days_to_order'
        , 'days_due'
    ]
)



### Looking at aggregates

We define some custom functions to improve our aggregations, and evaluate the outputs

In [None]:
# Helper functions for aggregation

def mode(x): return x.mode().iloc[0] if not x.mode().empty else None
def _25(x): return x.quantile(0.25)
def _75(x): return x.quantile(0.75)
def _90(x): return x.quantile(0.90)
def _95(x): return x.quantile(0.95)
def _98(x): return x.quantile(0.98)        


# Metric aggregations 

metric_functions = [
    'sum'
    , 'mean' 
    , 'std' 
    , 'var' 
    , 'skew' 
    , 'kurt' 
    , 'min' 
    , _25 
    , 'median'
    , _75
    , _90
    , _95
    , _98
    , 'max'
    , mode
]

time_functions = [
    'sum'
    ,'mean'
    ,'std'
    ,'var'
    
]
# Categorical aggregators

categorical_functions = [
        'count' 
        , 'unique' 
        , mode 
        , 'freq'
    ]

def get_aggregations(_data, _type, selected):
    agg_dict = {column : _type for column in data[selected].columns}

    return _data[selected].agg(agg_dict).round(2).T



## Aggregations

### Metric features

We aggregate the metric features and dive deep into their nuances.

In [None]:
get_aggregations(data, metric_functions, ['cust_age'])

Right away we can see that we have a very young customer base with a mode of 23, median of 26, and mean of aprox. 28 years, its distribution is as a result very skewed to the right and somewhat signficantly leptokurtic, as can be confirmed by looking at its quantiles, and in fact, 98% of customers are below the age of 47, while the oldest is 80.

In [None]:
get_aggregations(data, metric_functions, ['n_order', 'n_product', 'n_vendor'])

Most of this customer base showed itself loyal to a relatively small number of vendors, with half not placing orders from more than two vendors, this behaviour extends itself somewhat to the products purchased from said vendors, with median value of products bought at 3. In this case however, we see that the mean is higher than the median, but, the 95th percentile does not go beyond 18, indicating that we have extreme outliers, which the skewness and kurtosis appear to confirm.

Customers placed on average 4 orders during the quarter, but concerningly, half of the customer base, made only 3, ammounting to one order per month. This distribution is also extremely right skewed, and leptokurtic, meaning that a very small number of customers are responsible for a large proportion of orders. 

In [None]:
get_aggregations(data, metric_functions, ['n_chain', 'per_chain_order'])

If we consider n_chain to be the count of purchases made in chain restauraunts i.e. a fraction of n_order, we see that most customers make relatively about two thirds of their purchases from chained restauraunts; this is confirmed more or less, by the explicit calculation of pct_chain, which measures the fraction of orders placed by customers, that purport to chained restaurants.

In [None]:
get_aggregations(data, metric_functions, ['total_amt', 'avg_amt_per_order','avg_amt_per_product', 'avg_amt_per_vendor'])

Total amt spent shows the business depends on high spenders, with its mean (38.43) being much higher than the median value, and extremely high values at the higher percentiles, along with monstruous variance and kurtosis (note the max ammount of 1418.33). Of all the aggregate ammounts, average per product is the most well behaved with the maximum value being roughly three times the median. Curiously, the average ammount per vendor differs from that of average ammount per order, indicating that there is some relation between higher spending consumers and specific vendors.

In [None]:
get_aggregations(data, metric_functions, ['first_order', 'last_order','days_cust'])

Moving on to first order, it shows that 50 percent of the customer base placed its first order during the first three weeks, while the next 25 percent made orders in the three weeks proceeding. Thus, we conclude the remaing 25 percent of the current customer base was acquired in last month and a half of operations (approximately 6 weeks), constituting a dramatic slow down. 

This can be cause for concern if we look at the information about the previous propensities for small numbers of customers to place large orders, as this makes the company hostage to a small number of cash cows, to which it is then forced to make concessions, in exchange for loyalty - i.e. in a traditional PESTEL analysis sense, we can say that in such a scenario the company risks having its costumers gain leverage over the business, and reducing overal profit margins.

Looking at last order, we can assert that 75% of customers made their last purchase within the last 40 days, with 50% in the last 20; this is good news, as at the very least, it shows that the decreasing trend in customer acquisition is not accompanied by an increasing one in customers making their last purchases.


In [None]:
get_aggregations(data, metric_functions, ['avg_days_to_order','days_due'])

To help understand better this relation between first and last order, we inspect average days to order, which measures on average how many days passed between each of the customers order; we see that the median and mean are more or less in agreement, at 7 days, but there is great dispersion around this behaviour with high variance.

Finally, and oddly curious, is the modality of days_cust at 0 which is in total disagreance with the balance we made note of between first and last order, as under normal circunstances we would expect that if those two quantities are in the balance, days_cust ought to follow a uniform distribution, with mean roughly at the day 45; the fact that it doesn't might imply that a significant portion of customers made one time purchases, for specific reasons.

### Day features

Aggregating over days of week, highlights a few key points: 1. that there is a clear trend towards orders being placed on the weekends, 2. due to an increase in variance, we can also deduce that not all weekends are the same. 

In [None]:
get_aggregations(data, time_functions, day_features) 

### Hour features

Similarly aggregating over the hours, highlights a predictable concentration around lunch and diner, with a gentle trof at the mid-afternoon mark. And a rather conspicuous point at in the morning.

In [None]:
get_aggregations(data, time_functions, hour_features) 

### Non-Metric features

Unfortunatelly we don't gain much insight from this table below. 

In [None]:
data[non_metric_features].astype('category').describe(include='category').T 

## Histograms

No data scientist on Earth would consider not plotting these, so we do, for many features, most of the distributions are EXTREMELY skewed, so we exclude the value 0 for the purposes of plotting.

### Metric_features

In [None]:
target = metric_features
num_features = len(target)
num_columns = 3
num_rows = (num_features + num_columns - 1) // num_columns

fig, axes = plt.subplots(nrows=num_rows, ncols=num_columns, figsize=(15, 5 * num_rows))

# Flatten the axes array for easier iteration if there's more than one row
axes = axes.flatten()

# Loop through each numeric feature to plot histograms with Seaborn
for ax, feature in zip(axes, target):
    sns.histplot(data[feature], color='black', kde=True, ax=ax)  # kde adds a density line
    ax.set_title(f'{feature}')
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')

# Hide any empty subplots if the number of features is not even
for i in range(len(target), len(axes)):
    fig.delaxes(axes[i])

# Adjust layout to prevent overlap
plt.tight_layout()

# Show the plot
plt.show()

Our initial suspicions about the skewness of the data, are now in full display, as many of our features show clear right tail, sometimes with very sparse values.

More interestingly we note that:
- a. per_chain_order, that shows a sort of self-similar behaviour centered around approximately .5;
- b. almost one third of customers, placed a single order, because they were customers only for one day, which is visible in days_cust, and avg_days_to_order.
- c. the variables about avg order and product show very consistent spikes, in such a way that it leads us to believe that these might not purport to the same overall populations. But to begin speculating, a population of customers that chooses products and venders based on very well defined prices, infers either a subgroup with a very high sensitivity to product/price mix or fraud. 

##### **Correcting for one-time customers**

In [None]:
target = metric_features
num_features = len(target)
num_columns = 3
num_rows = (num_features + num_columns - 1) // num_columns

fig, axes = plt.subplots(nrows=num_rows, ncols=num_columns, figsize=(15, 5 * num_rows))

# Flatten the axes array for easier iteration if there's more than one row
axes = axes.flatten()

# Loop through each numeric feature to plot histograms with Seaborn
for ax, feature in zip(axes, target):
    sns.histplot(data[data['days_cust'] > 0][feature], color='black', kde=True, ax=ax)  # kde adds a density line
    ax.set_title(f'{feature}')
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')

# Hide any empty subplots if the number of features is not even
for i in range(len(target), len(axes)):
    fig.delaxes(axes[i])

# Adjust layout to prevent overlap
plt.tight_layout()

# Show the plot
plt.show()

When we correct for one time purchases, we see that certain distributions like avg_amt_per order and per product become more locally well behaved i.e. smoother. This leans into the idea that these customers are taking advantage of pricing, or product when they make their first purchase through the service, i.e. their need for the service might be driven by perception of advantage. We will test this later by checking price sensitivity, by comparing these customers with promotions.

### Day features

Below we dissect demand as a function of DOW, and go deeper into the issue with one-time customers.

In [None]:
target = day_features
num_features = len(target)
num_columns = 3
num_rows = (num_features + num_columns - 1) // num_columns

fig, axes = plt.subplots(nrows=num_rows, ncols=num_columns, figsize=(15, 5 * num_rows))
axes = axes.flatten()

for ax, feature in zip(axes, target):
    sns.countplot(data=data, x=feature, color='black', ax=ax)
    ax.set_title(f'{feature}')
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')

# Hide any empty subplots if the number of features is not even
for i in range(len(target), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()


##### **Correcting for one time costumers**

The removal of one time costumers also helps in this visualization. Although it is not yet completely clear what days customers prefer from here.

In [None]:
target = day_features
num_features = len(target)
num_columns = 3
num_rows = (num_features + num_columns - 1) // num_columns

fig, axes = plt.subplots(nrows=num_rows, ncols=num_columns, figsize=(15, 5 * num_rows))
axes = axes.flatten()

for ax, feature in zip(axes, target):
    sns.countplot(data=data[data['days_cust'] > 0], x=feature, color='black', ax=ax)
    ax.set_title(f'{feature}')
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')

# Hide any empty subplots if the number of features is not even
for i in range(len(target), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()


#### At least one day

In [None]:
at_least_one_day = {}

for day in day_features:
    at_least_one_day[day] = data.loc[data[day] > 0, day]

In [None]:
target = at_least_one_day.keys()
num_features = len(target)
num_columns = 3
num_rows = (num_features + num_columns - 1) // num_columns

fig, axes = plt.subplots(nrows=num_rows, ncols=num_columns, figsize=(15, 5 * num_rows))

# Flatten the axes array for easier iteration if there's more than one row
axes = axes.flatten()

# Loop through each numeric feature to plot countplot with Seaborn
for ax, feature in zip(axes, target):
    sns.countplot(data=at_least_one_day, x=feature, color='black', ax=ax)
    ax.set_title(f'{feature}')
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')

# Hide any empty subplots if the number of features is not even
for i in range(len(target), len(axes)):
    fig.delaxes(axes[i])

# Adjust layout to prevent overlap
plt.tight_layout()

# Show the plot
plt.show()

##### **Correcting for one-time customers**

Just as before, correcting for one time customers improves interpretability, as it reduces the number of trivial '0's' in the plots. Albeit both groups of histograms point us to the idea that we can define our superfan customers as those that make purchases in 3 or more days of the week. Lets quickly analyze this variable.

In [None]:
at_least_one_day = {}

for day in day_features:
    at_least_one_day[day] = data[data['days_cust'] > 0].loc[data[day] > 0, day]

In [None]:
target = at_least_one_day.keys()
num_features = len(target)
num_columns = 3
num_rows = (num_features + num_columns - 1) // num_columns

fig, axes = plt.subplots(nrows=num_rows, ncols=num_columns, figsize=(15, 5 * num_rows))

# Flatten the axes array for easier iteration if there's more than one row
axes = axes.flatten()

# Loop through each numeric feature to plot countplot with Seaborn
for ax, feature in zip(axes, target):
    sns.countplot(data=at_least_one_day, x=feature, color='black', ax=ax)
    ax.set_title(f'{feature}')
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')

# Hide any empty subplots if the number of features is not even
for i in range(len(target), len(axes)):
    fig.delaxes(axes[i])

# Adjust layout to prevent overlap
plt.tight_layout()

# Show the plot
plt.show()

Once we describe the data in terms of at least one day, and for customers that are not just one-time, we see that customers that make more orders over the period, are slighlly associated with higher values of the week.

#### Number of purchases in different days


In [None]:
# Create a mask to check if each day column is populated
mask = data[[f'DOW_{i}' for i in range(7)]] > 0

# Sum over the mask to get the count of days with purchases for each row
data.loc[:, 'n_days_week'] = mask.sum(axis=1)

# Updating the list of metric features
metric_features.append('n_days_week')

sns.countplot(data=data, x='n_days_week', color='black')


##### **Correcting for one-time customers**

And here at last when we aggregate the values we see the impact of one-time consumers. Regardless, we see that most customers that are not one-time customers have made orders in between 2 and 3 different days of the week.

In [None]:
# Plotting after the correction
sns.countplot(data=data[data['n_order'] > 1], x='n_days_week', color='black')


### Hour features

In [None]:
target = hour_features
num_features = len(target)
num_columns = 3
num_rows = (num_features + num_columns - 1) // num_columns

fig, axes = plt.subplots(nrows=num_rows, ncols=num_columns, figsize=(15, 5 * num_rows))
axes = axes.flatten()

for ax, feature in zip(axes, target):
    sns.countplot(data=data, x=feature, color='black', ax=ax)
    ax.set_title(f'{feature}')
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')

# Hide any empty subplots if the number of features is not even
for i in range(len(target), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()


##### **Correcting for one-time customers**

Just as before, correcting for one time customers improves interpretability, as it reduces the number of trivial '0's' in the plots.

In [None]:
target = hour_features
num_features = len(target)
num_columns = 3
num_rows = (num_features + num_columns - 1) // num_columns

fig, axes = plt.subplots(nrows=num_rows, ncols=num_columns, figsize=(15, 5 * num_rows))
axes = axes.flatten()

for ax, feature in zip(axes, target):
    sns.countplot(data=data[data['n_order'] > 1], x=feature, color='black', ax=ax)
    ax.set_title(f'{feature}')
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')

# Hide any empty subplots if the number of features is not even
for i in range(len(target), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()


Most people did not make a purchase at most hours, however, the fact that the values ranging from 1 to 3 are more highly populated for certain hours, do imply again that there is a clear preference for orders to line up with meal hours. 

We need to however take into consideration that food needs to be prepared and delivered, and that customers might account for this, when they place an order, thus the order placement in our records likely reflects this perceived lag; as orders begin as early as 10, which on it's own could be understood as breakfast; but if we consider that an order process initiated at 10:45 and finalized and placed at 10:55 - which would fall onto the 10H bracket - that then takes 45 minutes to reach the customers door. 

Means that the customer is having lunch between 11:40 and 12:00, which is a more habitual, if albeit slightly early hour for lunch.

#### At least one hour

In [None]:
at_least_one_hour = {}

for hour in hour_features:
    at_least_one_hour[hour] = data.loc[data[hour] > 0, hour]

In [None]:
target = at_least_one_hour.keys()
num_features = len(target)
num_columns = 3
num_rows = (num_features + num_columns - 1) // num_columns

fig, axes = plt.subplots(nrows=num_rows, ncols=num_columns, figsize=(15, 5 * num_rows))
axes = axes.flatten()

for ax, feature in zip(axes, target):
    sns.countplot(data=at_least_one_hour, x=feature, color='black', ax=ax)
    ax.set_title(f'{feature}')
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')

# Hide any empty subplots if the number of features is not even
for i in range(len(target), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()


Most customers that have made more than one order at a particular time, have done so, at meal hours. This implies that our more regular customers are to be found in the subset of those that observe habit, and plan ahead for their meal time. Curiously, HR_5 is the one that is associated with a smaller relative frequency gap between first and second order, which implies that customers that knowing that a customer placed an order at 5 in the morning, gives us greated confidence that they will have done so, more than once.

This makes intuitive sense, if we account for a. party goers, b. night shift workers.

##### **Correcting for one-time customers**

Just as before, correcting for one time customers improves interpretability, as it reduces the number of trivial '0's' in the plots.

In [None]:
at_least_one_hour = {}

for hour in hour_features:
    at_least_one_hour[hour] = data[data['days_cust'] > 0].loc[data[hour] > 0, hour]

In [None]:
target = at_least_one_hour.keys()
num_features = len(target)
num_columns = 3
num_rows = (num_features + num_columns - 1) // num_columns

fig, axes = plt.subplots(nrows=num_rows, ncols=num_columns, figsize=(15, 5 * num_rows))
axes = axes.flatten()

for ax, feature in zip(axes, target):
    sns.countplot(data=at_least_one_hour, x=feature, color='black', ax=ax)
    ax.set_title(f'{feature}')
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')

# Hide any empty subplots if the number of features is not even
for i in range(len(target), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()


#### Number of purchases at different hours

In [None]:
# Create a mask to check if each day column is populated
mask = data[hour_features] > 0

# Sum over the mask to get the count of days with purchases for each row
data.loc[:, 'n_times_day'] = mask.sum(axis=1)

# Updating the list of metric features
metric_features.append('n_times_day')

sns.countplot(data=data, x='n_times_day', color='black')

This plot is very tricky to interpret as we have identified that there are many customers that have made only one purchase, as such we would expect those customers to muddle the true values.

##### Correcting for one-time customers

In [None]:
# Create the histogram for customers with more than one order
sns.countplot(data=data[data['n_order'] > 1], x='n_times_day', color='black')

# Show the plot
plt.xlabel('Number of hours with Purchases')
plt.ylabel('Count')
plt.title('Histogram of hours with Purchases for Customers with More Than One Order')
plt.show()

When we net customers that made only one purchase, the distribution of this variable becomes much more apparent. <br> <br>
Customers tend to concentrate the orders that they make around 2 to 4 distinct hours. Ironically, this further reinforces the idea that we need to consider purging our main dataset of these values, at least for the purpose of understanding the average customer, if not all together. 

Strictly speaking, with the evidence thus collected we have evidence to believe these values to likely represent either a. people that wanted to try out the service; b. people that were trying to take advantage of a one time deal, on instalation of the service, on a product, etc., but that otherwise do not wish to continue using the service; c. fraudsters creating multiple accounts for the purposes of b. Of course, there is the risk that we are removing customers that legitemately belong to the customer base, but without any further way of filtering both situations, we feel it makes sense to put these aside. 

Moreover, there are considerations regarding if these values are even worth considering as part of our clustering, as to be fair, it is trivial to build them as a group and just append them to our clusters, and in fact, we might just find that without them our algorithms that depend on distances might have an easier time with other groups.

### Wrangling the beast

At this point we are thouroughly conviced that one-time customers are contributing very negatively to our analysis, to keep them in our set, we will instead create a boolean check, for one-time customers and filter our sets that way. 

In [None]:
# We create the flag as a variable for further use, 
data['regular'] = (data['days_cust'] != 0)


# But also a slicer for utility
regulars = data['regular'] == 1

### Cuisine features

In [None]:
target = cuisine_features
num_features = len(target)
num_columns = 3
num_rows = (num_features + num_columns - 1) // num_columns

fig, axes = plt.subplots(nrows=num_rows, ncols=num_columns, figsize=(15, 5 * num_rows))

# Flatten the axes array for easier iteration if there's more than one row
axes = axes.flatten()

# Loop through each numeric feature to plot histograms with Seaborn
for ax, feature in zip(axes, target):
    sns.histplot(data.loc[regulars, feature], color='black', kde=True, ax=ax)  # kde adds a density line
    ax.set_title(f'{feature}')
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')

# Hide any empty subplots if the number of features is not even
for i in range(len(target), len(axes)):
    fig.delaxes(axes[i])

# Adjust layout to prevent overlap
plt.tight_layout()

# Show the plot
plt.show()

We have no hope of interpreting this.

#### At least one cuisine

In [None]:
target = target = cuisine_features
num_features = len(target)
num_columns = 3
num_rows = (num_features + num_columns - 1) // num_columns

fig, axes = plt.subplots(nrows=num_rows, ncols=num_columns, figsize=(15, 5 * num_rows))

# Flatten the axes array for easier iteration if there's more than one row
axes = axes.flatten()

# Loop through each numeric feature to plot histograms with Seaborn
for ax, feature in zip(axes, target):
    sns.histplot(data.loc[regulars & data[feature] > 0, feature], color='black', kde=True, ax=ax)  # kde adds a density line
    ax.set_title(f'{feature}')
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')

# Hide any empty subplots if the number of features is not even
for i in range(len(target), len(axes)):
    fig.delaxes(axes[i])

# Adjust layout to prevent overlap
plt.tight_layout()

# Show the plot
plt.show()

This does in fact improve interpretability, but it is still a difficult endeavour. We can see some patterns particular ammounts spent, which might be evidence of a small number of products being purchased, and then the total amount spent in each kitchen being merely a reflection of this multiplication operation. 

So in limine if this logical abduction makes any sense, then what we see is that the histograms for which the distributions show what appear to be several modes, actually purports to customers strongly prefering a particular set of products within that specific type of cuisine. This then further implies that customers that opt for this type of cuisine, have a preference for this product at all levels of total aggregate spending.

#### Number of different cuisine purchases

In [None]:
#Create a mask where values are greater than zero (indicating an order)
mask = data[cuisine_features] > 0

#Use mask to get the ordered cuisines for each row
data.loc[:, 'ordered_cuisines'] = mask.apply(lambda row: [cuisine for cuisine, ordered in row.items() if ordered], axis=1)

# Updating the non_metric_features_list
non_metric_features.append('ordered_cuisines')

# Display countplot for how many cuisines were ordered per customer
data.loc[:, 'n_cuisines'] = mask.sum(axis=1)

# Updating the metric_features_list
metric_features.append('n_cuisines')

sns.countplot(data=data.loc[regulars, :], x='n_cuisines', color='black')


#### Order Cuisines by customer

In [None]:
data_exploded = data.loc[regulars, :].explode('ordered_cuisines')

# Drop any rows where 'ordered_cuisines' is NaN (in case some rows had no orders)
data_exploded = data_exploded.dropna(subset=['ordered_cuisines'])

# Plot the histogram for ordered cuisines
sns.histplot(data=data.explode('ordered_cuisines'), y='ordered_cuisines', color='black')

# Show plot
plt.xlabel("Frequency")
plt.ylabel("Cuisine")
plt.title("Frequency of Ordered Cuisines")
plt.show()


#### Total ammount per cuisine

In [None]:
# Sum orders for each cuisine across all rows
cuisine_order_counts = data.loc[regulars, :][cuisine_features].sum().reset_index()
cuisine_order_counts.columns = ['cuisine', 'sum']

# Plot the total count of orders for each cuisine
sns.barplot(data=cuisine_order_counts, y='cuisine', x='sum', color='black')
plt.xlabel("Total Ammount")
plt.ylabel("Cuisine")
plt.title("Total Ammount by Cuisine")
plt.show()

In [None]:
# Sum orders for each cuisine across all rows
cuisine_order_counts = data.loc[~regulars, :][cuisine_features].sum().reset_index()
cuisine_order_counts.columns = ['cuisine', 'sum']

# Plot the total count of orders for each cuisine
sns.barplot(data=cuisine_order_counts, y='cuisine', x='sum', color='black')
plt.xlabel("Total Ammount")
plt.ylabel("Cuisine")
plt.title("Total Ammount by Cuisine")
plt.show()

### Non-metric features

In [None]:
target = non_metric_features[:3]
num_features = len(target)
num_columns = 3
num_rows = (num_features + num_columns - 1) // num_columns

fig, axes = plt.subplots(nrows=num_rows, ncols=num_columns, figsize=(15, 5 * num_rows))

# Flatten the axes array for easier iteration if there's more than one row
axes = axes.flatten()

# Loop through each feature to plot barplots with Seaborn
for ax, feature in zip(axes, target):
    # Calculate frequency counts
    value_counts = data.loc[regulars, feature].value_counts().reset_index()
    value_counts.columns = ['Value', 'Frequency']
    
    # Plot with sns.barplot
    sns.barplot(data=value_counts, x='Value', y='Frequency', color='black', ax=ax)
    
    # Set title and labels
    ax.set_title(f'{feature}')
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')

# Hide any empty subplots if the number of features is not even
for i in range(len(target), len(axes)):
    fig.delaxes(axes[i])

# Adjust layout to prevent overlap
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
target = non_metric_features[:3]
num_features = len(target)
num_columns = 3
num_rows = (num_features + num_columns - 1) // num_columns

fig, axes = plt.subplots(nrows=num_rows, ncols=num_columns, figsize=(15, 5 * num_rows))

# Flatten the axes array for easier iteration if there's more than one row
axes = axes.flatten()

# Loop through each feature to plot barplots with Seaborn
for ax, feature in zip(axes, target):
    # Calculate frequency counts
    value_counts = data.loc[~regulars, feature].value_counts().reset_index()
    value_counts.columns = ['Value', 'Frequency']
    
    # Plot with sns.barplot
    sns.barplot(data=value_counts, x='Value', y='Frequency', color='black', ax=ax)
    
    # Set title and labels
    ax.set_title(f'{feature}')
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')

# Hide any empty subplots if the number of features is not even
for i in range(len(target), len(axes)):
    fig.delaxes(axes[i])

# Adjust layout to prevent overlap
plt.tight_layout()

# Show the plot
plt.show()


Pay methods by one time customers were highly irregular. It is also true that a for these customers a much greater emphasis on promotions was present, and since their last promotion is their only promotion, we can be certain that this was the promotion used for the purchase. Lastly, regions 0, 1, 2 see a smaller number of one time customers.

## Flagging outliers with the log transformation

Note below how the boxplots of our variables are in absolute disarray. To this end, it is not so much that we wish to eliminate our outliers, as this is not the time for that, but certainly create flags based on certain types out liers.

The log transformation offers a very robust way to find such intervals.


Note: the "regulars" condition carries over to this analysis, and will be repeated ad nauseum

In [None]:
target = data.drop(columns=non_metric_features).columns
num_features = len(target)
num_columns = 4
num_rows = (num_features + num_columns - 1) // num_columns

fig, axes = plt.subplots(nrows=num_rows, ncols=num_columns, figsize=(15, 5 * num_rows))

# Flatten the axes array for easier iteration if there's more than one row
axes = axes.flatten()

# Loop through each numeric feature to plot histograms with Seaborn
for ax, feature in zip(axes, target):
    sns.boxplot(y=data[feature], color='#666666', ax=ax)  
    ax.set_title(f'{feature}')

# Hide any empty subplots if the number of features is not even
for i in range(len(target), len(axes)):
    fig.delaxes(axes[i])

# Adjust layout to prevent overlap
plt.tight_layout()

# Show the plot
plt.show()

### Tranformations

Many of our variables have a true zero value. As a workaround, we call the log1p function which does introduce some bias to our distributions.

In [None]:
### Dropping specified columns and getting remaining columns as a list
targets = data.drop(columns=[
    'cust_age'
    , 'first_order'
    , 'last_order'
    , 'days_cust'
    , 'days_due'
    , 'avg_days_to_order'
    , 'per_chain_order'
    , 'cust_region'
    , 'last_promo'
    , 'pay_method'
    , 'ordered_cuisines'
    , 'n_cuisines'
    , 'regular'
] + hour_features.tolist() + day_features.tolist()).columns.tolist()

# Initialize an empty DataFrame to store log-transformed columns
log_transformed = pd.DataFrame()

# We create a list of log_features to assist us in our exploration
log_features = log_transformed.columns.tolist()

# Apply log1p to each column in targets and add it to log_transformed with the prefix 'log_'
for col in targets:
    log_transformed[f"log_{col}"] = np.log1p(data[col])

# Concatenate the original DataFrame with the new log-transformed DataFrame
data = pd.concat([data, log_transformed], axis=1)

In [None]:
data.head()

#### Plotting the log-normal transformed

In [None]:
target = data.filter(regex="^log_").columns

num_features = len(data.filter(regex="^log_").columns)
num_columns = 3
num_rows = (num_features + num_columns - 1) // num_columns

fig, axes = plt.subplots(nrows=num_rows, ncols=num_columns, figsize=(15, 5 * num_rows))

# Flatten the axes array for easier iteration if there's more than one row
axes = axes.flatten()

# Loop through each numeric feature to plot histograms with Seaborn
for ax, feature in zip(axes, target):
    # Calculate min and max for the current feature
    lower_limit = 0 
    upper_limit = data[feature].max()
    
    # Plot histogram
    sns.histplot(data.loc[regulars & data[feature] > 0, feature], color='black', kde=True, ax=ax)
    ax.set_title(f'{feature}')
    ax.set_xlabel('Value')
    ax.set_ylabel('Frequency')
    
    # Set x-axis limits and dynamically set ticks based on the range
    ax.set_xlim(lower_limit, upper_limit)
    
    # Calculate a reasonable tick interval
    tick_interval = (upper_limit - lower_limit) / 5  # Aim for 5 ticks
    ax.set_xticks(np.arange(lower_limit, upper_limit + tick_interval, tick_interval))

# Hide any extra subplots if there are fewer features than axes
for i in range(len(target), len(axes)):
    fig.delaxes(axes[i])

# Adjust layout to prevent overlap
plt.tight_layout()

# Show the plot
plt.show()

In truth, the discrete variables, look a bit odd, but naturally the log compresses them the least, which is why these distributions accumulate tighter packed intervals on the right tail. The boxplots below highlight the usefullness of this method.


In [None]:
target = data.filter(regex="^log_").columns

num_features = len(data.filter(regex="^log_").columns)
num_columns = 3
num_rows = (num_features + num_columns - 1) // num_columns

fig, axes = plt.subplots(nrows=num_rows, ncols=num_columns, figsize=(15, 5 * num_rows))

# Flatten the axes array for easier iteration if there's more than one row
axes = axes.flatten()

# Loop through each numeric feature to plot histograms with Seaborn
for ax, feature in zip(axes, target):
    # Plot histogram
    sns.boxplot(y=data.loc[regulars & data[feature] > 0, feature], color='black',ax=ax)
    ax.set_title(f'{feature}')
    ax.set_xlabel('Value')

    # Add finer tick marks
    ax.yaxis.set_major_locator(mticker.MaxNLocator(25))  # Control the number of major ticks

# Hide any extra subplots if there are fewer features than axes
for i in range(len(target), len(axes)):
    fig.delaxes(axes[i])

# Adjust layout to prevent overlap
plt.tight_layout()

# Show the plot
plt.show()

Since we are not really removing outliers, we can be more creative in our approach, we will attribute flags based one the following above IQR behaviours as follows:

    - foodie - n_vendor, n_product, n_order
        "Experiment with many vendors, order many products, and place many orders"
        
    - glutunous - avg_per_order, total_amt, n_chain
        "PLace large orders, spend a lot o money, mostly in chained restaurants"
        
    - loyal - avg_per_vendor, any_CUI
        "Spend a lot on each vendor, and spend a lot in a type of cuisine"

### Creating Slices

In [None]:
# Initialize dictionaries for feature groups with flags and relevant columns
feature_groups = {
    'foodie': ['n_vendor', 'n_product', 'n_order', 'n_cuisines'],
    'gluttonous': ['avg_amt_per_order', 'total_amt', 'n_chain'],
    'loyal': ['avg_amt_per_vendor'] + cuisine_features.tolist()
}


# Create columns to hold the flags for each feature group
data['foodie_flag'] = 0
data['gluttonous_flag'] = 0
data['loyal_flag'] = 0

# Function to calculate IQR bounds
def calculate_bounds(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return lower_bound, upper_bound

# Assign flags for each feature group
for group, features in feature_groups.items():
    for feature in features:
        log_feature = f"log_{feature}"
        
        if feature == 'n_cuisines':
            log_feature = feature
        
        lower_bound, upper_bound = calculate_bounds(data.loc[regulars & data[feature] > 0, log_feature])
        
        # Mark outliers for each group
        if group == 'foodie':
            data['foodie_flag'] |= (data[log_feature] > upper_bound).astype(int)
        elif group == 'gluttonous':
            data['gluttonous_flag'] |= (data[log_feature] > upper_bound).astype(int)
        elif group == 'loyal':
            data['loyal_flag'] |= (data[log_feature] > upper_bound).astype(int)

# Display results
for group in ['foodie_flag', 'gluttonous_flag', 'loyal_flag']:
    print(f"Number of customers flagged as {group.split('_')[0]}:", data[group].sum())


In [None]:
# Create the subset based on the conditions and include 'cust_region'
subset_df = data[
    (regulars) & 
    (~data['loyal_flag']) & 
    (~data['gluttonous_flag']) & 
    (~data['foodie_flag'])
]

# Plotting the histogram using the subset DataFrame, ensuring to use 'cust_region' for hue
sns.histplot(data=subset_df, x='avg_amt_per_product', color='black', hue='cust_region', kde=True)

plt.ylabel("Frequency")
plt.show()


### Checking the results

In [None]:
target = cuisine_features
num_features = len(target)
num_columns = 3
num_rows = (num_features + num_columns - 1) // num_columns

fig, axes = plt.subplots(nrows=num_rows, ncols=num_columns, figsize=(15, 5 * num_rows))

# Flatten the axes array for easier iteration if there's more than one row
axes = axes.flatten()

# Loop through each numeric feature to plot histograms with Seaborn
for ax, feature in zip(axes, target):
    try:
        sns.histplot(data.loc[regulars & ~data['loyal_flag'] & ~data['gluttonous_flag'] & ~data['foodie_flag'] & data[feature] > 0, feature], color='black', kde=True, ax=ax)  # kde adds a density line
        ax.set_title(f'{feature}')
        ax.set_xlabel('Value')
        ax.set_ylabel('Frequency')
    except (ValueError, TypeError) as e:
        pass
        
# Hide any empty subplots if the number of features is not even
for i in range(len(target), len(axes)):
    fig.delaxes(axes[i])

# Adjust layout to prevent overlap
plt.tight_layout()

# Show the plot
plt.show()

##  Looking closely at one time customers

In [None]:
subset_df = data[
    (regulars) & 
    (~data['loyal_flag']) & 
    (~data['gluttonous_flag']) & 
    (~data['foodie_flag'])
]

In [None]:
sns.histplot(data=subset_df, 
             x='avg_amt_per_product', 
             bins=90, 
             hue='cust_region', 
             kde=True)

Customer region ends up being an extremely good discriminator of this variable, Clearly there are average ammounts associated to one time purchases, and that this habit is discriminated by customer region. 

In [None]:
sns.histplot(data=subset_df, 
             x='log_avg_amt_per_order', 
             bins=90,
             hue='cust_region', 
             kde=True)

In [None]:
sns.histplot(data=subset_df, 
             x='log_total_amt', 
             bins=91,
             multiple='stack',
             hue='last_promo', 
             kde=True)


Talk about influencers

## Aggregations (again)

And finally it makes sense we feel to check how the average customer has changes when we exclude these differente anomalous values that we have identified.

In [None]:
subset_df = data[
    (regulars) & 
    (~data['loyal_flag']) & 
    (~data['gluttonous_flag']) & 
    (~data['foodie_flag'])
]

In [None]:
get_aggregations(subset_df, metric_functions, metric_features)

## Fin 

In [None]:
# Exports

data.reset_index(drop=False).to_csv('data_exploration.csv', index=False)