# Advanced hour features

In [None]:
# Step 1: Get the values of the columns as a NumPy array
values = data[hour_features].values

# Step 2: Use argsort to get the indices that would sort each row
sorted_indices = np.argsort(values, axis=1)

# Step 3: Get the index of the largest and second-largest column for each row
max_col_indices = sorted_indices[:, -1]  # Last column is the highest
second_max_col_indices = sorted_indices[:, -2]  # Second to last column is the second-highest

# Step 4: Check if the second highest value is zero (i.e., all other values are zero)

second_max_values = np.take_along_axis(values, second_max_col_indices[:, None], axis=1).flatten()

# Step 5: Map the indices back to column names, or use None if second max is zero
data.loc[:, '_1_modal_hr'] = np.array(hour_features)[max_col_indices]
data.loc[:, '_2_modal_hr'] = np.where(second_max_values == 0, None, np.array(hour_features)[second_max_col_indices])

data.loc[:, '_1_modal_hr'] = data['_1_modal_hr'].apply(lambda x: int(x.split('_')[1]) if isinstance(x, str) else np.nan)
data.loc[:, '_2_modal_hr'] = data['_2_modal_hr'].apply(lambda x: int(x.split('_')[1]) if isinstance(x, str) else np.nan)

# Step 6: Store actual values corresponding to mode_1 and mode_2
data.loc[:, 'n_order_1_modal_hr'] = values[np.arange(values.shape[0]), max_col_indices]
data.loc[:, 'n_order_2_modal_hr'] = np.where(second_max_values == 0, 0, values[np.arange(values.shape[0]), second_max_col_indices])

# Propensity to consume on hours that are modal
data['modal_hr_prop'] = (data.loc[:, 'n_order_1_modal_hr'] + data.loc[:, 'n_order_2_modal_hr']) / data['n_order']

# Parsing 1_mode
data['_1_modal_hr_rad'] = (data['_1_modal_hr'] * (2 * np.pi / 24)).astype(float)

# Calculate the mean of sine and cosine components from both sets of radians
mean_sin = np.mean(np.sin(data['_1_modal_hr_rad']))
mean_cos = np.mean(np.cos(data['_1_modal_hr_rad']))

circular_mean = np.arctan2(mean_sin, mean_cos) % (2 * np.pi)

data['position_1_hr_mode'] = data['_1_modal_hr_rad'].apply(lambda x: 1 if (x - circular_mean) > 0 else 0)

# Parsing 2_mode
data['_2_modal_hr_rad'] = (data['_2_modal_hr'] * (2 * np.pi / 24)).astype(float)

# Calculate the mean of sine and cosine components from both sets of radians
mean_sin = np.mean(np.sin(data['_2_modal_hr_rad']))
mean_cos = np.mean(np.cos(data['_2_modal_hr_rad']))

circular_mean = np.arctan2(mean_sin, mean_cos) % (2 * np.pi)

data['position_2_hr_mode'] = data['_2_modal_hr_rad'].apply(lambda x: 1 if (x - circular_mean) > 0 else 0)

# Parsing the average of both modes
data['mean_modal_hr_rad'] = data.apply(
    lambda row: np.arctan2(
        (np.sin(row['_1_modal_hr_rad']) + np.sin(row['_2_modal_hr_rad'])) / 2,  # mean sine
        (np.cos(row['_1_modal_hr_rad']) + np.cos(row['_2_modal_hr_rad'])) / 2   # mean cosine
    ) % (2 * np.pi),  # Ensure result is in [0, 2*pi]
    axis=1
)

# Calculate the mean of sine and cosine components from both sets of radians
mean_sin = np.mean(np.sin(data['_1_modal_hr_rad'])) + np.mean(np.sin(data['_2_modal_hr_rad'])) / 2
mean_cos = np.mean(np.cos(data['_1_modal_hr_rad'])) + np.mean(np.cos(data['_2_modal_hr_rad'])) / 2

circular_mean = np.arctan2(mean_sin, mean_cos) % (2 * np.pi)

data['position_mean_hr_mode'] = data['mean_modal_hr_rad'].apply(lambda x: 1 if (x - circular_mean) > 0 else 0)



# Advanced day features

In [None]:
# Step 1: Get the values of the columns as a NumPy array
values = data[day_features].values

# Step 2: Use argsort to get the indices that would sort each row
sorted_indices = np.argsort(values, axis=1)

# Step 3: Get the index of the largest and second-largest column for each row
max_col_indices = sorted_indices[:, -1]  # Highest value column
second_max_col_indices = sorted_indices[:, -2]  # Second highest column

# Step 4: Check if the second highest value is zero
second_max_values = np.take_along_axis(values, second_max_col_indices[:, None], axis=1).flatten()

# Step 5: Map the indices back to column names, or use None if second max is zero
data.loc[:, '_1_modal_day'] = np.array(day_features)[max_col_indices]
data.loc[:, '_2_modal_day'] = np.where(second_max_values == 0, None, np.array(day_features)[second_max_col_indices])

data.loc[:, '_1_modal_day'] = data['_1_modal_day'].apply(lambda x: int(x.split('_')[1]) if isinstance(x, str) else np.nan)
data.loc[:, '_2_modal_day'] = data['_2_modal_day'].apply(lambda x: int(x.split('_')[1]) if isinstance(x, str) else np.nan)

# Step 6: Store values for mode_1 and mode_2
data.loc[:, 'n_order_1_modal_day'] = values[np.arange(values.shape[0]), max_col_indices]
data.loc[:, 'n_order_2_modal_day'] = np.where(second_max_values == 0, 0, values[np.arange(values.shape[0]), second_max_col_indices])

# Propensity to consume on modal days
data['modal_day_prop'] = (data['n_order_1_modal_day'] + data['n_order_2_modal_day']) / data['n_order']

# Parsing 1st mode in radians
data['_1_modal_day_rad'] = (data['_1_modal_day'] * (2 * np.pi / 24)).astype(float)

# Mean of sine and cosine components for 1st modal day
mean_sin = np.mean(np.sin(data['_1_modal_day_rad']))
mean_cos = np.mean(np.cos(data['_1_modal_day_rad']))
circular_mean = np.arctan2(mean_sin, mean_cos) % (2 * np.pi)

data['position_1_day_mode'] = data['_1_modal_day'].apply(lambda x: 1 if (x - circular_mean) > 0 else 0)

# Parsing 2nd mode in radians
data['_2_modal_day_rad'] = (data['_2_modal_day'] * (2 * np.pi / 24)).astype(float)

# Mean of sine and cosine components for 2nd modal day
mean_sin = np.mean(np.sin(data['_2_modal_day_rad']))
mean_cos = np.mean(np.cos(data['_2_modal_day_rad']))
circular_mean = np.arctan2(mean_sin, mean_cos) % (2 * np.pi)

data['position_2_day_mode'] = data['_2_modal_day_rad'].apply(lambda x: 1 if (x - circular_mean) > 0 else 0)

# Parsing the average of both modes
data['mean_modal_day_rad'] = data.apply(
    lambda row: np.arctan2(
        (np.sin(row['_1_modal_day_rad']) + np.sin(row['_2_modal_day_rad'])) / 2,  # mean sine
        (np.cos(row['_1_modal_day_rad']) + np.cos(row['_2_modal_day_rad'])) / 2   # mean cosine
    ) % (2 * np.pi),
    axis=1
)

# Mean of sine and cosine components from both sets of radians
mean_sin = np.mean(np.sin(data['_1_modal_day_rad'])) + np.mean(np.sin(data['_2_modal_day_rad'])) / 2
mean_cos = np.mean(np.cos(data['_1_modal_day_rad'])) + np.mean(np.cos(data['_2_modal_day_rad'])) / 2
circular_mean = np.arctan2(mean_sin, mean_cos) % (2 * np.pi)

data['position_mean_day_mode'] = data['mean_modal_day_rad'].apply(lambda x: 1 if (x - circular_mean) > 0 else 0)



# Asking better questions

So far, we would not consider to have done anything that could be called, transforming data into knowledge, for that we need to ask deeper, harder questions. Below is a more or less comprehensive list of the questions that we would like to obtain from our data, that we believe will allow us to understand the drivers of demand, get to know our customer base and general trends about the industry.
<br>
1. **Who is the customer?**
    - How old is he/she?
    - How do they pay?
    - What do they eat?
    - How much do they spend?
    - When do they spend it?
    - Are they representative of the population?<br>
<br>
2. **Demand vis. categorical features?** <br><br>
3. **Demand and age?**
    - Can we construct groups based on age?
    - Do age groups capture different behaviours with respect to demand?
    - What about demand as a function of categorical variables?
    - How does propensity to spend vary with age, are there exceptions, are these exceptions representative?<br>
<br>
5. **Demand vis. hour, and vis. day of week?**
    - Does propensity for a cuisine type vary with HR and DOW values?
    - Are orders at different hours more or less sensistive to promotions?
    - What about different days of the week?<br>
<br>
6. **Customer Base evolution in the past 3 months?**
    - Are there any differences between the customer base from 30, 60 or 180 days to today?
    - Have promotional campaign effective?
    - Has the customer base grown or shrunk?
    - Has revenue increased or decreased?
    - How frequently do customers order on average?
    - How many times do customers order per week?<br>
<br>
8. **To chain or not to chain?**
    - Is there a relation between the ratio of chained restaurants and other aspects of demand?
    - Does a high ratio imply more demand?
    - Is it linked to a particular type of cuisine?
    - To a particular schedule?<br>
<br>
10. **What drives frequent costumers?**
    - Who qualifies as a frequent customer? 
    - Which payment method is prefered by frequent customers?
    - Is the number of frequent customers increasing? <br>
<br>
11. **How does demand vary as a function of type of cuisine?**
    - Does cuisine vary by region?
    - By age group?
    - By customer propensity?
    - Is there such a thing as "Taco Tuesdays"?<br>
<br>   

# Final Graphs

## Lorenz Curve of accumulated sales as a function of sales percentiles 

In [None]:
# Set Seaborn style to 'white'
sns.set(style="white")

# Sort the revenue data
revenues = data['total_amt'].sort_values()
pct_chain = data['pct_chain'].loc[revenues.index]  # Get corresponding values
num_customers = len(revenues)

# Calculate cumulative revenue
cumulative_revenue = revenues.cumsum()

# Calculate total revenue and cumulative revenue proportion
total_revenue = cumulative_revenue.iloc[-1]
cumulative_revenue_proportion = cumulative_revenue / total_revenue

# Calculate cumulative customer proportions
cumulative_customers_proportion = np.arange(1, num_customers + 1) / num_customers

# Prepare the Lorenz curve data
cumulative_customers_proportion = np.insert(cumulative_customers_proportion, 0, 0)  # Add (0, 0) as the start point
cumulative_revenue_proportion = np.insert(cumulative_revenue_proportion, 0, 0)  # Same as above

# Calculate Gini Coefficient
A = np.trapz(cumulative_revenue_proportion, cumulative_customers_proportion)
Gini_coefficient = (0.5 - A) / 0.5  # Normalizing by the area under the line of equality

# Plot the Lorenz Curve with filled area based on 'pct_chain'
plt.figure(figsize=(10, 9))

# Create a colormap for the first Lorenz curve based on pct_chain
cmap = plt.get_cmap('Blues')
colors = cmap(pct_chain.sort_values()) # Get colors based on sorted pct_chain values

# Fill the area under the Lorenz curve using the colors obtained from pct_chain
for i in range(num_customers):
    plt.fill_between(
        cumulative_customers_proportion[i:i + 2]
        , cumulative_revenue_proportion[i:i + 2]
        , color=colors[i]
        , alpha=0.9
    )

# Plotting the Lorenz curve
plt.plot(
        cumulative_customers_proportion
        , cumulative_revenue_proportion
        , color='Black'
        , linestyle='-.'
        , linewidth=2
        , label=f"Lorenz Curve (Gini: {Gini_coefficient:.2f})"
    )

# Ensure graph starts at 0 and ends at 1, for both x and y.
plt.xlim(0, 1)
plt.ylim(0, 1)

# Plotting the line of equality
plt.plot(
    [0, 1]
    , [0, 1]
    , color='crimson'
    , linestyle='--',
    label='Line of Equality'
)

# Setting title and labels for graph
plt.title('Cummulative Customer Revenue with % in chain purchase gradient', weight='bold',pad=20)
plt.xlabel('Cumulative Proportion of Customers')
plt.ylabel('Cumulative Proportion of Revenue')
plt.legend(loc='upper left')  # Set a specific location for the legend

# Create a color bar for the pct_chain gradient
norm = plt.Normalize(vmin=pct_chain.min(), vmax=pct_chain.max()) # Create a normalized scale based on pct_chain values
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm) # Map these values to a scalar map

# Create a color bar with extra padding
cbar = plt.colorbar(sm, ax=plt.gca(), orientation='vertical', pad=0.05) 
cbar.set_label('Percentile of % purchases in chained restaurant', labelpad=25)

# Set the ticks at specific data values (min, 25th percentile, etc.)
cbar.ax.set_yticks([
    pct_chain.min(),
    pct_chain.quantile(0.25),
    pct_chain.median(),
    pct_chain.quantile(0.55),
    pct_chain.quantile(0.59),
    pct_chain.max()
])

# Set the corresponding custom labels
cbar.ax.set_yticklabels([
    f"{round(pct_chain.min(), 2)}",
    f"{round(pct_chain.quantile(0.25), 2)}",
    f"{round(pct_chain.median(), 2)}",
    f"{round(pct_chain.quantile(0.55), 2)}",
    f"{round(pct_chain.quantile(0.59), 2)}",
    f"{round(pct_chain.max(), 2)}"
])


quantile_labels = ['0', '25', '50', '55', '59', '100'] # Add quantile labels on the left side of the colorbar
tick_positions = cbar.ax.get_yticks()  # Get the positions of the ticks

for i, label in enumerate(quantile_labels):
    cbar.ax.text(
        -0.1
        , tick_positions[i]
        , label
        , ha='right'
        , va='center'
        , fontstyle='italic'
        , fontsize=7
        , color='black',
        bbox=dict(facecolor='white', alpha=0, edgecolor='none', boxstyle='round,pad=0.5')
    ) # Set text labels on the left side of the gradient

plt.show()



If this Lorenz Curve instead represented income inequality, it would be 13th on the list according to Wikipedia a.k.a the Democratic Republic of the Congo. We point out that almost 80% of total revenue is coming the top 40th percentile of the customer base.

In a flicker of inspiration we decided to plot the gradient of % in chain purchases under the Lorents curve of per customer revenue, which incidentily by the laws of calculus tells us the exact ammount spent on chained restaurants, but more interestingly it shows a clear threshold, at the quantile we had mentioned previously. Note then that, what we are seeing is that the top 40th percentile of customers, which are responsible for those 80% of revenue, order all of their meals from chained restaurants. 

This is too big a finding not to consider studying populations resulting from this rule in isolation. We can deduce that in some way shape or form, the top 40th consumers are those whose routine revolves around a predictable demand for ABCDEats affiliates products, and the other group represent a much less homogenous.

## Cumulative Sales as function of Age

In [None]:
from matplotlib.ticker import FuncFormatter

# Set Seaborn style to 'white'
sns.set(style="white")

# Sort the data by age
sorted_data = data.sort_values('cust_age')
ages = sorted_data['cust_age'].values  # Use age as the primary variable
sales = sorted_data['total_amt'].values  # Corresponding sales based on sorted ages
num_customers = len(sales)

# Calculate cumulative sales
cumulative_sales = sales.cumsum()  # Correct calculation of cumulative sales

# Create a figure with two subplots (one for the cumulative sales curve and one for the customer distribution)
plt.figure(figsize=(10, 14))

# First subplot: Plotting the accumulation curve
plt.subplot(2, 1, 1)  # Two rows, one column, first plot
plt.plot(
    ages,  # Use ages for x-axis
    cumulative_sales,  # Cumulative sales for the y-axis
    color='Black',
    linestyle='-',
    linewidth=2,
    label='Accumulation Curve'
)

# Plotting the line of equality
plt.plot(
    [ages.min(), ages.max()],  # X values for the line of equality
    [0, cumulative_sales[-1]],  # Y values from 0 to total cumulative sales
    color='crimson',
    linestyle='--',
    label='Line of Equality'
)

# Total sales amount
total_sales = cumulative_sales[-1]  

# Add horizontal lines at every 5% of the total sales
for i in range(1, 21):  # 1 to 20 for 5% increments
    plt.plot([ages.min(), ages.max()], [i * 0.05 * total_sales, i * 0.05 * total_sales], color='black', linestyle='--', linewidth=0.7, alpha=0.5)

# Add vertical lines at 80%, 90%, and 95%
percentiles = [0.80, 0.90, 0.95]
age_percentile_values = []  # List to store age values for percentiles
cumulative_sales_values = []  # List to store cumulative sales values for percentiles

for p in percentiles:
    age_at_percentile = np.percentile(ages, p * 100)  # Get the corresponding age for the percentile
    age_percentile_values.append(age_at_percentile)  # Store the value
    
    # Calculate the y-value for the intersection with the accumulation curve
    sales_at_percentile_index = int(num_customers * p)  # Get index for the current percentile
    y_value = cumulative_sales[sales_at_percentile_index - 1]  # Cumulative sales at this percentile
    
    # Plot vertical line that ends at the intersection point
    plt.plot([age_at_percentile, age_at_percentile], [0, y_value], color='blue', linestyle='--', linewidth=1)
    
    # Plot horizontal line that ends at the intersection point
    plt.plot([ages.min(), age_at_percentile], [y_value, y_value], color='blue', linestyle='--', linewidth=1)

    # Store cumulative sales for this percentile
    cumulative_sales_values.append(y_value)  # Store cumulative sales for this percentile

# Add the total sales (100th percentile) to the cumulative sales values
cumulative_sales_values.append(total_sales)

# Set tick labels for the respective age percentiles on the x-axis
xticks_labels = [
    f'Min: {int(ages.min())}',  # Add min age label
    f'80%: {int(age_percentile_values[0])}',
    f'90%: {int(age_percentile_values[1])}',
    f'95%: {int(age_percentile_values[2])}',
    f'Max: {int(ages.max())}'  # Add max age label
]

plt.xticks(
    [ages.min()] + list(age_percentile_values) + [ages.max()],  # Add min age to ticks
    xticks_labels,
    rotation=45, ha='right', fontsize=11, color='black'  # Rotate and set color for better visibility
)

# Format y-ticks for cumulative sales in Euros
def euro_formatter(x, pos):
    return f'€{int(x):,}'

plt.yticks(cumulative_sales_values, [f'80%: €{int(cumulative_sales_values[0])}', 
                                     f'90%: €{int(cumulative_sales_values[1])}', 
                                     f'95%: €{int(cumulative_sales_values[2])}', 
                                     f'100%: €{int(cumulative_sales_values[3])}'],
           fontsize=11, color='black')  # Set font size and color for y-ticks

plt.gca().yaxis.set_major_formatter(FuncFormatter(euro_formatter))

# Adjust the position and rotation of y-tick labels
for label in plt.gca().get_yticklabels():
    label.set_rotation(45)  # Rotate y-tick labels to 45 degrees
    label.set_y(label.get_position()[1] - 0.02)  # Pad down slightly

# Setting title and labels for the first plot
plt.title('Cumulative Sales as a Function of Age', weight='bold', pad=20, fontsize=20)
plt.xlabel('Age', fontsize=14, fontweight='bold', labelpad=-27)  # Bold x-label with padding
plt.ylabel('Cumulative Sales', fontsize=14, fontweight='bold', labelpad=-35)  # Bold y-label with padding
plt.ylim(0, total_sales)  # Ensure y-axis starts at 0 and ends slightly above max cumulative sales
plt.xlim(min(ages), max(ages))

plt.legend(loc='lower right')  # Set a specific location for the legend to bottom right

# Second subplot: Customer distribution by age
plt.subplot(2, 1, 2)  # Second plot (2 rows, 1 column, second plot)

# Plot histogram to show customer distribution by age
plt.hist(ages, bins=20, color='grey', alpha=0.7)  # Use a histogram to show the distribution

# Set title and labels for the second plot
plt.title('Customer Distribution by Age', weight='bold', fontsize=16)
plt.xlabel('Age', fontsize=14, fontweight='bold', labelpad=-5)  # Bold x-label with padding
plt.ylabel('Number of Customers', fontsize=14, fontweight='bold')

# Adjust layout to ensure the two plots fit nicely without overlap
plt.tight_layout()

# Show both plots
plt.show()


## Cumulative Sales as functioons of Customer Region

In [None]:
# Set Seaborn style to 'white'
sns.set(style="white")

# Ensure 'cust_region' is treated as an ordered categorical
data['cust_region'] = pd.Categorical(data['cust_region']).as_ordered()

# Group by customer region and sum up sales for each region
grouped_data = data.groupby('cust_region', observed=True)['total_amt'].sum().reset_index()

# Sort the grouped data by total sales (descending order for highest sales first)
sorted_data = grouped_data.sort_values('total_amt', ascending=False)

# Get mapped region labels
mapped_region_labels = list(get_mapping(_dict=_region_dict))  # Get region labels from mapping
region_sales = sorted_data['total_amt'].values  # Total sales for each region

# Calculate cumulative sales by region
cumulative_sales = region_sales.cumsum()  # Cumulative sales per region

# Ensure the plot starts at 0
cumulative_sales = np.insert(cumulative_sales, 0, 0)  # Insert 0 at the start of the cumulative sales

# Calculate cumulative percentages
total_sales = region_sales.sum()  # Total sales for percentage calculations
cumulative_percentages = (cumulative_sales / total_sales) * 100  # Cumulative percentage calculations

# Plotting the accumulation curve by region with dots
plt.figure(figsize=(10, 12))  # Increase width by 2 points, adjust height as needed

# Create a scatter plot for cumulative sales
plt.subplot(2, 1, 1)  # First subplot for the scatter plot
plt.scatter(
    mapped_region_labels,  # Use region names for x-axis
    cumulative_sales[1:],  # Cumulative sales for the y-axis (exclude the first 0)
    color='Black',
    label='Cumulative Sales Points',
    s=100  # Size of the dots
)

# Optional: Add a line connecting the dots
plt.plot(
    mapped_region_labels,  # Use region names for x-axis
    cumulative_sales[1:],  # Cumulative sales for the y-axis (exclude the first 0)
    color='gray',  # Line color
    linestyle='--',  # Line style
    linewidth=1,  # Line width
    alpha=0.5  # Line transparency
)

# Add cumulative percentage annotations to the scatter plot
for i, cum_percent in enumerate(cumulative_percentages[1:]):  # Skip the first value (0)
    plt.annotate(f"{cum_percent:.1f}%", 
                 (mapped_region_labels[i], cumulative_sales[i + 1]),  # Adjust to use cumulative_sales for y-value
                 textcoords="offset points", 
                 xytext=(0, -15),  # Move text down 15 points
                 ha='center', 
                 fontsize=10)

# Add labels and title
plt.title('Cumulative Sales by Region', weight='bold', fontsize=18)
plt.xlabel('Region', fontsize=14)
plt.ylabel('Accumulated Sales (€)', fontsize=14)
plt.xticks(rotation=45)

# Format y-axis to show in dollars
from matplotlib.ticker import FuncFormatter

def dollar_formatter(x, _):
    return f'${x:,.0f}'  # Format the number as dollars

plt.gca().yaxis.set_major_formatter(FuncFormatter(dollar_formatter))

# Ensure y-axis starts at 0
plt.ylim(bottom=0)

# Show plot for scatter
plt.legend(loc='lower right')

plt.tight_layout()  # Adjust layout to prevent overlap

# Create a bar plot beneath the scatter plot
ax2 = plt.subplot(2, 1, 2)  # Second subplot for the bar plot
ax2.bar(mapped_region_labels, region_sales, color='grey', alpha=0.6, label='Sales by Region')
ax2.set_ylabel('Sales (€)', fontsize=14)

# Add annotations for proportional sales on the bar plot
for i, sale in enumerate(region_sales):
    # Calculate the proportional sales as a percentage of total sales
    proportional_sales = (sale / total_sales) * 100
    ax2.annotate(f"{proportional_sales:.1f}%", 
                 (mapped_region_labels[i], sale), 
                 textcoords="offset points", 
                 xytext=(0, 5),  # Move text up by 5 points
                 ha='center', 
                 fontsize=10)

# Set the legend for the bar plot
ax2.legend(loc='upper right')

plt.title('Sales Distribution by Region', weight='bold', fontsize=18)  # Title for the bar plot
plt.xticks(rotation=45)

# Adjust the height of the bottom plot
plt.subplots_adjust(hspace=0.3)  # Increase space between plots if needed

plt.tight_layout()  # Adjust layout to ensure everything fits
plt.show()  # Show all plots


# Cross-Entropy Discretization Iterator

### Calculate

In [None]:
def _get_bins(dataframe, target, binner):

    # read dataset
    X, y = dataframe[[target]].values, dataframe[[binner]].values
    numeric_features = np.arange(X.shape[1])  # This feature will be discretized
    
    # Split between training and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
    
    #Initialize discretizer object and fit to training data
    discretizer = MDLP_Discretizer(features=numeric_features)
    discretizer.fit(X_train, y_train)
    
    X_train_discretized = discretizer.transform(X_train)
    
    # apply same discretization to test set
    X_test_discretized = discretizer.transform(X_test)

    # Easiest is to manually adjust the bins, 
    while True:
        try:
            _bins = list(
                set(
                    np.round(
                        [
                            float(x) for x in discretizer._cuts[0]
                        ] 
                    )
                )
            )
            _bins = [0] + sorted(_bins) + [9999]
             
            dataframe[f'{target}_bin_{binner}'] = (
                pd.cut(
                    dataframe[target]
                    , bins= _bins
                    , labels=False
                    , include_lowest=True
                    , right=True
                )
            )
            
            return (f'{target}_bin_{binner}', (target, _bins))
            
        except Exception as e:
            input(f'Error was {e}: press any to cry.')
            pass



In [None]:
def iter_bins(data, target_feature, feature_lists):
    """
    Processes binning for the given feature lists and updates the binning dictionary.

    Parameters:
        data (DataFrame): The dataset to process.
        target_feature (str): The target feature for binning.
        feature_lists (list): A list of lists, where each sub-list contains features to process.

    Returns:
        dict: Updated binning dictionary.
    """
    binning_dict = {}
    get_bins_func = partial(_get_bins, data, target_feature)

    for features in feature_lists:
        for feature in features:
            try:
                key, value = get_bins_func(feature)
                binning_dict[key] = value
            except Exception as e:
                print(f"Error processing feature '{feature}': {e}")

    return binning_dict


def set_dicts(data, extende_r, extende_d):
    for key, value in extende_r.items():
        data[key] = (
            pd.cut(
                data[value[0]]
                , bins= value[1]
                , labels=False
                , include_lowest=True
                , right=True
            )
        )
    return extende_d.extend(extende_r.keys())

# Polynomial Correlator

In [None]:
def get_polynomial(data, metric_features, selected_target, degree=2):
    """
    Generates polynomial features from the given metric features, calculates their correlations
    with the target variable, and identifies the features with the highest Pearson and Spearman correlations.

    Args:
    data (pd.DataFrame): The input DataFrame containing the features and target.
    metric_features (list): List of metric features/columns.
    selected_target (str): The target variable/column to correlate with.
    degree (int): The degree of the polynomial features to generate. Default is 2.

    Returns:
    tuple: Feature with the highest Pearson correlation and its value,
           Feature with the highest Spearman correlation and its value.
    """
    
    # Drop any features that have the target in their name
    features = [feature for feature in metric_features if feature not in selected_target]

    # Generate polynomial features
    poly = PolynomialFeatures(degree=degree, include_bias=True)
    X_poly = poly.fit_transform(data[features])

    # Create a DataFrame for polynomial features
    poly_features = pd.DataFrame(X_poly, columns=poly.get_feature_names_out(features))

    # Calculate correlations with the target variable
    correlations = {}
    for col in poly_features.columns:
        pearson_corr, _ = pearsonr(poly_features[col], data[selected_target])
        spearman_corr, _ = spearmanr(poly_features[col], data[selected_target])
        correlations[col] = {
            'pearson': abs(pearson_corr),
            'spearman': abs(spearman_corr)
        }

    # Get the feature with the highest Pearson correlation
    max_pearson_feature = max(correlations, key=lambda x: correlations[x]['pearson'])
    max_pearson_corr = correlations[max_pearson_feature]['pearson']

    # Get the feature with the highest Spearman correlation
    max_spearman_feature = max(correlations, key=lambda x: correlations[x]['spearman'])
    max_spearman_corr = correlations[max_spearman_feature]['spearman']

    return (max_pearson_feature, max_pearson_corr), (max_spearman_feature, max_spearman_corr)

