In [30]:
import datetime
import itertools
import numpy as np
import pandas as pd
import pickle as pkl

from classes import (
    Agent,
    Action,
    Customer,
    Environment,
    Instance,
    InstanceData,
    Order,
    #RLSimulation,
    Slot,
    WTPPredictor,
    WTPGPPredictor,
)
from classes.funcs import get_wtp
from typing import Any, Dict, List, Optional, Tuple, Union
from pandera.typing import Series
from functools import partial
from decimal import Decimal

from dice_cf_demo import initialize_dice, find_cf

In [31]:
DF = pd.DataFrame

def load_wtp_predictor()->WTPPredictor:
    wtp_predictor = WTPGPPredictor(
            model_path="models/gp/runFinalTests_Results_trust-ai-wtp-gp-7ls79_iter1.json"
    )
    wtp_predictor.load()
    return wtp_predictor

def get_representative_instance_data(
    instance_data: InstanceData,
    num_booking_days: int = 3,
    num_orders: int = 20,
    num_slots: int = 5,
    area_id: Union[int, None] = 140,
) -> InstanceData:
    """Downsizes the input instance to consider a representative instance with `num_booking_days`
    each with `num_orders` orders incoming per day that can book any of the same `num_slots`"""

    # Read each DataFrame in Instance
    order_df = instance_data.orders
    customer_df = instance_data.customers
    slot_df = instance_data.slots

    # Filter area_id
    if area_id is not None:
        order_df = order_df.loc[:, [area_id], :, :]

    # Filter num_booking_days
    if num_booking_days < len(order_df.orderdate.unique()):
        start_timestamp = order_df.orderinstant.min()
        start_timestamp = pd.DatetimeIndex([start_timestamp]).normalize()[
            0
        ]  # set time to 00:00:00
        end_timestamp = start_timestamp + datetime.timedelta(days=num_booking_days)

        order_df = order_df[
            lambda x: (x.orderinstant >= start_timestamp)
            & (x.orderinstant < end_timestamp)
        ]

    # Sample orders
    orders = order_df.reset_index()[
        "shippingnumber orderdate".split()
    ].drop_duplicates()

    def sample_orders(group:Tuple, num_orders: int)->np.ndarray:
        if len(group)>=num_orders:
            return (sample := group.sample(n=num_orders)['shippingnumber'].unique())
        
        return group['shippingnumber'].values

    order_list = (orders
        .groupby('orderdate')
        .apply(sample_orders, num_orders)
        .explode()
        .reset_index(drop=True)
    )

    # Filter DataFrames
    order_df = order_df.loc[order_list, :, :, :]
    customer_df = customer_df.loc[order_df.uqicustomerid.unique(), :]
    slot_list = (
        slot_df[
            lambda x: (x.is_open) & (x.index.get_level_values("areaid").isin([area_id]))
        ]
        .groupby("slot_time")
        .sum("occupation")
        .occupation.sort_values(ascending=False)
        .iloc[:num_slots]
        .index.values
    )
    slot_df = slot_df.loc[area_id, :, slot_list].sort_index()

    return InstanceData(order_df, customer_df, slot_df, instance_data.order_cp7s)


def update_instance_data_capacity(
    instance: InstanceData, capacity_type: str = "max"
) -> InstanceData:
    """
    Adjusts the capacity of the slots based on the
    slots offered and several orders coming at specific instants.

    - Rationale: Equally divide the no. of customers that see the slot by
    the minimum/mean/maximum no. of slots competing with the slot under analysis
    """

    # read order and slot information
    order_df = instance.orders
    slot_df = instance.slots

    # join relevant information to see how many customers see each slot
    df = order_df["orderinstant".split()].join(
        slot_df["capacity occupation opening_time cutoff".split()]
    )

    # assuming infinite capacity, determine which slots the customer can see
    df = df.assign(
        is_open=lambda x: np.where(
            (x.orderinstant <= x.cutoff) & (x.orderinstant >= x.opening_time), 1, 0
        )
    )[lambda x: x.is_open == 1].drop(columns="is_open")

    # compute the number of customers passing by each slot
    df_customers = df.groupby(df.reset_index("shippingnumber").index.names).agg(
        nr_passing_customers=("capacity", "count")
    )

    # compute the number of alternative slots each customer would see
    ## computing this value means we know how many competing slots there were at each point in time
    # (each point being the arrival of a customer)
    df_slots = df.groupby("shippingnumber").agg(
        nr_available_slots=("capacity", "count")
    )

    # statistical measures on the distribution of competing slots for each individual slot over time
    df_dist_alt_slots = (
        df.join(df_slots)
        .reset_index("shippingnumber", drop=True)
        .set_index("orderinstant", append=True)["nr_available_slots".split()]
        .groupby(slot_df.index.names)
        .agg(
            min=("nr_available_slots", "min"),
            mean=("nr_available_slots", "mean"),
            max=("nr_available_slots", "max"),
            stdev=("nr_available_slots", "std"),
        )
    )

    # compute the possible capacity measures
    df_dist_alt_slots = df_dist_alt_slots.join(df_customers).assign(
        max_capacity=lambda x: x.nr_passing_customers / x["min"],
        mean_capacity=lambda x: x.nr_passing_customers / x["mean"],
        min_capacity=lambda x: x.nr_passing_customers / x["max"],
    )
    df_dist_alt_slots = df_dist_alt_slots[f"{capacity_type}_capacity".split()].assign(
        capacity=lambda x: np.ceil(x[f"{capacity_type}_capacity"]).astype(int)
    )

    slot_df = slot_df.drop(columns="capacity").join(df_dist_alt_slots[["capacity"]])
    slot_df["capacity"] = slot_df["capacity"].fillna(
        0
    )  # for those slots that won't open in the simulation
    slot_df["capacity"] += slot_df["occupation"]

    return InstanceData(
        instance.orders,
        instance.customers,
        slot_df.copy(),
        instance.order_cp7s,
    )


def generate_realistic_instance(
    instance_data: InstanceData, abt_shape: str = "l"
) -> Instance:
    """
    Creates an instance based on sampled data.
    """

    slot_df = prepare_slot_df(instance_data.slots, instance_data.orders)

    _slot_time_limits = get_slot_time_limits(slot_df)
    abt_curves = get_abt_curves(abt_shape, slot_df, _slot_time_limits)
    slots = get_slot_series(slot_df, abt_curves)

    customers = get_customer_series(instance_data.customers)
    orders = get_order_series(instance_data.orders, slots)

    return Instance(orders, customers, slots, instance_data.order_cp7s)


def get_order_series(order_df: DF, slots: Series[Slot]) -> Series[Order]:
    order_dfs = {
        shippingnumber: table.droplevel("shippingnumber")
        for shippingnumber, table in order_df.groupby("shippingnumber")
    }

    return (
        order_df.reset_index()[
            "shippingnumber uqicustomerid orderinstant areaid total customer_lat customer_long".split()
        ]
        .drop_duplicates()
        .rename(
            columns={
                "shippingnumber": "order_id",
                "uqicustomerid": "customer_id",
                "orderinstant": "timestamp",
                "total": "basket_value",
            }
        )
        .assign(
            order_df=lambda x: (x.order_id.T).map(order_dfs),
            # An order can only see slots reserved for its area
            allowed_slots=lambda x: x.apply(
                lambda y: slots.loc[pd.Series(y.areaid).astype(int), :, :].index.values,
                axis=1,
            ).T,
            coordinates=lambda x: tuple(zip(x.customer_lat, x.customer_long)),
        )
        .drop(columns=["areaid", "customer_lat", "customer_long"])
        .assign(
            # wtp_predictor=lambda x: np.repeat(wtp_pred, x.shape[0]),
            # cts_predictor=lambda x: np.repeat(cts_pred, x.shape[0]),
            # panel_generator=lambda x: np.repeat(panel_generator, x.shape[0]),
            _order=lambda x: x.to_dict(orient="records"),
            order=lambda x: x._order.apply(lambda y: Order(**y)),
        )
        .set_index("order_id")
        .order
    )


def get_target(cutoff, current_time, xs, ys, granularity="h"):
    return np.around(
        -np.interp((cutoff - current_time) / np.timedelta64(1, granularity), xs, ys),
        decimals=3,
    )


def get_customer_series(customer_df: DF) -> Series[Customer]:
    customer_dfs = {
        uqicustomerid: table.droplevel("uqicustomerid")
        for uqicustomerid, table in customer_df.groupby("uqicustomerid")
    }

    return (
        pd.Series(customer_df.index.get_level_values("uqicustomerid").unique())
        .to_frame()
        .assign(
            customer_df=lambda x: (x.uqicustomerid.T).map(customer_dfs),
            _customer=lambda x: x.to_dict(orient="records"),
            customer=lambda x: x._customer.apply(lambda y: Customer(**y)),
        )
        .set_index("uqicustomerid")
        .customer
    )


def get_slot_series(slot_df: DF, abt_curves: DF) -> Series[Slot]:
    slots = slot_df.join(abt_curves)

    # abt curve interpolation function
    return slots.assign(
        abt=lambda x: x.apply(
            lambda y: partial(
                get_target,
                cutoff=y.cutoff,
                xs=y.abt_curve["time_to_cutoff"],
                ys=y.abt_curve["target_capacity"],
                granularity="m",
            ),
            axis=1,
        ),
        _slot=lambda x: x.to_dict(orient="records"),
        slot=lambda x: x._slot.apply(lambda y: Slot(**y)),
    )["slot"]


def get_abt_curves(abt_shape: str, slot_df: DF, slots: Series[Slot]) -> Series:
    # abt curves - set of time instants
    nr_points = 40
    abt_curves = {
        slot: np.linspace(
            slots.loc[slot].last_time_to_cutoff,
            slots.loc[slot].first_time_to_cutoff,
            nr_points,
            dtype=int,
        )
        for slot in slots.index
    }

    # add points to plot the curve
    abt_curves = (
        pd.DataFrame.from_dict(abt_curves, orient="index")
        .stack()
        .rename_axis("slot obs".split(), axis=0)
        .rename("time_to_cutoff")
        .sort_index()
    )
    abt_curves = pd.DataFrame(abt_curves).join(
        pd.DataFrame(
            slot_df["capacity occupation".split()].values,
            index=pd.Index(slot_df.index.values, name="slot"),
            columns="capacity occupation".split(),
        )
    )
    abt_curves = abt_curves.join(
        pd.DataFrame(
            slots["first_time_to_cutoff last_time_to_cutoff".split()].values,
            index=pd.Index(slot_df.index.values, name="slot"),
            columns="first_time_to_cutoff last_time_to_cutoff".split(),
        )
    )

    if abt_shape == "l":
        # Linear - l
        # The abt follows a linear function: f(x) = ax + b
        # The desired occupation when x = tf (last time open in the simulation) is 1, i.e., f(tf) = 1.
        abt_curves = abt_curves.assign(
            a_factor=0,
            b_factor=lambda x: ((x.occupation / x.capacity) - 1)
            / (x.first_time_to_cutoff - x.last_time_to_cutoff),
            c_factor=1,
        )
    elif abt_shape == "qd":
        # Quadratic facing downwards - qd
        # The abt curve follows a quadratic behavior: f(x) = ax2 + bx + c
        # Let's assume f(0) = 1, local optimum of the function occurs at the cutoff f'(0) = 0
        abt_curves = abt_curves.assign(
            a_factor=lambda x: (
                (x.occupation / x.capacity - 1)
                / ((x.first_time_to_cutoff - x.last_time_to_cutoff) ** 2)
            ),
            b_factor=0,
            c_factor=1,
        )
    elif abt_shape == "qu":
        # Quadratic facing upwards - qu
        # The abt curve follows a quadratic behavior: f(x) = ax2 + bx + c
        # Let's assume f(0) = 1 and the local optimum of the function occurs at the initial instant f'(ti) = Oi/C
        abt_curves = abt_curves.assign(
            # f'(ti) = Oi/C
            a_factor=lambda x: (x.occupation / x.capacity - 1)
            / (-((x.first_time_to_cutoff - x.last_time_to_cutoff) ** 2)),
            # f'(ti) = Oi/C
            b_factor=lambda x: (2 / (x.first_time_to_cutoff - x.last_time_to_cutoff))
            * (x.occupation / x.capacity - 1),
            # from f(0) = 1
            c_factor=1,
        )

    abt_curves["target_capacity"] = (
        abt_curves.reset_index("obs")
        .assign(
            target_capacity=lambda x: (
                x.a_factor * (x.time_to_cutoff.astype(int) - x.last_time_to_cutoff) ** 2
            )
            + (x.b_factor * (x.time_to_cutoff.astype(int) - x.last_time_to_cutoff))
            + x.c_factor
        )["target_capacity"]
        .values
    )

    abt_curves = (
        abt_curves["time_to_cutoff target_capacity".split()]
        .reset_index()
        .set_index("slot time_to_cutoff".split())
        .drop(columns="obs")
    )

    abt_curves = {
        slot: table.droplevel("slot")
        .reset_index()["time_to_cutoff target_capacity".split()]
        .to_dict(orient="list")
        for slot, table in abt_curves.groupby("slot")
    }
    abt_curves = {
        slot: {col_name: np.array(col_values) for col_name, col_values in col.items()}
        for slot, col in abt_curves.items()
    }

    # data to instantiate slot class
    abt_curves = (
        pd.Series(abt_curves).to_frame().sort_index().rename(columns={0: "abt_curve"})
    )
    abt_curves.index.set_names(list(slot_df.index.names), inplace=True)
    return abt_curves


def get_slot_time_limits(slot_df: DF) -> DF:
    # for slots already open, we will consider the initial instant to be the simulation start
    # for slots that remain open past the cutoff, we will consider the last instant to be the simulation end
    return (
        slot_df["opening_time simulation_start cutoff simulation_end".split()]
        .assign(
            first_time_to_cutoff=lambda x: (
                (x.cutoff - np.maximum(x.opening_time, x.simulation_start))
                / np.timedelta64(1, "m")
            ).astype(int),
            last_time_to_cutoff=lambda x: (
                (x.cutoff - np.minimum(x.cutoff, x.simulation_end))
                / np.timedelta64(1, "m")
            ).astype(int),
        )
        .drop(columns="simulation_start simulation_end".split())
    )


def prepare_slot_df(slot_df: DF, order_df: DF) -> DF:
    slot_df = slot_df.drop(columns="alt")

    # determine the start and end of the simulation
    slot_df["simulation_start"] = order_df.orderinstant.min()
    slot_df["simulation_end"] = order_df.orderinstant.max()
    return slot_df


def get_instance():
    with open('instances/instance_midyear.pkl', 'rb') as f:
        _inst_data = pkl.load(f)

    instance_data = InstanceData(
        _inst_data["order_df"],
        _inst_data["customer_df"],
        _inst_data["slot_df"],
        _inst_data["order_cp7s"],
    )

    # Get a toy instance
    instance_data = get_representative_instance_data(instance_data)

    # To adjust the slot capacities according to instance features
    instance_data = update_instance_data_capacity(instance_data, capacity_type="max")

    # Generate instance
    return generate_realistic_instance(instance_data, abt_shape="l")


def initialize_time_slot_centroids(instance: Instance) -> DF:
    # Initialize time slot centroids beforehand
    slot_centroids = instance.slots.apply(
        lambda x: instance.order_cp7s.loc[x.assigned_orders][
            ["customer_lat", "customer_long"]
        ]
        if x.assigned_orders
        else instance.order_cp7s.iloc[[0]][["store_lat", "store_long"]].rename(
            columns={"store_lat": "customer_lat", "store_long": "customer_long"}
        )
    )
    nr_nodes = instance.slots.apply(
        lambda x: instance.order_cp7s.loc[x.assigned_orders]
    ).apply(lambda x: x.count())[["store_lat"]]
    nr_nodes.columns = ["nr_nodes"]
    slot_centroids = slot_centroids.apply(lambda x: x.mean()).join(nr_nodes)
    slot_centroids.columns = list(
        map(lambda s: s.replace("customer", "centroid"), slot_centroids.columns.values)
    )
    slot_centroids["nr_nodes"] = slot_centroids["nr_nodes"].astype(int)

    return slot_centroids

In [32]:
# Load instance
instance = get_instance()

# Load willigness-to-pay model
wtp_predictor = load_wtp_predictor()


"""
    INPUTS: DEFINE CUSTOMER TO ANALYZE
"""
# Select an order to analyze
order_id = np.random.choice(instance.orders.index) # a random order
order: Order = instance.orders.loc[order_id]

# Get corresponding customer
customer: Customer = instance.customers.loc[order.customer_id]

# Get the slots that the customer is seeing
current_timestamp = order.timestamp
instance.slots.apply(lambda x: x.check_open(current_timestamp))
slots = instance.slots.loc[order.allowed_slots].loc[
    lambda x: x.apply(lambda y: y.is_open) == 1
]

# Filter order_df for available time slots
order.order_df = order.order_df.loc[slots.index]


In [33]:
# Check list of columns in instance
print(instance.orders.head())

order_id
63465965_001    timestamp=Timestamp('2017-03-27 20:23:45') cus...
63465998_001    timestamp=Timestamp('2017-03-27 20:43:56') cus...
63466055_001    timestamp=Timestamp('2017-03-27 21:12:54') cus...
63466090_001    timestamp=Timestamp('2017-03-27 21:27:54') cus...
63471258_001    timestamp=Timestamp('2017-03-27 22:33:02') cus...
Name: order, dtype: object


In [34]:
print(instance.customers.head())

uqicustomerid
4098823F-A73F-4A2A-9DF6-734C456C68D6    customer_df=                freq_slot_dow  pre...
AE1E1189-357C-4968-9AE4-F053CA493195    customer_df=                freq_slot_dow  pre...
30BE5FE6-77C6-4664-8387-690CF8E94127    customer_df=                freq_slot_dow  pre...
79538D38-220E-42CF-A50B-0ACC55C83055    customer_df=                freq_slot_dow  pre...
3EB313DC-EC4A-4D97-8CB6-DE74CA32FEB8    customer_df=                freq_slot_dow  pre...
Name: customer, dtype: object


In [35]:
print(instance.slots.head())

areaid  deliverydate  slot_time    
140     2017-03-28    14:00 - 16:30    capacity=27 opening_time=Timestamp('2017-03-21...
                      16:00 - 18:30    capacity=25 opening_time=Timestamp('2017-03-21...
                      18:30 - 20:30    capacity=42 opening_time=Timestamp('2017-03-21...
                      20:00 - 22:30    capacity=55 opening_time=Timestamp('2017-03-21...
        2017-03-29    12:00 - 14:30    capacity=9 opening_time=Timestamp('2017-03-22 ...
Name: slot, dtype: object


In [36]:
print(instance.order_cp7s.head())

                store_lat  store_long  customer_lat  customer_long
shippingnumber                                                    
63465790_001     38.76405   -9.175984     38.685747      -9.311800
63465807_001     38.76405   -9.175984     38.719918      -9.133701
63465811_001     38.76405   -9.175984     38.758992      -9.255120
63465804_001     38.76405   -9.175984     38.746839      -9.104824
63465809_001     38.76405   -9.175984     38.736926      -9.147807


In [37]:
"""
    DECISION VARIABLES
"""
# Customer features / slot features

## This unveils the features that impact the wtp
print(wtp_predictor.symb_expr)

# If you notice, only exact_selection_customer_perc and slot_start impact the probability
# the remaining features are a consequence of the price panel presented

(((x.exact_selection_customer_perc*x.max_cost)-aq(x.slot_start,-7645.436000))+((x.q1_cost*x.rank_cost)*(x.median_cost-x.slotcost)))


In [38]:
"""
    PANELS
"""
# Solution space hyperparameters
MIN_PRICE = 2
MAX_PRICE = 9
DELTA = 1
MAX_SOLUTION_SPACE = 1000

price_points = np.arange(MIN_PRICE, MAX_PRICE, DELTA)

slot_ids = slots.index
full_space_size = len(price_points) ** len(slot_ids)  # P x S

def full_action_space(price_points: np.ndarray, slot_index: Any) -> List[Action]:
        return [
            Action(
                panel=(
                    prices := pd.DataFrame(
                        np.array(panel),
                        index=slot_index,
                        columns="slotcost".split(),
                    )
                ),
                price_level=prices.slotcost.mean(),
                assymetry_level=prices.slotcost.skew(),
                nr_options=prices.slotcost.count(),
            )
            for panel in itertools.product(price_points, repeat=len(slot_index))
        ]

def sample_action_space(price_points: np.ndarray, slot_index: Any, max_solution_space:int) -> List[Action]:
    return [
        Action(
            panel=(
                prices := pd.DataFrame(
                    np.random.choice(a=price_points, size=len(slot_index)),
                    index=slot_index,
                    columns="slotcost".split(),
                )
            ),
            price_level=prices.slotcost.mean(),
            assymetry_level=prices.slotcost.skew(),
            nr_options=prices.slotcost.count(),
        )
        for _ in np.arange(max_solution_space)
    ]

In [39]:

action_space = pd.Series(
        full_action_space(price_points, slot_ids)
        if (full_space_size <= MAX_SOLUTION_SPACE)
        else sample_action_space(price_points, slot_ids, MAX_SOLUTION_SPACE)
    )

# You can evaluate one panel (at random)
action = action_space.loc[lambda x: np.random.choice(x.index)]
panel = np.array([action.panel.values.reshape(-1)])

# Display customer features
print("customer features ", customer.customer_df.head())
print("list of columns ", customer.customer_df.columns)
print("Shape of panel", panel.shape)


customer features                  freq_slot_dow  preferred_slot_dow  \
alt                                                 
3-18:30 - 20:30          0.15                   0   
4-20:00 - 22:30          0.43                   1   
5-20:00 - 22:30          0.13                   0   
6-16:00 - 18:30          0.00                   0   
6-10:00 - 12:30          0.00                   0   

                partial_selection_customer_perc exact_selection_customer_perc  \
alt                                                                             
3-18:30 - 20:30                          0.0000                        0.0000   
4-20:00 - 22:30                          0.1250                        0.0625   
5-20:00 - 22:30                          0.0000                        0.0000   
6-16:00 - 18:30                          0.0000                        0.0000   
6-10:00 - 12:30                          0.0000                        0.0000   

                 slot_start  slot_end  np

In [40]:
_, selection_probability, walkaway_probability = get_wtp(
            wtp_predictor=wtp_predictor,
            order=order,
            panels=panel,
            customer=customer,
            slots=slots,
        )

# Provides the selection_probability of each time slot option
#print("selection probability", selection_probability)
print("the shape of probability distribution ", selection_probability.shape)
print("the sum of selection probabilities ", sum(selection_probability[0]))
print("selected time slot ", slots.index[np.argmax(selection_probability)])
print("probability of max selection probability ", np.max(selection_probability))

# Provides the walkaway probability of each time slot option
print("walkaway proability ", walkaway_probability)


the shape of probability distribution  (1, 23)
the sum of selection probabilities  1.0
selected time slot  (140, datetime.date(2017, 4, 4), '14:00 - 16:30')
probability of max selection probability  0.14766840586545774
walkaway proability  [0.01605898]


In [41]:
customer.customer_df.iloc[0]

freq_slot_dow                                     0.15
preferred_slot_dow                                   0
partial_selection_customer_perc                 0.0000
exact_selection_customer_perc                   0.0000
slot_start                                        4710
slot_end                                          4710
npurchases                                          16
orderdate                                   2017-03-28
deliverydate                       2017-03-29 00:00:00
expanding_freq_weekend_delivery                   0.18
previous_weekend_delivery                            0
expanding_avg_days_to_delivery                    1.75
freq_sun_selections                               0.18
freq_mon_selections                               0.00
freq_tue_selections                               0.12
freq_wed_selections                               0.12
freq_thu_selections                               0.43
freq_fri_selections                               0.12
freq_sat_s

In [42]:
# You can also evaluate several panels at once, you just need to ensure that you
# provide a matrix nr_panels x nr_available_slots
panels = action_space.apply(lambda x: x.panel.slotcost).values
print("how many panels ", panels.shape)

_, selection_probability, walkaway_probability = get_wtp(
            wtp_predictor=wtp_predictor,
            order=order,
            panels=panels,
            customer=customer,
            slots=slots,
        )

# Each row contains the selection probabily distribution of each panel
print("For multiple panels ", selection_probability)
print("probabilities shape ", selection_probability.shape)

# Find selected time slot for every panel
selected_slots = np.argmax(selection_probability, axis=1)
print("selected time slot for each panel ", selected_slots)
# selected slot with datetime
print("selected time slot with datetime ", slots.index[selected_slots])

# Each element corresponds to the walkaway probability of each panel
print(walkaway_probability)

how many panels  (1000, 23)
For multiple panels  [[9.72201121e-02 9.64901298e-02 8.45888929e-06 ... 9.50587525e-06
  1.17824132e-01 1.17142824e-01]
 [1.17782621e-01 7.62498950e-02 1.25196660e-01 ... 9.64867072e-06
  1.40723382e-01 9.64867072e-06]
 [9.78181771e-02 4.07576176e-05 2.08157166e-03 ... 1.22543170e-05
  1.72728141e-01 1.26503394e-01]
 ...
 [1.21269753e-01 1.46301543e-02 8.70284079e-04 ... 1.37001908e-04
  1.45920185e-01 1.39043172e-03]
 [1.06929430e-01 1.06432219e-01 1.05931854e-01 ... 6.88864289e-06
  1.51793788e-02 7.11126296e-02]
 [1.15708774e-01 6.60287783e-03 1.10671256e-01 ... 8.23731929e-06
  1.03473072e-02 1.01952217e-02]]
probabilities shape  (1000, 23)
selected time slot for each panel  [21 21 21 21 21  0 21  9 21  2 22  3 21 22  1  0  6  3  3 22  2 21  0 22
  7  3 21  1 21 22 22 21 22 22  2  2  0  0  2 22  1 22 21 21 21  1  0  0
 21 21  1 22  0 22 22  2  3  0 22 22 22  2 21  6  4 22 21 22 21 21  0 22
 21 21  0 21 22  3 22  0 21 22 21 22 22 22 21  9 22 21  3 22  7  

In [43]:
"""
    EXACT_SELECTION_CUSTOMER_PERC AND SLOT_START
"""

# to change this features, you need to change the order_df or customer_df themselves

# slot_start - evaluate the impact of antecipating 1 hour slot 12:00 - 14:30 of day 2017-03-30
order.order_df.loc[(140, datetime.date(2017, 3, 30), '12:00 - 14:30'),'slot_start'] -= 60

# exact_selection_customer_perc - assume customer chose slot  16:00 - 18:30 of the 6 days ahead in 5% more of the cases
customer.customer_df.loc[('6-16:00 - 18:30'), 'exact_selection_customer_perc'] += Decimal(0.05)

# and then you would proceed to see how the selection probabilities changed

In [44]:
class GPModelPipeline:
    def __init__(self, X_scaler, y_scaler, estimator):
        self.X_scaler = X_scaler
        self.y_scaler = y_scaler
        self.estimator = estimator

    def fit(self, X, y):
        X_scaled = self.X_scaler.fit_transform(X)
        y_scaled = self.y_scaler.fit_transform(y[:, np.newaxis])
        self.estimator.fit(X_scaled, y_scaled)

    def predict(self, X):
        X_scaled = self.X_scaler.transform(X)
        y_scaled = self.estimator.predict(X_scaled)
        y = self.y_scaler.inverse_transform(y_scaled[:, np.newaxis])
        return y.squeeze()

In [45]:
from classes.predictors.wtp_predictors import _build_slot_vars_df
from classes.predictors.wtp_predictors import sigmoid, aq, plog
import json
import math

class RetailPipeline:
    def __init__(self, wtp_predictor: WTPPredictor, order_df: Order, panels: np.ndarray, customer_df: Customer, slots: Series[Slot],
                 model_path="models/gp/runFinalTests_Results_trust-ai-wtp-gp-7ls79_iter1.json"):
        self.wtp_predictor = wtp_predictor
        self.order_df = order_df
        self.panels = panels
        self.customer_df = customer_df
        self.slots = slots
        self.model_path = model_path

        with open(self.model_path) as f:
            model_info = json.load(f)

        self.symb_expr = model_info["model"]
        self.predictors = model_info["gp_predictors"]
        self.walkaway_expr = model_info["walkaway_expr"]

        conv_dict = {
            f"x{i}": "x." + self.predictors[i] for i in range(len(self.predictors))
        }
        for key in range(len(self.predictors) - 1, -1, -1):
            self.symb_expr = self.symb_expr.replace(f"x{key}", conv_dict[f"x{key}"])
            

    def create_model_input(self):
        # GP needs both customer and order data
        df = self.order_df.set_index(self.customer_df.index.names, append=True).join(
            self.customer_df[
                np.array(set(self.customer_df.columns).difference(set(self.order_df.columns)))
            ]
        )

        # Multiply information for each panel and calculate vars
        slot_vars_df = (
            _build_slot_vars_df(panels, slots)
            .drop(columns="avg std".split())
            .rename(
                columns={
                    "min": "min_cost",
                    "q1": "q1_cost",
                    "median": "median_cost",
                    "q3": "q3_cost",
                    "max": "max_cost",
                    "iqr": "iqr_cost",
                    "cv": "cv_cost",
                    "rank": "rank_cost",
                }
            )
        )

        # Compile information
        self.input = slot_vars_df.join(df.drop(columns=slot_vars_df.columns))
        return self.input

    def predict_walkaway(self, selection_probability: np.ndarray) -> np.ndarray:
        '''EXISTED BEFORE: Walkaway probability which called inside predict'''
        return eval(
            self.walkaway_expr,
            globals(),
            {"selection_probability": selection_probability.max(axis=1)},
        )

    def predict_walkaway_df(self, input_panel) -> np.ndarray:
        '''ADDED: Take input panel and return walkaway probability #TODO: Decide at which step vectorization is happening '''
        selection_prob = (
            input_panel[self.predictors]
            .astype(float)
            .assign(
                predict=lambda x: eval(
                    self.symb_expr,
                    None,
                    {"x": x, "math.e": math.e, "aq": aq, "exp": np.exp, "plog": plog},
                )
            )["predict"]
            .values
        )

        selection_prob = np.vectorize(sigmoid)(selection_prob)
        selection_prob = selection_prob.reshape(
            -1, self.panels.shape[-1], self.panels.shape[0]
        ).transpose(0, 2, 1)[0]
        walkaway_probability = self.predict_walkaway(selection_prob)

        return walkaway_probability

    def predict(self, input_slot):
        '''ADDED: Predict individual time slot selection probability'''
        input_slot_float = input_slot[self.predictors].astype(float)
        selection_prob = eval(self.symb_expr, 
                              None, 
                              {"x": input_slot_float, "math.e": math.e, "aq": aq, "exp": np.exp, "plog": plog})
        selection_prob = np.vectorize(sigmoid)(selection_prob)
        return selection_prob

    def predict_old(self, input_panel) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        ''' Original prediction output format '''
        print("Input dataframe shape ", input_panel.shape)
        print("Input dataframe used in predictor shape ", input_panel[self.predictors].shape)
        selection_prob = (
            input_panel[self.predictors]
            .astype(float)
            .assign(
                predict=lambda x: eval(
                    self.symb_expr,
                    None,
                    {"x": x, "math.e": math.e, "aq": aq, "exp": np.exp, "plog": plog},
                )
            )["predict"]
            .values
        )
        print("selection prob shapes ", selection_prob.shape)
        ## Apply logistic regression transformation
        selection_prob = np.vectorize(sigmoid)(selection_prob)
        print("selection prob vectorized shapes ", selection_prob.shape)
        selection_prob = selection_prob.reshape(
            -1, self.panels.shape[-1], self.panels.shape[0]
        ).transpose(0, 2, 1)
        print("selection prob reshaped shapes ", selection_prob.shape)
        selection_prob = selection_prob[0]
        print("selection prob reshaped shapes ", selection_prob.shape)

        walkaway_probability = self.predict_walkaway(selection_prob)

        return (
            selection_prob,
            selection_prob / (selection_prob.sum(axis=1))[:, None],
            walkaway_probability,
        )


In [46]:
# input, let's take one random panel
panel = np.array([action.panel.values.reshape(-1)])

In [47]:
retail_model = RetailPipeline(wtp_predictor, order.order_df, panel, customer.customer_df, slots)

In [55]:
panel

array([[7, 4, 4, 7, 7, 8, 3, 5, 3, 5, 5, 7, 5, 6, 3, 5, 7, 8, 8, 3, 2, 4,
        7]])

In [48]:
retail_model.predictors

['slotcost',
 'slot_start',
 'exact_selection_customer_perc',
 'rank_cost',
 'median_cost',
 'partial_selection_customer_perc',
 'expanding_avg_days_to_delivery',
 'days_since_first_purchase',
 'q1_cost',
 'max_cost',
 'min_cost',
 'slot_width']

In [49]:
input_df = retail_model.create_model_input()
input_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,min_cost,q1_cost,median_cost,q3_cost,max_cost,iqr_cost,cv_cost,slotcost,rank_cost,storepostalcode,...,freq_sun_selections,freq_wed_selections,expanding_avg_slot_price,freq_thu_selections,partial_selection_customer_perc,freq_mon_selections,exact_selection_customer_perc,expanding_max_slot_price,deliverydate,first_online_purchase
areaid,deliverydate,slot_time,panel,alt,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
140,2017-03-29,12:00 - 14:30,0,3-12:00 - 14:30,2.0,3.0,4.0,6.5,8.0,3.5,0.455951,2.0,0.108696,1600528,...,0.18,0.12,6.516919,0.43,0.0625,0.0,0.0,7.55,2017-03-29 00:00:00,0
140,2017-03-29,12:00 - 14:30,1,3-12:00 - 14:30,2.0,4.0,5.0,8.0,8.0,4.0,0.348454,2.0,0.065217,1600528,...,0.18,0.12,6.516919,0.43,0.0625,0.0,0.0,7.55,2017-03-29 00:00:00,0
140,2017-03-29,12:00 - 14:30,2,3-12:00 - 14:30,2.0,3.5,4.0,7.0,8.0,3.5,0.38849,4.0,0.413043,1600528,...,0.18,0.12,6.516919,0.43,0.0625,0.0,0.0,7.55,2017-03-29 00:00:00,0
140,2017-03-29,12:00 - 14:30,3,3-12:00 - 14:30,2.0,4.0,6.0,7.0,8.0,3.0,0.333631,3.0,0.152174,1600528,...,0.18,0.12,6.516919,0.43,0.0625,0.0,0.0,7.55,2017-03-29 00:00:00,0
140,2017-03-29,12:00 - 14:30,4,3-12:00 - 14:30,2.0,3.0,4.0,5.0,8.0,2.0,0.356371,2.0,0.086957,1600528,...,0.18,0.12,6.516919,0.43,0.0625,0.0,0.0,7.55,2017-03-29 00:00:00,0


In [21]:
retail_model.predict_individual(slot_interest)

AttributeError: 'RetailPipeline' object has no attribute 'predict_individual'

In [50]:
input_df["Output"] = input_df.apply(lambda x: retail_model.predict(x), axis=1)

In [51]:
input_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,min_cost,q1_cost,median_cost,q3_cost,max_cost,iqr_cost,cv_cost,slotcost,rank_cost,storepostalcode,...,freq_wed_selections,expanding_avg_slot_price,freq_thu_selections,partial_selection_customer_perc,freq_mon_selections,exact_selection_customer_perc,expanding_max_slot_price,deliverydate,first_online_purchase,Output
areaid,deliverydate,slot_time,panel,alt,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
140,2017-03-29,12:00 - 14:30,0,3-12:00 - 14:30,2.0,3.0,4.0,6.5,8.0,3.5,0.455951,2.0,0.108696,1600528,...,0.12,6.516919,0.43,0.0625,0.0,0.0,7.55,2017-03-29 00:00:00,0,0.5217689621449022
140,2017-03-29,12:00 - 14:30,1,3-12:00 - 14:30,2.0,4.0,5.0,8.0,8.0,4.0,0.348454,2.0,0.065217,1600528,...,0.12,6.516919,0.43,0.0625,0.0,0.0,7.55,2017-03-29 00:00:00,0,0.554177888125391
140,2017-03-29,12:00 - 14:30,2,3-12:00 - 14:30,2.0,3.5,4.0,7.0,8.0,3.5,0.38849,4.0,0.413043,1600528,...,0.12,6.516919,0.43,0.0625,0.0,0.0,7.55,2017-03-29 00:00:00,0,0.3623814156409999
140,2017-03-29,12:00 - 14:30,3,3-12:00 - 14:30,2.0,4.0,6.0,7.0,8.0,3.0,0.333631,3.0,0.152174,1600528,...,0.12,6.516919,0.43,0.0625,0.0,0.0,7.55,2017-03-29 00:00:00,0,0.7792057696731906
140,2017-03-29,12:00 - 14:30,4,3-12:00 - 14:30,2.0,3.0,4.0,5.0,8.0,2.0,0.356371,2.0,0.086957,1600528,...,0.12,6.516919,0.43,0.0625,0.0,0.0,7.55,2017-03-29 00:00:00,0,0.4891757283238321


In [52]:
input_df['Output'] = input_df['Output'].astype(float)
input_max_value_index = input_df['Output'].idxmax()
print(input_df.loc[input_max_value_index])

min_cost                                         2.0
q1_cost                                          4.0
median_cost                                      7.0
q3_cost                                          8.0
max_cost                                         8.0
                                        ...         
exact_selection_customer_perc                      0
expanding_max_slot_price                        7.55
deliverydate                     2017-04-04 00:00:00
first_online_purchase                              0
Output                                      0.963786
Name: (140, 2017-04-04, 14:00 - 16:30, 693, 2-14:00 - 16:30), Length: 81, dtype: object


In [54]:
list(input_df.columns)

['min_cost',
 'q1_cost',
 'median_cost',
 'q3_cost',
 'max_cost',
 'iqr_cost',
 'cv_cost',
 'slotcost',
 'rank_cost',
 'storepostalcode',
 'uqicustomerid',
 'orderdate',
 'order_dow',
 'discountpercent',
 'administrativapercent',
 'alimentarpercent',
 'bazarpercent',
 'casapercent',
 'electronicspercent',
 'foodbakerypercent',
 'frescospercent',
 'textilpercent',
 'wellspercent',
 'nsku',
 'total',
 'total_requestedqty',
 'orderperiod',
 'slot_dow',
 'slot_start',
 'slot_end',
 'slot_start_w',
 'slot_end_w',
 'slot_width',
 'orderday_type_group',
 'slotchosen',
 'weekend_delivery',
 'npurchases',
 'previous_requested_amount',
 'expading_sucessful_picking_amount',
 'expading_substitution_picking_amount',
 'expanding_requested_amount',
 'expanding_sucessful_picking',
 'expading_substitution_picking',
 'previous_sucessful_picking_amount',
 'previous_substitution_picking_amount',
 'total_purchases',
 'days_since_last_order',
 'days_since_first_purchase',
 'days_between_purchase',
 'total_l

In [53]:
print(input_df.dtypes)

min_cost                         float64
q1_cost                          float64
median_cost                      float64
q3_cost                          float64
max_cost                         float64
                                  ...   
exact_selection_customer_perc     object
expanding_max_slot_price         float64
deliverydate                      object
first_online_purchase              Int32
Output                           float64
Length: 81, dtype: object


In [24]:
retail_input_predictor = input_df[retail_model.predictors]
retail_input_predictor["Output"] = input_df["Output"]
retail_input_predictor = retail_input_predictor.astype(float)
list_cont_features = list(retail_input_predictor.drop(columns="Output").columns)
list_cont_features

KeyError: 'Output'

In [None]:
import dice_ml

# make dice data object
d = dice_ml.Data(dataframe=retail_input_predictor, continuous_features=list_cont_features, outcome_name='Output')

# provide the trained ML model to dice's model object
backend = 'sklearn'
m = dice_ml.Model(model=retail_model, backend=backend, model_type='regressor')

# initiate dice
exp_random = dice_ml.Dice(d, m, method="random")

In [27]:
X_test.head()

NameError: name 'X_test' is not defined

In [25]:
X_test = retail_input_predictor.drop(columns="Output")
query_instances = X_test[4:5]
query_instances

KeyError: "['Output'] not found in axis"

In [None]:
dice_exp_random = exp_random.generate_counterfactuals(query_instances,
                                                      total_CFs=2,
                                                      desired_range=[0,1],
                                                               )
dice_exp_random.visualize_as_dataframe(show_only_changes=True)

100%|██████████| 1/1 [00:00<00:00,  7.43it/s]

Query instance (original outcome : 0)





Unnamed: 0,slotcost,slot_start,exact_selection_customer_perc,rank_cost,median_cost,partial_selection_customer_perc,expanding_avg_days_to_delivery,days_since_first_purchase,q1_cost,max_cost,min_cost,slot_width,Output
0,6.0,4440.0,0.0384,0.68,5.0,0.0384,3.15,176.0,4.0,8.0,2.0,150.0,0.0



Diverse Counterfactual set (new outcome: [0, 1])


Unnamed: 0,slotcost,slot_start,exact_selection_customer_perc,rank_cost,median_cost,partial_selection_customer_perc,expanding_avg_days_to_delivery,days_since_first_purchase,q1_cost,max_cost,min_cost,slot_width,Output
0,-,-,0.0384,0.68,-,0.0384,3.15,-,-,-,-,132.0,0.0477186329662799
1,-,-,0.0384,0.3,-,0.1,3.15,-,-,-,-,-,0.1864053606986999


In [None]:
retail_model.predict(query_instances)

array([0.04771863])

In [26]:
retail_model.predict_walkaway(input_df)

  {"selection_probability": selection_probability.max(axis=1)},


areaid  deliverydate  slot_time      panel  alt            
140     2017-03-28    14:00 - 16:30  0      2-14:00 - 16:30    0.0
                                     1      2-14:00 - 16:30    0.0
                                     2      2-14:00 - 16:30    0.0
                                     3      2-14:00 - 16:30    0.0
                                     4      2-14:00 - 16:30    0.0
                                                              ... 
        2017-04-02    12:00 - 14:30  995    0-12:00 - 14:30    0.0
                                     996    0-12:00 - 14:30    0.0
                                     997    0-12:00 - 14:30    0.0
                                     998    0-12:00 - 14:30    0.0
                                     999    0-12:00 - 14:30    0.0
Length: 25000, dtype: float64