In [1]:
cd /Users/karolinegriesbach/Documents/Innkeepr/Git/evaluation-and-execution-scripts/

In [2]:
import ast
import json
import glob
import logging
import pandas as pd
import awswrangler as wr
import matplotlib.pyplot as plt
import seaborn as sns
from general_functions.datetime_helper import transform_date_to_timestamp_milliseconds
from general_functions.return_account_ids import return_account_ids
from general_functions.constants import return_api_url
from general_functions.call_api_with_account_id import call_api_with_accountId, send_to_innkeepr_api_paginated

In [3]:
customer = "More"
from_date = '2026-01-01'
to_date = '2026-01-14'
audience = "68e246b9120f694f533b9fc1" #Innkeepr - Clear Whey - Existing Purchaser - Seed for history, because no conversion lag or exclusion
goal_name = ["checkout_completed"]#,"signed_up"] #"checkout_completed"
path_save = "SprintStories/EN-2944-bucket-size-rule/"
product_categories = {
    # Rosental 
    # "Slow Aging": ["Slow Aging"],
    # ESN
    # "Bundle": ["Bundles"],
    # "Brand": ["Brand"],
    # "Designer Whey": ["Designer Whey"],
    # "Isoclear": ["Iso Clear"]
    # More
    # "Matcha":["More Protein Iced Matcha Latte", "Pimp your Iced Matcha Bundle"],
    # "Iced Coffee":['Protein Iced Coffee', 'Pimp your Iced Coffee Bundle'] ,
    # "Zerup": ['Zerup - Zero Sirup', 'Barista Zerup', 'Pick and Mix Zerup Bundle'],
    "Bars": ['More Protein Bar', 'More Vegan Protein Bar']
}
url = return_api_url()
print(f"url = {url}")
account_id = return_account_ids()
account_id = [acc["id"] for acc in account_id if acc["name"] == customer]
account_id = account_id[0]

# Load targeting history

In [4]:
from os import path


path_history = f"{path_save}history_{customer}_{from_date}_{to_date}.csv"
try:
    history = pd.read_csv(path_history)
except FileNotFoundError:
    history = pd.DataFrame()
    print("load history")
    date_range = pd.date_range(from_date, to_date)
    for idate, date in enumerate(date_range):
        date = date_range[-idate+1].strftime("%Y%m%d")
        path_to_s3_history = f"s3://{account_id}/targeting.history/{date}/{audience}.parquet"
        try:
            temp = wr.s3.read_parquet(path_to_s3_history)
        except wr.exceptions.NoFilesFound:
            print(f"no history for {path_to_s3_history.replace(account_id,'*')}")
            continue
        #if history.shape[0] > 0:
        #    break
        history = pd.concat([history, temp])
        history = history.drop_duplicates(subset=["session"])
    if history.shape[0] > 0:
        history.to_csv(path_history)
    else:
        raise AttributeError("Empty history")
history

In [5]:
history.to_csv(path_history)

In [6]:
history.groupby("treatment")["conv_prob"].describe()

In [7]:
history["conv_prob"].isnull().sum()

# Load Conversions

In [8]:
list_dates = pd.date_range(from_date, to_date).strftime("%Y-%m-%d")
path_conversions = f"{path_save}conversions_{customer}_{from_date}_{to_date}.csv"
print("load conversions")
conversion_files = glob.glob(f"{path_save}conversions_{customer}*.csv")
conversions=pd.DataFrame(columns=["created"])
if len(conversion_files) > 0:
        print(f"load {len(conversion_files)} conversion files")
        for file in conversion_files:
            print(file)
            conversions = pd.concat([conversions, pd.read_csv(file)])
            conversions = conversions.drop_duplicates()
conversions["date"] = pd.to_datetime(conversions["created"]).dt.date
conversions["date"] = pd.to_datetime(conversions["date"]).dt.strftime("%Y-%m-%d")
list_dates = pd.date_range(from_date, to_date).strftime("%Y-%m-%d")
for idate, date in enumerate(list_dates):
        check_existing_date = (conversions["date"] == date).sum()
        print(check_existing_date, date)
        if check_existing_date > 0:
            print(f"Date already exists: {date}")
            continue
        if idate == len(list_dates)-1:
            end_date = list_dates[-1]
        else:
            end_date = list_dates[idate+1]
        print(f"Query conversions from {date} to {end_date}")
        content={
            "created": {
                            "$gte": transform_date_to_timestamp_milliseconds(date),
                            "$lte": transform_date_to_timestamp_milliseconds(end_date),
                    },
                    "name":goal_name
                        }
        temp=send_to_innkeepr_api_paginated(
            f"{url}/conversions/query",
            account_id,
            content,
            logging
        )
        temp = pd.json_normalize(temp)
        conversions = pd.concat([conversions, temp])
conversions.to_csv(path_conversions)

# Preprocess Products

In [9]:
conversions["properties.products.name"] = conversions["properties.products"].apply(lambda x: [entry["name"] for entry in ast.literal_eval(x) if "name" in entry.keys()])
conversions = conversions.explode("properties.products.name")
conversions[["created","sessionId","properties.products","properties.products.name"]]

## Get Products by Dictionary Keys

## Analyze Products

In [10]:
use_columns = ["created","sessionId","anonymousId","properties.products.name","properties.revenue"]
conversions_filtered = conversions[use_columns]
conversions_filtered["properties.products.name"] = conversions_filtered["properties.products.name"].astype(str)

In [11]:
#conversions_filtered["product_category"] = conversions_filtered["properties.products.name"].apply(lambda x: next((key for key in product_categories if x in product_categories[key]), "other"))
conversions_filtered["product_category"] = conversions_filtered["properties.products.name"].apply(lambda x: next((key for key in product_categories if key in x), "other"))
conversions_filtered = pd.merge(
    conversions_filtered, 
    history[["session", "conv_prob"]],
    left_on="sessionId",
    right_on="session",
    how="left"
)
conversions_filtered["product_category"].value_counts()

## Stats: Conversion Probability

In [12]:
conversions_filtered.groupby("product_category")["conv_prob"].describe()

In [13]:
sns.boxenplot(data=conversions_filtered, x="product_category", y="conv_prob")
plt.grid(True)

In [14]:
sns.barplot(data=conversions_filtered, x="product_category", y="conv_prob")
plt.grid(True)
plt.title("Average conversion probability per product category")

## Test Suggestion of Claude

In [15]:
conversions_filtered["conv_prob"] = conversions_filtered["conv_prob"].astype(float)

In [16]:
import numpy as np
from scipy import stats

def adjust_thresholds_by_distribution(probs, base_percentiles=(90, 80, 70)):
    """
    Adjust percentile thresholds based on probability distribution characteristics.
    
    - Right-skewed (many low probs): Tighten premium, widen volume
    - Left-skewed (many high probs): Widen premium, tighten volume
    - High variance: Use wider gaps between tiers
    """
    skewness = stats.skew(probs, nan_policy='omit')
    variance = np.var(probs)
    
    # Base percentiles
    premium_pct, growth_pct, volume_pct = base_percentiles
    
    # Skewness adjustment (Â±5 percentile points max)
    skew_adjustment = np.clip(skewness * 2, -5, 5)
    print(f"skewness = {skewness}")
    print(f"skew_adjustment = {skew_adjustment}")
    
    
    # Variance adjustment - high variance = wider tiers
    var_adjustment = np.clip(variance * 10, 0, 5)
    print(f"variance = {variance}")
    print(f"var_adjustment = {var_adjustment}")
    
    adjusted = {
        'premium': premium_pct + skew_adjustment,
        'growth': growth_pct + skew_adjustment - var_adjustment,
        'volume': volume_pct + skew_adjustment - var_adjustment * 2
    }
    
    return adjusted

def create_audience_segments(df):
    """
    Create audience segments per product category.
    """    
    audiences = {
        
    }
    
    for product in df['product_category'].unique():
        print(product)
        product_df = df[df['product_category'] == product]
        
        # Get probability distribution stats
        probs = product_df['conv_prob']
        thresholds = adjust_thresholds_by_distribution(probs)
        print(f"thresholds = {thresholds}")
        
        # Calculate actual cutoff values
        premium_cutoff = np.nanpercentile(probs, thresholds['premium'])
        growth_cutoff = np.nanpercentile(probs, thresholds['growth'])
        volume_cutoff = np.nanpercentile(probs, thresholds['volume'])
        
        # Segment users
        audiences[product] = {"premium_cutoff": None, "growth_cutoff": None, "volume_cutoff": None, "thresholds": thresholds}
        audiences[product]['premium_cutoff'] = premium_cutoff
        audiences[product]["growth_cutoff"] = growth_cutoff
        audiences[product]["volume_cutoff"] = volume_cutoff   
    return audiences

In [17]:
audience_threshold = create_audience_segments(conversions_filtered)
print(json.dumps(audience_threshold, indent=4))

## Apply Thresholds

In [18]:
def apply_threshold(data, thresholds):
    for product in thresholds.keys():
        bucket_key = f"bucket_{product}"
        data[bucket_key] = None
        data[bucket_key] = np.where(
            (data["conv_prob"] >= thresholds[product]["premium_cutoff"]),
            "premium", 
            data[bucket_key])
        data[bucket_key] = np.where(
            (data["conv_prob"] >= thresholds[product]["growth_cutoff"]) & 
            (data["conv_prob"] < thresholds[product]["premium_cutoff"]),
            "growth", 
            data[bucket_key]
            )
        data[bucket_key] = np.where(
            (data["conv_prob"] >= thresholds[product]["volume_cutoff"]) & 
            (data["conv_prob"] < thresholds[product]["growth_cutoff"]),
            "volume", 
            data[bucket_key]
            )
        data[bucket_key] = np.where(
            (data["conv_prob"] < thresholds[product]["volume_cutoff"]),
            "other", 
            data[bucket_key]
            )
        data[bucket_key] = np.where(
            data["conv_prob"].isnull(),
            "other", 
            data[bucket_key]
            )
    return data
conversions_filtered = apply_threshold(conversions_filtered, audience_threshold)
conversions_filtered

In [19]:
fig = plt.figure(figsize=(10,5))
for ibucket, bucket in enumerate(audience_threshold.keys()):
    ax = fig.add_subplot(2, 2, ibucket + 1)
    sns.boxenplot(data=conversions_filtered, x=f"bucket_{bucket}", y="conv_prob", ax=ax)
    ax.set_title(f"{bucket}")
    plt.grid(True)
    plt.tight_layout()
    plt.ylim(0,1)
plt.show()

# Modify Data to Test a more skwed approach

In [20]:
conversion_manipulated = conversions_filtered.copy()
conversion_manipulated["conv_prob"] = np.where(
    (conversions_filtered["product_category"] == "Matcha").fillna(False),
    conversions_filtered["conv_prob"] * np.random.uniform(0.1, 0.8),
    conversions_filtered["conv_prob"]
)
conversion_manipulated.groupby("product_category")["conv_prob"].describe()

In [21]:
audience_threshold_manipulated = create_audience_segments(conversion_manipulated)
print(json.dumps(audience_threshold_manipulated, indent=4))

In [22]:
conversion_manipulated = apply_threshold(conversion_manipulated, audience_threshold_manipulated)
conversion_manipulated

In [23]:
fig = plt.figure(figsize=(10,5))
for ibucket, bucket in enumerate(audience_threshold.keys()):
    ax = fig.add_subplot(2, 2, ibucket + 1)
    sns.boxenplot(data=conversion_manipulated, x=f"bucket_{bucket}", y="conv_prob", ax=ax)
    ax.set_title(f"{bucket} manipulated")
    plt.grid(True)
    plt.tight_layout()
    plt.ylim(0,1)
plt.show()