In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt

# Load data
df = pd.read_csv("/Users/levankikadze/Desktop/pandas/Bolt Food/cherningUsers/Data/UserOrdersTotal_past1yr.csv", index_col=0)

# Sort by order date descending
df.sort_values(by='Order Created Date', ascending=False, inplace=True)

# Get latest order per user
latest_order_dates = df.drop_duplicates(subset='User ID', keep='first')

# Ensure 'Order Created Date' is datetime
latest_order_dates['Order Created Date'] = pd.to_datetime(latest_order_dates['Order Created Date'], errors='coerce')

# Calculate days since last order
latest_order_dates['days_since_last_order'] = (today - latest_order_dates['Order Created Date']).dt.days

# Define bins and labels
bins = [0, 30, 60, 90, 120, np.inf]
labels = ['0-30 days', '31-60 days', '61-90 days', '91-120 days', '120+ days']

# Categorize users by recency
latest_order_dates.loc[:, 'order_recency'] = pd.cut(latest_order_dates['days_since_last_order'], bins=bins, labels=labels, right=True)

# List of campaign columns (excluding summary columns)
campaign_cols = [
    "Bolt Plus Campaign Discount Eur",
    "AM Campaign Discount, €",
    "MLC Churn Campaigns Discount, €",
    "MLC ELC Campaigns Discount, €",
    "Other Campaigns Discount, €",
    "Provider Campaigns Discount, €",
    "Store Campaign Discount, €",
    "ULC Activation Campaigns Discount, €",
    "ULC Engagement Campaigns Discount, €",
    "Cost-Share Delivery Fee Campaigns Spend by Bolt, €",
    "Cost-Share Menu Campaigns Spend by Bolt, €",
    "Delivery Fee Campaigns Discount, €",
    "Liquidity Campaigns Discount, €",
    "Marketing Campaigns Discount, €",
    "Menu Campaigns Discount, €"
]

# Clean campaign columns and create a column for which campaign was used
for col in campaign_cols:
    latest_order_dates.loc[:, col] = latest_order_dates[col].replace(r'[€,\s]', '', regex=True).replace('', '0').astype(float)

# Find which campaign was used for each user (first nonzero campaign column)
def get_campaign(row):
    for col in campaign_cols:
        if row[col] > 0:
            return col
    return 'No Campaign'

latest_order_dates.loc[:, 'used_campaign'] = latest_order_dates.apply(get_campaign, axis=1)

# Count users by recency and campaign
campaign_counts = latest_order_dates.groupby(['order_recency', 'used_campaign']).size().unstack(fill_value=0)

print(campaign_counts)

# Visualisation: stacked horizontal bar chart for top 8 campaigns + "No Campaign"
top_campaigns = campaign_counts.sum().sort_values(ascending=False).head(8).index.tolist()
if 'No Campaign' not in top_campaigns:
    top_campaigns.append('No Campaign')
campaign_counts_plot = campaign_counts[top_campaigns]

campaign_counts_plot.plot(kind='barh', stacked=True, colormap='tab20')
plt.title('User Last Order Recency and Campaign Used')
plt.xlabel('Number of Users')
plt.ylabel('Order Recency')
plt.legend(title='Campaign Used', bbox_to_anchor=(1.05, 1), loc='upper left')

plt.show()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  latest_order_dates['Order Created Date'] = pd.to_datetime(latest_order_dates['Order Created Date'], errors='coerce')


NameError: name 'today' is not defined