In [1]:
"""
Synthetic multichannel marketing dataset – multi-touch friendly
---------------------------------------------------------------
Adds repeat visits so ≥ 25 % of converting users have ≥ 2 clicks
inside the 60-day look-back window (good for Shapley / Markov).
"""

import numpy as np
import pandas as pd
from faker import Faker
from datetime import datetime, timedelta
from uuid import uuid4
import random

# ───────────────────────── 1. CONFIG ─────────────────────────
START_DATE = (datetime.today() - timedelta(days=730)).replace(
    hour=0, minute=0, second=0, microsecond=0
)
END_DATE   = (datetime.today() - timedelta(days=1)).replace(
    hour=0, minute=0, second=0, microsecond=0
)

CHANNELS          = ['Google', 'Meta', 'LinkedIn', 'Organic']
BASE_SPEND        = {'Google': 8000, 'Meta': 6000, 'LinkedIn': 3000, 'Organic': 0}
BASE_CPC          = {'Google': 2.00, 'Meta': 1.50, 'LinkedIn': 3.00, 'Organic': 0}
CLICKS_TARGET     = 2000                       # mean total clicks per day
SIGNUP_RATE       = 0.30                       # click → lead probability
CUSTOMER_RATE     = 0.20                       # lead → customer probability
MEAN_ORDERS_PER_C = 1.2                        # orders per customer
LOOKBACK_DAYS     = 60

DEVICES           = ['Desktop', 'Mobile', 'Tablet']
REGIONS           = ['Europe', 'North America', 'Asia', 'South America']
VARIANTS          = ['A', 'B']                 # fixed per user

SEASON_PEAK       = {'month': [11, 12], 'uplift': 0.40}
YOY_TREND         = 0.10
NOISE_SD          = 0.20

HOLIDAYS          = ['01-01', '07-04', '12-25', '12-31']
PROMO_DAYS_PCT    = 0.04
PROMO_LEVELS      = [0, 10, 20]

REVISIT_PROB      = 0.30        # ← new: probability a click reuses an existing user_id

fake = Faker()
np.random.seed(42)
random.seed(42)

# ───────────────────────── 2. HELPERS ────────────────────────
def seasonality_factor(d):
    peak  = SEASON_PEAK['uplift'] if d.month in SEASON_PEAK['month'] else 0
    trend = (1 + YOY_TREND) ** ((d - START_DATE).days / 365)
    return (1 + peak) * trend

# consistent variant per user
variant_of = {}
def get_variant(u):
    if u not in variant_of:
        variant_of[u] = np.random.choice(VARIANTS)
    return variant_of[u]

# ───────────────────────── 3. STORAGE ────────────────────────
marketing_rows, lead_rows, order_rows, ext_rows = [], [], [], []
users          = []            # list of existing user_ids
user_lead_done = set()         # to keep max one lead per user
lead_counter = order_counter = 1
comp_index = 100

# ───────────────────────── 4. GENERATION ─────────────────────
for d in pd.date_range(START_DATE, END_DATE, freq='D'):

    # 4-A  external vars
    holiday_flag   = int(d.strftime('%m-%d') in HOLIDAYS)
    promo_discount = np.random.choice(
        PROMO_LEVELS,
        p=[1 - PROMO_DAYS_PCT, PROMO_DAYS_PCT / 2, PROMO_DAYS_PCT / 2]
    )
    comp_index += np.random.normal(0, 1)
    ext_rows.append([d.date(), holiday_flag, promo_discount, round(comp_index, 2)])

    # 4-B  spend & clicks per channel
    day_factor    = seasonality_factor(d) * np.random.normal(1, NOISE_SD)
    target_clicks = max(1, int(np.random.poisson(CLICKS_TARGET)))

    spend_total, channel_spend, channel_clicks = 0, {}, {}
    for ch in CHANNELS:
        spend = BASE_SPEND[ch] * day_factor if BASE_SPEND[ch] else 0
        channel_spend[ch] = spend
        spend_total += spend
    for ch in CHANNELS:
        share = channel_spend[ch] / spend_total if spend_total else 1 / len(CHANNELS)
        channel_clicks[ch] = int(target_clicks * share)

    # 4-C  generate clicks
    for ch in CHANNELS:
        clicks_ch = channel_clicks[ch] or 1
        cpc       = BASE_CPC[ch] * np.random.normal(1, 0.05) if BASE_CPC[ch] else 0
        cost_per  = cpc if ch != 'Organic' else 0

        for _ in range(clicks_ch):

            # pick new or returning user
            if users and np.random.rand() < REVISIT_PROB:
                user_id = random.choice(users)
            else:
                user_id = f"user_{len(users)+1}"
                users.append(user_id)

            click_id = str(uuid4())
            ts       = fake.date_time_between_dates(
                          datetime_start=d,
                          datetime_end=d + timedelta(days=1)
                       )
            variant  = get_variant(user_id)
            device   = np.random.choice(DEVICES, p=[0.5, 0.4, 0.1])
            region   = np.random.choice(REGIONS, p=[0.5, 0.2, 0.2, 0.1])

            marketing_rows.append([
                click_id, user_id, ch, variant, device, region, ts, round(cost_per, 2)
            ])

            # 4-D  lead logic -- one lead per user
            if user_id not in user_lead_done and np.random.rand() < SIGNUP_RATE:
                lead_id     = f"lead_{lead_counter}"; lead_counter += 1
                lead_ts     = ts + timedelta(minutes=np.random.randint(1, 1440))
                is_customer = np.random.rand() < CUSTOMER_RATE
                lead_rows.append([lead_id, user_id, click_id, lead_ts, int(is_customer)])
                user_lead_done.add(user_id)

                # 4-E  order(s) per customer
                if is_customer:
                    n_orders = max(1, int(np.random.poisson(MEAN_ORDERS_PER_C)))
                    for _ in range(n_orders):
                        order_id = f"ord_{order_counter}"; order_counter += 1
                        order_ts = lead_ts + timedelta(days=np.random.randint(0, LOOKBACK_DAYS))
                        revenue  = round(np.random.uniform(50, 200), 2)
                        order_rows.append([order_id, lead_id, order_ts, revenue])

# ───────────────────────── 5. SAVE CSVs ──────────────────────
pd.DataFrame(marketing_rows, columns=[
    'click_id', 'user_id', 'channel', 'variant',
    'device_type', 'region', 'click_ts', 'ad_cost'
]).to_csv('marketing_clicks.csv', index=False)

pd.DataFrame(lead_rows, columns=[
    'lead_id', 'user_id', 'tracking_parameter', 'lead_ts', 'is_customer'
]).to_csv('leads.csv', index=False)

pd.DataFrame(order_rows, columns=[
    'order_id', 'lead_id', 'order_ts', 'revenue'
]).to_csv('orders.csv', index=False)

pd.DataFrame(ext_rows, columns=[
    'date', 'holiday_flag', 'promo_discount', 'competitor_index'
]).to_csv('external_vars.csv', index=False)

print("Multi-touch dataset written: marketing_clicks.csv  leads.csv  orders.csv  external_vars.csv")


Multi-touch dataset written: marketing_clicks.csv  leads.csv  orders.csv  external_vars.csv
