In [1]:
import pandas as pd
import numpy as np
from faker import Faker
import random

# Initialize Faker and seed for reproducibility
fake = Faker()
np.random.seed(42)

# Parameters
n_users = 10000
channels = ['Google', 'Meta', 'LinkedIn', 'Organic']
variants = ['A', 'B']
device_types = ['Desktop', 'Mobile', 'Tablet']
regions = ['Europe', 'North America', 'Asia', 'South America']

# Conversion rates
view_rate = 0.85
signup_rate = 0.30
purchase_rate = 0.20

# Simulate user funnel data
users = []
for i in range(n_users):
    user_id = f"user_{i+1}"
    channel = np.random.choice(channels, p=[0.4, 0.3, 0.2, 0.1]) # probability distribution of channels (e.g 40% chance of Google)
    variant = np.random.choice(variants)
    device = np.random.choice(device_types, p=[0.5, 0.4, 0.1])
    region = np.random.choice(regions, p=[0.5, 0.2, 0.2, 0.1])
    clicked_at = fake.date_time_between(start_date='-30d', end_date='now')

    # Funnel logic
    viewed_lp = np.random.rand() < view_rate
    signed_up = viewed_lp and (np.random.rand() < signup_rate)
    purchased = signed_up and (np.random.rand() < purchase_rate)

    # Revenue and ad cost
    revenue = np.round(np.random.uniform(50, 200), 2) if purchased else 0.0
    ad_cost = np.round(np.random.uniform(0.5, 5.0), 2) if channel != 'Organic' else 0.0

    users.append([
        user_id, channel, variant, clicked_at, device, region,
        int(viewed_lp), int(signed_up), int(purchased),
        revenue, ad_cost
    ])

# Create DataFrame
columns = [
    'user_id', 'channel', 'variant', 'clicked_at', 'device_type', 'region',
    'viewed_lp', 'signed_up', 'purchased', 'revenue', 'ad_cost'
]
df = pd.DataFrame(users, columns=columns)

# Save to CSV
df.to_csv("funnel_data.csv", index=False)

# --> creates the marketing funnel data we will work with later in this project and use for further analysis


✅ Data saved as funnel_data.csv
