In [17]:
import pandas as pd
import numpy as np
import uuid
from datetime import datetime, timedelta
from random import uniform, randint

In [None]:
# Helper function to generate random dates
def random_date(start, end):
    return start + timedelta(days=np.random.randint(0, (end - start).days + 1))

# Date range constants
START_DATE = datetime(2024, 1, 1)
END_DATE = datetime(2024, 6, 30)

In [None]:
# 1. Generate Users
def generate_users(num_users):
    channels = ['Google Ads', 'Facebook Ads', 'Instagram', 'Organic', 'Referral']
    countries = ['USA', 'UK', 'Canada', 'Germany', 'Australia']
    devices = ['iOS', 'Android']

    users = []
    for _ in range(num_users):
        signup_date = random_date(START_DATE, END_DATE)
        user = {
            'user_id': str(uuid.uuid4()),
            'signup_date': signup_date.date(),
            'channel': np.random.choice(channels, p=[0.3,0.25,0.15,0.2,0.1]),
            'country': np.random.choice(countries, p=[0.4,0.2,0.15,0.15,0.1]),
            'device': np.random.choice(devices, p=[0.6, 0.4])
        }
        users.append(user)
    return pd.DataFrame(users)


In [None]:
# 2. Generate Transactions
def generate_transactions(users_df):
    transactions = []
    for _, user in users_df.iterrows():
        signup_date = datetime.combine(user['signup_date'], datetime.min.time())
        num_transactions = np.random.poisson(lam=0.5)
        for _ in range(num_transactions):
            transaction_date = random_date(signup_date, END_DATE)
            transaction = {
                'transaction_id': str(uuid.uuid4()),
                'user_id': user['user_id'],
                'transaction_date': transaction_date.date(),
                'amount': np.random.choice([9.99, 19.99, 29.99], p=[0.5,0.3,0.2]),
                'type': np.random.choice(['Subscription', 'In-app purchase'], p=[0.7,0.3])
            }
            transactions.append(transaction)
    return pd.DataFrame(transactions)

In [None]:
# 3. Generate Costs
def generate_costs():
    dates = pd.date_range(START_DATE, END_DATE, freq='D')
    channels = ['Google Ads', 'Facebook Ads', 'Instagram']

    costs = []
    for date in dates:
        for channel in channels:
            cost = {
                'date': date.date(),
                'channel': channel,
                'cost': round(uniform(500, 2500), 2)
            }
            costs.append(cost)
    return pd.DataFrame(costs)

In [None]:
# 4. Generate User Engagement
def generate_engagement(users_df):
    engagements = []
    for _, user in users_df.iterrows():
        signup_date = datetime.combine(user['signup_date'], datetime.min.time())
        activity_days = np.random.poisson(lam=15)
        possible_dates = pd.date_range(signup_date, END_DATE)
        activity_dates = np.random.choice(possible_dates, min(activity_days, len(possible_dates)), replace=False)
        for date in activity_dates:
            engagement = {
                'user_id': user['user_id'],
                'activity_date': pd.Timestamp(date).date(),
                'habits_logged': randint(1, 5)
            }
            engagements.append(engagement)
    return pd.DataFrame(engagements)

In [None]:
# Generate all datasets
users_df = generate_users(100000)
transactions_df = generate_transactions(users_df)
costs_df = generate_costs()
engagement_df = generate_engagement(users_df)

In [None]:
# Saving datasets as CSV
users_df.to_csv('users.csv', index=False)
transactions_df.to_csv('transactions.csv', index=False)
costs_df.to_csv('costs.csv', index=False)
engagement_df.to_csv('engagement.csv', index=False)


In [20]:
transactions_df["user_id"].nunique()

39631