In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

In [2]:
np.random.seed(28)
random.seed(28)

# User

In [3]:
num_users = 20000  # 2万个用户
start_date = datetime(2025, 1, 1)
channels = ['TikTok_Ads', 'Baidu_SEM', 'Organic_Search', 'AppStore', 'Friend_Referral'] 
weights = [0.3, 0.2, 0.2, 0.2, 0.1]
data = {
    'user_id': range(10000, 10000 + num_users),
    'install_date': [start_date + timedelta(days=random.randint(0, 30)) for _ in range(num_users)],
    'channel': np.random.choice(channels, num_users, p=weights)
}
df_user = pd.DataFrame(data)

In [4]:
# 传播 Referral
df_user = df_user.sort_values('install_date').reset_index(drop=True)
inviter_ids = []
existing_user_pool = [] # 动态增长的池子
for index, row in  df_user.iterrows():
    if row['channel'] == 'Friend_Referral' and len(existing_user_pool) > 0:
        inviter_id = random.choice(existing_user_pool)
        inviter_ids.append(inviter_id)
    else:
        inviter_ids.append(None)
        if row['channel'] == 'Friend_Referral': # 无人可邀，改为其他渠道
            df_user.at[index, 'channel'] = random.choice(['TikTok_Ads', 'Baidu_SEM', 'Organic_Search', 'AppStore'])
    existing_user_pool.append(row['user_id'])
df_user['inviter_id'] = inviter_ids
df_user['inviter_id'] = df_user['inviter_id'].astype('Int64') 

In [5]:
# 激活 Activation
# 注册后浏览商品详情页超过 3 次才算有效激活，不同渠道质量不同
# base = 0.5
activation_probs = {
    'TikTok_Ads': 0.5,
    'Baidu_SEM': 0.4,   # 广告渠道质量较低
    'Organic_Search': 0.5,
    'AppStore': 0.5,
    'Friend_Referral': 0.8 # 朋友推荐质量较高
}
df_user['activated'] = df_user['channel'].apply(lambda x: 1 if random.random() < activation_probs[x] else 0)


In [6]:
df_user.columns

Index(['user_id', 'install_date', 'channel', 'inviter_id', 'activated'], dtype='object')

In [7]:
df_user.head()

Unnamed: 0,user_id,install_date,channel,inviter_id,activated
0,13090,2025-01-01,Organic_Search,,0
1,12447,2025-01-01,Organic_Search,,0
2,26889,2025-01-01,Friend_Referral,13090.0,1
3,22179,2025-01-01,Baidu_SEM,,1
4,22183,2025-01-01,AppStore,,1


# Order

In [8]:
orders = []
order_id_counter = 1
purchase_probs= {
            'TikTok_Ads': 0.05,
            'Baidu_SEM': 0.05,
            'Organic_Search': 0.12,
            'AppStore': 0.12,
            'Friend_Referral': 0.30
        }
for index, row in df_user.iterrows():
    if row['activated'] == 1:
        # 购买转化率为 2.26%
        # 购买概率为 转化率/激活率
        if random.random() < purchase_probs[row['channel']]:
            user_id = row['user_id']
            install_date = row['install_date']
            # 购买次数：1次: 80%, 2次: 15%, 3次+: 5%
            num_orders = np.random.choice([1, 2, 3, 4], p=[0.8, 0.15, 0.04, 0.01])
            
            for _ in range(num_orders):
                # 平均数约为33.1
                amount = round(np.random.lognormal(mean=3.5, sigma=1.1), 2)
                if amount < 0.5: amount = 0.5  # 最低消费0.5元
                days_after_install = random.randint(0, 60)
                order_time = install_date + timedelta(days=days_after_install, hours=random.randint(0,23))
                orders.append([order_id_counter, user_id, order_time, amount])
                order_id_counter += 1
                
df_orders = pd.DataFrame(orders, columns=['order_id', 'user_id', 'order_time', 'amount'])

In [9]:
df_orders.head()

Unnamed: 0,order_id,user_id,order_time,amount
0,1,26889,2025-02-06 03:00:00,64.05
1,2,22179,2025-01-29 15:00:00,20.66
2,3,11419,2025-02-05 03:00:00,59.0
3,4,11413,2025-03-02 21:00:00,10.99
4,5,16904,2025-02-24 15:00:00,49.17


#  Login Log

In [10]:
login_records = set()
# 购买
for index, row in df_orders.iterrows():
    login_records.add((row['user_id'], row['order_time'].date()))  
# 日常
for index, row in df_user.iterrows():
    user_id = row['user_id']
    install_date = row['install_date'].date()
    login_records.add((user_id, install_date))
    # 激活后30天内每天登录，概率递减
    if row['activated'] == 1:
        for day_offset in range(1, 61):
            prob = 0.9 * (0.95 ** day_offset)  # 登录概率指数衰减，第 60 天大约还有 0.04
            if random.random() < prob:
                login_date = install_date + timedelta(days=day_offset)
                login_records.add((user_id, login_date))
    else:
        # 未激活用户第二天登录概率低
        if random.random() < 0.05:
            login_date = install_date + timedelta(days=1)
            login_records.add((user_id, login_date))
df_logins = pd.DataFrame(list(login_records), columns=['user_id', 'login_date'])
df_logins = df_logins.sort_values(['user_id', 'login_date']).reset_index(drop=True)

In [11]:
df_logins.head()

Unnamed: 0,user_id,login_date
0,10000,2025-01-04
1,10000,2025-01-05
2,10001,2025-01-24
3,10002,2025-01-05
4,10002,2025-01-06


In [17]:
len(df_user)

20000

In [18]:
len(df_orders)

1537

In [16]:
len(df_logins)

188025

In [12]:
df_user.to_csv('user_data.csv', index=False)
df_orders.to_csv('order_data.csv', index=False)
df_logins.to_csv('login_data.csv', index=False)