In [None]:
import pandas as pd
import random
import numpy as np
from faker import Faker
from datetime import datetime

fake = Faker("zh_CN")
random.seed(42)
np.random.seed(42)

n = 500
rows = []

# 城市、职业等参考列表
cities = ["北京","上海","广州","深圳","杭州","成都","武汉","西安","南京","厦门",
          "青岛","天津","长沙","重庆","苏州","宁波","郑州","福州","合肥","大连"]
occupations = ["程序员","教师","医生","律师","自由职业","公务员","企业管理","金融分析师",
               "市场营销","制造业工人","科研人员","个体商户","学生","退休人员"]
educations = ["高中","大专","本科","硕士","博士"]
maritals = ["未婚","已婚","离异","丧偶"]
risk_pref = ["保守","稳健","平衡","积极","激进"]
health_status = ["优","良","一般","差"]

for i in range(n):
    # 年龄：18~65，按现实年龄段加权
    age_group = random.choices(
        population=[(18,25),(26,35),(36,45),(46,55),(56,65)],
        weights=[0.15,0.3,0.25,0.2,0.1]
    )[0]
    age = random.randint(age_group[0], age_group[1])
    birth_year = datetime.today().year - age
    birth = fake.date_of_birth(tzinfo=None, minimum_age=age, maximum_age=age)

    # 收入与支出逻辑
    salary = round(np.random.normal(loc=8000 + age*200, scale=3000),2)
    salary = max(2000, min(salary, 60000))
    business_income = round(random.uniform(0, 20000 if age>25 else 5000),2)
    passive_income = round(random.uniform(0, 10000 if age>30 else 2000),2)
    inflow = salary + business_income + passive_income
    outflow = round(inflow * random.uniform(0.6, 1.1), 2)
    savings_rate = round((inflow - outflow)/inflow, 2)

    # 消费细分分配（总和≈outflow）
    # 设置一个基础分布
    consumption_keys = [
        "餐饮","衣物","住房","交通","娱乐","教育培训","医疗保健","健身运动","旅行度假",
        "数字产品","宠物","图书影音","美容护肤","线上购物","线下购物","奢侈品",
        "家庭日用品","母婴","绿色环保","慈善捐赠"
    ]
    base_weights = np.random.dirichlet(np.ones(len(consumption_keys)),size=1)[0]
    consumption_detail = {k: round(outflow*w,2) for k,w in zip(consumption_keys, base_weights)}

    # 资产负债
    asset_liquid = round(random.uniform(0, 300000 + age*10000), 2)
    asset_fixed = round(random.uniform(0, 1000000 + age*50000), 2)
    stock_value = round(random.uniform(0, 500000), 2)
    real_estate = round(random.uniform(0, 3000000 if age>30 else 500000), 2)
    total_asset = asset_liquid + asset_fixed + stock_value + real_estate

    debt_credit = round(random.uniform(0, 50000 + age*2000), 2)
    debt_mortgage = round(random.uniform(0, 1500000 if age>28 else 200000), 2)
    debt_other = round(random.uniform(0, 200000), 2)
    total_debt = debt_credit + debt_mortgage + debt_other
    net_asset = round(total_asset - total_debt, 2)
    debt_ratio = round(total_debt/total_asset, 2) if total_asset>0 else 0

    # 社保与保障
    social_security = round(random.uniform(0, 200000 if age>22 else 0), 2)
    pension_years = random.randint(0, max(0, age-22))
    housing_fund = round(random.uniform(0, 150000 if age>25 else 0), 2)
    insurance_premium = round(random.uniform(0, 20000 + age*300), 2)
    insurance_coverage = round(random.uniform(0, 2000000), 2)

    # 用户反馈与目标
    retire_target = random.randint(50,70)
    pension_goal = round(random.uniform(500000, 5000000), 2)
    exp_return_min = round(random.uniform(0.01, 0.06), 3)
    exp_return_max = round(random.uniform(0.06, 0.12), 3)

    # 平台交互
    platform_visits = random.randint(0, 30 if age<50 else 20)
    adoption_rate = round(random.uniform(0.1,0.9), 2)
    qa_count = random.randint(0, 40 if age<40 else 20)
    personalization_count = random.randint(0, 15)
    feedback_pos = round(random.uniform(0.3, 1.0), 2)
    trust_score = round(random.uniform(30, 95), 2)

    row = {
        "用户ID": f"U{str(i+1).zfill(6)}",
        "年龄": age,
        "性别": random.choice(["男","女"]),
        "所在城市": random.choice(cities),
        "职业": random.choice(occupations),
        "教育程度": random.choice(educations),
        "婚姻状况": random.choice(maritals),
        "月工资收入": salary,
        "经营性收入": business_income,
        "被动收入": passive_income,
        "月总流入": inflow,
        "月总流出": outflow,
        "储蓄率": savings_rate,
        "活期存款": asset_liquid,
        "理财产品": asset_fixed,
        "股票基金": stock_value,
        "房产估值": real_estate,
        "总资产": total_asset,
        "净资产": net_asset,
        "信用卡欠款": debt_credit,
        "房贷余额": debt_mortgage,
        "其他贷款": debt_other,
        "总负债": total_debt,
        "负债率": debt_ratio,
        "养老金账户余额": social_security,
        "缴纳年限": pension_years,
        "住房公积金余额": housing_fund,
        "商业保险年缴": insurance_premium,
        "保险保额": insurance_coverage,
        "风险偏好": random.choice(risk_pref),
        "计划退休年龄": retire_target,
        "目标养老金": pension_goal,
        "期望收益率下限": exp_return_min,
        "期望收益率上限": exp_return_max,
        "健康状况": random.choice(health_status),
        "平台月访问次数": platform_visits,
        "策略采纳率": adoption_rate,
        "交互问答次数": qa_count,
        "个性化设置次数": personalization_count,
        "反馈积极度": feedback_pos,
        "信任评分": trust_score,
    }
    # 加入消费细分
    for k,v in consumption_detail.items():
        row[f"{k}消费"] = v

    rows.append(row)

# 生成DataFrame并导出
df = pd.DataFrame(rows)
df.to_csv("pension_mock_500.csv", index=False, encoding="utf-8-sig")
print("✅ 已生成 pension_mock_500.csv (500条完整模拟数据)")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# 1. 读取数据
df = pd.read_csv("pension_mock_500.csv")

plt.style.use("ggplot")

# 2. 年龄分布
plt.figure(figsize=(8,4))
df["年龄"].hist(bins=15, color="steelblue", edgecolor="black")
plt.title("Age Distribution")
plt.xlabel("Age")
plt.ylabel("Count")
plt.show()

# 3. 收入与支出分布
plt.figure(figsize=(8,4))
df[["月工资收入","月总流入","月总流出"]].plot(kind="box", figsize=(8,5))
plt.title("Income & Cash Flow Boxplot")
plt.ylabel("RMB")
plt.show()

# 4. 储蓄率分布
plt.figure(figsize=(8,4))
df["储蓄率"].plot(kind="hist", bins=20, color="green", edgecolor="black")
plt.title("Savings Rate Distribution")
plt.xlabel("Savings Rate")
plt.show()

# 5. 平台交互
plt.figure(figsize=(8,4))
df["平台月访问次数"].plot(kind="hist", bins=20, color="orange", edgecolor="black")
plt.title("Platform Visits Distribution")
plt.xlabel("Monthly Visits")
plt.show()

# 6. 消费类别占比（平均值）
consumption_cols = [c for c in df.columns if "消费" in c and "总" not in c]
mean_consumption = df[consumption_cols].mean().sort_values(ascending=False).head(10)
plt.figure(figsize=(10,5))
mean_consumption.plot(kind="bar", color="purple")
plt.title("Top 10 Average Consumption Categories")
plt.ylabel("Average Monthly Expense (RMB)")
plt.show()

# 7. 收入 vs 年龄散点
plt.figure(figsize=(8,4))
plt.scatter(df["年龄"], df["月总流入"], alpha=0.6, color="teal")
plt.title("Total Inflow vs Age")
plt.xlabel("Age")
plt.ylabel("Monthly Inflow (RMB)")
plt.show()


In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from ipywidgets import interact, IntSlider, Dropdown

# 1️⃣ 读取模拟数据
df = pd.read_csv("pension_mock_500.csv")

# ======= 基础处理 =======
# 按年龄段分组
bins = [18, 25, 35, 45, 55, 65, 80]
labels = ["18-24", "25-34", "35-44", "45-54", "55-64", "65+"]
df["年龄段"] = pd.cut(df["年龄"], bins=bins, labels=labels, right=False)

# 提取消费类列
consumption_cols = [c for c in df.columns if "消费" in c and "总" not in c]

# ======= 可视化1：动态年龄段收入分布 =======
def show_income_by_age(age_group):
    d = df[df["年龄段"]==age_group]
    fig = px.histogram(d, x="月总流入", nbins=20,
                       title=f"💰月总流入分布（年龄段：{age_group}）",
                       color_discrete_sequence=["steelblue"])
    fig.show()

interact(show_income_by_age, age_group=Dropdown(options=labels, description="年龄段"))

# ======= 可视化2：储蓄率 vs 平台信任度（气泡图） =======
fig = px.scatter(df, x="储蓄率", y="平台信任评分",
                 size="平台月访问次数", color="年龄段",
                 hover_name="用户ID",
                 title="💡储蓄率 vs 平台信任度（气泡大小=月访问次数）")
fig.show()

# ======= 可视化3：Top消费类别动态选择 =======
def show_top_consumption(top_n=10):
    mean_consumption = df[consumption_cols].mean().sort_values(ascending=False).head(top_n)
    fig = px.bar(mean_consumption, x=mean_consumption.index, y=mean_consumption.values,
                 color=mean_consumption.index,
                 title=f"🍽️平均消费Top{top_n}")
    fig.update_layout(xaxis_title="消费类别", yaxis_title="平均月消费(RMB)")
    fig.show()

interact(show_top_consumption, top_n=IntSlider(value=10, min=5, max=20))

# ======= 可视化4：平台月访问次数时间序列趋势模拟 =======
# 模拟每个用户12个月的访问次数
user_sample = df.sample(1, random_state=42).iloc[0]
time_series = pd.DataFrame({
    "月份": range(1,13),
    "访问次数": np.random.poisson(lam=user_sample["平台月访问次数"], size=12)
})
fig = px.line(time_series, x="月份", y="访问次数",
              title=f"📈用户 {user_sample['用户ID']} 平台访问次数趋势")
fig.show()

# ======= 可视化5：年龄 vs 收入 vs 消费三维气泡 =======
fig = px.scatter_3d(df,
    x="年龄", y="月总流入", z="月总流出",
    color="年龄段", size="储蓄率",
    title="🔵 年龄-收入-支出 三维关系",
    hover_data=["平台信任评分"]
)
fig.show()
