# 随机数据生成（对user表和user_movie_rating表）

In [1]:
import pandas as pd
import numpy as np
import random
import datetime as dt

In [2]:
# -----------------------------
# 参数区：可以按需要调整
# -----------------------------
N_USERS = 100             # 模拟用户数量
MIN_RATINGS_PER_USER = 30 # 每个用户最少评分数
MAX_RATINGS_PER_USER = 60 # 每个用户最多评分数

REG_START = dt.date(2005, 1, 1)    # 注册日期下限
REG_END   = dt.date(2020, 12, 31)  # 注册日期上限（<= 2020-12-31）
CUTOFF_DATE = REG_END              # 评分时间上限（保证 < 2021）

CITIES = ["北京", "上海", "广州", "深圳", "杭州",
          "成都", "武汉", "重庆", "南京", "西安", None]

# 为了可重复（可选）
random.seed(2025)
np.random.seed(2025)

## 1. 读取movie_rating作为影片池

In [3]:
mr = pd.read_csv("movie_rating.csv")

movie_ids = mr["movie_id"].values
movie_scores = mr["score"].values

# 以 rating_count 作为抽样权重，热门片更容易被抽到
weights = mr["rating_count"].fillna(0).values + 1  # 避免 0 权重
weights = weights / weights.sum()

## 2. 生成user表数据

In [4]:
user_rows = []
reg_total_days = (REG_END - REG_START).days

for uid in range(1, N_USERS + 1):
    # 随机注册日期
    reg_offset = random.randint(0, reg_total_days)
    reg_date = REG_START + dt.timedelta(days=reg_offset)

    nickname = f"user{uid:03d}"
    city = random.choice(CITIES)
    if city is None:
        city = ""

    user_rows.append({
        "user_id": uid,
        "nickname": nickname,
        "register_dt": reg_date.isoformat(),  # YYYY-MM-DD
        "city": city
    })

user_df = pd.DataFrame(user_rows)

## 3. 生成user_movie_rating表数据

In [6]:
rating_rows = []
rating_id = 1

for _, u in user_df.iterrows():
    uid = int(u["user_id"])
    reg_date = dt.date.fromisoformat(u["register_dt"])

    # 当前用户打分的电影数量
    n_ratings = random.randint(MIN_RATINGS_PER_USER, MAX_RATINGS_PER_USER)

    # 保证同一用户对同一电影最多一条记录
    used_movie_ids = set()

    for _ in range(n_ratings):
        # 按权重抽一个电影，直到抽到未评分过的
        while True:
            idx = np.random.choice(len(movie_ids), p=weights)
            mid = int(movie_ids[idx])
            if mid not in used_movie_ids:
                used_movie_ids.add(mid)
                avg_score = float(movie_scores[idx])
                break

        # 生成评分分值：以 avg_score 为均值做正态扰动
        score = np.random.normal(loc=avg_score, scale=1.0)
        score_int = int(round(score))
        score_int = max(1, min(10, score_int))  # 截断在 [1,10]

        # 评分时间：在 [reg_date, CUTOFF_DATE] 之间随机
        if reg_date > CUTOFF_DATE:
            start_date = CUTOFF_DATE
        else:
            start_date = reg_date
        delta_days = (CUTOFF_DATE - start_date).days
        offset = random.randint(0, delta_days)
        rating_date = start_date + dt.timedelta(days=offset)

        hour = random.randint(0, 23)
        minute = random.randint(0, 59)
        second = random.randint(0, 59)

        rating_dt = dt.datetime.combine(
            rating_date, dt.time(hour, minute, second)
        )

        rating_rows.append({
            "rating_id": rating_id,
            "user_id": uid,
            "movie_id": mid,
            "rating_score": score_int,
            "rating_time": rating_dt.strftime("%Y-%m-%d %H:%M:%S")
        })
        rating_id += 1

ratings_df = pd.DataFrame(rating_rows)

## 4. 导出为 CSV，供 MySQL LOAD DATA 使用

In [9]:
user_df.to_csv("user_sim.csv", index=False)
ratings_df.to_csv("user_movie_rating_sim.csv", index=False)

In [10]:
print("user_sim.csv 行数:", len(user_df))
print("user_movie_rating_sim.csv 行数:", len(ratings_df))
print("\nuser 表前 5 行：")
print(user_df.head())
print("\nuser_movie_rating 表前 5 行：")
print(ratings_df.head())

user_sim.csv 行数: 100
user_movie_rating_sim.csv 行数: 4277

user 表前 5 行：
   user_id nickname register_dt city
0        1  user001  2017-07-06   上海
1        2  user002  2019-06-27   重庆
2        3  user003  2008-11-20   南京
3        4  user004  2005-01-04   成都
4        5  user005  2013-06-05   西安

user_movie_rating 表前 5 行：
   rating_id  user_id  movie_id  rating_score          rating_time
0          1        1   1292000             7  2018-07-16 08:22:53
1          2        1   2353023             8  2020-12-14 01:54:52
2          3        1   6869412             8  2017-07-20 18:07:51
3          4        1   2005723             8  2020-10-29 19:21:48
4          5        1  26366496             8  2019-03-06 07:43:16
