# 原始数据清理及SQL数据导入准备

In [2]:
import pandas as pd
import json

--> 检验数据格式

In [3]:
import os, pathlib

path = "douban_movies_2021.json"
print("exists:", os.path.exists(path))
print("size:", os.path.getsize(path), "bytes")
print("cwd:", pathlib.Path().resolve())

with open(path, "rb") as f:
    head = f.read(200)
print(head[:200])

exists: True
size: 389951705 bytes
cwd: /Users/zhaolong/Documents/<freshgrad/SQL/project
b'[\n  {\n    "movie_id": 1292052,\n    "title": "\xe8\x82\x96\xe7\x94\xb3\xe5\x85\x8b\xe7\x9a\x84\xe6\x95\x91\xe8\xb5\x8e",\n    "original_title": "The Shawshank Redemption",\n    "aka": [\n      "[\'\xe6\x9c\x88\xe9\xbb\x91\xe9\xab\x98\xe9\xa3\x9e(\xe6\xb8\xaf)\'",\n      "\'\xe5\x88\xba\xe6\xbf\x801995(\xe5\x8f\xb0)\'",\n      "\'\xe5\x9c\xb0\xe7\x8b\xb1'


In [4]:
raw_df = pd.read_json('douban_movies_2021.json')

In [5]:
movie_cols = [
    'movie_id', 'title', 'original_title',
    'year', 'mainland_pubdate', 'summary', 'record_time'
]
movie_df = raw_df[movie_cols].copy()

In [6]:
def extract_rating(r):
    if pd.isna(r):
        return pd.Series({
            'score': None,
            'rating_count': None,
            'star_1_count': None,
            'star_2_count': None,
            'star_3_count': None,
            'star_4_count': None,
            'star_5_count': None
        })
    score = r.get('score')
    count = r.get('count')
    details = (r.get('raw') or {}).get('details', {})
    return pd.Series({
        'score': score,
        'rating_count': count,
        'star_1_count': int(details.get('1', 0)),
        'star_2_count': int(details.get('2', 0)),
        'star_3_count': int(details.get('3', 0)),
        'star_4_count': int(details.get('4', 0)),
        'star_5_count': int(details.get('5', 0)),
    })

rating_df = raw_df['rating'].apply(extract_rating)
movie_rating_df = pd.concat([raw_df['movie_id'], rating_df], axis=1)

# 再加上 collect_count / wish_count / comments_count / reviews_count
for col in ['collect_count', 'wish_count',
            'comments_count', 'reviews_count']:
    movie_rating_df[col] = raw_df[col]

In [7]:
# 展开每一部电影的 genres
genre_exploded = (
    raw_df[['movie_id', 'genres']]
      .explode('genres')
      .dropna(subset=['genres'])
)

# 去掉两端空白，过滤空字符串
genre_exploded['genres'] = (
    genre_exploded['genres']
      .astype(str)
      .str.strip()
)
genre_exploded = genre_exploded[genre_exploded['genres'] != '']

# 构造 genre 维度表
genre_dim = (
    genre_exploded[['genres']]
      .drop_duplicates()
      .reset_index(drop=True)
)
genre_dim['genre_id'] = genre_dim.index + 1
genre_dim = genre_dim.rename(columns={'genres': 'genre_name'})

# 构造 movie_genre 关系表
movie_genre_df = genre_exploded.merge(
    genre_dim,
    left_on='genres', right_on='genre_name'
)[['movie_id', 'genre_id']]
movie_genre_df = movie_genre_df.drop_duplicates()