# 百度地图地铁数据处理


In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
import sys
sys.path.append("../../..")
from secure.db_account import SubwayPrd
from k_libs.db_query import DBOperate

DBO = DBOperate(SubwayPrd)

## 字段表


In [3]:
# 线路字段
line_fields = [
    {'raw': 'lid',
    'processed': 'line_name_full',
    'info': '线路全名'},
    {'raw': 'lb',
    'processed': 'line_name',
    'info': '线路简称'},
    {'raw': 'n',
    'processed': 'num_stations',
    'info': '线路站点数'},
    {'raw': 'loop',
    'processed': 'is_loop',
    'info': '是否环线'},
    {'raw': 'lbx',
    'processed': 'label_x',
    'info': '线路标签X坐标'},
    {'raw': 'lby',
    'processed': 'label_y',
    'info': '线路标签Y坐标'},
    {'raw': 'lbr',
    'processed': 'label_rotation',
    'info': '线路标签旋转角度'},
    {'raw': 'lc',
    'processed': 'line_color',
    'info': '线路颜色'},
    {'raw': 'uid',
    'processed': 'line_uid',
    'info': '线路唯一ID'},
    {'raw': 'uid2',
    'processed': 'line_uid2',
    'info': '线路备用唯一ID'}
]

In [4]:
# 站点字段
station_fields = [
    {'raw': 'sid',
    'processed': 'station_name',
    'info': '站点名称'},
    {'raw': 'lb',
    'processed': 'station_label',
    'info': '站点标签'},
    {'raw': 'x',
    'processed': 'x',
    'info': '站点X坐标'},
    {'raw': 'y',
    'processed': 'y',
    'info': '站点Y坐标'},
    {'raw': 'rx',
    'processed': 'label_offset_x',
    'info': '站点标签X偏移'},
    {'raw': 'ry',
    'processed': 'label_offset_y',
    'info': '站点标签Y偏移'},
    {'raw': 'st',
    'processed': 'is_station',
    'info': '是否为车站'},
    {'raw': 'ex',
    'processed': 'is_exchange',
    'info': '是否换乘点'},
    {'raw': 'iu',
    'processed': 'is_use',
    'info': '是否在用,true表示该站已开通并运营'},
    {'raw': 'rc',
    'processed': 'is_rail_construction',
    'info': '是否为规划/在建站点'},
    {'raw': 'slb',
    'processed': 'show_label',
    'info': '是否显示标签'},
    {'raw': 'ln',
    'processed': 'lines',
    'info': '所属线路'},
    {'raw': 'uid',
    'processed': 'station_uid',
    'info': '站点唯一ID'},
    {'raw': 'px',
    'processed': 'proj_x',
    'info': '站点投影坐标X'},
    {'raw': 'py',
    'processed': 'proj_y',
    'info': '站点投影坐标Y'}
    ]


## 原始数据
ODS 层（Operational Data Store，操作数据存储层）

存放业务系统的原始数据，基本不做加工。

特点：与业务库字段保持一致，保证数据的完整性与可追溯性。

作用：承接源系统，作为数据仓库的“原材料”。

In [5]:
sql = """SELECT * FROM ods_subway_baidu WHERE crawler_id=(SELECT MAX(crawler_id) FROM ods_subway_baidu);"""
df_raw = DBO.read_sql(sql)

In [6]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15923 entries, 0 to 15922
Data columns (total 33 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   line_lid      15923 non-null  object
 1   line_lb       15923 non-null  object
 2   line_slb      15923 non-null  object
 3   line_n        15923 non-null  object
 4   line_loop     15923 non-null  object
 5   line_lbx      15923 non-null  object
 6   line_lby      15923 non-null  object
 7   line_lbr      15923 non-null  object
 8   line_lc       15923 non-null  object
 9   line_uid      15923 non-null  object
 10  line_uid2     15923 non-null  object
 11  st_sid        15923 non-null  object
 12  st_lb         15923 non-null  object
 13  st_x          15923 non-null  object
 14  st_y          15923 non-null  object
 15  st_rx         15923 non-null  object
 16  st_ry         15923 non-null  object
 17  st_st         15923 non-null  object
 18  st_ex         15923 non-null  object
 19  st_i

In [7]:
df_raw

Unnamed: 0,line_lid,line_lb,line_slb,line_n,line_loop,line_lbx,line_lby,line_lbr,line_lc,line_uid,...,st_int,st_uid,st_px,st_py,city_id,city_name,city_name_e,crawler_id,crawler_date,uid
0,地铁1号线八通线,1号线八通线,False,23,False,-498.9,139.1,0,0xc03935,bce557d6f7fadd4ea5da39b7,...,3,291c5802f26a751cbca240d9,12935140.04,4825694.5,131,北京,beijing,250928,2025-09-28,1312509280
1,地铁1号线八通线,1号线八通线,False,23,False,-498.9,139.1,0,0xc03935,bce557d6f7fadd4ea5da39b7,...,3,ad28546df35285eb851541d9,12937624.6,4825645.68,131,北京,beijing,250928,2025-09-28,1312509281
2,地铁1号线八通线,1号线八通线,False,23,False,-498.9,139.1,0,0xc03935,bce557d6f7fadd4ea5da39b7,...,3,a047555503a5bc5cbc4842d9,12940185.58,4825661.64,131,北京,beijing,250928,2025-09-28,1312509282
3,地铁1号线八通线,1号线八通线,False,23,False,-498.9,139.1,0,0xc03935,bce557d6f7fadd4ea5da39b7,...,3,a27fa5a23a128501177643d9,12942086.4,4825707.12,131,北京,beijing,250928,2025-09-28,1312509283
4,地铁1号线八通线,1号线八通线,False,23,False,-498.9,139.1,0,0xc03935,bce557d6f7fadd4ea5da39b7,...,3,53889c15034f2e3f58bbbcde,12944444.84,4825755.15,131,北京,beijing,250928,2025-09-28,1312509284
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15918,ML4,ML4,False,16,False,-110,1600,0,0x77b838,a1bf283eb66e3574c9c3dbb7,...,2,,,,51314,马德里,madeli,250928,2025-09-28,51314250928445
15919,ML4,ML4,False,16,False,-110,1600,0,0x77b838,a1bf283eb66e3574c9c3dbb7,...,2,,,,51314,马德里,madeli,250928,2025-09-28,51314250928446
15920,ML4,ML4,False,16,False,-110,1600,0,0x77b838,a1bf283eb66e3574c9c3dbb7,...,2,5660ab357cf66173901fc9b7,-417722.43,4872240.68,51314,马德里,madeli,250928,2025-09-28,51314250928447
15921,ML4,ML4,False,16,False,-110,1600,0,0x77b838,a1bf283eb66e3574c9c3dbb7,...,2,d79642c2e73da856eedfcab7,-417875.2,4871682.71,51314,马德里,madeli,250928,2025-09-28,51314250928448


## DWD 
DWD 层（Data Warehouse Detail，明细数据层）  
在 ODS 的基础上进行清洗、规范化，保留明细粒度的数据。  
特点：字段标准化（统一命名、数据类型），做一些维度退化或拆分。  
作用：保证数据“可用”，是最常被下游加工使用的一层。  

In [8]:
def dwd_subway_bd_to_sql(df_raw):
    """
    将百度地铁数据处理为DWD层数据，并存入数据库
    """

    # 对字段重命名
    df_dwd = df_raw.copy()
    df_dwd = df_dwd.rename(columns={
        'line_lid': 'line_name_full',
        'line_lb': 'line_name',
        'line_slb': 'line_show_label',
        'line_loop': 'line_is_loop',
        'line_lbx': 'line_label_x',
        'line_lby': 'line_label_y',
        'line_lbr': 'line_label_rotation',
        'line_lc': 'line_color',
        'st_sid': 'st_name',
        'st_lb': 'st_label',
        'st_rx': 'st_label_offset_x',
        'st_ry': 'st_label_offset_y',
        'st_st': 'st_is_station',
        'st_ex': 'st_is_exchange',
        'st_iu': 'st_is_use',
        'st_rc': 'st_is_rail_construction',
        'st_slb': 'st_show_label',
        'st_ln': 'st_lines',
        'st_px': 'st_proj_x',
        'st_py': 'st_proj_y'}
    )
    # 字段排序
    field_order = [
        'city_id', 'city_name', 'city_name_e',
        # 线路信息
        'line_name_full', 'line_name', 'line_show_label', 'line_n', 'line_is_loop',
        'line_label_x', 'line_label_y', 'line_label_rotation', 'line_color',
        'line_uid', 'line_uid2',
        # 站点信息
        'st_name', 'st_label', 'st_x', 'st_y', 'st_label_offset_x', 'st_label_offset_y',
        'st_is_station', 'st_is_exchange', 'st_is_use', 'st_is_rail_construction',
        'st_show_label', 'st_lines', 'st_int', 'st_uid', 'st_proj_x', 'st_proj_y',
        # 数据采集信息
        'crawler_id', 'crawler_date', 'uid'
    ]
    df_dwd = df_dwd[field_order]
    # 对存在false,true的字段进行处理
    bool_fields = [
        'line_show_label', 'line_is_loop', 'st_is_station', 'st_is_exchange', 'st_is_use', 'st_is_rail_construction', 'st_show_label'
    ]
    df_dwd = df_dwd.copy()
    for field in bool_fields:
        df_dwd[field] = df_dwd[field].map({'True': 1, 'False': 0})
    # 颜色修改为16进制
    df_dwd['line_color'] = df_dwd['line_color'].apply(lambda x: f"#{x[2:]}" if not x.startswith('#') else x)
    # 判断是否为车站,st_name是否为空
    df_dwd['st_is_station'] = df_dwd['st_name'].apply(lambda x: 1 if len(x) > 0 else 0)
    
    # 保存到数据库
    DBO.df_to_sql(df_dwd, "dwd_subway_baidu", if_exists="replace")
    print("Data saved to dwd_subway_baidu table.")
    return df_dwd

dwd_subway = dwd_subway_bd_to_sql(df_raw)
dwd_subway.info()

Data saved to dwd_subway_baidu table.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15923 entries, 0 to 15922
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   city_id                  15923 non-null  object 
 1   city_name                15923 non-null  object 
 2   city_name_e              15923 non-null  object 
 3   line_name_full           15923 non-null  object 
 4   line_name                15923 non-null  object 
 5   line_show_label          15923 non-null  int64  
 6   line_n                   15923 non-null  object 
 7   line_is_loop             15923 non-null  int64  
 8   line_label_x             15923 non-null  object 
 9   line_label_y             15923 non-null  object 
 10  line_label_rotation      15923 non-null  object 
 11  line_color               15923 non-null  object 
 12  line_uid                 15923 non-null  object 
 13  line_uid2                15923 non-nul

## DWS
DWS 层（Data Warehouse Summary，汇总数据层）

在 DWD 基础上，按业务主题和常用维度做聚合统计。

特点：以主题域为核心（如用户、订单、交易），形成宽表或统计指标。

作用：减少重复计算，支撑公共分析需求。

1. 城市表 dws_subway_bd_city
2. 线路表 dws_subway_bd_line
3. 车站表 dws_subway_bd_st

### 城市表

In [29]:
with open("city_info.json", "r", encoding="utf-8") as f:
    city_info = json.load(f)
def get_city_infos_dws(df, city_info):
    """
    生成城市信息表
    df: DWD层数据, dwd_subway
    city_info: 从city_info.json中读取的字典数据
    """
    city_info_df = pd.DataFrame.from_dict(city_info['cities'])
    df_city = df[['city_id', 'city_name', 'city_name_e']].drop_duplicates().reset_index(drop=True)
    df_city['city_order'] = df_city.index + 1
    df_city = df_city.merge(city_info_df, how='left', on='city_name')
    df_city = df_city.drop(columns=['english_pinyin_name'])
    df_city = df_city.rename(columns={'province_chinese': 'province', 
                                    'province_pinyin_english': 'province_e',
                                      'country_chinese': 'country', 
                                      'country_english': 'country_e'})
    return df_city
dwd_subway_city = get_city_infos_dws(dwd_subway, city_info)
dwd_subway_city

Unnamed: 0,city_id,city_name,city_name_e,city_order,province,province_e,country,country_e
0,131,北京,beijing,1,北京市,Beijing Shi,中国,China
1,289,上海,shanghai,2,上海市,Shanghai Shi,中国,China
2,257,广州,guangzhou,3,广东省,Guangdong Sheng,中国,China
3,340,深圳,shenzhen,4,广东省,Guangdong Sheng,中国,China
4,132,重庆,chongqing,5,重庆市,Chongqing Shi,中国,China
...,...,...,...,...,...,...,...,...
74,65531,莫斯科,mosike,75,莫斯科州,Moscow Oblast,俄罗斯,Russia
75,52390,鹿特丹,lutedan,76,南荷兰省,South Holland Province,荷兰,Netherlands
76,48552,伊斯坦布尔,yisitanbuer,77,伊斯坦布尔省,Istanbul Province,土耳其,Turkey
77,51271,巴塞罗那,basailuona,78,加泰罗尼亚自治区,Catalonia Autonomous Community,西班牙,Spain


### 线路表
* line_uid有空值，不可用
* line_name_full 暂无重复值，可以用
* line_name 有重复值，需处理

#### 支线名称清洗

In [42]:
def clear_line_branch(df):
    """
    清理线路支线信息，将line_name_full作为支线名称，支线名称应该唯一
    df: dwd_subway
    """
    df_line = df[['city_id', 'city_name', 'line_name_full', 'line_name', 'line_is_loop', 'line_label_x', 'line_label_y', 'line_label_rotation', 'line_color']].drop_duplicates().reset_index(drop=True)
    # 按city_name, line_name统计line_name出现次数
    df_line['line_name_count'] = df_line.groupby(['city_name', 'line_name'])['line_name'].transform('count')
    # if df_line['line_name_count'].max() > 1:
    #     print("存在line_name相同的线路")
    # 按city_name, line_name_full统计line_name出现次数
    df_line['line_name_full_count'] = df_line.groupby(['city_name', 'line_name_full'])['line_name_full'].transform('count')
    if df_line['line_name_full_count'].max() > 1:
        # 停止执行
        raise ValueError("存在line_name_full相同的线路，非唯一，需要定位数据，做进一步处理")
    # 添加line_name_branch，当line_name_count>1时，使用line_name_full
    df_line['line_name_branch'] = df_line['line_name']
    df_line.loc[df_line['line_name_count'] > 1, 'line_name_branch'] = df_line['line_name_full']
    # 删除line_name_full中的“地铁”
    df_line['line_name_branch'] = df_line['line_name_branch'].str.replace('地铁', '', regex=False)
    # 按城市，线路顺序添加line_order
    df_line['line_order'] = df_line.groupby('city_id').cumcount() + 1
    return df_line

df_line_branch = clear_line_branch(dwd_subway)
df_line_branch

Unnamed: 0,city_id,city_name,line_name_full,line_name,line_is_loop,line_label_x,line_label_y,line_label_rotation,line_color,line_name_count,line_name_full_count,line_name_branch,line_order
0,131,北京,地铁1号线八通线,1号线八通线,0,-498.9,139.1,0,#c03935,1,1,1号线八通线,1
1,131,北京,地铁2号线,2号线,1,-175.3,-4.9,0,#005f98,1,1,2号线,2
2,131,北京,地铁3号线,3号线,0,450,-55,0,#942413,1,1,3号线,3
3,131,北京,地铁4号线大兴线,4号线大兴线,0,-499.3,-224,0,#008e9c,1,1,4号线大兴线,4
4,131,北京,地铁5号线,5号线,0,18,-353.9,0,#a6217f,1,1,5号线,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
628,51314,马德里,ML1,ML1,0,-70,-960,0,#5086c2,1,1,ML1,13
629,51314,马德里,ML2,ML2,0,-780,-280,0,#ea4d81,1,1,ML2,14
630,51314,马德里,ML3,ML3,0,-1800,295.2,0,#ff6642,1,1,ML3,15
631,51314,马德里,ML42,ML4,0,-110,1600,0,#77b838,2,1,ML42,16


#### 主线名称清洗
1. 计算line_name与上一行相同字数，作为same_prefix_len列
2. 筛选出same_prefix_len>2的行，以及每一的上一行
3. 将['city_id', 'city_name', 'line_name_full', 'line_name', 'line_name_branch']保存为新的df
4. 对比subway_line_fixed.json文件，将新增的手工添加"line_name_main"字段，复制到subway_line_fixed.json文件最后。

In [44]:
def clear_line_main(df_line_branch, subway_line_fixed):
    """
    为支线添加主线名称
    df_line_branch: 支线数据
    subway_line_fixed: 从subway_line_fixed.json中读取的字典数据
    """
    df_line_main = df_line_branch.copy()
    # 按城市，统计line_name与上一行line_name相同的字数，从首字开始统计，如首字不同，直接记为0
    df_line_main['same_prefix_len'] = 0
    for city in df_line_main['city_name'].unique():
        city_mask = df_line_main[df_line_main['city_name'] == city]
        prev_line_name = ""
        for idx, row in city_mask.iterrows():
            line_name = row['line_name']
            if line_name == prev_line_name:
                df_line_main.at[idx, 'same_prefix_len'] = len(line_name)
            else:
                # 计算与上一行相同的前缀长度
                common_length = 0
                for c1, c2 in zip(line_name, prev_line_name):
                    if c1 == c2:
                        common_length += 1
                    else:
                        break
                df_line_main.at[idx, 'same_prefix_len'] = common_length
            prev_line_name = line_name
    # 筛选出same_prefix_len>2的行，以及每一的上一行
    df_line_main_filtered = pd.DataFrame()
    for city in df_line_main['city_name'].unique():
        city_mask = df_line_main[df_line_main['city_name'] == city]
        indices = city_mask.index.tolist()
        for i in range(1, len(indices)):
            if city_mask.at[indices[i], 'same_prefix_len'] > 2:
                df_line_main_filtered = pd.concat([df_line_main_filtered, city_mask.loc[[indices[i-1], indices[i]]]])
    df_line_main_filtered = df_line_main_filtered.drop_duplicates().reset_index(drop=True)

    df_line_main_filtered_dict = df_line_main_filtered[['city_id', 'city_name', 'line_name_full', 'line_name', 'line_name_branch']].to_dict(orient='records')
    # 对比subway_line_fixed文件，打印df_line_main_filtered_dict中每项前4项不在subway_line_fixed中的行
    fields = ['city_id', 'city_name', 'line_name_full', 'line_name']
    fixed_set = set(
        tuple(line[field] for field in fields)
        for line in subway_line_fixed
    )
    # 手动为下方打印的行添加"line_name_main"字段，复制到subway_line_fixed.json文件最后
    # 保存subway_line_fixed.json文件
    print("以下线路需要手动添加主线名称line_name_main，并复制到subway_line_fixed.json文件最后")
    for item in df_line_main_filtered_dict:
        key = tuple(item[field] for field in fields)
        if key not in fixed_set:
            print(item)

with open("subway_line_fixed.json", "r", encoding="utf-8") as f:
    subway_line_fixed = json.load(f)
clear_line_main(df_line_branch, subway_line_fixed)

以下线路需要手动添加主线名称line_name_main，并复制到subway_line_fixed.json文件最后


#### 合并线路数据

In [47]:
with open("subway_line_fixed.json", "r", encoding="utf-8") as f:
    subway_line_fixed = json.load(f)
subway_line_fixed_df = pd.DataFrame.from_dict(subway_line_fixed)
dwd_subway_bd_line = df_line_branch.merge(subway_line_fixed_df[['city_id', 'line_name_full', 'line_name_main']], how='left', on=['city_id', 'line_name_full']) 
# 将line_name_main中缺失值，填充为line_name_branch
dwd_subway_bd_line['line_name_main'] = dwd_subway_bd_line['line_name_main'].fillna(dwd_subway_bd_line['line_name_branch'])
dwd_subway_bd_line


Unnamed: 0,city_id,city_name,line_name_full,line_name,line_is_loop,line_label_x,line_label_y,line_label_rotation,line_color,line_name_count,line_name_full_count,line_name_branch,line_order,line_name_main
0,131,北京,地铁1号线八通线,1号线八通线,0,-498.9,139.1,0,#c03935,1,1,1号线八通线,1,1号线八通线
1,131,北京,地铁2号线,2号线,1,-175.3,-4.9,0,#005f98,1,1,2号线,2,2号线
2,131,北京,地铁3号线,3号线,0,450,-55,0,#942413,1,1,3号线,3,3号线
3,131,北京,地铁4号线大兴线,4号线大兴线,0,-499.3,-224,0,#008e9c,1,1,4号线大兴线,4,4号线大兴线
4,131,北京,地铁5号线,5号线,0,18,-353.9,0,#a6217f,1,1,5号线,5,5号线
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
628,51314,马德里,ML1,ML1,0,-70,-960,0,#5086c2,1,1,ML1,13,ML1
629,51314,马德里,ML2,ML2,0,-780,-280,0,#ea4d81,1,1,ML2,14,ML2
630,51314,马德里,ML3,ML3,0,-1800,295.2,0,#ff6642,1,1,ML3,15,ML3
631,51314,马德里,ML42,ML4,0,-110,1600,0,#77b838,2,1,ML42,16,ML4


### 车站表
车站唯一id问题，存在同一个城市有同名车站，比如杭州，有2个奥体中心，1个在杭州，1个在绍兴，需要使用sid（st_name）作为城市唯一标识

In [22]:
def get_dwd_subway_st(df, df_line, is_all_st=True):
    """
    获取地铁站数据
    车站含虚拟车站和真实车站两种
    is_all_st: True-全量车站， False-真实车站
    df: dwd_subway数据
    df_line: dwd_subway_bd_line数据
    """
    df = df.copy() if is_all_st else df[df['st_is_station'] == 1].copy()
    print(f"Total stations: {len(df)}")
    # 连接查询线路顺序
    res_df = df.merge(df_line[['city_id', 'line_name_full', 'line_order', 'line_name_branch', 'line_name_main']], how='left', on=['city_id', 'line_name_full'])
    print(f"Total stations after merge line info: {len(res_df)}")
    # 当len(res_df != len(df))时，说明有部分站点的line_name_full在df_line中没有匹配上
    if len(res_df) != len(df):
        print(f"WARNNING!, Stations with unmatched line info: {len(res_df) - len(df)}")
    # 车站顺序
    res_df['st_order'] = res_df.groupby(['city_id', 'line_name_full']).cumcount() + 1
    # 车站id
    res_df['st_id_virtual'] = res_df.apply(lambda row: f"{row['city_id']}_{row['line_order']:03d}_{row['st_order']:03d}", axis=1)
    # 将st_name为空或字符串长度为0的站点st_name填充为st_id_virtual
    res_df['st_name'] = res_df['st_name'].replace('', np.nan)
    res_df['st_name'] = res_df['st_name'].fillna(res_df['st_id_virtual'])
    # 下一站id
    res_df['target_st_id_virtual'] = res_df.groupby(['city_id', 'line_name_full'])['st_id_virtual'].shift(-1)
    # 如果line_is_loop为1，则最后一站的下一站为第一站
    # 找出所有环线线路
    loop_keys = res_df.loc[res_df['line_is_loop'] == 1, ['city_id', 'line_name_full']].drop_duplicates()

    for _, row in loop_keys.iterrows():
        mask = (res_df['city_id'] == row['city_id']) & (res_df['line_name_full'] == row['line_name_full'])
        idx = res_df.loc[mask].index

        if len(idx) > 1:  # 至少要有两站才能构成环线
            first_idx = idx[0]
            last_idx = idx[-1]
            res_df.loc[last_idx, 'target_st_id_virtual'] = res_df.loc[first_idx, 'st_id_virtual']
    # 按target_st_id_virtual， 添加target_st_name, target_st_x, target_st_y
    res_df = res_df.merge(res_df[['st_id_virtual', 'st_name', 'st_x', 'st_y']].rename(columns={
        'st_id_virtual': 'target_st_id_virtual',
        'st_name': 'target_st_name',
        'st_x': 'target_st_x',
        'st_y': 'target_st_y'
    }), how='left', on='target_st_id_virtual')
    return res_df

#### 全量车站表
含虚拟车站，用于线条更柔和

In [23]:
dwd_subway_st_all =  get_dwd_subway_st(dwd_subway, dwd_subway_bd_line, is_all_st=True)

Total stations: 15923
Total stations after merge line info: 15923


#### 真实车站表
真实存在的， 即st_is_station == 1

In [24]:
dwd_subway_st_real =  get_dwd_subway_st(dwd_subway, dwd_subway_bd_line, is_all_st=False)


Total stations: 13015
Total stations after merge line info: 13015


## ADS
ADS 层（Application Data Store，应用数据层）

面向应用和报表的最终数据，通常是宽表、指标表。

特点：与具体应用、报表或接口一一对应。

作用：满足业务方“即取即用”，保证查询效率。

1. city_infos 城市信息表
2. city_stats 城市维度统计指标
3. city_line_links 线路间换乘站数量， 不需要lineStyle数据
4. city_line_data

### 城市信息

#### dict

In [30]:
dwd_subway_city = get_city_infos_dws(dwd_subway, city_info)
dwd_subway_city

Unnamed: 0,city_id,city_name,city_name_e,city_order,province,province_e,country,country_e
0,131,北京,beijing,1,北京市,Beijing Shi,中国,China
1,289,上海,shanghai,2,上海市,Shanghai Shi,中国,China
2,257,广州,guangzhou,3,广东省,Guangdong Sheng,中国,China
3,340,深圳,shenzhen,4,广东省,Guangdong Sheng,中国,China
4,132,重庆,chongqing,5,重庆市,Chongqing Shi,中国,China
...,...,...,...,...,...,...,...,...
74,65531,莫斯科,mosike,75,莫斯科州,Moscow Oblast,俄罗斯,Russia
75,52390,鹿特丹,lutedan,76,南荷兰省,South Holland Province,荷兰,Netherlands
76,48552,伊斯坦布尔,yisitanbuer,77,伊斯坦布尔省,Istanbul Province,土耳其,Turkey
77,51271,巴塞罗那,basailuona,78,加泰罗尼亚自治区,Catalonia Autonomous Community,西班牙,Spain


In [31]:
def get_city_infos_dict(df):
    """
    构建城市相关信息字典，包括城市列表、国家列表、分组等
    df: dwd_subway_city数据
    """
    city_id_list = df['city_id'].tolist()
    city_name_list = df['city_name'].tolist()
    country_list = (
        df.groupby('country')
        .size()
        .sort_values(ascending=False)
        .index.tolist()
    )
    country_city_dict = {}
    for country, group in df.groupby('country'):
        country_city_dict[country] = group['city_id'].tolist()
    city_dict = {}
    for _, row in df.iterrows():
        city_dict[row['city_id']] = {
            'city_id': row['city_id'],
            'city_name': row['city_name'],
            'city_name_e': row['city_name_e'],
            'province': row['province'],
            'province_e': row['province_e'],
            'country': row['country'],
            'country_e': row['country_e'],
            'city_order': row['city_order']
        }
    city_infos = {
        'city_id_list': city_id_list,
        'city_name_list': city_name_list,
        'country_list': country_list,
        'country_city_dict': country_city_dict,
        'city_dict': city_dict
    }
    return city_infos

city_infos = get_city_infos_dict(dwd_subway_city)
city_infos

{'city_id_list': ['131',
  '289',
  '257',
  '340',
  '132',
  '332',
  '150',
  '315',
  '75',
  '58',
  '179',
  '218',
  '158',
  '224',
  '167',
  '53',
  '233',
  '104',
  '138',
  '48',
  '268',
  '180',
  '317',
  '178',
  '236',
  '163',
  '300',
  '119',
  '261',
  '127',
  '194',
  '92',
  '146',
  '2912',
  '9002',
  '9019',
  '161',
  '242',
  '122',
  '313',
  '323',
  '155',
  '348',
  '197',
  '288',
  '36',
  '316',
  '321',
  '176',
  '333',
  '153',
  '293',
  '129',
  '244',
  '189',
  '274',
  '2911',
  '20001',
  '20508',
  '26001',
  '26019',
  '26022',
  '26033',
  '26041',
  '30001',
  '30004',
  '30005',
  '30007',
  '30016',
  '49872',
  '39816',
  '53009',
  '39817',
  '60732',
  '65531',
  '52390',
  '48552',
  '51271',
  '51314'],
 'city_name_list': ['北京',
  '上海',
  '广州',
  '深圳',
  '重庆',
  '天津',
  '石家庄',
  '南京',
  '成都',
  '沈阳',
  '杭州',
  '武汉',
  '长沙',
  '苏州',
  '大连',
  '长春',
  '西安',
  '昆明',
  '佛山',
  '哈尔滨',
  '郑州',
  '宁波',
  '无锡',
  '温州',
  '青岛',
  '南昌',
  

#### 保存本地json

In [32]:
# city_infos
city_infos = get_city_infos_dict(dwd_subway_city)
with open("json_data/city_infos.json", "w", encoding="utf-8") as f:
    json.dump(city_infos, f, ensure_ascii=False, indent=4)

### 城市指标


#### df

In [33]:
def get_city_stats_ads(df_st, df_city):
    """
    按城市统计地铁指标：线路数量、车站数量、换乘站数量及占比
    df_st: dwd_subway_st_real,计算城市指标时，使用真实车站数据
    df_city: dwd_subway_city数据
    """
    city_stats = df_st.groupby(['city_id', 'city_name', 'city_name_e']).agg(
        line_name_main_count=('line_name_main', 'nunique'),
        line_name_full_count=('line_name_full', 'nunique'),
        station_count=('st_name', 'nunique'),
        transfer_station_count=('st_name', lambda x: x[df_st.loc[x.index, 'st_is_exchange'] == 1].nunique())
    ).reset_index()
    # 计算换乘站占比，保留4位小数
    city_stats['transfer_station_ratio'] = city_stats['transfer_station_count'] / city_stats['station_count']
    city_stats['transfer_station_ratio'] = city_stats['transfer_station_ratio'].fillna(0).round(4)
    
    city_stats = city_stats.merge(
        df_city[['city_id', 'city_order']],
        how='left', on='city_id'
    ).sort_values('city_order').reset_index(drop=True)
    city_stats['city_name'] = city_stats['city_name'].str.replace('特别行政区', '')
    return city_stats

ads_subway_city_stats = get_city_stats_ads(dwd_subway_st_real, dwd_subway_city)
ads_subway_city_stats

Unnamed: 0,city_id,city_name,city_name_e,line_name_main_count,line_name_full_count,station_count,transfer_station_count,transfer_station_ratio,city_order
0,131,北京,beijing,27,28,414,99,0.2391,1
1,289,上海,shanghai,22,25,420,92,0.2190,2
2,257,广州,guangzhou,28,32,453,100,0.2208,3
3,340,深圳,shenzhen,17,19,354,68,0.1921,4
4,132,重庆,chongqing,14,16,282,46,0.1631,5
...,...,...,...,...,...,...,...,...,...
74,65531,莫斯科,mosike,12,14,181,52,0.2873,75
75,52390,鹿特丹,lutedan,5,5,62,32,0.5161,76
76,48552,伊斯坦布尔,yisitanbuer,14,14,159,15,0.0943,77
77,51271,巴塞罗那,basailuona,19,20,196,45,0.2296,78


#### dict

In [34]:
def get_city_stats_dict(df_city_stats):
    """
    构建城市地铁统计指标字典
    df_city_stats: ads_subway_city_stats数据
    """
    city_stats_dict = {}
    for _, row in df_city_stats.iterrows():
        city_stats_dict[row['city_id']] = {
            'city_id': row['city_id'],
            'city_name': row['city_name'],
            'city_name_e': row['city_name_e'],
            'line_name_main_count': int(row['line_name_main_count']),
            'line_name_full_count': int(row['line_name_full_count']),
            'station_count': int(row['station_count']),
            'transfer_station_count': int(row['transfer_station_count']),
            'transfer_station_ratio': float(row['transfer_station_ratio']),
            'city_order': int(row['city_order'])
        }
    return city_stats_dict
city_stats_dict = get_city_stats_dict(ads_subway_city_stats)
city_stats_dict

{'131': {'city_id': '131',
  'city_name': '北京',
  'city_name_e': 'beijing',
  'line_name_main_count': 27,
  'line_name_full_count': 28,
  'station_count': 414,
  'transfer_station_count': 99,
  'transfer_station_ratio': 0.2391,
  'city_order': 1},
 '289': {'city_id': '289',
  'city_name': '上海',
  'city_name_e': 'shanghai',
  'line_name_main_count': 22,
  'line_name_full_count': 25,
  'station_count': 420,
  'transfer_station_count': 92,
  'transfer_station_ratio': 0.219,
  'city_order': 2},
 '257': {'city_id': '257',
  'city_name': '广州',
  'city_name_e': 'guangzhou',
  'line_name_main_count': 28,
  'line_name_full_count': 32,
  'station_count': 453,
  'transfer_station_count': 100,
  'transfer_station_ratio': 0.2208,
  'city_order': 3},
 '340': {'city_id': '340',
  'city_name': '深圳',
  'city_name_e': 'shenzhen',
  'line_name_main_count': 17,
  'line_name_full_count': 19,
  'station_count': 354,
  'transfer_station_count': 68,
  'transfer_station_ratio': 0.1921,
  'city_order': 4},
 '13

#### 保存json

In [35]:
with open("json_data/city_stats.json", "w", encoding="utf-8") as f:
    json.dump(city_stats_dict, f, ensure_ascii=False, indent=4)

### 线路间换乘数据