# 百度地图地铁数据处理


In [130]:
import pandas as pd
import numpy as np
import json
from collections import defaultdict

import networkx as nx

In [2]:
import sys
sys.path.append("../../..")
from secure.db_account import SubwayPrd
from k_libs.db_query import DBOperate

DBO = DBOperate(SubwayPrd)

## 字段表


In [3]:
# 线路字段
line_fields = [
    {'raw': 'lid',
    'processed': 'line_name_full',
    'info': '线路全名'},
    {'raw': 'lb',
    'processed': 'line_name',
    'info': '线路简称'},
    {'raw': 'n',
    'processed': 'num_stations',
    'info': '线路站点数'},
    {'raw': 'loop',
    'processed': 'is_loop',
    'info': '是否环线'},
    {'raw': 'lbx',
    'processed': 'label_x',
    'info': '线路标签X坐标'},
    {'raw': 'lby',
    'processed': 'label_y',
    'info': '线路标签Y坐标'},
    {'raw': 'lbr',
    'processed': 'label_rotation',
    'info': '线路标签旋转角度'},
    {'raw': 'lc',
    'processed': 'line_color',
    'info': '线路颜色'},
    {'raw': 'uid',
    'processed': 'line_uid',
    'info': '线路唯一ID'},
    {'raw': 'uid2',
    'processed': 'line_uid2',
    'info': '线路备用唯一ID'}
]

In [4]:
# 站点字段
station_fields = [
    {'raw': 'sid',
    'processed': 'station_name',
    'info': '站点名称'},
    {'raw': 'lb',
    'processed': 'station_label',
    'info': '站点标签'},
    {'raw': 'x',
    'processed': 'x',
    'info': '站点X坐标'},
    {'raw': 'y',
    'processed': 'y',
    'info': '站点Y坐标'},
    {'raw': 'rx',
    'processed': 'label_offset_x',
    'info': '站点标签X偏移'},
    {'raw': 'ry',
    'processed': 'label_offset_y',
    'info': '站点标签Y偏移'},
    {'raw': 'st',
    'processed': 'is_station',
    'info': '是否为车站'},
    {'raw': 'ex',
    'processed': 'is_exchange',
    'info': '是否换乘点'},
    {'raw': 'iu',
    'processed': 'is_use',
    'info': '是否在用,true表示该站已开通并运营'},
    {'raw': 'rc',
    'processed': 'is_rail_construction',
    'info': '是否为规划/在建站点'},
    {'raw': 'slb',
    'processed': 'show_label',
    'info': '是否显示标签'},
    {'raw': 'ln',
    'processed': 'lines',
    'info': '所属线路'},
    {'raw': 'uid',
    'processed': 'station_uid',
    'info': '站点唯一ID'},
    {'raw': 'px',
    'processed': 'proj_x',
    'info': '站点投影坐标X'},
    {'raw': 'py',
    'processed': 'proj_y',
    'info': '站点投影坐标Y'}
    ]


## 原始数据
ODS 层（Operational Data Store，操作数据存储层）

存放业务系统的原始数据，基本不做加工。

特点：与业务库字段保持一致，保证数据的完整性与可追溯性。

作用：承接源系统，作为数据仓库的“原材料”。

In [5]:
sql = """SELECT * FROM ods_subway_baidu WHERE crawler_id=(SELECT MAX(crawler_id) FROM ods_subway_baidu);"""
df_raw = DBO.read_sql(sql)

In [6]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15923 entries, 0 to 15922
Data columns (total 33 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   line_lid      15923 non-null  object
 1   line_lb       15923 non-null  object
 2   line_slb      15923 non-null  object
 3   line_n        15923 non-null  object
 4   line_loop     15923 non-null  object
 5   line_lbx      15923 non-null  object
 6   line_lby      15923 non-null  object
 7   line_lbr      15923 non-null  object
 8   line_lc       15923 non-null  object
 9   line_uid      15923 non-null  object
 10  line_uid2     15923 non-null  object
 11  st_sid        15923 non-null  object
 12  st_lb         15923 non-null  object
 13  st_x          15923 non-null  object
 14  st_y          15923 non-null  object
 15  st_rx         15923 non-null  object
 16  st_ry         15923 non-null  object
 17  st_st         15923 non-null  object
 18  st_ex         15923 non-null  object
 19  st_i

In [7]:
df_raw

Unnamed: 0,line_lid,line_lb,line_slb,line_n,line_loop,line_lbx,line_lby,line_lbr,line_lc,line_uid,...,st_int,st_uid,st_px,st_py,city_id,city_name,city_name_e,crawler_id,crawler_date,uid
0,地铁1号线八通线,1号线八通线,False,23,False,-498.9,139.1,0,0xc03935,bce557d6f7fadd4ea5da39b7,...,3,291c5802f26a751cbca240d9,12935140.04,4825694.5,131,北京,beijing,250928,2025-09-28,1312509280
1,地铁1号线八通线,1号线八通线,False,23,False,-498.9,139.1,0,0xc03935,bce557d6f7fadd4ea5da39b7,...,3,ad28546df35285eb851541d9,12937624.6,4825645.68,131,北京,beijing,250928,2025-09-28,1312509281
2,地铁1号线八通线,1号线八通线,False,23,False,-498.9,139.1,0,0xc03935,bce557d6f7fadd4ea5da39b7,...,3,a047555503a5bc5cbc4842d9,12940185.58,4825661.64,131,北京,beijing,250928,2025-09-28,1312509282
3,地铁1号线八通线,1号线八通线,False,23,False,-498.9,139.1,0,0xc03935,bce557d6f7fadd4ea5da39b7,...,3,a27fa5a23a128501177643d9,12942086.4,4825707.12,131,北京,beijing,250928,2025-09-28,1312509283
4,地铁1号线八通线,1号线八通线,False,23,False,-498.9,139.1,0,0xc03935,bce557d6f7fadd4ea5da39b7,...,3,53889c15034f2e3f58bbbcde,12944444.84,4825755.15,131,北京,beijing,250928,2025-09-28,1312509284
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15918,ML4,ML4,False,16,False,-110,1600,0,0x77b838,a1bf283eb66e3574c9c3dbb7,...,2,,,,51314,马德里,madeli,250928,2025-09-28,51314250928445
15919,ML4,ML4,False,16,False,-110,1600,0,0x77b838,a1bf283eb66e3574c9c3dbb7,...,2,,,,51314,马德里,madeli,250928,2025-09-28,51314250928446
15920,ML4,ML4,False,16,False,-110,1600,0,0x77b838,a1bf283eb66e3574c9c3dbb7,...,2,5660ab357cf66173901fc9b7,-417722.43,4872240.68,51314,马德里,madeli,250928,2025-09-28,51314250928447
15921,ML4,ML4,False,16,False,-110,1600,0,0x77b838,a1bf283eb66e3574c9c3dbb7,...,2,d79642c2e73da856eedfcab7,-417875.2,4871682.71,51314,马德里,madeli,250928,2025-09-28,51314250928448


## DWD 
DWD 层（Data Warehouse Detail，明细数据层）  
在 ODS 的基础上进行清洗、规范化，保留明细粒度的数据。  
特点：字段标准化（统一命名、数据类型），做一些维度退化或拆分。  
作用：保证数据“可用”，是最常被下游加工使用的一层。  

In [8]:
def dwd_subway_bd_to_sql(df_raw):
    """
    将百度地铁数据处理为DWD层数据，并存入数据库
    """

    # 对字段重命名
    df_dwd = df_raw.copy()
    df_dwd = df_dwd.rename(columns={
        'line_lid': 'line_name_full',
        'line_lb': 'line_name',
        'line_slb': 'line_show_label',
        'line_loop': 'line_is_loop',
        'line_lbx': 'line_label_x',
        'line_lby': 'line_label_y',
        'line_lbr': 'line_label_rotation',
        'line_lc': 'line_color',
        'st_sid': 'st_name',
        'st_lb': 'st_label',
        'st_rx': 'st_label_offset_x',
        'st_ry': 'st_label_offset_y',
        'st_st': 'st_is_station',
        'st_ex': 'st_is_exchange',
        'st_iu': 'st_is_use',
        'st_rc': 'st_is_rail_construction',
        'st_slb': 'st_show_label',
        'st_ln': 'st_lines',
        'st_px': 'st_proj_x',
        'st_py': 'st_proj_y'}
    )
    # 字段排序
    field_order = [
        'city_id', 'city_name', 'city_name_e',
        # 线路信息
        'line_name_full', 'line_name', 'line_show_label', 'line_n', 'line_is_loop',
        'line_label_x', 'line_label_y', 'line_label_rotation', 'line_color',
        'line_uid', 'line_uid2',
        # 站点信息
        'st_name', 'st_label', 'st_x', 'st_y', 'st_label_offset_x', 'st_label_offset_y',
        'st_is_station', 'st_is_exchange', 'st_is_use', 'st_is_rail_construction',
        'st_show_label', 'st_lines', 'st_int', 'st_uid', 'st_proj_x', 'st_proj_y',
        # 数据采集信息
        'crawler_id', 'crawler_date', 'uid'
    ]
    df_dwd = df_dwd[field_order]
    # 对存在false,true的字段进行处理
    bool_fields = [
        'line_show_label', 'line_is_loop', 'st_is_station', 'st_is_exchange', 'st_is_use', 'st_is_rail_construction', 'st_show_label'
    ]
    df_dwd = df_dwd.copy()
    for field in bool_fields:
        df_dwd[field] = df_dwd[field].map({'True': 1, 'False': 0})
    # 颜色修改为16进制
    df_dwd['line_color'] = df_dwd['line_color'].apply(lambda x: f"#{x[2:]}" if not x.startswith('#') else x)
    # 判断是否为车站,st_name是否为空
    df_dwd['st_is_station'] = df_dwd['st_name'].apply(lambda x: 1 if len(x) > 0 else 0)
    
    # 保存到数据库
    DBO.df_to_sql(df_dwd, "dwd_subway_baidu", if_exists="replace")
    print("Data saved to dwd_subway_baidu table.")
    return df_dwd

dwd_subway = dwd_subway_bd_to_sql(df_raw)
dwd_subway.info()

Data saved to dwd_subway_baidu table.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15923 entries, 0 to 15922
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   city_id                  15923 non-null  object 
 1   city_name                15923 non-null  object 
 2   city_name_e              15923 non-null  object 
 3   line_name_full           15923 non-null  object 
 4   line_name                15923 non-null  object 
 5   line_show_label          15923 non-null  int64  
 6   line_n                   15923 non-null  object 
 7   line_is_loop             15923 non-null  int64  
 8   line_label_x             15923 non-null  object 
 9   line_label_y             15923 non-null  object 
 10  line_label_rotation      15923 non-null  object 
 11  line_color               15923 non-null  object 
 12  line_uid                 15923 non-null  object 
 13  line_uid2                15923 non-nul

## DWS
DWS 层（Data Warehouse Summary，汇总数据层）

在 DWD 基础上，按业务主题和常用维度做聚合统计。

特点：以主题域为核心（如用户、订单、交易），形成宽表或统计指标。

作用：减少重复计算，支撑公共分析需求。

1. 城市表 dws_subway_bd_city
2. 线路表 dws_subway_bd_line
3. 车站表 dws_subway_bd_st

### 城市表

In [140]:
with open("city_info.json", "r", encoding="utf-8") as f:
    city_info = json.load(f)
def get_city_infos_dws(df, city_info):
    """
    生成城市信息表
    df: DWD层数据, dwd_subway
    city_info: 从city_info.json中读取的字典数据
    """
    city_info_df = pd.DataFrame.from_dict(city_info['cities'])
    # df_city = df[['city_id', 'city_name', 'city_name_e']].drop_duplicates().reset_index(drop=True)
    df_city = df.groupby(['city_id', 'city_name'])['line_name_full'].nunique().reset_index(name='line_count')
    # city_order， 按线路数排序后，
    df_city = df_city.sort_values(by='line_count', ascending=False).reset_index(drop=True)

    df_city['city_order'] = df_city.index + 1
    df_city = df_city.merge(city_info_df, how='left', on='city_name')
    df_city = df_city.rename(columns={'english_pinyin_name': 'city_name_e',
        'province_chinese': 'province', 
                                    'province_pinyin_english': 'province_e',
                                      'country_chinese': 'country', 
                                      'country_english': 'country_e'})
    return df_city
dwd_subway_city = get_city_infos_dws(dwd_subway, city_info)
dwd_subway_city

Unnamed: 0,city_id,city_name,line_count,city_order,city_name_e,province,province_e,country,country_e
0,257,广州,32,1,Guangzhou,广东省,Guangdong Sheng,中国,China
1,60732,纽约,32,2,New York,纽约州,New York State,美国,United States
2,30016,首尔,30,3,Seoul,首尔特别市,Seoul Special City,韩国,South Korea
3,131,北京,28,4,Beijing,北京市,Beijing Shi,中国,China
4,289,上海,25,5,Shanghai,上海市,Shanghai Shi,中国,China
...,...,...,...,...,...,...,...,...,...
74,244,台州,1,75,Taizhou,浙江省,Zhejiang Sheng,中国,China
75,189,滁州,1,76,Chuzhou,安徽省,Anhui Sheng,中国,China
76,30007,光州,1,77,Gwangju,光州广域市,Gwangju Metropolitan City,韩国,South Korea
77,313,湘潭,1,78,Xiangtan,湖南省,Hunan Sheng,中国,China


### 线路表
* line_uid有空值，不可用
* line_name_full 暂无重复值，可以用
* line_name 有重复值，需处理

#### 支线名称清洗

In [42]:
def clear_line_branch(df):
    """
    清理线路支线信息，将line_name_full作为支线名称，支线名称应该唯一
    df: dwd_subway
    """
    df_line = df[['city_id', 'city_name', 'line_name_full', 'line_name', 'line_is_loop', 'line_label_x', 'line_label_y', 'line_label_rotation', 'line_color']].drop_duplicates().reset_index(drop=True)
    # 按city_name, line_name统计line_name出现次数
    df_line['line_name_count'] = df_line.groupby(['city_name', 'line_name'])['line_name'].transform('count')
    # if df_line['line_name_count'].max() > 1:
    #     print("存在line_name相同的线路")
    # 按city_name, line_name_full统计line_name出现次数
    df_line['line_name_full_count'] = df_line.groupby(['city_name', 'line_name_full'])['line_name_full'].transform('count')
    if df_line['line_name_full_count'].max() > 1:
        # 停止执行
        raise ValueError("存在line_name_full相同的线路，非唯一，需要定位数据，做进一步处理")
    # 添加line_name_branch，当line_name_count>1时，使用line_name_full
    df_line['line_name_branch'] = df_line['line_name']
    df_line.loc[df_line['line_name_count'] > 1, 'line_name_branch'] = df_line['line_name_full']
    # 删除line_name_full中的“地铁”
    df_line['line_name_branch'] = df_line['line_name_branch'].str.replace('地铁', '', regex=False)
    # 按城市，线路顺序添加line_order
    df_line['line_order'] = df_line.groupby('city_id').cumcount() + 1
    return df_line

df_line_branch = clear_line_branch(dwd_subway)
df_line_branch

Unnamed: 0,city_id,city_name,line_name_full,line_name,line_is_loop,line_label_x,line_label_y,line_label_rotation,line_color,line_name_count,line_name_full_count,line_name_branch,line_order
0,131,北京,地铁1号线八通线,1号线八通线,0,-498.9,139.1,0,#c03935,1,1,1号线八通线,1
1,131,北京,地铁2号线,2号线,1,-175.3,-4.9,0,#005f98,1,1,2号线,2
2,131,北京,地铁3号线,3号线,0,450,-55,0,#942413,1,1,3号线,3
3,131,北京,地铁4号线大兴线,4号线大兴线,0,-499.3,-224,0,#008e9c,1,1,4号线大兴线,4
4,131,北京,地铁5号线,5号线,0,18,-353.9,0,#a6217f,1,1,5号线,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
628,51314,马德里,ML1,ML1,0,-70,-960,0,#5086c2,1,1,ML1,13
629,51314,马德里,ML2,ML2,0,-780,-280,0,#ea4d81,1,1,ML2,14
630,51314,马德里,ML3,ML3,0,-1800,295.2,0,#ff6642,1,1,ML3,15
631,51314,马德里,ML42,ML4,0,-110,1600,0,#77b838,2,1,ML42,16


#### 主线名称清洗
1. 计算line_name与上一行相同字数，作为same_prefix_len列
2. 筛选出same_prefix_len>2的行，以及每一的上一行
3. 将['city_id', 'city_name', 'line_name_full', 'line_name', 'line_name_branch']保存为新的df
4. 对比subway_line_fixed.json文件，将新增的手工添加"line_name_main"字段，复制到subway_line_fixed.json文件最后。

In [44]:
def clear_line_main(df_line_branch, subway_line_fixed):
    """
    为支线添加主线名称
    df_line_branch: 支线数据
    subway_line_fixed: 从subway_line_fixed.json中读取的字典数据
    """
    df_line_main = df_line_branch.copy()
    # 按城市，统计line_name与上一行line_name相同的字数，从首字开始统计，如首字不同，直接记为0
    df_line_main['same_prefix_len'] = 0
    for city in df_line_main['city_name'].unique():
        city_mask = df_line_main[df_line_main['city_name'] == city]
        prev_line_name = ""
        for idx, row in city_mask.iterrows():
            line_name = row['line_name']
            if line_name == prev_line_name:
                df_line_main.at[idx, 'same_prefix_len'] = len(line_name)
            else:
                # 计算与上一行相同的前缀长度
                common_length = 0
                for c1, c2 in zip(line_name, prev_line_name):
                    if c1 == c2:
                        common_length += 1
                    else:
                        break
                df_line_main.at[idx, 'same_prefix_len'] = common_length
            prev_line_name = line_name
    # 筛选出same_prefix_len>2的行，以及每一的上一行
    df_line_main_filtered = pd.DataFrame()
    for city in df_line_main['city_name'].unique():
        city_mask = df_line_main[df_line_main['city_name'] == city]
        indices = city_mask.index.tolist()
        for i in range(1, len(indices)):
            if city_mask.at[indices[i], 'same_prefix_len'] > 2:
                df_line_main_filtered = pd.concat([df_line_main_filtered, city_mask.loc[[indices[i-1], indices[i]]]])
    df_line_main_filtered = df_line_main_filtered.drop_duplicates().reset_index(drop=True)

    df_line_main_filtered_dict = df_line_main_filtered[['city_id', 'city_name', 'line_name_full', 'line_name', 'line_name_branch']].to_dict(orient='records')
    # 对比subway_line_fixed文件，打印df_line_main_filtered_dict中每项前4项不在subway_line_fixed中的行
    fields = ['city_id', 'city_name', 'line_name_full', 'line_name']
    fixed_set = set(
        tuple(line[field] for field in fields)
        for line in subway_line_fixed
    )
    # 手动为下方打印的行添加"line_name_main"字段，复制到subway_line_fixed.json文件最后
    # 保存subway_line_fixed.json文件
    print("以下线路需要手动添加主线名称line_name_main，并复制到subway_line_fixed.json文件最后")
    for item in df_line_main_filtered_dict:
        key = tuple(item[field] for field in fields)
        if key not in fixed_set:
            print(item)

with open("subway_line_fixed.json", "r", encoding="utf-8") as f:
    subway_line_fixed = json.load(f)
clear_line_main(df_line_branch, subway_line_fixed)

以下线路需要手动添加主线名称line_name_main，并复制到subway_line_fixed.json文件最后


#### 合并线路数据

In [47]:
with open("subway_line_fixed.json", "r", encoding="utf-8") as f:
    subway_line_fixed = json.load(f)
subway_line_fixed_df = pd.DataFrame.from_dict(subway_line_fixed)
dwd_subway_bd_line = df_line_branch.merge(subway_line_fixed_df[['city_id', 'line_name_full', 'line_name_main']], how='left', on=['city_id', 'line_name_full']) 
# 将line_name_main中缺失值，填充为line_name_branch
dwd_subway_bd_line['line_name_main'] = dwd_subway_bd_line['line_name_main'].fillna(dwd_subway_bd_line['line_name_branch'])
dwd_subway_bd_line


Unnamed: 0,city_id,city_name,line_name_full,line_name,line_is_loop,line_label_x,line_label_y,line_label_rotation,line_color,line_name_count,line_name_full_count,line_name_branch,line_order,line_name_main
0,131,北京,地铁1号线八通线,1号线八通线,0,-498.9,139.1,0,#c03935,1,1,1号线八通线,1,1号线八通线
1,131,北京,地铁2号线,2号线,1,-175.3,-4.9,0,#005f98,1,1,2号线,2,2号线
2,131,北京,地铁3号线,3号线,0,450,-55,0,#942413,1,1,3号线,3,3号线
3,131,北京,地铁4号线大兴线,4号线大兴线,0,-499.3,-224,0,#008e9c,1,1,4号线大兴线,4,4号线大兴线
4,131,北京,地铁5号线,5号线,0,18,-353.9,0,#a6217f,1,1,5号线,5,5号线
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
628,51314,马德里,ML1,ML1,0,-70,-960,0,#5086c2,1,1,ML1,13,ML1
629,51314,马德里,ML2,ML2,0,-780,-280,0,#ea4d81,1,1,ML2,14,ML2
630,51314,马德里,ML3,ML3,0,-1800,295.2,0,#ff6642,1,1,ML3,15,ML3
631,51314,马德里,ML42,ML4,0,-110,1600,0,#77b838,2,1,ML42,16,ML4


### 车站表
车站唯一id问题，存在同一个城市有同名车站，比如杭州，有2个奥体中心，1个在杭州，1个在绍兴，需要使用sid（st_name）作为城市唯一标识

In [72]:
def get_dwd_subway_st(df, df_line, is_all_st=True):
    """
    获取地铁站数据
    车站含虚拟车站和真实车站两种
    is_all_st: True-全量车站， False-真实车站
    df: dwd_subway数据
    df_line: dwd_subway_bd_line数据
    """
    df = df.copy() if is_all_st else df[df['st_is_station'] == 1].copy()
    print(f"Total stations: {len(df)}")
    # 连接查询线路顺序
    res_df = df.merge(df_line[['city_id', 'line_name_full', 'line_order', 'line_name_branch', 'line_name_main']], how='left', on=['city_id', 'line_name_full'])
    print(f"Total stations after merge line info: {len(res_df)}")
    # 当len(res_df != len(df))时，说明有部分站点的line_name_full在df_line中没有匹配上
    if len(res_df) != len(df):
        print(f"WARNNING!, Stations with unmatched line info: {len(res_df) - len(df)}")
    # 车站顺序
    res_df['st_order'] = res_df.groupby(['city_id', 'line_name_full']).cumcount() + 1
    # 车站id
    res_df['st_id_virtual'] = res_df.apply(lambda row: f"{row['city_id']}_{row['line_order']:03d}_{row['st_order']:03d}", axis=1)
    # 将st_name为空或字符串长度为0的站点st_name填充为st_id_virtual
    res_df['st_name'] = res_df['st_name'].replace('', np.nan)
    res_df['st_name'] = res_df['st_name'].fillna(res_df['st_id_virtual'])
    # 下一站id
    res_df['target_st_id_virtual'] = res_df.groupby(['city_id', 'line_name_full'])['st_id_virtual'].shift(-1)
    # 如果line_is_loop为1，则最后一站的下一站为第一站
    # 找出所有环线线路
    loop_keys = res_df.loc[res_df['line_is_loop'] == 1, ['city_id', 'line_name_full']].drop_duplicates()

    for _, row in loop_keys.iterrows():
        mask = (res_df['city_id'] == row['city_id']) & (res_df['line_name_full'] == row['line_name_full'])
        idx = res_df.loc[mask].index

        if len(idx) > 1:  # 至少要有两站才能构成环线
            first_idx = idx[0]
            last_idx = idx[-1]
            res_df.loc[last_idx, 'target_st_id_virtual'] = res_df.loc[first_idx, 'st_id_virtual']
    
    # 数据类型，需要转换为float
    cols_float = ['st_x', 'st_y', 'st_label_offset_x', 'st_label_offset_y', 'st_proj_x', 'st_proj_y', 'line_label_x', 'line_label_y']
    for col in cols_float:
        res_df[col] = pd.to_numeric(res_df[col], errors='coerce')

    # 按target_st_id_virtual， 添加target_st_name, target_st_x, target_st_y
    res_df = res_df.merge(res_df[['st_id_virtual', 'st_name', 'st_x', 'st_y']].rename(columns={
        'st_id_virtual': 'target_st_id_virtual',
        'st_name': 'target_st_name',
        'st_x': 'target_st_x',
        'st_y': 'target_st_y'
    }), how='left', on='target_st_id_virtual')
    return res_df

#### 全量车站表
含虚拟车站，用于线条更柔和

In [73]:
dwd_subway_st_all =  get_dwd_subway_st(dwd_subway, dwd_subway_bd_line, is_all_st=True)

Total stations: 15923
Total stations after merge line info: 15923


#### 真实车站表
真实存在的， 即st_is_station == 1

In [108]:
dwd_subway_st_real =  get_dwd_subway_st(dwd_subway, dwd_subway_bd_line, is_all_st=False)


Total stations: 13015
Total stations after merge line info: 13015


In [109]:
dwd_subway_st_real

Unnamed: 0,city_id,city_name,city_name_e,line_name_full,line_name,line_show_label,line_n,line_is_loop,line_label_x,line_label_y,...,uid,line_order,line_name_branch,line_name_main,st_order,st_id_virtual,target_st_id_virtual,target_st_name,target_st_x,target_st_y
0,131,北京,beijing,地铁1号线八通线,1号线八通线,0,23,0,-498.9,139.1,...,1312509280,1,1号线八通线,1号线八通线,1,131_001_001,131_001_002,八角游乐园,-608.2,170.6
1,131,北京,beijing,地铁1号线八通线,1号线八通线,0,23,0,-498.9,139.1,...,1312509281,1,1号线八通线,1号线八通线,2,131_001_002,131_001_003,八宝山,-549.2,170.6
2,131,北京,beijing,地铁1号线八通线,1号线八通线,0,23,0,-498.9,139.1,...,1312509282,1,1号线八通线,1号线八通线,3,131_001_003,131_001_004,玉泉路,-501.8,170.6
3,131,北京,beijing,地铁1号线八通线,1号线八通线,0,23,0,-498.9,139.1,...,1312509283,1,1号线八通线,1号线八通线,4,131_001_004,131_001_005,五棵松,-453.8,170.6
4,131,北京,beijing,地铁1号线八通线,1号线八通线,0,23,0,-498.9,139.1,...,1312509284,1,1号线八通线,1号线八通线,5,131_001_005,131_001_006,万寿路,-409.2,170.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13010,51314,马德里,madeli,ML4,ML4,0,16,0,-110.0,1600.0,...,51314250928440,17,ML4,ML4,12,51314_017_012,51314_017_013,Poligono Industrial Ciudad de Parla,-261.4,1434.8
13011,51314,马德里,madeli,ML4,ML4,0,16,0,-110.0,1600.0,...,51314250928443,17,ML4,ML4,13,51314_017_013,51314_017_014,Jaime I Sur,-320.0,1494.8
13012,51314,马德里,madeli,ML4,ML4,0,16,0,-110.0,1600.0,...,51314250928447,17,ML4,ML4,14,51314_017_014,51314_017_015,Estrella Polar Sur,-320.0,1556.1
13013,51314,马德里,madeli,ML4,ML4,0,16,0,-110.0,1600.0,...,51314250928448,17,ML4,ML4,15,51314_017_015,51314_017_016,Venus Sur,-320.0,1616.8


## ADS
ADS 层（Application Data Store，应用数据层）

面向应用和报表的最终数据，通常是宽表、指标表。

特点：与具体应用、报表或接口一一对应。

作用：满足业务方“即取即用”，保证查询效率。

1. city_infos 城市信息表
2. city_stats 城市维度统计指标
3. city_line_links 线路间换乘站数量， 不需要lineStyle数据
4. city_line_data

### 城市信息

#### dict

In [30]:
dwd_subway_city = get_city_infos_dws(dwd_subway, city_info)
dwd_subway_city

Unnamed: 0,city_id,city_name,city_name_e,city_order,province,province_e,country,country_e
0,131,北京,beijing,1,北京市,Beijing Shi,中国,China
1,289,上海,shanghai,2,上海市,Shanghai Shi,中国,China
2,257,广州,guangzhou,3,广东省,Guangdong Sheng,中国,China
3,340,深圳,shenzhen,4,广东省,Guangdong Sheng,中国,China
4,132,重庆,chongqing,5,重庆市,Chongqing Shi,中国,China
...,...,...,...,...,...,...,...,...
74,65531,莫斯科,mosike,75,莫斯科州,Moscow Oblast,俄罗斯,Russia
75,52390,鹿特丹,lutedan,76,南荷兰省,South Holland Province,荷兰,Netherlands
76,48552,伊斯坦布尔,yisitanbuer,77,伊斯坦布尔省,Istanbul Province,土耳其,Turkey
77,51271,巴塞罗那,basailuona,78,加泰罗尼亚自治区,Catalonia Autonomous Community,西班牙,Spain


In [31]:
def get_city_infos_dict(df):
    """
    构建城市相关信息字典，包括城市列表、国家列表、分组等
    df: dwd_subway_city数据
    """
    city_id_list = df['city_id'].tolist()
    city_name_list = df['city_name'].tolist()
    country_list = (
        df.groupby('country')
        .size()
        .sort_values(ascending=False)
        .index.tolist()
    )
    country_city_dict = {}
    for country, group in df.groupby('country'):
        country_city_dict[country] = group['city_id'].tolist()
    city_dict = {}
    for _, row in df.iterrows():
        city_dict[row['city_id']] = {
            'city_id': row['city_id'],
            'city_name': row['city_name'],
            'city_name_e': row['city_name_e'],
            'province': row['province'],
            'province_e': row['province_e'],
            'country': row['country'],
            'country_e': row['country_e'],
            'city_order': row['city_order']
        }
    city_infos = {
        'city_id_list': city_id_list,
        'city_name_list': city_name_list,
        'country_list': country_list,
        'country_city_dict': country_city_dict,
        'city_dict': city_dict
    }
    return city_infos

city_infos = get_city_infos_dict(dwd_subway_city)
city_infos

{'city_id_list': ['131',
  '289',
  '257',
  '340',
  '132',
  '332',
  '150',
  '315',
  '75',
  '58',
  '179',
  '218',
  '158',
  '224',
  '167',
  '53',
  '233',
  '104',
  '138',
  '48',
  '268',
  '180',
  '317',
  '178',
  '236',
  '163',
  '300',
  '119',
  '261',
  '127',
  '194',
  '92',
  '146',
  '2912',
  '9002',
  '9019',
  '161',
  '242',
  '122',
  '313',
  '323',
  '155',
  '348',
  '197',
  '288',
  '36',
  '316',
  '321',
  '176',
  '333',
  '153',
  '293',
  '129',
  '244',
  '189',
  '274',
  '2911',
  '20001',
  '20508',
  '26001',
  '26019',
  '26022',
  '26033',
  '26041',
  '30001',
  '30004',
  '30005',
  '30007',
  '30016',
  '49872',
  '39816',
  '53009',
  '39817',
  '60732',
  '65531',
  '52390',
  '48552',
  '51271',
  '51314'],
 'city_name_list': ['北京',
  '上海',
  '广州',
  '深圳',
  '重庆',
  '天津',
  '石家庄',
  '南京',
  '成都',
  '沈阳',
  '杭州',
  '武汉',
  '长沙',
  '苏州',
  '大连',
  '长春',
  '西安',
  '昆明',
  '佛山',
  '哈尔滨',
  '郑州',
  '宁波',
  '无锡',
  '温州',
  '青岛',
  '南昌',
  

#### 保存本地json

In [32]:
# city_infos
city_infos = get_city_infos_dict(dwd_subway_city)
with open("json_data/city_infos.json", "w", encoding="utf-8") as f:
    json.dump(city_infos, f, ensure_ascii=False, indent=4)

### 城市指标


#### df

In [143]:
def get_city_stats_ads(df_st, df_city):
    """
    按城市统计地铁指标：线路数量、车站数量、换乘站数量及占比
    df_st: dwd_subway_st_real,计算城市指标时，使用真实车站数据
    df_city: dwd_subway_city数据
    """
    city_stats = df_st.groupby(['city_id', 'city_name']).agg(
        line_name_main_count=('line_name_main', 'nunique'),
        line_name_full_count=('line_name_full', 'nunique'),
        station_count=('st_name', 'nunique'),
        transfer_station_count=('st_name', lambda x: x[df_st.loc[x.index, 'st_is_exchange'] == 1].nunique())
    ).reset_index()
    # 计算换乘站占比，保留4位小数
    city_stats['transfer_station_ratio'] = city_stats['transfer_station_count'] / city_stats['station_count']
    city_stats['transfer_station_ratio'] = city_stats['transfer_station_ratio'].fillna(0).round(4)
    
    city_stats = city_stats.merge(
        df_city[['city_id', 'city_order', 'city_name_e']],
        how='left', on='city_id'
    ).sort_values('city_order').reset_index(drop=True)
    city_stats['city_name'] = city_stats['city_name'].str.replace('特别行政区', '')
    return city_stats

ads_subway_city_stats = get_city_stats_ads(dwd_subway_st_real, dwd_subway_city)
ads_subway_city_stats

Unnamed: 0,city_id,city_name,line_name_main_count,line_name_full_count,station_count,transfer_station_count,transfer_station_ratio,city_order,city_name_e
0,257,广州,28,32,453,100,0.2208,1,Guangzhou
1,60732,纽约,26,32,463,212,0.4579,2,New York
2,30016,首尔,23,30,626,107,0.1709,3,Seoul
3,131,北京,27,28,414,99,0.2391,4,Beijing
4,289,上海,22,25,420,92,0.2190,5,Shanghai
...,...,...,...,...,...,...,...,...,...
74,244,台州,1,1,15,0,0.0000,75,Taizhou
75,189,滁州,1,1,10,0,0.0000,76,Chuzhou
76,30007,光州,1,1,20,0,0.0000,77,Gwangju
77,313,湘潭,1,1,33,7,0.2121,78,Xiangtan


#### dict

In [34]:
def get_city_stats_dict(df_city_stats):
    """
    构建城市地铁统计指标字典
    df_city_stats: ads_subway_city_stats数据
    """
    city_stats_dict = {}
    for _, row in df_city_stats.iterrows():
        city_stats_dict[row['city_id']] = {
            'city_id': row['city_id'],
            'city_name': row['city_name'],
            'city_name_e': row['city_name_e'],
            'line_name_main_count': int(row['line_name_main_count']),
            'line_name_full_count': int(row['line_name_full_count']),
            'station_count': int(row['station_count']),
            'transfer_station_count': int(row['transfer_station_count']),
            'transfer_station_ratio': float(row['transfer_station_ratio']),
            'city_order': int(row['city_order'])
        }
    return city_stats_dict
city_stats_dict = get_city_stats_dict(ads_subway_city_stats)
city_stats_dict

{'131': {'city_id': '131',
  'city_name': '北京',
  'city_name_e': 'beijing',
  'line_name_main_count': 27,
  'line_name_full_count': 28,
  'station_count': 414,
  'transfer_station_count': 99,
  'transfer_station_ratio': 0.2391,
  'city_order': 1},
 '289': {'city_id': '289',
  'city_name': '上海',
  'city_name_e': 'shanghai',
  'line_name_main_count': 22,
  'line_name_full_count': 25,
  'station_count': 420,
  'transfer_station_count': 92,
  'transfer_station_ratio': 0.219,
  'city_order': 2},
 '257': {'city_id': '257',
  'city_name': '广州',
  'city_name_e': 'guangzhou',
  'line_name_main_count': 28,
  'line_name_full_count': 32,
  'station_count': 453,
  'transfer_station_count': 100,
  'transfer_station_ratio': 0.2208,
  'city_order': 3},
 '340': {'city_id': '340',
  'city_name': '深圳',
  'city_name_e': 'shenzhen',
  'line_name_main_count': 17,
  'line_name_full_count': 19,
  'station_count': 354,
  'transfer_station_count': 68,
  'transfer_station_ratio': 0.1921,
  'city_order': 4},
 '13

#### 保存json

In [35]:
with open("json_data/city_stats.json", "w", encoding="utf-8") as f:
    json.dump(city_stats_dict, f, ensure_ascii=False, indent=4)

### 线路数据

#### df

In [87]:
# 线路换乘线网图比例
def get_chart_ratio(df_city, line_list):
    df_line = df_city[df_city['line_name_branch'].isin(line_list)]
    x_min = df_line['st_x'].min()
    x_max = df_line['st_x'].max()
    y_min = df_line['st_y'].min()
    y_max = df_line['st_y'].max()
    x_range = x_max - x_min
    y_range = y_max - y_min
    ratio = x_range / y_range if y_range != 0 else 1
    return round(ratio, 2)

# 线路换乘线网所有车站数量
def get_line_transfer_lines_st_count(df_city, line_list):
    df_line = df_city[df_city['line_name_branch'].isin(line_list)]
    return df_line['st_name'].nunique()

# 按照st_name提取所在线路的st_id_virtual
def get_line_st_ids(df_city, line_name, st_name_list):
    df_line = df_city[df_city['line_name_branch'] == line_name]
    return df_line[df_line['st_name'].isin(st_name_list)]['st_id_virtual'].unique().tolist()

In [98]:
def get_line_data_ads(df_st):
    """
    获取线路数据
    df_st: dwd_subway_st_real数据
    """
    city_ids = df_st['city_id'].unique().tolist()
    res_df = pd.DataFrame()
    for city_id in city_ids:
        df_city = df_st[df_st['city_id'] == city_id]
        df_city_line = df_city.groupby(['city_id', 'city_name', 'line_name_full', 'line_name', 'line_name_branch', 'line_name_main', 'line_color']).agg(
            line_order = ('line_order', 'first'),
            st_count = ('st_name', 'nunique'),
            transfer_st_count = ('st_is_exchange', 'sum'),
            transfer_sts = ('st_name', lambda x: x[df_city.loc[x.index, 'st_is_exchange'] == 1].unique().tolist(),)
            ).reset_index()
        # transfer_sts_ids
        df_city_line['transfer_sts_ids'] = df_city_line.apply(
            lambda row: get_line_st_ids(df_city, row['line_name_branch'], row['transfer_sts']), axis=1
            )
        # 线路顺序
        line_sorted = df_city_line[['line_name_branch', 'line_order']].drop_duplicates().sort_values('line_order')['line_name_branch'].tolist()
        # 换乘线路，按transfer_sts确定车站所在路线，并将换乘线路列表添加到transfer_lines字段中
        df_city_line['transfer_lines_all'] = df_city_line['transfer_sts'].apply(
            lambda sts: df_city[df_city['st_name'].isin(sts)]['line_name_branch'].unique().tolist()
            )
        # 去重
        df_city_line['transfer_lines_all'] = df_city_line['transfer_lines_all'].apply(lambda x: list(set(x)))
        # 按line_sorted排序
        df_city_line['transfer_lines_all'] = df_city_line['transfer_lines_all'].apply(lambda x: sorted(x, key=lambda y: line_sorted.index(y)) if isinstance(x, list) else x)
        # 去除自身
        df_city_line['transfer_lines'] = df_city_line.apply(lambda row: [line for line in row['transfer_lines_all'] if line != row['line_name_branch']], axis=1)
        df_city_line['transfer_lines_count'] = df_city_line['transfer_lines'].apply(len)

        # 线路图比例
        df_city_line['chart_ratio'] = df_city_line['transfer_lines_all'].apply(lambda x: get_chart_ratio(df_city, x)).fillna(1.0)

        # 城市线路数
        df_city_line['city_line_count'] = len(df_city_line)
        # 城市车站数
        df_city_line['city_st_count'] = df_city['st_name'].nunique()
        # transfer_lines_all所有车站的数量，按st_name去重
        df_city_line['transfer_lines_all_count'] = df_city_line['transfer_lines_all'].apply(lambda x: get_line_transfer_lines_st_count(df_city, x))
        
        # 线路排序
        df_city_line = df_city_line.sort_values('line_order')
        res_df = pd.concat([res_df, df_city_line], ignore_index=True)
    return res_df

In [99]:
ads_subway_line_data = get_line_data_ads(dwd_subway_st_real)
ads_subway_line_data

Unnamed: 0,city_id,city_name,line_name_full,line_name,line_name_branch,line_name_main,line_color,line_order,st_count,transfer_st_count,transfer_sts,transfer_sts_ids,transfer_lines_all,transfer_lines,transfer_lines_count,chart_ratio,city_line_count,city_st_count,transfer_lines_all_count
0,131,北京,地铁1号线八通线,1号线八通线,1号线八通线,1号线八通线,#c03935,1,35,12,"[公主坟, 军事博物馆, 木樨地, 复兴门, 西单, 王府井, 东单, 建国门, 国贸, 大...","[131_001_007, 131_001_008, 131_001_009, 131_00...","[1号线八通线, 2号线, 4号线大兴线, 5号线, 7号线, 8号线, 9号线, 10号线...","[2号线, 4号线大兴线, 5号线, 7号线, 8号线, 9号线, 10号线, 14号线, ...",9,1.24,28,414,252
1,131,北京,地铁2号线,2号线,2号线,2号线,#005f98,2,18,13,"[西直门, 积水潭, 鼓楼大街, 雍和宫, 东直门, 东四十条, 朝阳门, 建国门, 崇文门...","[131_002_001, 131_002_002, 131_002_003, 131_00...","[1号线八通线, 2号线, 3号线, 4号线大兴线, 5号线, 6号线, 8号线, 13号线...","[1号线八通线, 3号线, 4号线大兴线, 5号线, 6号线, 8号线, 13号线, 19号...",9,1.50,28,414,195
2,131,北京,地铁3号线,3号线,3号线,3号线,#942413,3,10,5,"[东四十条, 工人体育场, 团结湖, 朝阳公园, 东坝北]","[131_003_001, 131_003_002, 131_003_003, 131_00...","[2号线, 3号线, 10号线, 12号线, 14号线, 17号线(未来科学城北-工人体育场)]","[2号线, 10号线, 12号线, 14号线, 17号线(未来科学城北-工人体育场)]",5,1.39,28,414,124
3,131,北京,地铁4号线大兴线,4号线大兴线,4号线大兴线,4号线大兴线,#008e9c,4,35,12,"[西苑, 海淀黄庄, 人民大学, 国家图书馆, 西直门, 平安里, 西单, 宣武门, 菜市口...","[131_004_003, 131_004_007, 131_004_008, 131_00...","[1号线八通线, 2号线, 4号线大兴线, 6号线, 7号线, 9号线, 10号线, 12号...","[1号线八通线, 2号线, 6号线, 7号线, 9号线, 10号线, 12号线, 13号线,...",11,1.53,28,414,264
4,131,北京,地铁5号线,5号线,5号线,5号线,#a6217f,5,23,12,"[立水桥, 大屯路东, 惠新西街南口, 和平西桥, 雍和宫, 北新桥, 东四, 东单, 崇文...","[131_005_004, 131_005_007, 131_005_009, 131_00...","[1号线八通线, 2号线, 5号线, 6号线, 7号线, 10号线, 12号线, 13号线,...","[1号线八通线, 2号线, 6号线, 7号线, 10号线, 12号线, 13号线, 14号线...",11,1.73,28,414,253
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
628,51314,马德里,ML1,ML1,ML1,ML1,#5086c2,13,9,2,"[Pinar de Chamartin, Las Tablas]","[51314_013_001, 51314_013_009]","[Line 1, Line 4, Line 10, ML1]","[Line 1, Line 4, Line 10]",3,0.57,17,294,88
629,51314,马德里,ML2,ML2,ML2,ML2,#ea4d81,14,13,1,[Colonia Jardin],[51314_014_001],"[Line 10, ML2, ML3]","[Line 10, ML3]",2,1.03,17,294,58
630,51314,马德里,ML3,ML3,ML3,ML3,#ff6642,15,16,1,[Colonia Jardin],[51314_015_001],"[Line 10, ML2, ML3]","[Line 10, ML2]",2,1.03,17,294,58
631,51314,马德里,ML42,ML4,ML42,ML4,#77b838,16,15,0,[],[],[],[],0,1.00,17,294,0


#### dict

In [100]:
def get_line_data_dict(df_line_data):
    """
    构建线路数据字典
    df_line_data: ads_subway_line_data数据
    """
    city_ids = df_line_data['city_id'].unique().tolist()
    res = defaultdict(dict)
    for city_id in city_ids:
        df_city = df_line_data[df_line_data['city_id'] == city_id]
        res[city_id]['city_id'] = city_id
        res[city_id]['city_name'] = df_city['city_name'].iloc[0]
        res[city_id]['city_line_count'] = int(df_city['city_line_count'].iloc[0])
        res[city_id]['city_st_count'] = int(df_city['city_st_count'].iloc[0])
        lines_data = []
        for _, row in df_city.iterrows():
            lines_data.append({
                'line_name_full': row['line_name_full'],
                'line_name': row['line_name'],
                'line_name_branch': row['line_name_branch'],
                'line_name_main': row['line_name_main'],
                'line_color': row['line_color'],
                'line_order': int(row['line_order']),
                'st_count': int(row['st_count']),
                'transfer_st_count': int(row['transfer_st_count']),
                'transfer_sts': row['transfer_sts'],
                'transfer_sts_ids': row['transfer_sts_ids'],
                'transfer_lines_all': row['transfer_lines_all'],
                'transfer_lines_all_count': int(row['transfer_lines_all_count']),
                'transfer_lines': row['transfer_lines'],
                'transfer_lines_count': int(row['transfer_lines_count']),
                'chart_ratio': float(row['chart_ratio'])
            })
        res[city_id]['lines_data'] = lines_data
    return res
line_data_dict = get_line_data_dict(ads_subway_line_data)
line_data_dict

defaultdict(dict,
            {'131': {'city_id': '131',
              'city_name': '北京',
              'city_line_count': 28,
              'city_st_count': 414,
              'lines_data': [{'line_name_full': '地铁1号线八通线',
                'line_name': '1号线八通线',
                'line_name_branch': '1号线八通线',
                'line_name_main': '1号线八通线',
                'line_color': '#c03935',
                'line_order': 1,
                'st_count': 35,
                'transfer_st_count': 12,
                'transfer_sts': ['公主坟',
                 '军事博物馆',
                 '木樨地',
                 '复兴门',
                 '西单',
                 '王府井',
                 '东单',
                 '建国门',
                 '国贸',
                 '大望路',
                 '花庄',
                 '环球度假区'],
                'transfer_sts_ids': ['131_001_007',
                 '131_001_008',
                 '131_001_009',
                 '131_001_011',
                 '131_001_012',
                

#### 保存json

In [101]:
with open("json_data/line_data.json", "w", encoding="utf-8") as f:
    json.dump(line_data_dict, f, ensure_ascii=False, indent=4)

### 线路间换乘站数量矩阵

#### df

In [114]:
def get_line_transfer_matrix(df_city):
    # 只保留换乘站
    df_city = df_city[df_city['st_is_exchange'] == 1]
    # 获取所有线路名
    lines = df_city.sort_values(by='line_order')['line_name_branch'].unique()

    # 构建线路间换乘站数量的矩阵
    # 统计每个车站涉及的线路数
    station_line_counts = df_city.groupby('st_name')['line_name_branch'].nunique()
    # 换乘站定义为涉及多条线路的车站
    transfer_stations = station_line_counts[station_line_counts > 1]
    transfer_station_set = set(transfer_stations.index)
    line_transfer = pd.DataFrame(0, index=lines, columns=lines)

    # 主线的支线，如果num_common>1， 则默认为1

    for line1 in lines:
        stations1 = set(df_city[df_city['line_name_branch'] == line1]['st_name']) & transfer_station_set
        line1_main = df_city[df_city['line_name_branch'] == line1]['line_name_main'].iloc[0]
        for line2 in lines:
            line2_main = df_city[df_city['line_name_branch'] == line2]['line_name_main'].iloc[0]
            if line1 == line2:
                continue
            stations2 = set(df_city[df_city['line_name_branch'] == line2]['st_name']) & transfer_station_set
            # 两条线路的换乘站交集数量
            num_common = len(stations1 & stations2)
            if line1_main == line2_main and num_common > 1:
                num_common = 1 
            line_transfer.loc[line1, line2] = num_common
    # 新建index列,并将原index列移动到第一列,然后重命名为line_name_branch
    line_transfer = line_transfer.reset_index().rename(columns={'index': 'line_name_branch'})

    return line_transfer

In [117]:
df_city = dwd_subway_st_real[dwd_subway_st_real['city_id'] == '75']
df_city

Unnamed: 0,city_id,city_name,city_name_e,line_name_full,line_name,line_show_label,line_n,line_is_loop,line_label_x,line_label_y,...,uid,line_order,line_name_branch,line_name_main,st_order,st_id_virtual,target_st_id_virtual,target_st_name,target_st_x,target_st_y
3036,75,成都,chengdu,地铁1号线(五根松-韦家碾),1号线(五根松-韦家碾),0,16,0,-85.0,420.0,...,752509280,1,1号线(五根松-韦家碾),1号线,1,75_001_001,75_001_002,升仙湖,60.0,-860.0
3037,75,成都,chengdu,地铁1号线(五根松-韦家碾),1号线(五根松-韦家碾),0,16,0,-85.0,420.0,...,752509281,1,1号线(五根松-韦家碾),1号线,2,75_001_002,75_001_003,火车北站,-70.0,-720.0
3038,75,成都,chengdu,地铁1号线(五根松-韦家碾),1号线(五根松-韦家碾),0,16,0,-85.0,420.0,...,752509282,1,1号线(五根松-韦家碾),1号线,3,75_001_003,75_001_004,人民北路,-120.0,-610.0
3039,75,成都,chengdu,地铁1号线(五根松-韦家碾),1号线(五根松-韦家碾),0,16,0,-85.0,420.0,...,752509286,1,1号线(五根松-韦家碾),1号线,4,75_001_004,75_001_005,文殊院,-120.0,-540.0
3040,75,成都,chengdu,地铁1号线(五根松-韦家碾),1号线(五根松-韦家碾),0,16,0,-85.0,420.0,...,752509287,1,1号线(五根松-韦家碾),1号线,5,75_001_005,75_001_006,骡马市,-120.0,-390.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3498,75,成都,chengdu,地铁S3线(资阳线),地铁S3线(资阳线),0,,0,900.0,1380.0,...,75250928659,18,S3线(资阳线),S3线(资阳线),2,75_018_002,75_018_003,幸福大道,870.0,1340.0
3499,75,成都,chengdu,地铁S3线(资阳线),地铁S3线(资阳线),0,,0,900.0,1380.0,...,75250928661,18,S3线(资阳线),S3线(资阳线),3,75_018_003,75_018_004,苌弘广场,950.0,1340.0
3500,75,成都,chengdu,地铁S3线(资阳线),地铁S3线(资阳线),0,,0,900.0,1380.0,...,75250928662,18,S3线(资阳线),S3线(资阳线),4,75_018_004,75_018_005,宝台,1030.0,1340.0
3501,75,成都,chengdu,地铁S3线(资阳线),地铁S3线(资阳线),0,,0,900.0,1380.0,...,75250928663,18,S3线(资阳线),S3线(资阳线),5,75_018_005,75_018_006,资阳北站,1110.0,1340.0


In [119]:
line_transfer_matrix = get_line_transfer_matrix(df_city)
line_transfer_matrix

Unnamed: 0,line_name_branch,1号线(五根松-韦家碾),1号线(科学城-韦家碾),2号线,3号线,4号线,5号线,6号线,7号线,8号线,9号线,10号线,17号线,18号线,19号线,27号线,有轨电车蓉2线(成都西站-郫县西站),有轨电车蓉2线(新业路-仁和),S3线(资阳线)
0,1号线(五根松-韦家碾),0,1,1,1,1,0,1,2,1,1,0,0,3,0,1,0,0,0
1,1号线(科学城-韦家碾),1,0,1,1,1,0,2,2,1,1,0,0,5,0,1,0,0,0
2,2号线,1,1,0,1,1,1,2,2,1,0,0,1,0,0,1,0,1,0
3,3号线,1,1,1,0,1,1,1,2,0,1,4,1,0,1,0,0,0,0
4,4号线,1,1,1,1,0,1,1,2,1,1,0,0,0,1,0,1,0,0
5,5号线,0,0,1,1,1,0,2,2,1,1,1,1,0,1,1,0,0,0
6,6号线,1,2,2,1,1,2,0,2,1,1,0,0,1,1,1,1,0,0
7,7号线,2,2,2,2,2,2,2,0,2,0,1,2,1,0,0,0,0,0
8,8号线,1,1,1,0,1,1,1,2,0,1,0,0,0,1,0,0,0,0
9,9号线,1,1,0,1,1,1,1,0,1,0,1,1,1,0,0,1,0,0


#### dict

In [122]:
def get_line_transfer_dict(df_st_real):
	"""
	获取线路换乘数据
	df_st_real: dwd_subway_st_real数据
	"""
	city_ids = df_st_real['city_id'].unique().tolist()
	res = defaultdict(dict)
	for city_id in city_ids:
		df_city = df_st_real[df_st_real['city_id'] == city_id]
		df_matrix = get_line_transfer_matrix(df_city)
		df = df_matrix.set_index('line_name_branch')
		line_name = df.index.tolist()
		col_name = df.columns.tolist()
		res_dict = []
		res_list = []
		for r in range(df.shape[0]):
			for c in range(df.shape[1]):
				v = df.iloc[r, c]
				if v != 0:
					res_dict_i = {
						'source': line_name[r],
						'target': col_name[c],
						'value': int(v)
					}
					res_list_i = [line_name[r], col_name[c], int(v)]
					res_dict.append(res_dict_i)
					res_list.append(res_list_i)
		res[city_id] = {
			'city_id': city_id,
			'city_name': df_city['city_name'].iloc[0],
			'line_links': res_dict,
			'line_transfer_matrix': {
				'data': res_list,
				'lines': line_name
			}
		}
	return res
get_line_transfer_dict(dwd_subway_st_real)

defaultdict(dict,
            {'131': {'city_id': '131',
              'city_name': '北京',
              'line_links': [{'source': '1号线八通线', 'target': '2号线', 'value': 2},
               {'source': '1号线八通线', 'target': '4号线大兴线', 'value': 1},
               {'source': '1号线八通线', 'target': '5号线', 'value': 1},
               {'source': '1号线八通线', 'target': '7号线', 'value': 2},
               {'source': '1号线八通线', 'target': '8号线', 'value': 1},
               {'source': '1号线八通线', 'target': '9号线', 'value': 1},
               {'source': '1号线八通线', 'target': '10号线', 'value': 2},
               {'source': '1号线八通线', 'target': '14号线', 'value': 1},
               {'source': '1号线八通线', 'target': '16号线', 'value': 1},
               {'source': '2号线', 'target': '1号线八通线', 'value': 2},
               {'source': '2号线', 'target': '3号线', 'value': 1},
               {'source': '2号线', 'target': '4号线大兴线', 'value': 2},
               {'source': '2号线', 'target': '5号线', 'value': 2},
               {'source': '2号线', 'targ

#### 保存json

In [123]:
line_transfer_dict = get_line_transfer_dict(dwd_subway_st_real)
with open("json_data/line_transfer.json", "w", encoding="utf-8") as f:
    json.dump(line_transfer_dict, f, ensure_ascii=False, indent=4)

### 换乘数量topN

In [124]:
df_city

Unnamed: 0,city_id,city_name,city_name_e,line_name_full,line_name,line_show_label,line_n,line_is_loop,line_label_x,line_label_y,...,uid,line_order,line_name_branch,line_name_main,st_order,st_id_virtual,target_st_id_virtual,target_st_name,target_st_x,target_st_y
3036,75,成都,chengdu,地铁1号线(五根松-韦家碾),1号线(五根松-韦家碾),0,16,0,-85.0,420.0,...,752509280,1,1号线(五根松-韦家碾),1号线,1,75_001_001,75_001_002,升仙湖,60.0,-860.0
3037,75,成都,chengdu,地铁1号线(五根松-韦家碾),1号线(五根松-韦家碾),0,16,0,-85.0,420.0,...,752509281,1,1号线(五根松-韦家碾),1号线,2,75_001_002,75_001_003,火车北站,-70.0,-720.0
3038,75,成都,chengdu,地铁1号线(五根松-韦家碾),1号线(五根松-韦家碾),0,16,0,-85.0,420.0,...,752509282,1,1号线(五根松-韦家碾),1号线,3,75_001_003,75_001_004,人民北路,-120.0,-610.0
3039,75,成都,chengdu,地铁1号线(五根松-韦家碾),1号线(五根松-韦家碾),0,16,0,-85.0,420.0,...,752509286,1,1号线(五根松-韦家碾),1号线,4,75_001_004,75_001_005,文殊院,-120.0,-540.0
3040,75,成都,chengdu,地铁1号线(五根松-韦家碾),1号线(五根松-韦家碾),0,16,0,-85.0,420.0,...,752509287,1,1号线(五根松-韦家碾),1号线,5,75_001_005,75_001_006,骡马市,-120.0,-390.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3498,75,成都,chengdu,地铁S3线(资阳线),地铁S3线(资阳线),0,,0,900.0,1380.0,...,75250928659,18,S3线(资阳线),S3线(资阳线),2,75_018_002,75_018_003,幸福大道,870.0,1340.0
3499,75,成都,chengdu,地铁S3线(资阳线),地铁S3线(资阳线),0,,0,900.0,1380.0,...,75250928661,18,S3线(资阳线),S3线(资阳线),3,75_018_003,75_018_004,苌弘广场,950.0,1340.0
3500,75,成都,chengdu,地铁S3线(资阳线),地铁S3线(资阳线),0,,0,900.0,1380.0,...,75250928662,18,S3线(资阳线),S3线(资阳线),4,75_018_004,75_018_005,宝台,1030.0,1340.0
3501,75,成都,chengdu,地铁S3线(资阳线),地铁S3线(资阳线),0,,0,900.0,1380.0,...,75250928663,18,S3线(资阳线),S3线(资阳线),5,75_018_005,75_018_006,资阳北站,1110.0,1340.0


In [127]:
get_line_data_ads(df_city)

Unnamed: 0,city_id,city_name,line_name_full,line_name,line_name_branch,line_name_main,line_color,line_order,st_count,transfer_st_count,transfer_sts,transfer_sts_ids,transfer_lines_all,transfer_lines,transfer_lines_count,chart_ratio,city_line_count,city_st_count,transfer_lines_all_count
0,75,成都,地铁1号线(五根松-韦家碾),1号线(五根松-韦家碾),1号线(五根松-韦家碾),1号线,#6B3FB0,1,24,11,"[韦家碾, 火车北站, 人民北路, 骡马市, 天府广场, 省体育馆, 倪家桥, 火车南站, ...","[75_001_001, 75_001_003, 75_001_004, 75_001_00...","[1号线(五根松-韦家碾), 1号线(科学城-韦家碾), 2号线, 3号线, 4号线, 6号...","[1号线(科学城-韦家碾), 2号线, 3号线, 4号线, 6号线, 7号线, 8号线, 9...",10,0.99,18,366,259
1,75,成都,地铁1号线(科学城-韦家碾),1号线(科学城-韦家碾),1号线(科学城-韦家碾),1号线,#6B3FB0,2,33,13,"[韦家碾, 火车北站, 人民北路, 骡马市, 天府广场, 省体育馆, 倪家桥, 火车南站, ...","[75_002_001, 75_002_003, 75_002_004, 75_002_00...","[1号线(五根松-韦家碾), 1号线(科学城-韦家碾), 2号线, 3号线, 4号线, 6号...","[1号线(五根松-韦家碾), 2号线, 3号线, 4号线, 6号线, 7号线, 8号线, 9...",10,0.99,18,366,259
2,75,成都,地铁2号线,2号线,2号线,2号线,#F28A6C,3,32,11,"[犀浦, 天河路, 羊犀立交, 一品天下, 中医大省医院, 人民公园, 天府广场, 春熙路,...","[75_003_001, 75_003_002, 75_003_008, 75_003_00...","[1号线(五根松-韦家碾), 1号线(科学城-韦家碾), 2号线, 3号线, 4号线, 5号...","[1号线(五根松-韦家碾), 1号线(科学城-韦家碾), 3号线, 4号线, 5号线, 6号...",11,0.99,18,366,305
3,75,成都,地铁3号线,3号线,3号线,3号线,#CF5287,4,37,12,"[双流西站, 龙桥路, 武青南路, 太平园, 红牌楼, 高升桥, 省体育馆, 春熙路, 市二...","[75_004_001, 75_004_007, 75_004_009, 75_004_01...","[1号线(五根松-韦家碾), 1号线(科学城-韦家碾), 2号线, 3号线, 4号线, 5号...","[1号线(五根松-韦家碾), 1号线(科学城-韦家碾), 2号线, 4号线, 5号线, 6号...",11,0.99,18,366,286
4,75,成都,地铁4号线,4号线,4号线,4号线,#48C387,5,30,9,"[凤溪河, 成都西站, 文化宫, 中医大省医院, 骡马市, 市二医院, 玉双路, 双桥路, ...","[75_005_003, 75_005_012, 75_005_014, 75_005_01...","[1号线(五根松-韦家碾), 1号线(科学城-韦家碾), 2号线, 3号线, 4号线, 5号...","[1号线(五根松-韦家碾), 1号线(科学城-韦家碾), 2号线, 3号线, 5号线, 6号...",11,0.99,18,366,303
5,75,成都,地铁5号线,5号线,5号线,5号线,#AA6CA1,6,41,11,"[洞子口, 北站西二路, 西北桥, 中医大省医院, 省骨科医院, 高升桥, 九兴大道, 神仙...","[75_006_013, 75_006_017, 75_006_018, 75_006_02...","[2号线, 3号线, 4号线, 5号线, 6号线, 7号线, 8号线, 9号线, 10号线,...","[2号线, 3号线, 4号线, 6号线, 7号线, 8号线, 9号线, 10号线, 17号线...",11,0.99,18,366,302
6,75,成都,地铁6号线,6号线,6号线,6号线,#CB9764,7,56,15,"[望丛祠, 犀浦, 金府, 西南交大, 西北桥, 人民北路, 前锋路, 玉双路, 牛王庙, ...","[75_007_001, 75_007_009, 75_007_016, 75_007_01...","[1号线(五根松-韦家碾), 1号线(科学城-韦家碾), 2号线, 3号线, 4号线, 5号...","[1号线(五根松-韦家碾), 1号线(科学城-韦家碾), 2号线, 3号线, 4号线, 5号...",13,0.99,18,366,323
7,75,成都,地铁7号线内环,7号线,7号线,7号线,#94DEE5,8,31,16,"[高朋大道, 神仙树, 火车南站, 琉璃场, 成都东客站, 槐树店, 理工大学, 二仙桥, ...","[75_008_001, 75_008_002, 75_008_003, 75_008_00...","[1号线(五根松-韦家碾), 1号线(科学城-韦家碾), 2号线, 3号线, 4号线, 5号...","[1号线(五根松-韦家碾), 1号线(科学城-韦家碾), 2号线, 3号线, 4号线, 5号...",11,0.99,18,366,296
8,75,成都,地铁8号线,8号线,8号线,8号线,#B5D340,9,32,9,"[理工大学, 双桥路, 东大路, 东光, 倪家桥, 九兴大道, 高朋大道, 三元, 龙港]","[75_009_008, 75_009_012, 75_009_013, 75_009_01...","[1号线(五根松-韦家碾), 1号线(科学城-韦家碾), 2号线, 4号线, 5号线, 6号...","[1号线(五根松-韦家碾), 1号线(科学城-韦家碾), 2号线, 4号线, 5号线, 6号...",9,1.03,18,366,255
9,75,成都,地铁9号线,9号线,9号线,9号线,#EFC34C,10,13,8,"[成都西站, 机投桥, 武青南路, 华兴, 三元, 锦城大道, 孵化园, 金融城东]","[75_010_002, 75_010_004, 75_010_005, 75_010_00...","[1号线(五根松-韦家碾), 1号线(科学城-韦家碾), 3号线, 4号线, 5号线, 6号...","[1号线(五根松-韦家碾), 1号线(科学城-韦家碾), 3号线, 4号线, 5号线, 6号...",11,0.99,18,366,281


In [135]:
def get_transer_stats(df_st_real):
    """
    获取换乘统计数据
    df_st_real: dwd_subway_st_real数据
    1. 每个城市换乘线路数最多的10条线路
    2. 每个城市换乘车站数最多的10条线路
    3. 每个城市换乘车站邻接车站数最多的10个车站
    """
    city_ids = df_st_real['city_id'].unique().tolist()
    res = defaultdict(dict)
    for city_id in city_ids:
        df_city = df_st_real[df_st_real['city_id'] == city_id]
        line_df = get_line_data_ads(df_city)
        res_i = defaultdict(dict)
        res_i['city_id'] = line_df['city_id'].iloc[0]
        res_i['city_name'] = line_df['city_name'].iloc[0]
        # 线路换乘数据统计
        line_df = line_df[['line_name_branch', 'transfer_lines_count', 'transfer_st_count']]
        # 按transfer_lines_count降序
        line_df = line_df.sort_values(by='transfer_lines_count', ascending=False).reset_index(drop=True)
        res_i['line_transfer_line_count'] = {
            'lines': line_df['line_name_branch'].tolist()[:10],
            'line_counts': line_df['transfer_lines_count'].tolist()[:10],
            'st_counts': line_df['transfer_st_count'].tolist()[:10]
        }
        # 按transfer_st_count降序
        line_df = line_df.sort_values(by='transfer_st_count', ascending=False).reset_index(drop=True)
        res_i['line_transfer_st_count'] = {
            'lines': line_df['line_name_branch'].tolist()[:10],
            'line_counts': line_df['transfer_lines_count'].tolist()[:10],
            'st_counts': line_df['transfer_st_count'].tolist()[:10]
        }
        # 车站邻接车站数，与车站换乘线路数
        st_df = df_city[['st_name', 'target_st_name']].dropna()
        G_i = nx.from_pandas_edgelist(st_df, source='st_name', target='target_st_name', create_using=nx.Graph())
        # 度数最大的10个节点
        degrees = dict(G_i.degree())
        top_10_degrees = sorted(degrees.items(), key=lambda x: x[1], reverse=True)[:10]
        st_transfer_lines_count = df_city[df_city['st_is_exchange'] == 1].groupby('st_name')['line_name_branch'].nunique().to_dict()
        st_names =[]
        trans_lines_count = []
        st_degree = []
        for st_name, degree in top_10_degrees:
            transfer_lines_count = st_transfer_lines_count.get(st_name, 0)
            st_names.append(st_name)
            trans_lines_count.append(transfer_lines_count)
            st_degree.append(degree)
        res_i['st_transfer_lines_count'] = {
            'st_names': st_names,
            'transfer_lines_counts': trans_lines_count,
            'degrees': st_degree
        }
        res[city_id] = res_i

    return res

In [136]:
transer_stats = get_transer_stats(dwd_subway_st_real)
with open("json_data/transfer_stats.json", "w", encoding="utf-8") as f:
    json.dump(transer_stats, f, ensure_ascii=False, indent=4)
