In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

In [3]:
def create_temporal_data(n_days=365, n_hours_per_day=24, n_lines=4):
    """
    创建时间序列数据，包括天气、节假日、活动等时间相关特征
    实际使用时请注意自行替换这里！！！

    参数:
    - n_days: 天数
    - n_hours_per_day: 每天的小时数
    - n_lines: 地铁线路数量

    返回:
    - 包含时间特征的DataFrame
    """
    # 创建日期范围
    start_date = datetime(2022, 1, 1)
    dates = [start_date + timedelta(hours=h) for h in range(n_days * n_hours_per_day)]

    # 创建DataFrame
    time_df = pd.DataFrame({
        'datetime': dates,
        'hour': [d.hour for d in dates],
        'day': [d.day for d in dates],
        'month': [d.month for d in dates],
        'day_of_week': [d.weekday() for d in dates],  # 0-6，周一至周日
        'is_weekend': [1 if d.weekday() >= 5 else 0 for d in dates],
        'is_rush_hour': [1 if h in [7, 8, 9, 17, 18, 19] else 0 for h in [d.hour for d in dates]]  # 高峰时段
    })

    # 添加温度数据（摄氏度）- 有季节性变化和每日波动
    base_temp = np.array([
        -5, -3, 0, 10, 18, 25, 28, 26, 20, 12, 5, -2  # 每月基础温度
    ])

    monthly_temp = np.array([base_temp[month - 1] for month in time_df['month']])
    daily_variation = 5 * np.sin(np.pi * time_df['hour'] / 12)  # 日温差约10°C
    random_variation = np.random.normal(0, 2, len(time_df))  # 随机波动

    time_df['temperature'] = monthly_temp + daily_variation + random_variation

    # 添加降水概率（根据月份和温度）
    base_precip_prob = np.array([
        0.2, 0.2, 0.3, 0.4, 0.3, 0.2, 0.1, 0.1, 0.2, 0.3, 0.3, 0.2  # 每月基础降水概率
    ])

    monthly_precip_prob = np.array([base_precip_prob[month - 1] for month in time_df['month']])
    temp_effect = -0.01 * np.maximum(0, time_df['temperature'])  # 温度越高，降水概率越低

    time_df['precipitation_prob'] = np.clip(monthly_precip_prob + temp_effect + np.random.normal(0, 0.05, len(time_df)),
                                            0, 1)

    # 添加实际降水量（毫米）
    precip_occurrence = np.random.binomial(1, time_df['precipitation_prob'])
    precip_amount = np.random.exponential(5, len(time_df)) * precip_occurrence
    time_df['precipitation'] = precip_amount

    # 添加空气质量指数 (AQI)
    base_aqi = np.array([
        100, 95, 85, 75, 70, 65, 70, 75, 80, 85, 90, 95  # 每月基础AQI
    ])

    monthly_aqi = np.array([base_aqi[month - 1] for month in time_df['month']])
    # 雨天空气质量更好，温度高时空气质量可能更差
    weather_effect = -20 * np.clip(time_df['precipitation'], 0, 1) + 0.5 * np.maximum(0, time_df['temperature'])

    time_df['aqi'] = np.clip(monthly_aqi + weather_effect + np.random.normal(0, 10, len(time_df)), 0, 300)

    # 标记节假日（部分）
    holidays = [
        (1, 1),  # 元旦
        (2, 1),  # 春节（简单搞搞了~）
        (4, 5),  # 清明
        (5, 1),  # 劳动节
        (10, 1),  # 国庆节
    ]

    time_df['is_holiday'] = 0
    for month, day in holidays:
        holiday_mask = (time_df['month'] == month) & (time_df['day'] == day)
        time_df.loc[holiday_mask, 'is_holiday'] = 1

    # 为节假日前后添加工作日调整和假期效应
    for i, (month, day) in enumerate(holidays):
        # 找到假日的索引
        holiday_indices = time_df[
            (time_df['month'] == month) &
            (time_df['day'] == day)
            ].index

        for idx in holiday_indices:
            # 假期效应持续3天
            for j in range(1, 72):  # 3天 * 24小时
                if idx + j < len(time_df):
                    time_df.loc[idx + j, 'is_holiday'] = max(0, 1 - j / 72)

            # 假期前效应
            for j in range(1, 24):  # 假期前1天
                if idx - j >= 0:
                    time_df.loc[idx - j, 'is_holiday'] = max(0, 0.5 - j / 48)

    # 添加特殊活动（如音乐会、体育赛事等）
    time_df['special_event'] = 0

    # 随机生成一些特殊活动，主要在晚上和周末
    n_events = n_days // 10  # 平均每10天一个特殊活动

    for _ in range(n_events):
        # 这里我选的随机日期，偏向于周末
        random_idx = np.random.choice(
            time_df.index,
            p=np.array(
                [3 if h in [18, 19, 20] and w >= 5 else 1 for h, w in zip(time_df['hour'], time_df['day_of_week'])]) /
              sum([3 if h in [18, 19, 20] and w >= 5 else 1 for h, w in zip(time_df['hour'], time_df['day_of_week'])])
        )

        # 特殊活动通常持续3-5小时
        event_duration = np.random.randint(3, 6)

        for i in range(event_duration):
            if random_idx + i < len(time_df):
                time_df.loc[random_idx + i, 'special_event'] = 1

    # 添加维护计划信息 - 每条线路单独维护
    for line in range(1, n_lines + 1):
        time_df[f'maintenance_line_{line}'] = 0

        # 每条线路每月进行一次维护
        for month in range(1, 13):
            # 选择一个工作日的午夜进行维护
            maintenance_candidates = time_df[
                (time_df['month'] == month) &
                (time_df['day_of_week'] < 5) &  # 工作日
                (time_df['hour'] >= 0) & (time_df['hour'] < 5)  # 午夜至凌晨5点
                ]

            if len(maintenance_candidates) > 0:
                # 随机选择一个时间点
                maintenance_idx = np.random.choice(maintenance_candidates.index)

                # 维护通常持续3-5小时
                maintenance_duration = np.random.randint(3, 6)

                for i in range(maintenance_duration):
                    if maintenance_idx + i < len(time_df):
                        time_df.loc[maintenance_idx + i, f'maintenance_line_{line}'] = 1

    # 添加历史故障记录 - 每条线路单独记录
    for line in range(1, n_lines + 1):
        time_df[f'failures_line_{line}'] = 0

        # 随机生成一些故障事件，故障通常在高峰期或极端天气时更容易发生
        n_failures = n_days // 30  # 平均每30天一次故障

        for _ in range(n_failures):
            # 故障更可能在高峰期或极端天气条件下发生
            failure_weights = (time_df['is_rush_hour'] * 2 +
                               np.maximum(0, (time_df['temperature'] - 30) / 10) +  # 高温
                               np.maximum(0, (0 - time_df['temperature']) / 10) +  # 低温
                               np.clip(time_df['precipitation'], 0, 1) * 3)  # 降水

            failure_weights = failure_weights / sum(failure_weights)

            # 选择一个时间点
            failure_idx = np.random.choice(time_df.index, p=failure_weights)

            # 故障通常持续1-3小时
            failure_duration = np.random.randint(1, 4)

            for i in range(failure_duration):
                if failure_idx + i < len(time_df):
                    time_df.loc[failure_idx + i, f'failures_line_{line}'] = 1

    # 添加客流量变化 - 每条线路有些微差异
    for line in range(1, n_lines + 1):
        # 基础客流模式 - 工作日的早晚高峰
        time_df[f'passenger_flow_line_{line}'] = 0

        for idx, row in time_df.iterrows():
            base_flow = 0
            # 工作日模式
            if row['day_of_week'] < 5:  # 周一至周五
                if 7 <= row['hour'] <= 9:  # 早高峰
                    base_flow = 0.8 + 0.2 * np.sin((row['hour'] - 7) * np.pi / 2)
                elif 17 <= row['hour'] <= 19:  # 晚高峰
                    base_flow = 0.9 + 0.1 * np.sin((row['hour'] - 17) * np.pi / 2)
                elif 10 <= row['hour'] <= 16:  # 工作时段
                    base_flow = 0.4
                else:  # 夜间
                    base_flow = 0.1 * np.exp(-(row['hour'] - 20) ** 2 / 20) if row['hour'] >= 20 else 0.1
            else:  # 周末模式
                if 10 <= row['hour'] <= 18:  # 白天活动
                    base_flow = 0.5 + 0.2 * np.sin((row['hour'] - 10) * np.pi / 8)
                else:  # 其他时段
                    base_flow = 0.2

            # 添加假期效应
            if row['is_holiday'] > 0:
                base_flow *= 0.7  # 假期客流减少

            # 添加天气效应
            if row['precipitation'] > 1:  # 下雨
                base_flow *= 0.9  # 雨天客流减少

            # 添加特殊活动效应
            if row['special_event'] > 0:
                base_flow *= 1.3  # 特殊活动客流增加

            # 每条线路有些微差异
            line_variation = 1 + 0.1 * np.sin(line * np.pi / 4)  # 线路间的差异

            time_df.loc[idx, f'passenger_flow_line_{line}'] = base_flow * line_variation

    return time_df

In [4]:
def build_lstm_model(input_shape, output_units=64):
    """
    构建LSTM模型用于提取时间特征
    自行调参训练

    参数:
    - input_shape: 输入数据的形状，例如(sequence_length, n_features)
    - output_units: 输出特征的维度

    返回:
    - LSTM特征提取模型
    """
    # 定义输入
    inputs = Input(shape=input_shape)

    # 第一个LSTM层
    x = LSTM(128, return_sequences=True)(inputs)
    x = Dropout(0.2)(x)

    # 第二个LSTM层
    x = LSTM(64)(x)
    x = Dropout(0.2)(x)

    # 最终特征向量
    outputs = Dense(output_units, activation='relu', name='temporal_features')(x)

    # 创建模型
    model = Model(inputs=inputs, outputs=outputs, name='lstm_time_feature_extractor')

    return model

In [5]:
def prepare_sequences(time_df, sequence_length=24, n_lines=4):
    """
    将时间数据准备为序列用于LSTM模型

    参数:
    - time_df: 包含时间特征的DataFrame
    - sequence_length: 序列长度（过去多少个时间点）
    - n_lines: 线路数量

    返回:
    - 特征序列
    """
    # 选择要使用的基础特征
    base_features = [
        'hour', 'day_of_week', 'is_weekend', 'is_rush_hour',
        'temperature', 'precipitation_prob', 'precipitation',
        'aqi', 'is_holiday', 'special_event'
    ]

    # 添加线路特定特征
    line_features = []
    for line in range(1, n_lines + 1):
        line_features.extend([
            f'maintenance_line_{line}',
            f'failures_line_{line}',
            f'passenger_flow_line_{line}'
        ])

    # 合并所有特征
    features = base_features + line_features

    # 归一化/标准化特征
    normalized_df = time_df.copy()
    normalized_df['hour'] = normalized_df['hour'] / 23  # 0-23 -> 0-1
    normalized_df['day_of_week'] = normalized_df['day_of_week'] / 6  # 0-6 -> 0-1
    normalized_df['temperature'] = (normalized_df['temperature'] - (-10)) / 40  # 假设温度范围 -10 到 30
    normalized_df['aqi'] = normalized_df['aqi'] / 300  # 0-300 -> 0-1

    # 转换为numpy数组
    feature_data = normalized_df[features].values

    # 创建序列
    sequences = []
    for i in range(len(feature_data) - sequence_length):
        sequences.append(feature_data[i:i + sequence_length])

    return np.array(sequences)

In [6]:
def extract_temporal_features(sequences, output_units=64):
    """
    使用LSTM模型提取时间特征
    实际中注意先训练模型获得参数，这里直接用的随机初始化设定的参数

    参数:
    - sequences: 时间序列特征
    - output_units: 输出特征的维度

    返回:
    - 时间特征向量
    """
    # 构建LSTM模型
    sequence_length, n_features = sequences.shape[1], sequences.shape[2]
    lstm_model = build_lstm_model((sequence_length, n_features), output_units)

    # 提取特征
    features = lstm_model.predict(sequences)

    return features

In [7]:
def plot_temporal_data(time_df, n_lines=4):
    """
    可视化时间数据的一些关键特征

    参数:
    - time_df: 包含时间特征的DataFrame
    - n_lines: 线路数量
    """
    # 选择一个较短的时间段进行可视化（例如前7天）
    plot_df = time_df[:7 * 24].copy()
    plot_df['date'] = plot_df['datetime'].dt.date
    plot_df['hour_of_day'] = plot_df['datetime'].dt.hour

    # 创建图表
    fig, axes = plt.subplots(4, 1, figsize=(14, 16))

    # 温度和降水
    ax1 = axes[0]
    ax1.plot(plot_df['datetime'], plot_df['temperature'], 'r-', label='Temperature')
    ax1.set_ylabel('Temperature (°C)', color='r')
    ax1.tick_params(axis='y', labelcolor='r')

    ax1_twin = ax1.twinx()
    ax1_twin.plot(plot_df['datetime'], plot_df['precipitation'], 'b-', label='Precipitation')
    ax1_twin.set_ylabel('Precipitation (mm)', color='b')
    ax1_twin.tick_params(axis='y', labelcolor='b')
    ax1.set_title('Temperature and Precipitation')

    # 客流量 - 所有线路
    ax2 = axes[1]
    for line in range(1, n_lines + 1):
        ax2.plot(plot_df['datetime'], plot_df[f'passenger_flow_line_{line}'],
                 label=f'Line {line}')

    ax2.set_title('Passenger Flow by Line')
    ax2.legend()

    # 维护计划
    ax3 = axes[2]
    for line in range(1, n_lines + 1):
        if plot_df[f'maintenance_line_{line}'].sum() > 0:  # 只绘制有维护的线路
            ax3.scatter(plot_df[plot_df[f'maintenance_line_{line}'] > 0]['datetime'],
                        [line] * plot_df[f'maintenance_line_{line}'].sum(),
                        label=f'Line {line}', s=50)

    ax3.set_yticks(range(1, n_lines + 1))
    ax3.set_title('Maintenance Schedule by Line')
    ax3.set_ylabel('Line Number')

    # 故障记录
    ax4 = axes[3]
    for line in range(1, n_lines + 1):
        if plot_df[f'failures_line_{line}'].sum() > 0:  # 只绘制有故障的线路
            ax4.scatter(plot_df[plot_df[f'failures_line_{line}'] > 0]['datetime'],
                        [line] * plot_df[f'failures_line_{line}'].sum(),
                        label=f'Line {line}', marker='x', s=100, color='red')

    ax4.set_yticks(range(1, n_lines + 1))
    ax4.set_title('Failures by Line')
    ax4.set_ylabel('Line Number')

    plt.tight_layout()
    plt.show()


In [9]:
def main():
    # 创建时间序列数据
    n_lines = 4  # 与地铁网络数据一致
    time_df = create_temporal_data(n_days=90, n_lines=n_lines)  # 使用90天的数据作为示例

    print("时间序列数据形状:", time_df.shape)
    print("时间序列数据列:", time_df.columns.tolist())
    print("\n前5行时间数据:")
    print(time_df[['datetime', 'hour', 'day_of_week', 'temperature', 'precipitation', 'is_holiday']].head())

    # 准备序列数据
    sequence_length = 24  # 使用过去24小时的数据预测
    sequences = prepare_sequences(time_df, sequence_length, n_lines)

    print("\n时间序列特征形状:", sequences.shape)
    print(
        f"共有 {sequences.shape[0]} 个序列，每个序列长度为 {sequences.shape[1]}，每个时间点有 {sequences.shape[2]} 个特征")

    # 提取时间特征
    temporal_features = extract_temporal_features(sequences)

    print("\n提取的时间特征形状:", temporal_features.shape)
    print("时间特征示例（第一个样本的前10个特征）:", temporal_features[0, :10])

    # 显示LSTM模型摘要
    sequence_length, n_features = sequences.shape[1], sequences.shape[2]
    model = build_lstm_model((sequence_length, n_features))
    model.summary()

    # 可视化部分数据
    # plot_temporal_data(time_df, n_lines)  # 实际运行时可以启用

    # print("\n改进的多线路时间特征提取的优势:")
    # print("1. 为每条线路单独捕获维护计划、故障历史和客流量变化")
    # print("2. 支持对换乘站和多线路经过的站点进行更精细的分析")
    # print("3. 可以捕获不同线路之间的相互影响和关联性")
    # print("4. 特征提取能力更强，可以处理复杂的时间模式和线路特异性")
    # print("5. 结构化的时间特征适合与空间特征融合进行综合分析")

In [10]:
if __name__ == "__main__":
    main()

  time_df.loc[idx + j, 'is_holiday'] = max(0, 1 - j / 72)
  time_df.loc[idx, f'passenger_flow_line_{line}'] = base_flow * line_variation
  time_df.loc[idx, f'passenger_flow_line_{line}'] = base_flow * line_variation
  time_df.loc[idx, f'passenger_flow_line_{line}'] = base_flow * line_variation
  time_df.loc[idx, f'passenger_flow_line_{line}'] = base_flow * line_variation


时间序列数据形状: (2160, 25)
时间序列数据列: ['datetime', 'hour', 'day', 'month', 'day_of_week', 'is_weekend', 'is_rush_hour', 'temperature', 'precipitation_prob', 'precipitation', 'aqi', 'is_holiday', 'special_event', 'maintenance_line_1', 'maintenance_line_2', 'maintenance_line_3', 'maintenance_line_4', 'failures_line_1', 'failures_line_2', 'failures_line_3', 'failures_line_4', 'passenger_flow_line_1', 'passenger_flow_line_2', 'passenger_flow_line_3', 'passenger_flow_line_4']

前5行时间数据:
             datetime  hour  day_of_week  temperature  precipitation  \
0 2022-01-01 00:00:00     0            5    -3.146372       0.000000   
1 2022-01-01 01:00:00     1            5    -3.533521       0.000000   
2 2022-01-01 02:00:00     2            5    -3.352901       0.000000   
3 2022-01-01 03:00:00     3            5    -0.527194       0.000000   
4 2022-01-01 04:00:00     4            5    -1.239951       8.827457   

   is_holiday  
0    0.020833  
1    0.041667  
2    0.062500  
3    0.083333  
4    0.10