In [5]:
import pandas as pd
df_T = pd.read_csv('./data/weather/heat.csv')
df_BSU = pd.read_csv('./data/processed_bsu/combined_peak_non_peak.csv')
df_v = pd.read_csv('./data/variables/variables_constant.csv')
df_panel = df_T.merge(df_BSU, on='ID', how='inner').merge(df_v, on='ID', how='inner')
df_panel[['non_peak_4_wd', 'non_peak_4_we', 'non_peak_7_wd', 'non_peak_7_we']] /= 7
df_panel[['peak_7_we', 'peak_4_wd', 'peak_4_we', 'peak_7_wd']] /= 6
df_panel.columns


Index(['ID', '4_peak_mean', '4_non_mean', '7_peak_mean', '7_non_mean',
       'non_peak_4_wd', 'peak_4_wd', 'non_peak_4_we', 'peak_4_we',
       'non_peak_7_wd', 'peak_7_wd', 'non_peak_7_we', 'peak_7_we',
       'bus distance', 'POI diversity', 'NDVI', 'road density',
       'building density', 'BVI', 'SVI', 'GVI', 'VNMI', 'VHI',
       'metro distance', 'slope', 'AQI'],
      dtype='object')

In [9]:
columns_to_analyze = ['POI diversity',
    '4_peak_mean', '4_non_mean', '7_peak_mean', '7_non_mean',
    'non_peak_4_wd', 'peak_4_wd', 'non_peak_4_we', 'peak_4_we',
    'non_peak_7_wd', 'peak_7_wd', 'non_peak_7_we', 'peak_7_we'
]

# 计算均值和标准差
stats_df = df_panel[columns_to_analyze].agg(['mean', 'std'])

# 打印结果
print(stats_df)

      POI diversity  4_peak_mean  4_non_mean  7_peak_mean  7_non_mean  \
mean       0.671678    28.369484   57.046532    63.689565   72.653198   
std        0.629833     2.188049    2.469380     4.959599    4.406289   

      non_peak_4_wd  peak_4_wd  non_peak_4_we  peak_4_we  non_peak_7_wd  \
mean      15.683505  35.410046      24.009637  30.562393      10.329242   
std       20.629254  46.412591      29.974163  39.129642      13.408453   

      peak_7_wd  non_peak_7_we  peak_7_we  
mean  24.639040      10.918400  16.185897  
std   32.101697      14.098195  20.556886  


In [11]:
# 定义要分配的基础列
common_cols = [
    'bus distance', 'POI diversity', 'NDVI', 'road density',
    'building density', 'BVI', 'SVI', 'GVI', 'VNMI', 'VHI',
    'metro distance', 'slope', 'AQI'
]

# 定义每个子 DataFrame 的组合
df_list = [
    df_panel[['4_peak_mean', 'peak_4_wd'] + common_cols].rename(
        columns={'4_peak_mean': 'Tmrt', 'peak_4_wd': 'bike-sharing usage'}
    ),
    df_panel[['4_peak_mean', 'peak_4_we'] + common_cols].rename(
        columns={'4_peak_mean': 'Tmrt', 'peak_4_we': 'bike-sharing usage'}
    ),
    df_panel[['7_peak_mean', 'peak_7_wd'] + common_cols].rename(
        columns={'7_peak_mean': 'Tmrt', 'peak_7_wd': 'bike-sharing usage'}
    ),
    df_panel[['7_peak_mean', 'peak_7_we'] + common_cols].rename(
        columns={'7_peak_mean': 'Tmrt', 'peak_7_we': 'bike-sharing usage'}
    ),
    df_panel[['4_non_mean', 'non_peak_4_we'] + common_cols].rename(
        columns={'4_non_mean': 'Tmrt', 'non_peak_4_we': 'bike-sharing usage'}
    ),
    df_panel[['4_non_mean', 'non_peak_4_wd'] + common_cols].rename(
        columns={'4_non_mean': 'Tmrt', 'non_peak_4_wd': 'bike-sharing usage'}
    ),
    df_panel[['7_non_mean', 'non_peak_7_wd'] + common_cols].rename(
        columns={'7_non_mean': 'Tmrt', 'non_peak_7_wd': 'bike-sharing usage'}
    ),
    df_panel[['7_non_mean', 'non_peak_7_we'] + common_cols].rename(
        columns={'7_non_mean': 'Tmrt', 'non_peak_7_we': 'bike-sharing usage'}
    )
]

# 可通过 df_list[0], df_list[1], ... 访问各个子 DataFrame


In [15]:
# 定义导出文件名（使用对应列名进行命名）
file_names = [
    'peak_4_wd.csv', 'peak_4_we.csv',
    'peak_7_wd.csv', 'peak_7_we.csv',
    'non_peak_4_we.csv', 'non_peak_4_wd.csv',
    'non_peak_7_wd.csv', 'non_peak_7_we.csv'
]

# 导出为 CSV 文件
for i, df in enumerate(df_list):
    df.to_csv(f'./data/variables/{file_names[i]}', index=False)


In [23]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error

# 定义目录路径
plot_dir = './data/panel/'

# 获取所有 CSV 文件
csv_files = [f for f in os.listdir(plot_dir) if f.endswith('.csv')]

# 初始化结果字典
results = {
    'Model': ['OLS', '', 'GBR', '', 'MLP', '', 'XGB', ''],
}

# 循环读取 CSV 文件并进行建模
for file in csv_files:
    file_path = os.path.join(plot_dir, file)
    print(f"Processing file: {file_path}")

    # 加载数据
    dfj = pd.read_csv(file_path)

    # 删除 'bike-sharing usage' 为0的行
    dfj = dfj[dfj['bike-sharing usage'] > 0]

    # 计算 log_bike-sharing usage 列
    dfj['log_bike-sharing usage'] = np.log(dfj['bike-sharing usage'])

    # 定义自变量
    X = dfj[['Tmrt','POI diversity','road density', 'building density', 'bus distance', 'metro distance', 'BVI', 'SVI', 'GVI', 
             'VNMI', 'VHI', 'slope','NDVI' ,'AQI']]

    # 定义因变量
    y = dfj['log_bike-sharing usage']

    # 标准化自变量，并确保索引对齐
    scaler = StandardScaler()
    X_standardized = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)

    # 重置索引，保持自变量和因变量索引一致
    y = y.reset_index(drop=True)
    X_standardized = X_standardized.reset_index(drop=True)

    # 1. OLS回归
    X_with_const = add_constant(X_standardized)
    ols_model = sm.OLS(y, X_with_const).fit()
    ols_pred = ols_model.predict(X_with_const)
    ols_r2 = r2_score(y, ols_pred)
    ols_mse = mean_squared_error(y, ols_pred)

    # 2. MLP回归
    mlp_model = MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
    mlp_model.fit(X_standardized, y)
    mlp_pred = mlp_model.predict(X_standardized)
    mlp_r2 = r2_score(y, mlp_pred)
    mlp_mse = mean_squared_error(y, mlp_pred)

    # 3. GBR回归
    gbr_model = GradientBoostingRegressor(n_estimators=20, random_state=42)
    gbr_model.fit(X_standardized, y)
    gbr_pred = gbr_model.predict(X_standardized)
    gbr_r2 = r2_score(y, gbr_pred)
    gbr_mse = mean_squared_error(y, gbr_pred)

    # 4. XGBoost回归
    xgb_model = XGBRegressor(n_estimators=20, random_state=42)
    xgb_model.fit(X_standardized, y)
    xgb_pred = xgb_model.predict(X_standardized)
    xgb_r2 = r2_score(y, xgb_pred)
    xgb_mse = mean_squared_error(y, xgb_pred)

    # 将结果存入结果字典
    results[file] = [
        ols_r2, ols_mse,
        gbr_r2, gbr_mse,
        mlp_r2, mlp_mse,
        xgb_r2, xgb_mse
    ]

# 将结果转换为 DataFrame
results_df = pd.DataFrame(results)

# # 显示结果
# import ace_tools as tools
# tools.display_dataframe_to_user(name="Model Results", dataframe=results_df)




Processing file: ./data/panel/peak_7_wd.csv
Processing file: ./data/panel/peak_7_we.csv
Processing file: ./data/panel/non_peak_4_we.csv
Processing file: ./data/panel/non_peak_4_wd.csv
Processing file: ./data/panel/peak_4_we.csv
Processing file: ./data/panel/peak_4_wd.csv
Processing file: ./data/panel/non_peak_7_wd.csv
Processing file: ./data/panel/non_peak_7_we.csv


In [27]:
results_df.to_csv('./data/panel/model.csv',index=False)