In [2]:
import pandas as pd
import numpy as np
# 用20个 random seeds
np.random.seed(0)  # 固定种子以便复现
random_seeds = np.random.choice(10000, size=20, replace=False)
print(random_seeds)
n = 19
print(n)
print(random_seeds[n])

[9394  898 2398 5906 2343 8225 5506 6451 2670 3497 1087 1819 2308 6084
 3724 3184 6387 3728 2702 7883]
19
7883


In [12]:
import geopandas as gpd
# 文件夹路径
grid_folder = r'/content/city2016_lst_ratio_grid_480m_bcr_bhv_ndvi_svf_ev_distbp_distmt_distwb_wr_xy.shp'
data = gpd.read_file(grid_folder)
print(data)


                                               geometry
0     POLYGON ((180636.136 449880.582, 180633.582 45...
1     POLYGON ((180633.582 450360.745, 180631.028 45...
2     POLYGON ((180631.028 450840.909, 180628.474 45...
3     POLYGON ((180628.474 451321.073, 180625.919 45...
4     POLYGON ((181116.3 449883.135, 181113.746 4503...
...                                                 ...
1856  POLYGON ((215208.568 450064.457, 215206.014 45...
1857  POLYGON ((215206.014 450544.639, 215203.46 451...
1858  POLYGON ((215203.46 451024.82, 215200.905 4515...
1859  POLYGON ((215200.905 451505.002, 215198.35 451...
1860  POLYGON ((215198.35 451985.184, 215195.795 452...

[1861 rows x 1 columns]


In [1]:
# 这个代码干的事情其实就是用梯度提升回归树 (GradientBoostingRegressor, GBDT) 对地理网格数据建模，并做特征重要性分析 + 偏依赖图 (PDP) 提取。
import geopandas as gpd
import pandas as pd
import numpy as np
import os
import re
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.inspection import PartialDependenceDisplay
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from scipy.stats import randint, uniform, loguniform
import joblib
joblib.parallel.BatchCompletionCallBack = lambda *args, **kwargs: None

# 文件夹路径
grid_folder = r'/content'
#####################################################################
# 用20个 random seeds
np.random.seed(0)  # 固定种子以便复现
random_seeds = np.random.choice(10000, size=20, replace=False)
print(random_seeds)
n = 19
print(n)
print(random_seeds[n])

for year in [2016]:
    target_vars = [f'nor_{year}', f'ext_{year}', f'hr_{year}']
    explanatory_vars = ['BCR', 'BHV',  'SVF', 'NDVI', 'EV', 'WR', 'Dist_W', 'Dist_P', 'Dist_M','X','Y'] # 顺序很讲究
    # 保存结果
    all_results = []
    r2_comparison = []

    param_dist = {
        'n_estimators': [4168], #4168
        'learning_rate': loguniform(0.002, 0.355), #(0.002, 0.355)
        'subsample': uniform(0.545, 0.413), # [0.545,0.958]
        'max_depth' : randint(5, 14), # [5, 13]
        'min_samples_split':[2], #2
        'max_features': uniform(0.335, 0.581), #[0.335,0.916]
        }

    # === 主循环 ===
    for filename in os.listdir(grid_folder):
        if filename.endswith(rf'city{year}_lst_ratio_grid_480m_bcr_bhv_ndvi_svf_ev_distbp_distmt_distwb_wr_xy.shp'):
            input_path = os.path.join(grid_folder, filename)
            match = re.search(r'(\d{3,5})m', filename)
            grid_size = match.group(1)

            gdf = gpd.read_file(input_path)
            gdf_clean = gdf.replace([np.inf, -np.inf], np.nan).dropna(subset=target_vars + explanatory_vars)

            for target in target_vars:
                X = gdf_clean[explanatory_vars]
                y = gdf_clean[target]

                for r in [random_seeds[n]]:
                    # 数据划分
                    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=r) # 切20次
                    gbdt = GradientBoostingRegressor(random_state=0)
                    cv = RepeatedKFold(n_splits=5, n_repeats=20, random_state=0)

                    search = RandomizedSearchCV(
                        estimator=gbdt,
                        param_distributions=param_dist,
                        n_iter= 200,
                        scoring='r2',
                        cv=cv, # cross validation
                        verbose=3,
                        n_jobs=1,
                        random_state=0
                    )

                    search.fit(X_train, y_train)
                    folder = os.path.join(grid_folder, r'Machine Learning')
                    os.makedirs(folder, exist_ok=True)

                    # 先保存一份 checkpoint (CSV，追加方式)
                    checkpoint_path = os.path.join(folder, f"{year}_{target}_seed{n}_checkpoint.csv")
                    write_header = not os.path.exists(checkpoint_path)
                    df_cv = pd.DataFrame(search.cv_results_)
                    df_cv.to_csv(checkpoint_path, mode='a', header=write_header, index=False)
                    print(f"[SAVE] checkpoint -> {checkpoint_path}")


                    # 再保存原本的 Excel（完整結果）
                    cv_path = os.path.join(folder, f"{n}_{r}_GBDT_{target}_cv_results.xlsx")
                    try:
                        df_cv.to_excel(cv_path, index=False)
                        print(f"[SAVE] CV results -> {cv_path}")
                    except Exception as e:
                        print(f"[ERROR] Save CV results failed: {cv_path} | {e}")
                        raise

                    # 使用测试集评估
                    best_model = search.best_estimator_
                    y_train_pred = best_model.predict(X_train)
                    y_test_pred = best_model.predict(X_test)

                    r2_train = best_model.score(X_train, y_train)
                    r2_test = r2_score(y_test, y_test_pred)

                    rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
                    rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

                    print(f" {filename} | {target} 最佳参数: {search.best_params_} | R²_train={r2_train:.3f} | R²_test={r2_test:.3f}")

                    for var, importance in zip(explanatory_vars, best_model.feature_importances_):
                        all_results.append({
                            'GridSize': grid_size,
                            'Target': target,
                            'Feature': var,
                            'Random seed':r,
                            'FeatureImportance_TrainModel': round(importance, 4),
                            'Train_R2': round(r2_train, 4),
                            'Train_RMSE': round(rmse_train, 4),
                            'Test_R2': round(r2_test, 4),
                            'Test_RMSE': round(rmse_test, 4),
                            **search.best_params_
                        })
                        r2_comparison.append({
                            'GridSize': grid_size,
                            'Target': target,
                            'Random seed':r,
                            'Train_R2': round(r2_train, 4),
                            'Test_R2': round(r2_test, 4),
                            'Train_RMSE': round(rmse_train, 4),
                            'Test_RMSE': round(rmse_test, 4)
                        })

            # 保存模型训练后的结果
            df_all = pd.DataFrame(all_results)
            df_all.to_excel(os.path.join(folder, f'{year} GBDT_Random_Search_Results_{target}_{n}_{r}.xlsx'), index=False)
            df_r2 = pd.DataFrame(r2_comparison)
            df_r2 = df_r2.sort_values(['Target', 'GridSize'])
            df_r2.to_excel(os.path.join(folder, f'{year} R2_Comparison_Train_vs_Test_{target}_{n}_{r}.xlsx'), index=False)

[9394  898 2398 5906 2343 8225 5506 6451 2670 3497 1087 1819 2308 6084
 3724 3184 6387 3728 2702 7883]
19
7883
