In [2]:
import shap
import sklearn
import numpy as np
import pandas as pd
from math import sqrt
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.model_selection import GridSearchCV

In [3]:
# Read the data
df = pd.read_csv("datas.csv")
df = df.sort_values(by="year")

In [None]:
# Filling in missing data
def fillna_with_rolling_mean(series, window=7):
    return series.fillna(series.rolling(window=window, min_periods=1, center=True).mean())
df = df.apply(lambda col: fillna_with_rolling_mean(col))
print(df.isnull().sum())

In [15]:
features = [ 'FAP', 'SAP', 'GAP', 'CAP', 'UAP',
      'TEM', 'PRE', 'PET', 'PDSI',
      'DSSR', 'ELE', 'SLO', 'AOD',
      'PM10', 'PM2.5', 'POP', 'NLT' ]

# Select specific columns as features X and targets y
X = df[features]
y = df["kNDVI"]

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Data Normalization
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaler = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = scaler.transform(X_test)
X_test_scaler = pd.DataFrame(X_test_scaled, columns=X_test.columns)

print(X_train_scaler.head())
print(X_test_scaler.head())

In [None]:
# Calculate R² and Adjusted R²
def r_squared(y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    n = len(y_true)
    p = X_train.shape[1]
    adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
    return r2, adjusted_r2

# Calculate RMSE
def rmse(y_true, y_pred):
    return sqrt(mean_squared_error(y_true, y_pred))

In [None]:
# Linear
model_lr = LinearRegression()
model_lr.fit(X_train_scaler, y_train)
y_train_pred_lr = model_lr.predict(X_train_scaler)
y_test_pred_lr = model_lr.predict(X_test_scaler)

# Calculating evaluation metrics
r2_train_lr, adj_r2_train_lr = r_squared(y_train, y_train_pred_lr)
r2_test_lr, adj_r2_test_lr = r_squared(y_test, y_test_pred_lr)
mae_train_lr = mean_absolute_error(y_train, y_train_pred_lr)
mae_test_lr = mean_absolute_error(y_test, y_test_pred_lr)
mse_train_lr = mean_squared_error(y_train, y_train_pred_lr)
mse_test_lr = mean_squared_error(y_test, y_test_pred_lr)
rmse_train_lr = rmse(y_train, y_train_pred_lr)
rmse_test_lr = rmse(y_test, y_test_pred_lr)

print(f"train \nR²: {r2_train_lr}, \nAdjusted R²: {adj_r2_train_lr}, \nMAE: {mae_train_lr}, \nMSE: {mse_train_lr}, \nRMSE: {rmse_train_lr}")
print(f"test \nR²: {r2_test_lr}, \nAdjusted R²: {adj_r2_test_lr}, \nMAE: {mae_test_lr}, \nMSE: {mse_test_lr}, \nRMSE: {rmse_test_lr}")


In [None]:
# Ridge
model_ridge = Ridge(alpha=1.0)
model_ridge.fit(X_train_scaler, y_train)
y_train_pred_ridge = model_ridge.predict(X_train_scaler)
y_test_pred_ridge = model_ridge.predict(X_test_scaler)

# Calculating evaluation metrics
r2_train_ridge, adj_r2_train_ridge = r_squared(y_train, y_train_pred_ridge)
r2_test_ridge, adj_r2_test_ridge = r_squared(y_test, y_test_pred_ridge)
mae_train_ridge = mean_absolute_error(y_train, y_train_pred_ridge)
mae_test_ridge = mean_absolute_error(y_test, y_test_pred_ridge)
mse_train_ridge = mean_squared_error(y_train, y_train_pred_ridge)
mse_test_ridge = mean_squared_error(y_test, y_test_pred_ridge)
rmse_train_ridge = rmse(y_train, y_train_pred_ridge)
rmse_test_ridge = rmse(y_test, y_test_pred_ridge)

print(f"train \nR²: {r2_train_ridge}, \nAdjusted R²: {adj_r2_train_ridge}, \nMAE: {mae_train_ridge}, \nMSE: {mse_train_ridge}, \nRMSE: {rmse_train_ridge}")
print(f"test \nR²: {r2_test_ridge}, \nAdjusted R²: {adj_r2_test_ridge}, \nMAE: {mae_test_ridge}, \nMSE: {mse_test_ridge}, \nRMSE: {rmse_test_ridge}")

In [None]:
# SVM
model_svr = SVR()
param_grid = {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto']
        }
grid_search = GridSearchCV(model_svr, param_grid, cv=5)
grid_search.fit(X_train_scaler, y_train)
best_model_svr = grid_search.best_estimator_
y_train_pred_svr = best_model_svr.predict(X_train_scaler)
y_test_pred_svr = best_model_svr.predict(X_test_scaler)

# Calculating evaluation metrics
r2_train_svr, adj_r2_train_svr = r_squared(y_train, y_train_pred_svr)
r2_test_svr, adj_r2_test_svr = r_squared(y_test, y_test_pred_svr)
mae_train_svr = mean_absolute_error(y_train, y_train_pred_svr)
mae_test_svr = mean_absolute_error(y_test, y_test_pred_svr)
mse_train_svr = mean_squared_error(y_train, y_train_pred_svr)
mse_test_svr = mean_squared_error(y_test, y_test_pred_svr)
rmse_train_svr = rmse(y_train, y_train_pred_svr)
rmse_test_svr = rmse(y_test, y_test_pred_svr)

print(f"train \nR²: {r2_train_svr}, \nAdjusted R²: {adj_r2_train_svr}, \nMAE: {mae_train_svr}, \nMSE: {mse_train_svr}, \nRMSE: {rmse_train_svr}")
print(f"test \nR²: {r2_test_svr}, \nAdjusted R²: {adj_r2_test_svr}, \nMAE: {mae_test_svr}, \nMSE: {mse_test_svr}, \nRMSE: {rmse_test_svr}")

In [None]:
# RandomForest
model_rf = RandomForestRegressor(random_state=42)
param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7]
        }
grid_search = GridSearchCV(model_rf, param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_model_rf = grid_search.best_estimator_
y_train_pred_rf = best_model_rf.predict(X_train)
y_test_pred_rf = best_model_rf.predict(X_test)

# Calculating evaluation metrics
r2_train_rf, adj_r2_train_rf = r_squared(y_train, y_train_pred_rf)
r2_test_rf, adj_r2_test_rf = r_squared(y_test, y_test_pred_rf)
mae_train_rf = mean_absolute_error(y_train, y_train_pred_rf)
mae_test_rf = mean_absolute_error(y_test, y_test_pred_rf)
mse_train_rf = mean_squared_error(y_train, y_train_pred_rf)
mse_test_rf = mean_squared_error(y_test, y_test_pred_rf)
rmse_train_rf = rmse(y_train, y_train_pred_rf)
rmse_test_rf = rmse(y_test, y_test_pred_rf)

print(f"train \nR²: {r2_train_rf}, \nAdjusted R²: {adj_r2_train_rf}, \nMAE: {mae_train_rf}, \nMSE: {mse_train_rf}, \nRMSE: {rmse_train_rf}")
print(f"test \nR²: {r2_test_rf}, \nAdjusted R²: {adj_r2_test_rf}, \nMAE: {mae_test_rf}, \nMSE: {mse_test_rf}, \nRMSE: {rmse_test_rf}")

  _data = np.array(data, dtype=dtype, copy=copy,


RandomForest模型效果:
训练集 
R²: 0.8737754235077884, 
Adjusted R²: 0.873511355356131, 
MAE: 0.02496782659974799, 
MSE: 0.0010411529954326244, 
RMSE: 0.03226690247657225
测试集 
R²: 0.857684711244072, 
Adjusted R²: 0.8564858212991509, 
MAE: 0.026873216553710392, 
MSE: 0.0012077742535447536, 
RMSE: 0.03475304668003589


In [None]:
# AdaBoost
model_ab = AdaBoostRegressor(random_state=42)
param_grid = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2]
        }
grid_search = GridSearchCV(model_ab, param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_model_ab = grid_search.best_estimator_
y_train_pred_ab = best_model_ab.predict(X_train)
y_test_pred_ab = best_model_ab.predict(X_test)

# Calculating evaluation metrics
r2_train_ab, adj_r2_train_ab = r_squared(y_train, y_train_pred_ab, X_train)
r2_test_ab, adj_r2_test_ab = r_squared(y_test, y_test_pred_ab, X_test)
mae_train_ab = mean_absolute_error(y_train, y_train_pred_ab)
mae_test_ab = mean_absolute_error(y_test, y_test_pred_ab)
mse_train_ab = mean_squared_error(y_train, y_train_pred_ab)
mse_test_ab = mean_squared_error(y_test, y_test_pred_ab)
rmse_train_ab = rmse(y_train, y_train_pred_ab)
rmse_test_ab = rmse(y_test, y_test_pred_ab)

print(f"train \nR²: {r2_train_ab}, \nAdjusted R²: {adj_r2_train_ab}, \nMAE: {mae_train_ab}, \nMSE: {mse_train_ab}, \nRMSE: {rmse_train_ab}")
print(f"test \nR²: {r2_test_ab}, \nAdjusted R²: {adj_r2_test_ab}, \nMAE: {mae_test_ab}, \nMSE: {mse_test_ab}, \nRMSE: {rmse_test_ab}")

In [None]:
# XGBoost
model_xgb = xgb.XGBRegressor(random_state=42)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}
grid_search = GridSearchCV(model_xgb, param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_model_xgb = grid_search.best_estimator_
y_train_pred_xgb = best_model_xgb.predict(X_train)
y_test_pred_xgb = best_model_xgb.predict(X_test)

# Calculating evaluation metrics
r2_train_xgb, adj_r2_train_xgb = r_squared(y_train, y_train_pred_xgb, X_train)
r2_test_xgb, adj_r2_test_xgb = r_squared(y_test, y_test_pred_xgb, X_test)
mae_train_xgb = mean_absolute_error(y_train, y_train_pred_xgb)
mae_test_xgb = mean_absolute_error(y_test, y_test_pred_xgb)
mse_train_xgb = mean_squared_error(y_train, y_train_pred_xgb)
mse_test_xgb = mean_squared_error(y_test, y_test_pred_xgb)
rmse_train_xgb = rmse(y_train, y_train_pred_xgb)
rmse_test_xgb = rmse(y_test, y_test_pred_xgb)

print(f"train \nR²: {r2_train_xgb}, \nAdjusted R²: {adj_r2_train_xgb}, \nMAE: {mae_train_xgb}, \nMSE: {mse_train_xgb}, \nRMSE: {rmse_train_xgb}")
print(f"test \nR²: {r2_test_xgb}, \nAdjusted R²: {adj_r2_test_xgb}, \nMAE: {mae_test_xgb}, \nMSE: {mse_test_xgb}, \nRMSE: {rmse_test_xgb}")

In [None]:
# LightGBM
model_lgb = lgb.LGBMRegressor(random_state=42)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}
grid_search = GridSearchCV(model_lgb, param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_model_lgb = grid_search.best_estimator_
y_train_pred_lgb = best_model_lgb.predict(X_train)
y_test_pred_lgb = best_model_lgb.predict(X_test)

# Calculating evaluation metrics
r2_train_lgb, adj_r2_train_lgb = r_squared(y_train, y_train_pred_lgb, X_train)
r2_test_lgb, adj_r2_test_lgb = r_squared(y_test, y_test_pred_lgb, X_test)
mae_train_lgb = mean_absolute_error(y_train, y_train_pred_lgb)
mae_test_lgb = mean_absolute_error(y_test, y_test_pred_lgb)
mse_train_lgb = mean_squared_error(y_train, y_train_pred_lgb)
mse_test_lgb = mean_squared_error(y_test, y_test_pred_lgb)
rmse_train_lgb = rmse(y_train, y_train_pred_lgb)
rmse_test_lgb = rmse(y_test, y_test_pred_lgb)

print(f"train: \nR²: {r2_train_lgb}, \nAdjusted R²: {adj_r2_train_lgb}, \nMAE: {mae_train_lgb}, \nMSE: {mse_train_lgb}, \nRMSE: {rmse_train_lgb}")
print(f"test: \nR²: {r2_test_lgb}, \nAdjusted R²: {adj_r2_test_lgb}, \nMAE: {mae_test_lgb}, \nMSE: {mse_test_lgb}, \nRMSE: {rmse_test_lgb}")

In [None]:
# CatBoost
model_cat = CatBoostRegressor(random_state=42)
param_grid = {
    'iterations': [100, 200, 300],
    'depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}
grid_search = GridSearchCV(model_cat, param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_model_cat = grid_search.best_estimator_
y_train_pred_cat = best_model_cat.predict(X_train)
y_test_pred_cat = best_model_cat.predict(X_test)

# Calculating evaluation metrics
r2_train_cat, adj_r2_train_cat = r_squared(y_train, y_train_pred_cat, X_train)
r2_test_cat, adj_r2_test_cat = r_squared(y_test, y_test_pred_cat, X_test)
mae_train_cat = mean_absolute_error(y_train, y_train_pred_cat)
mae_test_cat = mean_absolute_error(y_test, y_test_pred_cat)
mse_train_cat = mean_squared_error(y_train, y_train_pred_cat)
mse_test_cat = mean_squared_error(y_test, y_test_pred_cat)
rmse_train_cat = rmse(y_train, y_train_pred_cat)
rmse_test_cat = rmse(y_test, y_test_pred_cat)

print(f"train \nR²: {r2_train_cat}, \nAdjusted R²: {adj_r2_train_cat}, \nMAE: {mae_train_cat}, \nMSE: {mse_train_cat}, \nRMSE: {rmse_train_cat}")
print(f"test \nR²: {r2_test_cat}, \nAdjusted R²: {adj_r2_test_cat}, \nMAE: {mae_test_cat}, \nMSE: {mse_test_cat}, \nRMSE: {rmse_test_cat}")

In [None]:
# Build an ensemble model and predict
y_train_pred_avg = (y_train_pred_xgb + y_train_pred_lgb + y_train_pred_cat) / 3
y_test_pred_avg = (y_test_pred_xgb + y_test_pred_lgb + y_test_pred_cat) / 3

# Calculating evaluation metrics
r2_train_avg = r2_score(y_train, y_train_pred_avg)
r2_test_avg = r2_score(y_test, y_test_pred_avg)
mae_train_avg = mean_absolute_error(y_train, y_train_pred_avg)
mae_test_avg = mean_absolute_error(y_test, y_test_pred_avg)
mse_train_avg = mean_squared_error(y_train, y_train_pred_avg)
mse_test_avg = mean_squared_error(y_test, y_test_pred_avg)
rmse_train_avg = sqrt(mse_train_avg)
rmse_test_avg = sqrt(mse_test_avg)

print(f"train \nR²: {r2_train_avg}, \nMAE: {mae_train_avg}, \nMSE: {mse_train_avg}, \nRMSE: {rmse_train_avg}")
print(f"test \nR²: {r2_test_avg}, \nMAE: {mae_test_avg}, \nMSE: {mse_test_avg}, \nRMSE: {rmse_test_avg}")


In [None]:
# Apply the ensemble model to each cluster and evaluate the model performance.
# Functions that handle each cluster
def process_cluster(cluster_num):
    cluster_data = df[df["cluster"] == cluster_num]
    X_cluster = cluster_data[features]
    y_cluster = cluster_data["kNDVI"]

    # Get the prediction results of each model separately
    y_pred_xgb = best_model_xgb.predict(X_cluster)
    y_pred_lgb = best_model_lgb.predict(X_cluster)
    y_pred_cat = best_model_cat.predict(X_cluster)

    # Prediction results of the ensemble model
    y_pred_avg = (y_pred_xgb + y_pred_lgb + y_pred_cat) / 3

    # Calculating evaluation metrics
    r2_avg = r2_score(y_cluster, y_pred_avg)
    mae_avg = mean_absolute_error(y_cluster, y_pred_avg)
    mse_avg = mean_squared_error(y_cluster, y_pred_avg)
    rmse_avg = sqrt(mse_avg)

    print(f" ensemble model (cluster {cluster_num}):")
    print(f"R²: {r2_avg}, \nMAE: {mae_avg}, \nMSE: {mse_avg}, \nRMSE: {rmse_avg}")

# Process and evaluate each cluster
for cluster_num in [1, 2, 3]:
    process_cluster(cluster_num)

In [23]:
# Calculating SHAP Value
# Define a general function to process each cluster and calculate the SHAP value
def shap_cluster(model, model_name, cluster_num):
    cluster_data = df[df["cluster"] == cluster_num]
    X_cluster = cluster_data[features]

    # Create an Explainer and calculate the SHAP value
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_cluster)

    # Convert SHAP values ​​to DataFrame
    shap_values_df = pd.DataFrame(shap_values, columns=features)
    shap_df = pd.concat([cluster_data.drop(columns=features), shap_values_df], axis=1)

    # Dynamically generate file name and save
    file_path = f'shap_values_{model_name}_cluster_{cluster_num}.csv'
    shap_df.to_csv(file_path, index=False)

# Process each cluster
def process_clusters(models, cluster_nums):
    for model_name, model in models.items():
        for cluster_num in cluster_nums:
            shap_cluster(model, model_name, cluster_num)

# Model List
models = {
    "xgb": best_model_xgb,
    "lgb": best_model_lgb,
    "cat": best_model_cat
}

# Calculate the SHAP value of each cluster
process_clusters(models, [1, 2, 3])

# Define the function that calculates and saves the average SHAP value
def process_cluster_avg(cluster_num):
    # Dynamically generated file paths
    shap_dfs = [pd.read_csv(f'shap_values_{model_name}_cluster_{cluster_num}.csv') for model_name in models.keys()]

    # Calculate the average SHAP value
    avg_shap_values = sum(df[features] for df in shap_dfs) / len(shap_dfs)

    # Merge average SHAP value with other columns
    merged_shap = pd.concat([shap_dfs[0].drop(columns=features), avg_shap_values], axis=1)

    # Save the merged data
    merged_shap.to_csv(f'shap_values_avg_cluster_{cluster_num}.csv', index=False)

# Process and save the average SHAP value for each cluster
for cluster_num in [1, 2, 3]:
    process_cluster_avg(cluster_num)