## **TRAIN**

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score

train_data = pd.read_csv('training_data.csv')
df_prediction = pd.read_csv('predicting_data.csv')

# 选择用于机器学习的特征
FEATURES = ['lat', 'long', 'alt', 
       'GDD', 'GDD_cumsum', 'GDD_30d',
       'GDD_60d', 'GDD_120d', 'GDD_rate_change_30_60',
       'GDD_rate_change_60_120', 'TAVG_7d', 'TAVG_30d', 'PRCP_cumsum',
       'PRCP_7d_cumsum', 'PRCP_30d_cumsum', 'PRCP_60d_cumsum',
       'PRCP_120d_cumsum', 'PRCP_dry_days', 'PRCP_rainy_days',
       'Frost_days_30d', 'Heat_days_30d', 'TMAX_fluctuation',
       'Frost_days_365d', 'Heat_days_365d']

#
TARGET = "bloom_doy"

# 准备数据
X = train_data[FEATURES]
y = train_data[TARGET]

scaler = StandardScaler()
X = scaler.fit_transform(X)

# 划分训练集、测试集 (80% 训练, 20% 测试)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=333)

In [None]:
# 初始化多种模型
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "ElasticNet": ElasticNet(),
}

# 定义更精细的超参数范围
param_grids = {
    "Ridge Regression": {"alpha": np.logspace(-3, 3, 100)},  # 0.001 to 1000 {"alpha": [1]},#
    "Lasso Regression": {"alpha": np.logspace(-6, 2, 100)},
    "ElasticNet": {
        "alpha": np.logspace(-6, 3, 100),
        "l1_ratio": np.arange(0.1, 1.0, 0.2)  # 0.1 to 0.9 step 0.1
    },
}

# 训练和超参数优化
results = {}
best_models = {}

for name, model in models.items():
    print(f"Training {name}...")

    # 网格搜索超参数（如果适用）
    if name in param_grids:
        grid_search = GridSearchCV(model, param_grids[name], scoring="neg_mean_squared_error", cv=5, n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
    else:
        best_model = model.fit(X_train, y_train)
        best_params = "Default"
    # 预测 & 评估
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results[name] = {"MSE": mse, "R²": r2, "Best Params": best_params}
    best_models[name] = best_model

# 显示结果
df_results = pd.DataFrame(results).T

## **PREDICIT**

In [None]:
df_prediction_dict = {}
for station in df_prediction.STATION.unique():
  df_prediction_dict[city_dict[station]] = df_prediction[df_prediction['STATION'] == station]

In [None]:
for city in df_prediction_dict:
  for model in best_models:
    best_model = best_models[model]

    FEATURES = ['lat', 'long', 'alt', 
          'GDD', 'GDD_cumsum', 'GDD_30d',
          'GDD_60d', 'GDD_120d', 'GDD_rate_change_30_60',
          'GDD_rate_change_60_120', 'TAVG_7d', 'TAVG_30d', 'PRCP_cumsum',
          'PRCP_7d_cumsum', 'PRCP_30d_cumsum', 'PRCP_60d_cumsum',
          'PRCP_120d_cumsum', 'PRCP_dry_days', 'PRCP_rainy_days',
          'Frost_days_30d', 'Heat_days_30d', 'TMAX_fluctuation',
          'Frost_days_365d', 'Heat_days_365d']

    X_new = df_prediction_dict[city][FEATURES]
    X_new_scaled = scaler.transform(X_new)
    y_pred = best_model.predict(X_new_scaled)
    print(f'({model}) The predicted bloom doy of {city} is: {y_pred.round()}.')