In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
import joblib
from itertools import product
import time
from tqdm import tqdm

In [2]:
df_train = pd.read_csv('train_dataset.csv')
df_valid = pd.read_csv('valid_dataset.csv')
df_test = pd.read_csv('test_dataset.csv')

In [3]:
X_train = df_train.drop(columns=["avg_available_spots"])
y_train = df_train["avg_available_spots"]
X_valid = df_valid.drop(columns=["avg_available_spots"])
y_valid = df_valid["avg_available_spots"]
X_test = df_test.drop(columns=["avg_available_spots"])
y_test = df_test["avg_available_spots"]

In [4]:
model = RandomForestRegressor(random_state=42)

In [5]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
}

In [6]:
param_combinations = list(product(param_grid['n_estimators'], param_grid['max_depth']))

best_model = None
best_mse = float('inf')
best_result = None

In [None]:
# calculate ajusted R²
def adjusted_r2(r2, n_samples, n_features):
    return 1 - (1 - r2) * ((n_samples - 1) / (n_samples - n_features - 1))

n_valid_samples = X_valid.shape[0]
n_features = X_valid.shape[1]

In [8]:
for n_estimators, max_depth in tqdm(param_combinations):
    print(f"\nTraining with n_estimators={n_estimators}, max_depth={max_depth}")
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=42,
        n_jobs=-1
    )
    start_time = time.time()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)

    mse = mean_squared_error(y_valid, y_pred)
    r2 = r2_score(y_valid, y_pred)
    rmse = np.sqrt(mse)
    adj_r2 = adjusted_r2(r2, n_valid_samples, n_features)
    end_time = time.time()
    time_elapsed = end_time - start_time
    
    print(f"→ MSE: {mse:.4f}, RMSE: {rmse:.4f}, R²: {r2:.4f}, Adjusted R²: {adj_r2:.4f}, time elapsed:{time_elapsed:.4f}")

    if mse < best_mse:
        best_mse = mse
        best_model = model
        best_result = {
            'n_estimators': n_estimators,
            'max_depth': max_depth,
            'mse': mse,
            'rmse': rmse,
            'r2': r2,
        }

  0%|          | 0/4 [00:00<?, ?it/s]


Training with n_estimators=100, max_depth=10


 25%|██▌       | 1/4 [18:55<56:47, 1135.69s/it]

→ MSE: 249.2706, RMSE: 15.7883, R²: 0.9314, Adjusted R²: 0.9314, time elapsed:1135.6865

Training with n_estimators=100, max_depth=20


 50%|█████     | 2/4 [49:48<51:54, 1557.39s/it]

→ MSE: 250.6828, RMSE: 15.8330, R²: 0.9310, Adjusted R²: 0.9310, time elapsed:1852.5415

Training with n_estimators=200, max_depth=10


 75%|███████▌  | 3/4 [1:25:48<30:32, 1832.77s/it]

→ MSE: 249.8893, RMSE: 15.8079, R²: 0.9312, Adjusted R²: 0.9312, time elapsed:2160.2114

Training with n_estimators=200, max_depth=20


100%|██████████| 4/4 [2:26:36<00:00, 2199.17s/it]

→ MSE: 252.4147, RMSE: 15.8876, R²: 0.9305, Adjusted R²: 0.9305, time elapsed:3647.9366





In [9]:
print("Best parameters (lowest MSE):", best_result)

Best parameters (lowest MSE): {'n_estimators': 100, 'max_depth': 10, 'mse': 249.27064355725577, 'rmse': np.float64(15.78830717832839), 'r2': 0.9314085278550577}


In [None]:
# create feature importance series and sort it
feature_importances = pd.Series(best_model.feature_importances_, index=X_train.columns)
sorted_importances = feature_importances.sort_values(ascending=False)

with open("feature_importances_all_data_v6.txt", "w", encoding="utf-8") as f:
    for feature, importance in sorted_importances.items():
        line = f"{feature}: {importance:.6f}"
        print(line)           
        f.write(line + "\n")  

TotalSpaces: 0.887314
day_off: 0.049817
half_hour_sin: 0.018329
lon: 0.010231
half_hour_cos: 0.006784
weekday_num_4: 0.002670
firstHourFee: 0.002420
laterHourFee: 0.002232
weekday_num_5: 0.002147
weekday_num_6: 0.002081
relative_humidity_2m: 0.001662
day_sin: 0.001593
month_val_4: 0.001229
et0_fao_evapotranspiration: 0.001127
month_val_2: 0.001030
day_cos: 0.000964
lat: 0.000804
surface_pressure: 0.000585
pressure_msl: 0.000583
weekday_num_0: 0.000542
vapour_pressure_deficit: 0.000452
district: 0.000438
dew_point_2m: 0.000360
terrestrial_radiation_instant: 0.000310
weekday_num_1: 0.000278
month_val_10: 0.000276
apparent_temperature: 0.000270
year_val: 0.000257
wind_gusts_10m: 0.000243
temperature_2m: 0.000239
month_val_5: 0.000236
wind_direction_10m_cos: 0.000202
month_val_7: 0.000179
temperature_120m: 0.000161
cloud_cover: 0.000150
cloud_cover_mid: 0.000137
wind_speed_10m: 0.000135
wind_direction_10m_sin: 0.000131
temperature_80m: 0.000131
cloud_cover_high: 0.000129
wind_direction_120

In [None]:
sfm = SelectFromModel(best_model, threshold='mean')  
X_selected = sfm.transform(X_train)
selected_features = X_train.columns[sfm.get_support()]
print("被選中的特徵：", selected_features.tolist())



被選中的特徵： ['TotalSpaces', 'half_hour_sin', 'day_off']


In [None]:
# y_test: true value of the test set
# y_pred: model prediction
y_pred = best_model.predict(X_test)
r2 = r2_score(y_test, y_pred)

n = len(y_test)         # number of samples
p = X_test.shape[1]     # number of features
adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

print(f"R² score: {r2:.4f}, Adjusted R^2:{adjusted_r2:.4f}")
# calculate MAE
mae = mean_absolute_error(y_test, y_pred)

# calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"MAE (Mean Absolute Error): {mae:.4f}")
print(f"RMSE (Root Mean Squared Error): {rmse:.4f}")

R² score: 0.9765, Adjusted R^2:0.9765
MAE (Mean Absolute Error): 3.6189
RMSE (Root Mean Squared Error): 9.3546


In [None]:


# save model
joblib.dump(best_model, 'random_forest_model_all_feature_v6.pkl')

['random_forest_model_all_feature_v6.pkl']

In [14]:
print(len(df_train))

2079088


In [15]:
print(df_train.columns)

Index(['year_val', 'district', 'avg_available_spots', 'TotalSpaces', 'lat',
       'lon', 'firstHourFee', 'laterHourFee', 'precipitation',
       'apparent_temperature', 'relative_humidity_2m', 'temperature_2m',
       'cloud_cover', 'cloud_cover_low', 'cloud_cover_mid', 'cloud_cover_high',
       'visibility', 'wind_speed_10m', 'temperature_80m', 'temperature_120m',
       'showers', 'pressure_msl', 'surface_pressure', 'wind_speed_80m',
       'wind_speed_120m', 'wind_gusts_10m', 'evapotranspiration',
       'et0_fao_evapotranspiration', 'vapour_pressure_deficit', 'dew_point_2m',
       'terrestrial_radiation_instant', 'weekday_num_0', 'weekday_num_1',
       'weekday_num_2', 'weekday_num_3', 'weekday_num_4', 'weekday_num_5',
       'weekday_num_6', 'month_val_1', 'month_val_2', 'month_val_3',
       'month_val_4', 'month_val_5', 'month_val_6', 'month_val_7',
       'month_val_8', 'month_val_9', 'month_val_10', 'month_val_11',
       'month_val_12', 'day_sin', 'day_cos', 'half_hour_si