In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
import joblib
from itertools import product
import time
from tqdm import tqdm

In [2]:
#vote_3
#vote_4
#elastic_feature
typename = "elastic_feature"
best_model_filename = 'random_forest_model_' + typename + '_v6.pkl'
best_model_feature_importance = "feature_importances_" + typename + "_v6.txt"
best_model_perforamnce = "performance_" + typename + ".txt"

In [3]:
df_train = pd.read_csv('train_dataset.csv')
df_valid = pd.read_csv('valid_dataset.csv')
df_test = pd.read_csv('gnn_test_discrete.csv')

In [None]:
# read the list of column names
# discrete_elastic_feature.txt
# discrete_vote_3.txt
# discrete_vote_4.txt
with open("discrete_" + typename + ".txt", "r") as f:
    columns_to_keep = [line.strip() for line in f if line.strip()]

columns_to_keep.append("avg_available_spots")

# filter columns in the DataFrame
df_train = df_train[columns_to_keep]
df_valid = df_valid[columns_to_keep]
df_test = df_test[columns_to_keep]



In [5]:
X_train = df_train.drop(columns=["avg_available_spots"])
y_train = df_train["avg_available_spots"]
X_valid = df_valid.drop(columns=["avg_available_spots"])
y_valid = df_valid["avg_available_spots"]
X_test = df_test.drop(columns=["avg_available_spots"])
y_test = df_test["avg_available_spots"]

In [6]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
}

In [7]:
param_combinations = list(product(param_grid['n_estimators'], param_grid['max_depth']))

best_model = None
best_mse = float('inf')
best_result = None

In [None]:
# calculate adjusted R²
def adjusted_r2(r2, n_samples, n_features):
    return 1 - (1 - r2) * ((n_samples - 1) / (n_samples - n_features - 1))

n_valid_samples = X_valid.shape[0]
n_features = X_valid.shape[1]

In [9]:
df_train.columns

Index(['TotalSpaces', 'lon', 'district', 'half_hour_sin', 'day_off',
       'half_hour_cos', 'terrestrial_radiation_instant', 'lat', 'firstHourFee',
       'laterHourFee', 'et0_fao_evapotranspiration', 'relative_humidity_2m',
       'weekday_num_5', 'dew_point_2m', 'weekday_num_6', 'surface_pressure',
       'pressure_msl', 'wind_speed_10m', 'avg_available_spots'],
      dtype='object')

In [10]:
df_train.head(5)

Unnamed: 0,TotalSpaces,lon,district,half_hour_sin,day_off,half_hour_cos,terrestrial_radiation_instant,lat,firstHourFee,laterHourFee,et0_fao_evapotranspiration,relative_humidity_2m,weekday_num_5,dew_point_2m,weekday_num_6,surface_pressure,pressure_msl,wind_speed_10m,avg_available_spots
0,3,120.19519,1,-0.258819,1,0.965926,0.0,22.9948,0,0,0.01,78,1,25.2,0,1009.5,1010.1,5.8,1.0
1,3,120.19519,1,-0.866025,0,0.5,0.0,22.9948,0,0,0.03,94,0,25.8,0,984.9,985.5,45.1,1.0
2,3,120.19519,1,0.707107,0,-0.707107,967.1,22.9948,20,20,0.15,82,0,25.0,0,988.5,989.1,45.3,1.0
3,3,120.19519,1,0.92388,0,0.382683,0.0,22.9948,0,0,0.04,73,0,21.5,0,1009.8,1010.4,10.1,2.0
4,3,120.19519,1,0.608761,0,0.793353,0.0,22.9948,0,0,0.02,73,0,22.2,0,1006.2,1006.8,7.1,1.0


In [11]:
for n_estimators, max_depth in tqdm(param_combinations):
    print(f"\nTraining with n_estimators={n_estimators}, max_depth={max_depth}")
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=42,
        n_jobs=-1
    )
    start_time = time.time()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)

    mse = mean_squared_error(y_valid, y_pred)
    r2 = r2_score(y_valid, y_pred)
    rmse = np.sqrt(mse)
    adj_r2 = adjusted_r2(r2, n_valid_samples, n_features)
    end_time = time.time()
    time_elapsed = end_time - start_time
    
    print(f"→ MSE: {mse:.4f}, RMSE: {rmse:.4f}, R²: {r2:.4f}, Adjusted R²: {adj_r2:.4f}, time elapsed:{time_elapsed:.4f}")

    if mse < best_mse:
        best_mse = mse
        best_model = model
        best_result = {
            'n_estimators': n_estimators,
            'max_depth': max_depth,
            'mse': mse,
            'rmse': rmse,
            'r2': r2,
            'Adjusted r2': adj_r2
        }

  0%|          | 0/4 [00:00<?, ?it/s]


Training with n_estimators=100, max_depth=10


 25%|██▌       | 1/4 [06:44<20:12, 404.18s/it]

→ MSE: 213.2714, RMSE: 14.6038, R²: 0.9413, Adjusted R²: 0.9413, time elapsed:404.1793

Training with n_estimators=100, max_depth=20


 50%|█████     | 2/4 [17:16<17:57, 538.53s/it]

→ MSE: 222.2438, RMSE: 14.9078, R²: 0.9388, Adjusted R²: 0.9388, time elapsed:632.5791

Training with n_estimators=200, max_depth=10


 75%|███████▌  | 3/4 [30:46<11:02, 662.28s/it]

→ MSE: 215.1088, RMSE: 14.6666, R²: 0.9408, Adjusted R²: 0.9408, time elapsed:809.2876

Training with n_estimators=200, max_depth=20


100%|██████████| 4/4 [51:18<00:00, 769.73s/it]

→ MSE: 222.8368, RMSE: 14.9277, R²: 0.9387, Adjusted R²: 0.9387, time elapsed:1232.5873





In [12]:
print("Best parameters (lowest MSE):", best_result)

Best parameters (lowest MSE): {'n_estimators': 100, 'max_depth': 10, 'mse': 213.2714055025146, 'rmse': np.float64(14.603814758566154), 'r2': 0.9413143904108454, 'Adjusted r2': 0.9413118266444597}


In [None]:
# create feature importance series and sort it
feature_importances = pd.Series(best_model.feature_importances_, index=X_train.columns)
sorted_importances = feature_importances.sort_values(ascending=False)

with open(best_model_feature_importance, "w", encoding="utf-8") as f:
    for feature, importance in sorted_importances.items():
        line = f"{feature}: {importance:.6f}"
        print(line)           
        f.write(line + "\n")  

TotalSpaces: 0.891228
day_off: 0.050018
half_hour_sin: 0.018497
lon: 0.010246
half_hour_cos: 0.006895
weekday_num_5: 0.003081
weekday_num_6: 0.002616
firstHourFee: 0.002365
relative_humidity_2m: 0.002337
laterHourFee: 0.002252
et0_fao_evapotranspiration: 0.001860
pressure_msl: 0.001768
dew_point_2m: 0.001722
surface_pressure: 0.001632
wind_speed_10m: 0.001486
lat: 0.000815
terrestrial_radiation_instant: 0.000676
district: 0.000505


In [None]:
sfm = SelectFromModel(best_model, threshold='mean') 
X_selected = sfm.transform(X_train)
selected_features = X_train.columns[sfm.get_support()]
print("被選中的特徵：", selected_features.tolist())



被選中的特徵： ['TotalSpaces']


In [None]:
# y_test: ture value of the test set
# y_pred: model prediction
y_pred = best_model.predict(X_test)
r2 = r2_score(y_test, y_pred)

n = len(y_test)         # number of samples
p = X_test.shape[1]     # number of features
adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

print(f"R² score: {r2:.4f}, Adjusted R^2:{adjusted_r2:.4f}")
# calculate MAE
mae = mean_absolute_error(y_test, y_pred)

# calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"MAE (Mean Absolute Error): {mae:.4f}")
print(f"RMSE (Root Mean Squared Error): {rmse:.4f}")

R² score: 0.9700, Adjusted R^2:0.9700
MAE (Mean Absolute Error): 3.8846
RMSE (Root Mean Squared Error): 10.7512


In [None]:


# save model
joblib.dump(best_model, best_model_filename)

['random_forest_model_elastic_feature_v6.pkl']

In [17]:
'''
best_result = {
            'n_estimators': n_estimators,
            'max_depth': max_depth,
            'mse': mse,
            'rmse': rmse,
            'r2': r2,
        }
'''
with open(best_model_perforamnce, "w") as f:
    for metric, value in best_result.items():
        f.write(f"{metric}: {value:.4f}\n")

In [18]:
print(len(df_train))

2079088


In [19]:
print(df_train.columns)

Index(['TotalSpaces', 'lon', 'district', 'half_hour_sin', 'day_off',
       'half_hour_cos', 'terrestrial_radiation_instant', 'lat', 'firstHourFee',
       'laterHourFee', 'et0_fao_evapotranspiration', 'relative_humidity_2m',
       'weekday_num_5', 'dew_point_2m', 'weekday_num_6', 'surface_pressure',
       'pressure_msl', 'wind_speed_10m', 'avg_available_spots'],
      dtype='object')
