In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
import numpy as np
import joblib
from itertools import product
import time
from tqdm import tqdm
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, precision_recall_curve, roc_curve, auc
)

In [3]:
df_train = pd.read_csv('train_dataset_bool.csv')
df_valid = pd.read_csv('valid_dataset_bool.csv')
df_test = pd.read_csv('gnn_test_bool.csv')

In [4]:
X_train = df_train.drop(columns=["is_full"])
y_train = df_train["is_full"]
X_valid = df_valid.drop(columns=["is_full"])
y_valid = df_valid["is_full"]
X_test = df_test.drop(columns=["is_full"])
y_test = df_test["is_full"]

In [5]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
}

In [6]:
param_combinations = list(product(param_grid['n_estimators'], param_grid['max_depth']))

best_score = -1
best_model = None
best_params = None
best_metrics = {}

In [7]:
for n_estimators, max_depth in tqdm(param_combinations):
    print(f"\nTraining with n_estimators={n_estimators}, max_depth={max_depth}")
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=42,
        n_jobs=-1
    )
    start_time = time.time()
    model.fit(X_train, y_train)
    #y_pred = model.predict(X_valid)
    y_proba = model.predict_proba(X_valid)[:, 1]
    
    precisions, recalls, thresholds = precision_recall_curve(y_valid, y_proba)
    f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-8)
    best_idx = f1_scores.argmax()
    best_threshold = thresholds[best_idx]
    prc_auc = auc(recalls, precisions)

    y_pred = (y_proba >= best_threshold).astype(int)

    accuracy = accuracy_score(y_valid, y_pred)
    precision = precision_score(y_valid, y_pred)
    recall = recall_score(y_valid, y_pred)
    f1 = f1_score(y_valid, y_pred)
    roc_auc = roc_auc_score(y_valid, y_proba)
    

    #fpr, tpr, thresholds = roc_curve(y_valid, y_proba)
    #youden_j = tpr - fpr
    #best_threshold_index = np.argmax(youden_j)
    #best_threshold = thresholds[best_threshold_index]
    
    end_time = time.time()
    time_elapsed = end_time - start_time
    
    print(f"→ accuracy: {accuracy:.4f}, precision: {precision:.4f}, recall: {recall:.4f}, f1: {f1:.4f}, roc_auc: {roc_auc:.4f}")
    print(f"→ prc_auc: {prc_auc:.4f}, best_threshold: {best_threshold:.4f}, time elapsed:{time_elapsed:.4f}")

    if f1 > best_score:
            best_score = f1
            best_model = model
            best_params = {'n_estimators': n_estimators, 'max_depth': max_depth}
            best_metrics = {
                'Accuracy': accuracy,
                'Precision': precision,
                'Recall': recall,
                'F1 Score': f1,
                'ROC-AUC': roc_auc,
                'PRC-AUC': prc_auc,
                'Best Threshold': best_threshold
            }

  0%|          | 0/4 [00:00<?, ?it/s]


Training with n_estimators=100, max_depth=10


 25%|██▌       | 1/4 [03:36<10:48, 216.24s/it]

→ accuracy: 0.8952, precision: 0.3252, recall: 0.7466, f1: 0.4531, roc_auc: 0.9083
→ prc_auc: 0.4013, best_threshold: 0.1284, time elapsed:216.2365

Training with n_estimators=100, max_depth=20


 50%|█████     | 2/4 [08:59<09:18, 279.18s/it]

→ accuracy: 0.9113, precision: 0.3544, recall: 0.6391, f1: 0.4560, roc_auc: 0.9192
→ prc_auc: 0.4016, best_threshold: 0.2407, time elapsed:323.2316

Training with n_estimators=200, max_depth=10


 75%|███████▌  | 3/4 [16:05<05:46, 346.42s/it]

→ accuracy: 0.8993, precision: 0.3321, recall: 0.7231, f1: 0.4551, roc_auc: 0.9089
→ prc_auc: 0.4023, best_threshold: 0.1453, time elapsed:426.4481

Training with n_estimators=200, max_depth=20


100%|██████████| 4/4 [26:38<00:00, 399.69s/it]

→ accuracy: 0.9058, precision: 0.3436, recall: 0.6823, f1: 0.4571, roc_auc: 0.9202
→ prc_auc: 0.4013, best_threshold: 0.2184, time elapsed:632.8296





In [24]:
print("Best parameters (highest f1):", best_metrics["F1 Score"])

Best parameters (highest f1): 0.4570557722690603


In [None]:
# create feature importance series and sort it
feature_importances = pd.Series(best_model.feature_importances_, index=X_train.columns)
sorted_importances = feature_importances.sort_values(ascending=False)


with open("feature_importances_all_data_v6_bool.txt", "w", encoding="utf-8") as f:
    for feature, importance in sorted_importances.items():
        line = f"{feature}: {importance:.6f}"
        print(line)           
        f.write(line + "\n")  

TotalSpaces: 0.338827
lon: 0.126238
lat: 0.125860
half_hour_sin: 0.026278
half_hour_cos: 0.020237
laterHourFee: 0.017789
dew_point_2m: 0.014492
pressure_msl: 0.014311
surface_pressure: 0.014284
day_sin: 0.012727
apparent_temperature: 0.012125
vapour_pressure_deficit: 0.011959
relative_humidity_2m: 0.011463
temperature_80m: 0.011228
district: 0.011145
temperature_120m: 0.011117
temperature_2m: 0.010970
day_cos: 0.010890
wind_speed_10m: 0.010550
wind_speed_120m: 0.010544
firstHourFee: 0.010405
wind_speed_80m: 0.010363
et0_fao_evapotranspiration: 0.010290
wind_gusts_10m: 0.010205
wind_direction_10m_sin: 0.009739
terrestrial_radiation_instant: 0.009731
wind_direction_10m_cos: 0.009389
wind_direction_120m_sin: 0.009211
wind_direction_80m_sin: 0.009186
wind_direction_80m_cos: 0.009130
wind_direction_120m_cos: 0.009086
day_off: 0.006306
cloud_cover: 0.006183
cloud_cover_low: 0.005451
cloud_cover_high: 0.004939
month_val_2: 0.004560
cloud_cover_mid: 0.004415
weekday_num_6: 0.003013
weekday_num

In [None]:
sfm = SelectFromModel(best_model, threshold='mean')  
X_selected = sfm.transform(X_train)
selected_features = X_train.columns[sfm.get_support()]
print("被選中的特徵：", selected_features.tolist())



被選中的特徵： ['TotalSpaces', 'lat', 'lon', 'laterHourFee', 'half_hour_sin', 'half_hour_cos']


In [None]:
# y_test: true value of the test set
# y_pred: model prediction
y_proba = model.predict_proba(X_test)[:, 1]
    
precisions, recalls, thresholds = precision_recall_curve(y_test, y_proba)
prc_auc = auc(recalls, precisions)

y_pred = (y_proba >= best_metrics["Best Threshold"]).astype(int)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

best_threshold = best_metrics["Best Threshold"]


print(f"accuracy: {accuracy:.4f}")
print(f"precision: {precision:.4f}")
print(f"recall: {recall:.4f}")
print(f"f1: {f1:.4f}")
print(f"roc_auc: {roc_auc:.4f}")
print(f"prc_auc: {prc_auc:.4f}")
print(f"best_threshold: {best_threshold:.4f}")

test_metrics = {
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1,
    'ROC-AUC': roc_auc,
    'PRC-AUC': prc_auc,
    'Best Threshold': best_threshold
}

accuracy: 0.9441
precision: 0.4728
recall: 0.5114
f1: 0.4913
roc_auc: 0.9243
prc_auc: 0.4177
best_threshold: 0.2184


In [36]:
with open("performance_all_feature_bool.txt", "w") as f:
    f.write(f"Best Parameters: {best_params}\n")
    f.write(f"\nValid Performance:\n")
    for metric, value in best_metrics.items():
        f.write(f"{metric}: {value:.4f}\n")
    f.write(f"\nTest Performance:\n")
    for metric, value in test_metrics.items():
        f.write(f"{metric}: {value:.4f}\n")

In [None]:
# save model
joblib.dump(best_model, 'random_forest_model_all_feature_v2_bool.pkl')

['random_forest_model_all_feature_v2_bool.pkl']

In [38]:
print(len(df_train))

2079088


In [39]:
print(df_train.columns)

Index(['year_val', 'district', 'TotalSpaces', 'lat', 'lon', 'firstHourFee',
       'laterHourFee', 'precipitation', 'apparent_temperature',
       'relative_humidity_2m', 'temperature_2m', 'cloud_cover',
       'cloud_cover_low', 'cloud_cover_mid', 'cloud_cover_high', 'visibility',
       'wind_speed_10m', 'temperature_80m', 'temperature_120m', 'showers',
       'pressure_msl', 'surface_pressure', 'wind_speed_80m', 'wind_speed_120m',
       'wind_gusts_10m', 'evapotranspiration', 'et0_fao_evapotranspiration',
       'vapour_pressure_deficit', 'dew_point_2m',
       'terrestrial_radiation_instant', 'weekday_num_0', 'weekday_num_1',
       'weekday_num_2', 'weekday_num_3', 'weekday_num_4', 'weekday_num_5',
       'weekday_num_6', 'month_val_1', 'month_val_2', 'month_val_3',
       'month_val_4', 'month_val_5', 'month_val_6', 'month_val_7',
       'month_val_8', 'month_val_9', 'month_val_10', 'month_val_11',
       'month_val_12', 'day_sin', 'day_cos', 'half_hour_sin', 'half_hour_cos',
  