In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
import numpy as np
import joblib
from itertools import product
import time
from tqdm import tqdm
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, precision_recall_curve, roc_curve, auc
)

In [2]:
#elastic_feature
#smote
typename = "smote"
feature_filename = "bool_elastic_feature.txt"
best_model_filename = 'random_forest_model_' + typename + '_v6_bool.pkl'
best_model_feature_importance = "feature_importances_" + typename + "_v6_bool.txt"
best_model_performance = "performance_" + typename + "_bool.txt"

In [3]:
if typename == "elastic_feature":
    df_train = pd.read_csv('train_dataset_bool.csv')
elif typename == "smote":
    df_train = pd.read_csv('smote_data_raw.csv')
df_valid = pd.read_csv('valid_dataset_bool.csv')
df_test = pd.read_csv('gnn_test_bool.csv')

In [None]:
if typename == "elastic_feature":
    with open(feature_filename, "r") as f:
        columns_to_keep = [line.strip() for line in f if line.strip()]

    columns_to_keep.append("is_full")

    # filter columns in the DataFrame
    df_train = df_train[columns_to_keep]
    df_valid = df_valid[columns_to_keep]
    df_test = df_test[columns_to_keep]
elif typename == "smote":
    with open(feature_filename, "r") as f:
        columns_to_keep = [line.strip() for line in f if line.strip()]

    columns_to_keep.append("is_full")

    # filter columns in the DataFrame
    df_valid = df_valid[columns_to_keep]
    df_test = df_test[columns_to_keep]

In [5]:
print(df_train.columns)

Index(['laterHourFee', 'firstHourFee', 'day_off', 'district', 'half_hour_cos',
       'lat', 'year_val', 'month_val_2', 'TotalSpaces', 'month_val_12', 'lon',
       'terrestrial_radiation_instant', 'dew_point_2m', 'weekday_num_2',
       'day_sin', 'weekday_num_1', 'half_hour_sin', 'month_val_5',
       'cloud_cover_mid', 'weekday_num_0', 'pressure_msl',
       'vapour_pressure_deficit', 'month_val_11', 'wind_speed_10m',
       'cloud_cover', 'precipitation', 'surface_pressure', 'is_full'],
      dtype='object')


In [6]:
df_train.head(5)

Unnamed: 0,laterHourFee,firstHourFee,day_off,district,half_hour_cos,lat,year_val,month_val_2,TotalSpaces,month_val_12,...,cloud_cover_mid,weekday_num_0,pressure_msl,vapour_pressure_deficit,month_val_11,wind_speed_10m,cloud_cover,precipitation,surface_pressure,is_full
0,0.0,0.0,True,False,0.965926,22.9948,2024,False,3.0,False,...,0.0,False,1010.1,0.9,False,5.8,15.0,0.8,1009.5,False
1,0.0,0.0,False,False,0.5,22.9948,2024,False,3.0,False,...,100.0,False,985.5,0.21,False,45.1,100.0,15.2,984.9,False
2,20.0,20.0,False,False,-0.707107,22.9948,2024,False,3.0,False,...,100.0,False,989.1,0.7,False,45.3,100.0,0.8,988.5,False
3,0.0,0.0,False,False,0.382683,22.9948,2024,False,3.0,False,...,90.0,False,1010.4,0.95,False,10.1,99.0,0.0,1009.8,False
4,0.0,0.0,False,False,0.793353,22.9948,2024,False,3.0,False,...,5.0,True,1006.8,0.99,False,7.1,100.0,0.0,1006.2,False


In [7]:
X_train = df_train.drop(columns=["is_full"])
y_train = df_train["is_full"]
X_valid = df_valid.drop(columns=["is_full"])
y_valid = df_valid["is_full"]
X_test = df_test.drop(columns=["is_full"])
y_test = df_test["is_full"]

In [8]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
}

In [9]:
param_combinations = list(product(param_grid['n_estimators'], param_grid['max_depth']))

best_score = -1
best_model = None
best_params = None
best_metrics = {}

In [10]:
print(df_train.columns)
print(df_valid.columns)

Index(['laterHourFee', 'firstHourFee', 'day_off', 'district', 'half_hour_cos',
       'lat', 'year_val', 'month_val_2', 'TotalSpaces', 'month_val_12', 'lon',
       'terrestrial_radiation_instant', 'dew_point_2m', 'weekday_num_2',
       'day_sin', 'weekday_num_1', 'half_hour_sin', 'month_val_5',
       'cloud_cover_mid', 'weekday_num_0', 'pressure_msl',
       'vapour_pressure_deficit', 'month_val_11', 'wind_speed_10m',
       'cloud_cover', 'precipitation', 'surface_pressure', 'is_full'],
      dtype='object')
Index(['laterHourFee', 'firstHourFee', 'lon', 'day_off', 'half_hour_cos',
       'lat', 'month_val_2', 'TotalSpaces', 'month_val_12',
       'terrestrial_radiation_instant', 'dew_point_2m', 'weekday_num_2',
       'day_sin', 'year_val', 'half_hour_sin', 'cloud_cover_mid',
       'weekday_num_1', 'pressure_msl', 'vapour_pressure_deficit',
       'weekday_num_0', 'wind_speed_10m', 'cloud_cover', 'month_val_5',
       'precipitation', 'surface_pressure', 'is_full'],
      dtype='o

In [None]:
for n_estimators, max_depth in tqdm(param_combinations):
    print(f"\nTraining with n_estimators={n_estimators}, max_depth={max_depth}")
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=42,
        n_jobs=-1
    )
    start_time = time.time()
    model.fit(X_train, y_train)

    y_proba = model.predict_proba(X_valid)[:, 1]
    
    precisions, recalls, thresholds = precision_recall_curve(y_valid, y_proba)
    f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-8)
    best_idx = f1_scores.argmax()
    best_threshold = thresholds[best_idx]
    prc_auc = auc(recalls, precisions)

    y_pred = (y_proba >= best_threshold).astype(int)

    accuracy = accuracy_score(y_valid, y_pred)
    precision = precision_score(y_valid, y_pred)
    recall = recall_score(y_valid, y_pred)
    f1 = f1_score(y_valid, y_pred)
    roc_auc = roc_auc_score(y_valid, y_proba)
    
    end_time = time.time()
    time_elapsed = end_time - start_time
    
    print(f"→ accuracy: {accuracy:.4f}, precision: {precision:.4f}, recall: {recall:.4f}, f1: {f1:.4f}, roc_auc: {roc_auc:.4f}")
    print(f"→ prc_auc: {prc_auc:.4f}, best_threshold: {best_threshold:.4f}, time elapsed:{time_elapsed:.4f}")

    if f1 > best_score:
            best_score = f1
            best_model = model
            best_params = {'n_estimators': n_estimators, 'max_depth': max_depth}
            best_metrics = {
                'Accuracy': accuracy,
                'Precision': precision,
                'Recall': recall,
                'F1 Score': f1,
                'ROC-AUC': roc_auc,
                'PRC-AUC': prc_auc,
                'Best Threshold': best_threshold
            }

In [None]:
print("Best parameters (highest f1):", best_metrics["F1 Score"])

In [None]:
# create feature importance series and sort it
feature_importances = pd.Series(best_model.feature_importances_, index=X_train.columns)
sorted_importances = feature_importances.sort_values(ascending=False)


with open(best_model_feature_importance, "w", encoding="utf-8") as f:
    for feature, importance in sorted_importances.items():
        line = f"{feature}: {importance:.6f}"
        print(line)           
        f.write(line + "\n")  

In [None]:
sfm = SelectFromModel(best_model, threshold='mean')  
X_selected = sfm.transform(X_train)
selected_features = X_train.columns[sfm.get_support()]
print("被選中的特徵：", selected_features.tolist())

In [None]:
# y_test: true value of the test set
# y_pred: model prediction
y_proba = model.predict_proba(X_test)[:, 1]
    
precisions, recalls, thresholds = precision_recall_curve(y_test, y_proba)
prc_auc = auc(recalls, precisions)

y_pred = (y_proba >= best_metrics["Best Threshold"]).astype(int)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

best_threshold = best_metrics["Best Threshold"]

print(f"accuracy: {accuracy:.4f}")
print(f"precision: {precision:.4f}")
print(f"recall: {recall:.4f}")
print(f"f1: {f1:.4f}")
print(f"roc_auc: {roc_auc:.4f}")
print(f"prc_auc: {prc_auc:.4f}")
print(f"best_threshold: {best_threshold:.4f}")

test_metrics = {
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1,
    'ROC-AUC': roc_auc,
    'PRC-AUC': prc_auc,
    'Best Threshold': best_threshold
}

In [None]:
with open(best_model_performance, "w") as f:
    f.write(f"Best Parameters: {best_params}\n")
    f.write(f"\nValid Performance:\n")
    for metric, value in best_metrics.items():
        f.write(f"{metric}: {value:.4f}\n")
    f.write(f"\nTest Performance:\n")
    for metric, value in test_metrics.items():
        f.write(f"{metric}: {value:.4f}\n")

In [None]:
# save model
joblib.dump(best_model, best_model_filename)

In [None]:
print(len(df_train))

In [None]:
print(df_train.columns)