# Goal: Predict the likelihood of accidents on different types of roads.

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import mean_squared_error

from autogluon.tabular import TabularPredictor
from autogluon.common import space as ag

import matplotlib.pyplot as plt
plt.figure(figsize=(8, 6))

  from .autonotebook import tqdm as notebook_tqdm


<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

# Feature Engineering

In [2]:
def feature_engineering(df):
    df = df.copy()

    # --- 衍生邏輯特徵 ---
    df["is_highway"] = (df["road_type"] == "highway")
    df["is_dark"] = df["lighting"].isin(["dim", "night"])
    df["is_peak_hour"] = df["time_of_day"].isin(["morning", "evening"])
    df["is_bad_weather"] = df["weather"].isin(["rainy", "foggy"]) # new

    # --- 交互與比值特徵 ---
    df["lane_density"] = df["num_reported_accidents"] / (df["num_lanes"] + 1e-5) # 1e-5 避免 num_lanes 為 0 的情況下進行除法

    # --- 分段速度類別 ---
    df["speed_zone"] = pd.cut(
        df["speed_limit"],
        bins=[0, 30, 50, 70, 100],
        labels=["low", "medium", "high", "extreme"]
    )

    return df

# Data Preprocessing

In [3]:
cat_cols = [
    "road_type",
    "lighting",
    "weather",
    "time_of_day",
    "speed_zone"
]

num_cols = [
    "num_lanes",
    "curvature",
    "num_reported_accidents",
    "lane_density",
    "speed_limit"
]

bool_cols = [
    "road_signs_present",
    "public_road",
    "holiday",
    "school_season",
    "is_highway",
    "is_dark",
    "is_peak_hour",
    "is_bad_weather"
]

In [4]:
def preprocess_data(df, cat_cols, num_cols, bool_cols, label):
    print(f"初始特徵數量：{df.shape[1]}")
    # Feature Engineering
    df = feature_engineering(df)
    print(f"[Feature Engineering] 特徵數量：{df.shape[1]}")

    # Select features and target
    print("Drop 'ID' and 'accident_risk'")
    drop_cols = ["id"] 
    X = df.drop(columns=drop_cols + [label])
    y = df[label]

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Feature Selection - Variance Threshold
    vt = VarianceThreshold(threshold=0.001)   # if variance is less than 0.01%, remove it
    X_num_bool = X_train[num_cols + bool_cols]
    vt.fit(X_num_bool)
    kept_cols = X_num_bool.columns[vt.get_support()].tolist()

    print(f"[num_cols+bool_cols] 低變異移除後特徵數量：{len(kept_cols)} / {len(X_num_bool.columns)}")
    print(f"被移除的特徵：{[c for c in X_num_bool.columns if c not in kept_cols]}")

    # Update num_cols and bool_cols based on Variance Threshold
    num_cols = [c for c in num_cols if c in kept_cols]
    bool_cols = [c for c in bool_cols if c in kept_cols]
    
    # 過濾X_train與X_test中的低變異欄位
    keep_all_cols = list(set(cat_cols + num_cols + bool_cols))
    X_train = X_train[keep_all_cols].copy()
    X_test = X_test[keep_all_cols].copy()

    print(f"[Preprocessing 完成] 最終特徵數量：{X_train.shape[1]}")
    return X_train, X_test, y_train, y_test

# Model Training with AutoGluon

In [5]:
train_df = pd.read_csv("train.csv")
train_df.head()

Unnamed: 0,id,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents,accident_risk
0,0,urban,2,0.06,35,daylight,rainy,False,True,afternoon,False,True,1,0.13
1,1,urban,4,0.99,35,daylight,clear,True,False,evening,True,True,0,0.35
2,2,rural,4,0.63,70,dim,clear,False,True,morning,True,False,2,0.3
3,3,highway,4,0.07,35,dim,rainy,True,True,morning,False,False,1,0.21
4,4,rural,1,0.58,60,daylight,foggy,False,False,evening,True,False,1,0.56


In [6]:
label = "accident_risk"

In [7]:
X_train, X_test, y_train, y_test = preprocess_data(train_df, cat_cols, num_cols, bool_cols, label)

初始特徵數量：14
[Feature Engineering] 特徵數量：20
Drop 'ID' and 'accident_risk'
[num_cols+bool_cols] 低變異移除後特徵數量：13 / 13
被移除的特徵：[]
[Preprocessing 完成] 最終特徵數量：18


In [8]:
X_train.head()

Unnamed: 0,is_bad_weather,road_type,school_season,speed_zone,num_lanes,lane_density,is_highway,curvature,weather,is_dark,is_peak_hour,time_of_day,public_road,num_reported_accidents,holiday,road_signs_present,lighting,speed_limit
143159,False,rural,True,high,2,0.999995,False,0.43,clear,True,False,afternoon,False,2,True,False,dim,60
20172,False,highway,False,low,4,0.249999,True,0.18,clear,True,False,afternoon,False,1,True,False,night,25
57926,False,urban,True,high,1,0.99999,False,0.2,clear,True,True,morning,True,1,True,False,night,70
193319,True,urban,True,high,3,0.333332,False,0.81,foggy,False,True,evening,False,1,True,True,daylight,60
213938,True,rural,True,medium,3,0.666664,False,0.43,rainy,True,True,morning,True,2,True,False,night,35


In [9]:
X_train[label] = y_train.values

In [None]:
eval_metric = "rmse"
time_limit = 60 * 45

hyperparameters = {
    "GBM": {
        "learning_rate": ag.Real(0.01, 0.1, log=True),
        "num_leaves": ag.Int(31, 255),
        "n_estimators": ag.Int(2000, 5000),
        "max_depth": ag.Int(3, 12),
        "random_state": 42
    },
    "XGB": {
        "learning_rate": ag.Real(0.01, 0.1, log=True),
        "max_depth": ag.Int(3, 12),
        "subsample": ag.Real(0.7, 1.0),
        "colsample_bytree": ag.Real(0.6, 1.0),
        "random_state": 42
    },
    "CAT": {
        "iterations": ag.Int(2000, 5000),
        "depth": ag.Int(4, 10),
        "learning_rate": ag.Real(0.01, 0.1, log=True),
        "random_state": 42
    },
    "NN_TORCH": {
        "num_layers": ag.Int(2, 6),
        "hidden_size": ag.Int(256, 1024),
        "dropout_prob": ag.Real(0.1, 0.5),
        "learning_rate": ag.Real(0.01, 0.1, log=True),
        "random_state": 42
    },
}

presets = "best_quality"

predictor = TabularPredictor(
    label=label,
    eval_metric=eval_metric,
    problem_type="regression"
).fit(
    train_data=X_train,
    time_limit=time_limit,
    hyperparameters=hyperparameters,
    presets=presets,
    num_bag_folds=5,
    num_stack_levels=1,
    hyperparameter_tune_kwargs="auto",
    refit_full=True
)


In [None]:
predictor.leaderboard()

In [None]:
predictor.fit_summary().get("model_paths")

In [None]:
predictor.feature_importance(X_train)

In [None]:
y_val_pred = predictor.predict(X_test)
rmse_val = np.sqrt(mean_squared_error(y_test, round(y_val_pred, 2)))
print(f"Validation RMSE (AutoGluon best model): {rmse_val:.5f}")

# Inference

In [10]:
test_df = pd.read_csv("test.csv")
test_df.head()

Unnamed: 0,id,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents
0,517754,highway,2,0.34,45,night,clear,True,True,afternoon,True,True,1
1,517755,urban,3,0.04,45,dim,foggy,True,False,afternoon,True,False,0
2,517756,urban,2,0.59,35,dim,clear,True,False,afternoon,True,True,1
3,517757,rural,4,0.95,35,daylight,rainy,False,False,afternoon,False,False,2
4,517758,highway,2,0.86,35,daylight,clear,True,False,evening,False,True,3


In [None]:
X_test_final = feature_engineering(test_df)
y_test_pred = predictor.predict(X_test_final)

In [None]:
submission = pd.DataFrame({
    "id": test_df["id"],
    "accident_risk": y_test_pred
})
submission.to_csv("autogluon_test_v6.csv", index=False)