# Import

In [None]:
import polars as pl
import polars.selectors as cs
import plotly.express as px
from pathlib import Path
from sklearn.linear_model import RidgeCV, Ridge, LassoCV, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.metrics import r2_score, root_mean_squared_error
from xgboost import XGBRegressor
import numpy as np
import time

import warnings

warnings.filterwarnings("ignore")

In [None]:
# Load data set
train = pl.read_csv("../data/train.csv")

In [None]:
# Constants
PLOT_FOLDER = Path("../plots")
PLOT_FOLDER.mkdir(exist_ok=True)
SAMPLE_SIZE = 5000

# EDA

In [None]:
train.glimpse()

In [None]:
train.select(cs.numeric()).describe()

No null values

In [None]:
# Categorical and Boolean variables
cat_cols = train.select([cs.string(), cs.boolean()]).columns
print(len(cat_cols))  # 8
for col in cat_cols:
    px.histogram(train, x=col, color=col).show()

No imbalance issues for any variables.

In [None]:
# Numeric variables
num_cols = train.select(cs.numeric()).select(pl.exclude("id", "accident_risk")).columns
y_col = "accident_risk"
print(len(num_cols))  # 4
for col in num_cols + [y_col]:
    px.box(train, x=col).show()

number of reported accidents >= 4 are outliers => maybe try new_col = reported_accidents >= 4 
accident_risk above 0.8 are outliers

In [None]:
# correlation: numeric variables
correlation = train.select([y_col] + num_cols).corr()
fig = px.imshow(
    correlation,
    text_auto=True,
    color_continuous_scale="RdBu_r",
    aspect="equal",
    y=correlation.columns,
    title="Numeric variable correlation",
)
fig.update_layout(
    width=700,  # Set explicit width
    height=700,  # Set explicit height
)
fig.show()
fig.write_image(PLOT_FOLDER / "numeric_corr.png")

1. Low correlations among variables.
2. Positive correlation with y: curvature > speed limit > num reported accidents
3. Low correlation with y: num of lanes (-0.006)

In [None]:
# y_col vs cat_cols
for col in cat_cols:
    print(f"For the column {col.replace('_', ' ').title()}")
    mean = train.group_by(col).agg(
        pl.col(y_col).mean().alias("Mean accident rate"),
        pl.col(y_col).std().alias("Standard deviation"),
    )
    print(mean)
    px.box(train, x=col, y=y_col, color=col).show()

From visualization, we identify strong candidates: 
- road type: urban
- lighting: night
- weather: foggy, rainy
- time of day: evening
- road signs present: no significant difference
- public road: True (moderately strong)
- holiday: True
- school season: no significant difference

# Modeling

## Baseline

In [None]:
X = train.select(num_cols + cat_cols).to_dummies(cat_cols).to_numpy()
y = train.select(y_col).to_numpy()
kfold = KFold(5, shuffle=True, random_state=42)

In [None]:
def cv_score_regression(
    estimator, X=X, y=y, scoring="neg_root_mean_squared_error", cv=kfold
):
    start_time = time.time()
    print("====== Start training =======")
    print(f"Model: {estimator}")
    cv_rmse = np.mean(cross_val_score(estimator, X, y, scoring=scoring, cv=cv))
    cv_r2 = np.mean(cross_val_score(estimator, X, y, scoring="r2", cv=cv))
    print(f"RMSE: {-cv_rmse}")
    print(f"R2: {cv_r2}")
    end_time = time.time()
    print(f"Training took {end_time - start_time} seconds.")
    print("====== Finished training ======")

In [None]:
# Define baseline models
ridge_model = Ridge(alpha=1)
lasso_model = Lasso(alpha=0.01)
# rf_model = RandomForestRegressor(
#     n_estimators=100,
#     criterion="squared_error",
#     n_jobs=-1,
#     max_depth=10,
#     min_samples_split=10,
#     random_state=42,
# )
xgb_model = XGBRegressor(
    n_estimators=100,
    criterion="squared_error",
    n_jobs=-2,
    random_state=42,
)
models = [ridge_model, lasso_model, xgb_model]

In [None]:
for model in models:
    cv_score_regression(model)

## Baseline Performance
1. Ridge Regression:
- RMSE: 0.07350
- R2: 0.8049
2. XGB:
- RMSE: 0.05614
- R2: 0.8862

In [None]:
def cv_score_improvement_by_new_features(
    new_X,
    estimator=ridge_model,
    old_X=X,
    y=y,
    scoring="neg_root_mean_squared_error",
    cv=kfold,
):
    old_score = np.mean(cross_val_score(estimator, old_X, y, scoring=scoring, cv=cv))
    new_score = np.mean(cross_val_score(estimator, new_X, y, scoring=scoring, cv=cv))
    difference = new_score - old_score
    print(f"Baseline score: {old_score}")
    print(f"Score with new features: {new_score}")
    if difference > 0:
        print(
            f"The model improves by {difference} ({-difference / old_score * 100:.2f}%)"
        )
    else:
        print(
            f"The model degrades by {-difference}({difference / old_score * 100:.2f}%)"
        )

# Feature Engineering

In [None]:
def make_new_X(new_train, test_set=False):
    cat = new_train.select([cs.string(), cs.boolean()]).columns
    if not test_set:
        new_X = new_train.drop(["id", y_col]).to_dummies(cat).to_numpy()
    else:
        new_X = new_train.drop(["id"]).to_dummies(cat).to_numpy()
    return new_X

## Curvature: explore

In [None]:
px.scatter(
    train.sample(n=SAMPLE_SIZE),
    x="curvature",
    y=y_col,
    marginal_x="histogram",
    trendline="ols",
)

## Abnormal reported

In [None]:
train.select(["num_reported_accidents", y_col]).with_columns(
    (pl.col("num_reported_accidents") >= 4).alias("abnormal_reported")
).group_by("abnormal_reported").agg(pl.mean(y_col))

In [None]:
new_train = train.with_columns(
    (pl.col("num_reported_accidents") >= 3).alias("abnormal_reported")
)
new_X = make_new_X(new_train)
cv_score_improvement_by_new_features(new_X)

Only minor improvement. However, this may be due to the small sample size of abnormal reported == True. With simple average comparison, there is a significant difference.

In [None]:
train_for_plotting = train.sample(n=SAMPLE_SIZE)
for col in num_cols:
    print(f"Column: {col}")
    print(
        train.group_by(col)
        .agg([pl.col(y_col).mean().alias("Mean"), pl.col(y_col).count().alias("Count")])
        .sort("Mean", descending=True)
    )
    px.scatter(
        train.sample(n=SAMPLE_SIZE),
        x=col,
        y=y_col,
        title=f"{col} vs Accident rate",
    ).show()

## high speed limit

In [None]:
train.group_by("speed_limit").agg(pl.col(y_col).mean().alias("Mean")).sort(
    "Mean", descending=True
)

In [None]:
new_train = train.with_columns((pl.col("speed_limit") > 50).alias("high_speed_limit"))
new_X = make_new_X(new_train)
cv_score_improvement_by_new_features(new_X)
cv_score_improvement_by_new_features(new_X, estimator=xgb_model)

We identified a significant predictor for linear regression (speed limit > 50). But not so much for XGB which handles threshold type features well.

In [None]:
new_train_1 = train.with_columns((pl.col("speed_limit") > 50).alias("high_speed_limit"))
new_X_1 = make_new_X(new_train_1)

In [None]:
# try speed limit in combination with other categorical variables
print(
    new_train_1.group_by("high_speed_limit").agg(
        (pl.col(y_col).mean()).alias("Mean accident rate")
    )
)
for col in cat_cols:
    print(f"Column: {col}")
    print(
        new_train_1.group_by(["high_speed_limit", col])
        .agg((pl.col(y_col).mean()).alias("mean_accident_rate"))
        .pivot(index=col, on="high_speed_limit", values="mean_accident_rate")
    )
    px.box(
        new_train_1.sample(n=SAMPLE_SIZE), x=col, y=y_col, facet_col="high_speed_limit"
    ).show()

In [None]:
# interaction term: high speed limit & lighting night
new_train_2 = new_train_1.with_columns(
    (pl.col("high_speed_limit") & (pl.col("lighting") == "night")).alias(
        "high_speed_night"
    )
)
new_X_2 = make_new_X(new_train)
cv_score_improvement_by_new_features(new_X_2, old_X=new_X_1)
cv_score_improvement_by_new_features(new_X_2, old_X=new_X_1, estimator=xgb_model)
cv_score_improvement_by_new_features(new_X_2, estimator=xgb_model)

## interaction: numerical vars

In [None]:
# interaction between numerical variables
# 1. speed limit X # lanes
new_train = train.with_columns(
    (pl.col("speed_limit") * pl.col("num_lanes")).alias("combo")
)
new_X = make_new_X(new_train)
cv_score_improvement_by_new_features(new_X)
cv_score_improvement_by_new_features(new_X, estimator=xgb_model)

In [None]:
# interaction between numerical variables
# 2. # lanes X curvature
new_train = train.with_columns(
    (pl.col("curvature") * pl.col("num_lanes")).alias("combo")
)
new_X = make_new_X(new_train)
cv_score_improvement_by_new_features(new_X)
cv_score_improvement_by_new_features(new_X, estimator=xgb_model)

In [None]:
# interaction between numerical variables
# 3. curvature X speed limit
new_train = train.with_columns(
    (pl.col("curvature") * pl.col("speed_limit")).alias("combo")
)
new_X = make_new_X(new_train)
cv_score_improvement_by_new_features(new_X)
cv_score_improvement_by_new_features(new_X, estimator=xgb_model)

Doesn't seem to help at all.

## speed limit ** 2

In [None]:
new_train = train.with_columns((pl.col("speed_limit") ** 2).alias("speed_squared"))
new_X = make_new_X(new_train)
cv_score_improvement_by_new_features(new_X, old_X=new_X_1)
cv_score_improvement_by_new_features(new_X, old_X=new_X_1, estimator=xgb_model)

Doesn't seem to help either. The explanatory power of speed limit seems exhausted here.

## Try dropping some columns

In [None]:
new_train = train.drop(["num_lanes", "road_signs_present", "school_season"])
new_X = make_new_X(new_train)
cv_score_improvement_by_new_features(new_X)
cv_score_improvement_by_new_features(new_X, estimator=xgb_model)

Worse overall

## Combine evrything so far

In [None]:
new_train = train.with_columns(
    [
        (pl.col("num_reported_accidents") >= 3).alias("abnormal_reports"),
        (pl.col("speed_limit") > 50).alias("high_speed_limit"),
        ((pl.col("speed_limit") > 50) & (pl.col("lighting") == "night")).alias(
            "high_speed_night"
        ),
    ]
)
new_X = make_new_X(new_train)
cv_score_improvement_by_new_features(new_X)
cv_score_improvement_by_new_features(new_X, estimator=xgb_model)

Through feature engineering, I managed to close the gap between Ridge regression and XGBoost. 

# Tune hyperparameters

In [None]:
# Use the best features so far
new_train = train.with_columns(
    [
        (pl.col("num_reported_accidents") >= 3).alias("abnormal_reports"),
        (pl.col("speed_limit") > 50).alias("high_speed_limit"),
        ((pl.col("speed_limit") > 50) & (pl.col("lighting") == "night")).alias(
            "high_speed_night"
        ),
    ]
)
new_X = make_new_X(new_train)

# xgboost with full dataset
# Define hyperparameter grid
param_grid = {
    "n_estimators": [100, 300],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.05, 0.1],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0],
}

xgb_model = XGBRegressor(
    random_state=42,
)

# Create GridSearchCV object
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=kfold,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1,
    verbose=2,
)

# Fit the grid search

grid_search.fit(new_X, y)

# Get results
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Use the best model
best_xgb_model = grid_search.best_estimator_


In [None]:
# Use the best features so far
new_train = train.with_columns(
    [
        (pl.col("num_reported_accidents") >= 3).alias("abnormal_reports"),
        (pl.col("speed_limit") > 50).alias("high_speed_limit"),
        ((pl.col("speed_limit") > 50) & (pl.col("lighting") == "night")).alias(
            "high_speed_night"
        ),
    ]
)
new_X = make_new_X(new_train)
# Ridge Regression with full dataset
# Define hyperparameter grid
param_grid = {"alphas": [0.1, 0.25, 0.5, 0.75, 1, 2]}

model = RidgeCV()

# Create GridSearchCV object
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=kfold,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1,
    verbose=2,
)

# Fit the grid search

grid_search.fit(new_X, y)

# Get results
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Use the best model
best_ridge_model = grid_search.best_estimator_


# Submit results

In [None]:
sample_submission = pl.read_csv("../data/sample_submission.csv")
test = pl.read_csv("../data/test.csv")

In [None]:
# ridge model output
new_test = test.with_columns(
    [
        (pl.col("num_reported_accidents") >= 3).alias("abnormal_reports"),
        (pl.col("speed_limit") > 50).alias("high_speed_limit"),
        ((pl.col("speed_limit") > 50) & (pl.col("lighting") == "night")).alias(
            "high_speed_night"
        ),
    ]
)
new_X = make_new_X(new_test, test_set=True)
ridge_pred = best_ridge_model.predict(new_X)
ridge_output = pl.DataFrame({"id": test.select("id"), "accident_risk": ridge_pred})
ridge_output.write_parquet("../prediction/ridge_submission.parquet")

In [None]:
# xgboost model output
new_test = test.with_columns(
    [
        (pl.col("num_reported_accidents") >= 3).alias("abnormal_reports"),
        # (pl.col("speed_limit") > 50).alias("high_speed_limit"),
        ((pl.col("speed_limit") > 50) & (pl.col("lighting") == "night")).alias(
            "high_speed_night"
        ),
    ]
)
new_X = make_new_X(new_test, test_set=True)
xgb_pred = best_xgb_model.predict(new_X)
xgb_output = pl.DataFrame({"id": test.select("id"), "accident_risk": xgb_pred})
xgb_output.write_parquet("../prediction/xgb_submission.parquet")

In [None]:
# Saving the models
import joblib

joblib.dump(best_ridge_model, "../models/ridge_model.joblib")
joblib.dump(best_xgb_model, "../models/xgb_model.joblib")

# Interpret Ridge Regression

In [None]:
# Prepare data for coefficient interpretation
df = train.with_columns(
    [
        (pl.col("num_reported_accidents") >= 3).alias("abnormal_reports"),
        (pl.col("speed_limit") > 50).alias("high_speed_limit"),
        ((pl.col("speed_limit") > 50) & (pl.col("lighting") == "night")).alias(
            "high_speed_night"
        ),
    ]
)
cat = df.select([cs.string(), cs.boolean()]).columns
df = df.drop(["id", y_col]).to_dummies(cat)

In [None]:
model = joblib.load("../models/ridge_model.joblib")
feature_names = ["intercept"] + df.columns
coefficient = model.intercept_.tolist() + model.coef_.tolist()
coef_df = pl.DataFrame({"features": feature_names, "coef": coefficient})

In [None]:
pred = model.predict(df.to_numpy())
r2 = r2_score(y, pred)
rmse = root_mean_squared_error(y, pred)
print(f"RMSE: {rmse}")
print(f"R2: {r2}")

In [None]:
plot_height = len(coef_df) * 25 + 100
fig = px.bar(
    coef_df.sort("coef", descending=True),
    x="coef",
    y="features",
    orientation="h",
    height=plot_height,
)
fig.update_yaxes(dtick=1)
fig.show()
fig.write_image(PLOT_FOLDER / "coefficients.png")


## Key observations:
1. Positive coefficients:
- Curvature (0.30): greatest contributor to accident risk
- Intercept (0.28): high baseline accident risk
- Lighting - night (0.13)
- High Speed Limit - True (> 50) (0.09): a better indicator than pure numeric speed limit
- Abnormal Reported Accidents - True (>= 3) (0.05): a better indicator than numeric number of accidents reported
- Weather - raining/foggy (0.03): similar effects, moderate increase in accidents
2. Negative coefficients:
- High Speed Limit - False (-0.09): best indicator to lower accident risks
- Lighting - Daylight (-0.06)
- Weather - Clear (-0.06)
- Lighting - Dim (-0.06): could be due to dim light causing drivers to drive more carefully
- Abnormal Reported - False (-0.05)
3. Almost all significant variables make intuitive sense. Feature engineering helps make the model perform better and more interpretable.

In [None]:
# Calculate residuals
res_df = pl.DataFrame({"y": y.ravel(), "pred": pred})
res_df = res_df.with_columns((pl.col("y") - pl.col("pred")).alias("residual"))
# Plot them
fig = px.scatter(data_frame=res_df.sample(n=SAMPLE_SIZE), x="pred", y="residual")
fig.add_hline(y=0, line_dash="dash", line_color="red")
fig.show()

## Alternative exploration

In [None]:
# try getting rid of speed limit to potentially reduce multicollinearity issues
new_train = train.with_columns(
    [
        (pl.col("num_reported_accidents") >= 3).alias("abnormal_reports"),
        (pl.col("speed_limit") > 50).alias("high_speed_limit"),
        ((pl.col("speed_limit") > 50) & (pl.col("lighting") == "night")).alias(
            "high_speed_night"
        ),
    ]
).drop("speed_limit")
new_X = make_new_X(new_train)
old_train = train.with_columns(
    [
        (pl.col("num_reported_accidents") >= 3).alias("abnormal_reports"),
        (pl.col("speed_limit") > 50).alias("high_speed_limit"),
        ((pl.col("speed_limit") > 50) & (pl.col("lighting") == "night")).alias(
            "high_speed_night"
        ),
    ]
)
old_X = make_new_X(old_train)
cv_score_improvement_by_new_features(new_X, old_X=old_X)

Dropping minor columns doesn't seem to improve the model performance, which suggests multicollinearity issues are not present.