In [2]:
# ============================================================
# Import required libraries
# ============================================================
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from lightgbm import LGBMRegressor, early_stopping, log_evaluation
from xgboost import XGBRegressor

In [3]:
# ============================================================
# Function: Load data from CSV files
# ============================================================
def load_data():
    # Read training, test, and sample submission datasets
    train = pd.read_csv("input/train.csv")
    test = pd.read_csv("input/test.csv")
    sample_submission = pd.read_csv("input/sample_submission.csv")

    # Return all three datasets
    return train, test, sample_submission

load_data()
train, test, smaple = load_data()
print(train.head())

   id road_type  num_lanes  curvature  speed_limit  lighting weather  \
0   0     urban          2       0.06           35  daylight   rainy   
1   1     urban          4       0.99           35  daylight   clear   
2   2     rural          4       0.63           70       dim   clear   
3   3   highway          4       0.07           35       dim   rainy   
4   4     rural          1       0.58           60  daylight   foggy   

   road_signs_present  public_road time_of_day  holiday  school_season  \
0               False         True   afternoon    False           True   
1                True        False     evening     True           True   
2               False         True     morning     True          False   
3                True         True     morning    False          False   
4               False        False     evening     True          False   

   num_reported_accidents  accident_risk  
0                       1           0.13  
1                       0           

In [4]:
# ============================================================
# Function: Create engineered features
# ============================================================
def create_engineered_features(df):
    """
    Create 2 powerful engineered features for accident risk prediction

    Parameters:
    df: DataFrame with raw features

    Returns:
    DataFrame with added engineered features
    """
    df = df.copy()

    # Feature 1: Risk Density Score
    # Combines accident history with road capacity
    # Higher values indicate more accidents per lane (higher risk concentration)
    df['risk_density'] = df['num_reported_accidents'] / (df['num_lanes'] + 1)

    # Feature 2: Dangerous Conditions Index
    # Interaction between environmental/temporal risk factors
    # Creates binary flags for high-risk conditions and combines them

    # High-risk weather conditions (excluding clear/sunny)
    weather_risk = (~df['weather'].isin(['clear', 'sunny'])).astype(int)

    # Poor lighting conditions
    lighting_risk = (df['lighting'].isin(['dark', 'dusk'])).astype(int)

    # High-risk time periods (rush hours and night)
    time_risk = (df['time_of_day'].isin(['evening', 'night', 'morning'])).astype(int)

    # Dangerous road characteristics
    road_risk = (
        (df['curvature'] > df['curvature'].median()).astype(int) +
        (df['speed_limit'] > df['speed_limit'].median()).astype(int)
    )

    # Combine all risk factors into composite score
    df['dangerous_conditions_index'] = (
        weather_risk * 2 +  # Weather weighted heavily
        lighting_risk * 1.5 +  # Lighting is important
        time_risk * 1.2 +  # Time of day matters
        road_risk * 0.8  # Road characteristics
    )

    return df

In [62]:
# ============================================================
# Function: Prepare features for model training
# ============================================================
def prepare_features(train, test):
    # Create engineered features
    train = create_engineered_features(train)
    test = create_engineered_features(test)

    # Separate features and target variable
    X = train.drop(columns=["accident_risk"])
    y = train["accident_risk"]

    # Define categorical feature names
    categorical_features = [
        "road_type",
        "lighting",
        "weather",
        "time_of_day"
    ]

    # Define numerical feature names
    numerical_features = [
        "num_lanes",
        "curvature",
        "speed_limit",
        "num_reported_accidents",
        "holiday",
        "school_season",
        "road_signs_present",
        "public_road",
        "risk_density",
        "dangerous_conditions_index"
    ]

    # One-hot encode categorical features
    categorical_transformer = OneHotEncoder(handle_unknown="ignore")

    # Standardize numerical features
    numerical_transformer = StandardScaler()

    # Combine transformations into a preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", categorical_transformer, categorical_features),
            ("num", numerical_transformer, numerical_features)
        ]
    )

    # Return features, target, test set, and preprocessor
    return X, y, test, preprocessor

In [63]:
# ============================================================
# Function: Build LightGBM model
# ============================================================
def build_lgbm_model(preprocessor):
    # LightGBM hyperparameters
    params = {
        "n_estimators": 525,
        "learning_rate": 0.06,
        "max_depth": 8,
        "num_leaves": 64,
        "subsample": 0.8,
        "colsample_bytree": 0.9,
        "reg_lambda": 0.6,
        "reg_alpha": 0.2
    }

    # Create pipeline with preprocessing and LightGBM
    model = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("regressor", LGBMRegressor(
            **params,
            random_state=42,
            n_jobs=-1,
            verbosity=-1
        ))
    ])
    return model

In [64]:
# ============================================================
# Function: Build XGBoost model
# ============================================================
def build_xgb_model(preprocessor):
    # XGBoost hyperparameters
    params = {
        "n_estimators": 525,
        "learning_rate": 0.06,
        "max_depth": 8,
        "subsample": 0.6,
        "colsample_bytree": 0.8,
        "reg_lambda": 0.6,
        "reg_alpha": 0.2,
        "tree_method": "hist",
        "n_jobs": -1
    }

    # Create pipeline with preprocessing and XGBoost
    model = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("regressor", XGBRegressor(
            **params,
            random_state=42
        ))
    ])
    return model

In [65]:
# ============================================================
# Function: Train models and create averaged submission
# ============================================================
def generate_submission(lgbm_model, xgb_model, X, y, test, sample_submission):
    # Preprocess features
    X_processed = lgbm_model.named_steps["preprocessor"].fit_transform(X)
    test_processed = lgbm_model.named_steps["preprocessor"].transform(test)

    # Train LightGBM
    lgbm_model.named_steps["regressor"].fit(
        X_processed,
        y,
        eval_set=[(X_processed, y)],
        eval_metric="rmse",
        callbacks=[
            early_stopping(stopping_rounds=50),
            log_evaluation(period=50)
        ]
    )

    # Train XGBoost
    xgb_model.named_steps["preprocessor"].fit(X, y)
    xgb_model.named_steps["regressor"].fit(
        xgb_model.named_steps["preprocessor"].transform(X),
        y,
        eval_set=[(xgb_model.named_steps["preprocessor"].transform(X), y)],
        verbose=False
    )

    # Generate predictions from both models
    preds_lgbm = lgbm_model.named_steps["regressor"].predict(test_processed)
    preds_xgb = xgb_model.predict(test)

    # Average predictions
    final_predictions = (preds_lgbm + preds_xgb) / 2

    # Prepare submission file
    submission = sample_submission.copy()
    submission["accident_risk"] = final_predictions

    # Save CSV for Kaggle submission
    submission.to_csv("submission.csv", index=False)

    # Print confirmation
    print("Submission file saved as submission.csv (LGBM + XGBoost averaged)")

In [66]:
# ============================================================
# Main function: Complete execution flow
# ============================================================
def main():
    # Load datasets
    train, test, sample_submission = load_data()

    # Prepare features and preprocessing
    X, y, test, preprocessor = prepare_features(train, test)

    # Build both models
    lgbm_model = build_lgbm_model(preprocessor)
    xgb_model = build_xgb_model(preprocessor)

    # Generate final averaged submission
    generate_submission(lgbm_model, xgb_model, X, y, test, sample_submission)

In [67]:
# ============================================================
# Script entry point
# ============================================================
if __name__ == "__main__":
    main()

Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.0569097	training's l2: 0.00323871
[100]	training's rmse: 0.0560022	training's l2: 0.00313625
[150]	training's rmse: 0.0558697	training's l2: 0.00312143
[200]	training's rmse: 0.0557887	training's l2: 0.00311237
[250]	training's rmse: 0.055706	training's l2: 0.00310315
[300]	training's rmse: 0.0556386	training's l2: 0.00309565
[350]	training's rmse: 0.0555763	training's l2: 0.00308873
[400]	training's rmse: 0.0555212	training's l2: 0.0030826
[450]	training's rmse: 0.0554675	training's l2: 0.00307664
[500]	training's rmse: 0.0554228	training's l2: 0.00307169




Submission file saved as submission.csv (LGBM + XGBoost averaged)
