# LGBM regressor
LGM regressor scorer ofte bra, jeg har valgt å bruke denne for å se hva vi kan få ut av datasettet uten større mengde feature engineering.

## importere bibloteker

In [5]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import seaborn as sb
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error as mse
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error as mse
from lightgbm import LGBMRegressor, early_stopping, log_evaluation
from sklearn.impute import SimpleImputer


## for å hente ut data

In [6]:
def load_data():
    # Read training, test, and sample submission datasets
    train = pd.read_csv("input/train.csv")
    test = pd.read_csv("input/test.csv")
    sample_submission = pd.read_csv("input/sample_submission.csv")

    # Return all three datasets
    return train, test, sample_submission


## Evt. feature engineering

In [7]:
def create_engineered_features(df):
    """
    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame with raw features.

    Returns
    -------
    pandas.DataFrame
        DataFrame with added engineered features (non-destructive copy).
    """
    df = df.copy()

    # Feature 1: speed / curvature (protect against div-by-zero with small epsilon)
    speed_o_curve = df["speed_limit"] / (df["curvature"] + 1e-6)
    df["speed_o_curve"] = speed_o_curve.fillna(0)

    # Feature 2 (disabled): speed squared
    # speed_x_speed = df["speed_limit"] ** 2
    # df["speed_x_speed"] = speed_x_speed.fillna(0)

    # Feature 3 (disabled): speed * reported accidents
    # speed_x_accidents = df["speed_limit"] * df["num_reported_accidents"]
    # df["speed_x_accidents"] = speed_x_accidents.fillna(0)

    # Feature 4: visibility risk components (lighting + weather) and composite
    lighting_w = {"night": 0.9, "dim": 0.3, "daylight": 0.1}
    weather_w = {"foggy": 0.8, "rainy": 0.7, "clear": 0.1}

    df["lighting_risk"] = df["lighting"].map(lighting_w).fillna(0)
    df["weather_risk"] = df["weather"].map(weather_w).fillna(0)
    df["visibility_composite"] = (df["lighting_risk"] + df["weather_risk"]) / 2

    # Feature 5: time of day as ordinal integer
    time_order = {"morning": 1, "evening": 2, "afternoon": 3}
    df["time_as_int"] = df["time_of_day"].map(time_order)

    # # Feature 6: log1p(speed / curvature)
    # df["log_speed_o_curve"] = np.log1p(speed_o_curve)
    #
    # # Feature 7: curvature * speed
    # df["curvature_x_speed"] = df["curvature"] * df["speed_limit"]
    #
    # # Feature 8: reported accidents per lane (add 1 to avoid div-by-zero)
    # df["accidents_o_lanes"] = df["num_reported_accidents"] / (df["num_lanes"] + 1)
    #
    # # Feature 9: speed * time (ordinal)
    # df["speed_time_interaction"] = df["speed_limit"] * df["time_as_int"]
    #
    # # Feature 10: curvature * time (ordinal)
    # df["curvature_time_interaction"] = df["curvature"] * df["time_as_int"]

    return df


## preparere features for bruk i modell

In [8]:
def prepare_features(train, test):
    # Create engineered features
    train = create_engineered_features(train)
    test = create_engineered_features(test)

    # Separate features and target variable
    X = train.drop(columns=["accident_risk", "id"])
    y = train["accident_risk"]


    # Define categorical feature names
    categorical_features = [
        "road_type",
        "lighting",
        "weather",
        "time_of_day"
    ]

    # Define numerical feature names
    numerical_features = [
        "num_lanes",                # base
        "curvature",                # base
        "speed_limit",              # base
        "num_reported_accidents",   # base

        "speed_o_curve",            # Feature 1
        # "speed_x_speed",          # Feature 2 (disabled)
        # "speed_x_accidents",      # Feature 3 (disabled)

        "visibility_composite",     # Feature 4 (composite)
        "lighting_risk",            # Feature 4 (component)
        "weather_risk",             # Feature 4 (component)

        "time_as_int",              # Feature 5
        # "log_speed_o_curve",        # Feature 6
        # "curvature_x_speed",        # Feature 7
        # "accidents_o_lanes",        # Feature 8
        # "speed_time_interaction",   # Feature 9
        # "curvature_time_interaction",  # Feature 10
    ]


    boolean_features = [
        "holiday",
        "school_season",
        "road_signs_present",
        "public_road"
    ]

    print(f"Features: {X.shape[1]}")

        # --- Normalize column dtypes to avoid np.isnan / pd.NA type issues ---
    for col in categorical_features:
        if col in X.columns:
            X[col] = X[col].astype("string").fillna("__MISSING__")
            test[col] = test[col].astype("string").fillna("__MISSING__")


    for col in boolean_features:
        X[col] = X[col].astype("boolean")
        test[col] = test[col].astype("boolean")

    for col in numerical_features:
        # Coerce nullable numerics (like Int64) to float64
        X[col] = pd.to_numeric(X[col], errors="coerce")
        test[col] = pd.to_numeric(test[col], errors="coerce")

    # --- Pipelines for each type ---
    categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=True))
    ])

    numerical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        # with_mean=False keeps it compatible with sparse output
        ("scaler", StandardScaler(with_mean=False))
    ])

    # Combine everything in a ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            #("cat", categorical_transformer, categorical_features),
            ("num", numerical_transformer, numerical_features),
            # You can include boolean features as numeric 0/1
            ("bool", "passthrough", boolean_features)
        ],
        sparse_threshold=0.3
    )

    # Return features, target, test set, and preprocessor
    return X, y, test, preprocessor

## bygge en stacked model

In [9]:
def build_lgbm_model(preprocessor):
    # LightGBM best hyperparameters
    params = {
        "n_estimators": 1542,
        "learning_rate": 0.004624613524705627,
        "max_depth": 12,
        "num_leaves": 258,
        "subsample": 0.6963927503033583,
        "colsample_bytree": 0.9755243540395523,
        "reg_lambda": 0.0034553945666010275,
        "reg_alpha": 0.12863137655092372,
        "min_child_samples": 8,
        "subsample_freq": 1
    }

    # Create pipeline with preprocessing and LightGBM
    model = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("regressor", LGBMRegressor(
            **params,
            random_state=42,
            n_jobs=-1,
            verbosity=-1
        ))
    ])
    return model

# STacked bodel lgbm, rf, enet

In [25]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import RidgeCV, ElasticNet
from sklearn.ensemble import StackingRegressor, RandomForestRegressor
from lightgbm import LGBMRegressor
import numpy as np

# --- Helpers ---
def get_lgbm_best():
    return LGBMRegressor(
        n_estimators=1542,
        learning_rate=0.004624613524705627,
        max_depth=12,
        num_leaves=258,
        subsample=0.6963927503033583,
        colsample_bytree=0.9755243540395523,
        reg_lambda=0.0034553945666010275,
        reg_alpha=0.12863137655092372,
        min_child_samples=8,
        subsample_freq=1,
        random_state=42,
        n_jobs=-1,
        verbosity=-1
    )

def build_lgbm_model(preprocessor):
    print("Building LightGBM model...")
    return Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("regressor", get_lgbm_best())
    ])

def build_stacked_model(preprocessor):
    print("Building Stacked model...")
    lgbm = get_lgbm_best()
    rf = RandomForestRegressor(
        n_estimators=200,        # cut trees
        max_depth=16,            # cap depth
        min_samples_leaf=5,      # fewer leaves
        max_features="sqrt",
        n_jobs=1,                # <<< 1
        random_state=42
    )
    print("Building Stacked model...")
    enet = ElasticNet(alpha=0.0005, l1_ratio=0.1, max_iter=2000, random_state=42)

    print("Building Stacked model...")
    stack = StackingRegressor(
        estimators=[("lgbm", lgbm), ("rf", rf), ("enet", enet)],
        final_estimator=RidgeCV(alphas=np.logspace(-4, 4, 25)),
        cv=KFold(n_splits=5, shuffle=True, random_state=42),
        passthrough=False,
        n_jobs=-1
    )
    return Pipeline(steps=[("preprocessor", preprocessor), ("stack", stack)])

def evaluate_cv(model, X, y, scoring="neg_root_mean_squared_error"):
    print("Evaluating CV...")
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring, n_jobs=-1)
    mean, std = np.mean(scores), np.std(scores)
    return mean, std, scores  # note: for neg metrics, higher is better


## skape submission fil

In [26]:
import os

def generate_submission(model, X, y, test, sample_submission, path="submissions/finalmodel2.csv"):
    # Fit end-to-end pipeline and predict
    model.fit(X, y)
    preds = model.predict(test)

    # Prepare submission
    submission = sample_submission.copy()
    submission["accident_risk"] = preds

    # Ensure folder exists
    os.makedirs(os.path.dirname(path), exist_ok=True)
    submission.to_csv(path, index=False)
    print(f"Submission saved to {path}")


In [27]:
def main():
    # Load datasets
    train, test, sample_submission = load_data()

    # Prepare features and preprocessing
    X, y, test, preprocessor = prepare_features(train, test)

    # Build candidate models
    # lgbm_model = build_lgbm_model(preprocessor)
    stacked_model = build_stacked_model(preprocessor)

    # CV comparison (change scoring to your metric if needed)
    # lgbm_mean, lgbm_std, _ = evaluate_cv(lgbm_model, X, y, scoring="neg_root_mean_squared_error")
    stack_mean, stack_std, _ = evaluate_cv(stacked_model, X, y, scoring="neg_root_mean_squared_error")

    winner = stacked_model

    # print(f"LGBM CV (neg RMSE): mean={lgbm_mean:.6f} ± {lgbm_std:.6f}")
    print(f"Stack CV (neg RMSE): mean={stack_mean:.6f} ± {stack_std:.6f}")

    # Pick the winner (higher is better for neg metrics)
    # winner = stacked_model if stack_mean > lgbm_mean else lgbm_model
    # print("Selected model:", "Stacked" if winner is stacked_model else "LGBM")

    # Fit on full training data and predict test
    winner.fit(X, y)
    preds = winner.predict(test)

    # Make submission
    generate_submission(winner, X, y, test, sample_submission)

In [28]:
main()

Features: 17
Building Stacked model...
Building Stacked model...
Building Stacked model...
Evaluating CV...
Stack CV (neg RMSE): mean=-0.055984 ± 0.000114




Submission saved to submissions/finalmodel2.csv
