# LGBM regressor
LGM regressor scorer ofte bra, jeg har valgt å bruke denne for å se hva vi kan få ut av datasettet uten større mengde feature engineering.

## importere bibloteker

In [35]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMRegressor
from sklearn.impute import SimpleImputer
import joblib, os


## for å hente ut data

In [36]:
def load_data():
    # Read training, test, and sample submission datasets
    train = pd.read_csv("input/train.csv")
    test = pd.read_csv("input/test.csv")
    sample_submission = pd.read_csv("input/sample_submission.csv")

    # Return all three datasets
    return train, test, sample_submission


## Evt. feature engineering

In [37]:
def create_engineered_features(df):
    """
    These are all the parameters that ended up being used in the final model
    """
    df = df.copy()

    # Feature 1: speed / curvature (protect against div-by-zero with small epsilon)
    speed_o_curve = df["speed_limit"] / (df["curvature"] + 1e-6)
    df["speed_o_curve"] = speed_o_curve.fillna(0)

    # Feature 4: visibility risk components (lighting + weather) and composite
    lighting_w = {"night": 0.9, "dim": 0.3, "daylight": 0.1}
    weather_w = {"foggy": 0.8, "rainy": 0.7, "clear": 0.1}

    df["lighting_risk"] = df["lighting"].map(lighting_w).fillna(0)
    df["weather_risk"] = df["weather"].map(weather_w).fillna(0)
    df["visibility_composite"] = (df["lighting_risk"] + df["weather_risk"]) / 2

    # Feature 5: time of day as ordinal integer
    time_order = {"morning": 1, "evening": 2, "afternoon": 3}
    df["time_as_int"] = df["time_of_day"].map(time_order)

    return df

## preparere features for bruk i modell

In [38]:
def prepare_features(train, test):
    # Create engineered features
    train = create_engineered_features(train)
    test = create_engineered_features(test)

    # Separate features and target variable
    X = train.drop(columns=["accident_risk", "id"])
    y = train["accident_risk"]


    # Define categorical feature names
    categorical_features = [
        "road_type",
        "lighting",
        "weather",
        "time_of_day"
    ]

    # Define numerical feature names
    numerical_features = [
        "num_lanes",                # base
        "curvature",                # base
        "speed_limit",              # base
        "num_reported_accidents",   # base
        "speed_o_curve",            # Feature 1
        "visibility_composite",     # Feature 4 (composite)
        "lighting_risk",            # Feature 4 (component)
        "weather_risk",             # Feature 4 (component)
        "time_as_int",              # Feature 5
    ]


    boolean_features = [
        "holiday",
        "school_season",
        "road_signs_present",
        "public_road"
    ]

    print(f"Features: {X.shape[1]}")

        # --- Normalize column dtypes to avoid np.isnan / pd.NA type issues ---
    for col in categorical_features:
        if col in X.columns:
            X[col] = X[col].astype("string").fillna("__MISSING__")
            test[col] = test[col].astype("string").fillna("__MISSING__")


    for col in boolean_features:
        X[col] = X[col].astype("boolean")
        test[col] = test[col].astype("boolean")

    for col in numerical_features:
        # Coerce nullable numerics (like Int64) to float64
        X[col] = pd.to_numeric(X[col], errors="coerce")
        test[col] = pd.to_numeric(test[col], errors="coerce")

    # --- Pipelines for each type ---
    categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=True))
    ])

    numerical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        # with_mean=False keeps it compatible with sparse output
        ("scaler", StandardScaler(with_mean=False))
    ])

    # Combine everything in a ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            #("cat", categorical_transformer, categorical_features),
            ("num", numerical_transformer, numerical_features),
            # You can include boolean features as numeric 0/1
            ("bool", "passthrough", boolean_features)
        ],
        sparse_threshold=0.3
    )

    # Return features, target, test set, and preprocessor
    return X, y, test, preprocessor

# Stacked bodel lgbm, rf, enet

In [39]:
def get_lgbm_best():
    return LGBMRegressor(
        n_estimators=1542,
        learning_rate=0.004624613524705627,
        max_depth=12,
        num_leaves=258,
        subsample=0.6963927503033583,
        colsample_bytree=0.9755243540395523,
        reg_lambda=0.0034553945666010275,
        reg_alpha=0.12863137655092372,
        min_child_samples=8,
        subsample_freq=1,
        random_state=42,
        n_jobs=-1,
        verbosity=-1
    )

def build_lgbm_model(preprocessor):
    print("Building LightGBM model...")
    return Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("regressor", get_lgbm_best())
    ])

def build_stacked_model(preprocessor):
    print("Building Stacked model...")
    lgbm = get_lgbm_best()
    rf = RandomForestRegressor(
        n_estimators=200,        # cut trees
        max_depth=16,            # cap depth
        min_samples_leaf=5,      # fewer leaves
        max_features="sqrt",
        n_jobs=1,                # <<< 1
        random_state=42
    )
    print("Building Stacked model...")
    enet = ElasticNet(alpha=0.0005, l1_ratio=0.1, max_iter=2000, random_state=42)

    print("Building Stacked model...")
    stack = StackingRegressor(
        estimators=[("lgbm", lgbm), ("rf", rf), ("enet", enet)],
        final_estimator=RidgeCV(alphas=np.logspace(-4, 4, 25)),
        cv=KFold(n_splits=5, shuffle=True, random_state=42),
        passthrough=False,
        n_jobs=-1
    )
    return Pipeline(steps=[("preprocessor", preprocessor), ("stack", stack)])

def evaluate_cv(model, X, y, scoring="neg_root_mean_squared_error"):
    print("Evaluating CV...")
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring, n_jobs=-1)
    mean, std = np.mean(scores), np.std(scores)
    return mean, std, scores


## skape submission fil

In [44]:
def generate_submission(model, test, sample_submission, path="/sumbissions/final_stack.csv"):
    # Predict with the fitted pipeline
    preds = model.predict(test)

    # Prepare submission
    submission = sample_submission.copy()
    submission["accident_risk"] = preds

    # Ensure folder exists and save
    os.makedirs(os.path.dirname(path), exist_ok=True)
    submission.to_csv(path, index=False)
    print(f"Submission saved to {path}")


In [41]:
def main():
    # Load datasets
    train, test, sample_submission = load_data()

    # Prepare features and preprocessing
    X, y, test, preprocessor = prepare_features(train, test)

    # Build candidate models
    stacked_model = build_stacked_model(preprocessor)

    # Fit once here
    stacked_model.fit(X, y)

    os.makedirs("models", exist_ok=True)
    joblib.dump(stacked_model, "models/road_risk_lgbm_stacked.joblib")
    print("Model saved to models/road_risk_lgbm_stacked.joblib")

    # Make submission (no re-fit)
    generate_submission(stacked_model, test, sample_submission)

In [42]:
main()

Features: 17
Building Stacked model...
Building Stacked model...
Building Stacked model...




Submission saved to /kaggle/working/submission.csv


NameError: name 'stacked_model' is not defined