# LGBM regressor
LGM regressor scorer ofte bra, jeg har valgt Ã¥ bruke denne for Ã¥ se hva vi kan fÃ¥ ut av datasettet uten stÃ¸rre mengde feature engineering.

## importere bibloteker

In [70]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import seaborn as sb
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error as mse
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error as mse
from lightgbm import LGBMRegressor, early_stopping, log_evaluation
from sklearn.impute import SimpleImputer


In [71]:
# Create a helper module that runs a grid search over FEATURE GROUPS by selecting columns via name patterns.
# This avoids editing your existing feature-engineering functions, and lets you toggle whole groups of features.
# You can import and run this inside your notebook.
#
# It will:
# - Build engineered features with your existing function (create_engineered_features)
# - Define feature groups via column-name regex patterns
# - For each combination of groups, select matching columns, build a fresh preprocessing+LGBM pipeline, and KFold-CV evaluate (RMSE)
# - Save a CSV of results sorted by best score
#
# Usage inside your notebook (after defining/using load_data, create_engineered_features):
# from feature_set_gridsearch import run_feature_set_grid_search, default_group_patterns
# train, test, _ = load_data()
# results = run_feature_set_grid_search(train, target_col="accident_risk",
#                                       group_patterns=default_group_patterns,
#                                       n_splits=5, random_state=42)
# results.head()
#
# To customize, edit default_group_patterns below to match your column prefixes/patterns.
# You can also pass your own dict of patterns to the function.
#
# The results CSV will be saved to /mnt/data/feature_grid_results.csv in this environment.
# In your environment it will save to "feature_grid_results.csv" in the working directory by default.

import os, re, itertools, json, math
from typing import Dict, List, Iterable, Tuple, Optional
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

try:
    from lightgbm import LGBMRegressor
except Exception as e:
    # LightGBM may not be installed in this execution env, but will be in your notebook.
    LGBMRegressor = None


def _column_matches_any(col: str, patterns: Iterable[str]) -> bool:
    for pat in patterns:
        if re.search(pat, col):
            return True
    return False


def build_dynamic_preprocessor(X: pd.DataFrame) -> ColumnTransformer:
    """Infer a simple preprocessor: scale numeric, one-hot categorical."""
    numeric_selector = selector(dtype_include=np.number)
    categorical_selector = selector(dtype_exclude=np.number)

    numeric_transformer = Pipeline(steps=[("scale", StandardScaler(with_mean=False))])
    categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_selector),
            ("cat", categorical_transformer, categorical_selector),
        ]
    )
    return preprocessor


def build_lgbm_pipeline(X: pd.DataFrame, random_state: int = 42) -> Pipeline:
    """Construct a fresh pipeline for each feature subset."""
    pre = build_dynamic_preprocessor(X)
    if LGBMRegressor is None:
        raise RuntimeError("LightGBM is not available in this environment. Run this inside your notebook.")
    model = LGBMRegressor(
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=-1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=random_state,
        n_jobs=-1
    )
    pipe = Pipeline(steps=[("preprocess", pre), ("regressor", model)])
    return pipe


def select_columns_by_groups(
    X: pd.DataFrame, include_groups: List[str], group_patterns: Dict[str, List[str]]
) -> pd.DataFrame:
    """Return X with only columns matching the included groups (union). If a group has [], it is ignored."""
    if not include_groups:
        # No groups selected => return empty feature set (caller may handle or skip)
        return X.iloc[:, 0:0].copy()

    include_cols = set()
    for g in include_groups:
        pats = group_patterns.get(g, [])
        if not pats:
            continue
        for c in X.columns:
            if _column_matches_any(c, pats):
                include_cols.add(c)

    # If some columns didn't match any pattern, they are implicitly excluded.
    if not include_cols:
        # Fall back to nothing
        return X.iloc[:, 0:0].copy()

    return X.loc[:, sorted(include_cols)].copy()


def expand_boolean_grid(options: List[str], max_groups: Optional[int] = None) -> List[List[str]]:
    """
    Generate all group combinations (like a ParameterGrid over True/False per group).
    If max_groups is set, only include combos up to that size (ablation-style).
    """
    groups = options
    combos = []
    for r in range(1, len(groups) + 1):
        if max_groups is not None and r > max_groups:
            break
        for subset in itertools.combinations(groups, r):
            combos.append(list(subset))
    return combos


def run_feature_set_grid_search(
    train: pd.DataFrame,
    target_col: str,
    group_patterns: Dict[str, List[str]],
    id_cols: Optional[List[str]] = None,
    n_splits: int = 5,
    random_state: int = 42,
    max_groups: Optional[int] = None,
    save_path: str = "feature_grid_results.csv"
) -> pd.DataFrame:
    """
    Perform KFold CV over combinations of feature groups chosen by name-patterns.
    Returns a DataFrame with mean/std RMSE and columns used.
    """
    id_cols = id_cols or ["id"]
    if target_col not in train.columns:
        raise ValueError(f"target_col '{target_col}' not found in train columns")

    # Use your existing feature engineering to create columns up front.
    # We import from the current kernel: create_engineered_features was defined in your notebook.
    if "create_engineered_features" not in globals():
        raise RuntimeError("Expected 'create_engineered_features' to be defined in the notebook environment.")
    full = create_engineered_features(train.copy())

    # Separate target
    y = full[target_col].copy()
    # Drop target + id columns
    drop_cols = [c for c in id_cols if c in full.columns] + [target_col]
    X_full = full.drop(columns=drop_cols, errors="ignore")

    # Build all group combinations
    all_groups = list(group_patterns.keys())
    combos = expand_boolean_grid(all_groups, max_groups=max_groups)
    if not combos:
        raise RuntimeError("No group combinations produced. Check your group_patterns.")

    records = []
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    for include_groups in combos:
        X_sub = select_columns_by_groups(X_full, include_groups, group_patterns)

        if X_sub.shape[1] == 0:
            # Skip empty feature sets
            continue

        rmses = []
        for fold, (tr_idx, va_idx) in enumerate(kf.split(X_sub), start=1):
            X_tr, X_va = X_sub.iloc[tr_idx], X_sub.iloc[va_idx]
            y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

            pipe = build_lgbm_pipeline(X_tr, random_state=random_state)
            pipe.fit(X_tr, y_tr)

            pred = pipe.predict(X_va)
            rmse = mean_squared_error(y_va, pred)
            rmses.append(rmse)

        records.append({
            "groups": include_groups,
            "n_features": X_sub.shape[1],
            "cv_rmse_mean": float(np.mean(rmses)),
            "cv_rmse_std": float(np.std(rmses)),
            "fold_rmses": rmses,
        })

    results = pd.DataFrame.from_records(records).sort_values("cv_rmse_mean").reset_index(drop=True)
    results.to_csv(save_path, index=False)
    print(f"Saved results to {save_path} with {len(results)} rows.")
    return results


# === EXAMPLE DEFAULT GROUP PATTERNS ===
# Customize this mapping to your engineered feature names.
# Use regex patterns; each group is a list of patterns. A column matches a group if it matches ANY pattern in that list.
# Tailored feature groups matching your exact columns
group_patterns = {
    # Categorical
    "categorical": [
        r"^(road_type|lighting|weather|time_of_day)$",
    ],

    # Boolean flags (treated as categorical in the pipeline unless you cast to int)
    "booleans": [
        r"^(holiday|school_season|road_signs_present|public_road)$",
    ],

    # Base numeric
    "base_numeric": [
        r"^(num_lanes|curvature|speed_limit|num_reported_accidents)$",
    ],

    # Core engineered single feature
    "speed_o_curve": [
        r"^speed_o_curve$",
    ],

    # Composite / risk components
    "composite_risk": [
        r"^(visibility_composite|lighting_risk|weather_risk)$",
    ],

    # Time-related
    "time": [
        r"^time_as_int$",
    ],

    # Log transforms
    "logs": [
        r"^log_speed_o_curve$",
    ],

    # Interactions
    "interactions": [
        r"^(curvature_x_speed|accidents_o_lanes|speed_time_interaction|curvature_time_interaction)$",
    ],
}

# (Optional) If you later add these, keep a parking group:
# "experimental_disabled": [r"^(speed_x_speed|speed_x_accidents)$"]



# If run as a script (optional): do nothing. The main use is to import from your notebook.



In [72]:
from copy import deepcopy
from sklearn.compose import ColumnTransformer
from sklearn.base import clone

def _filter_columns(cols, present_cols):
    """Return cols âˆ© present_cols if cols is a list/array of names; otherwise return cols unchanged."""
    if cols is None:
        return cols
    # Lists/tuples/Index of col names
    if isinstance(cols, (list, tuple)):
        return [c for c in cols if c in present_cols]
    try:
        # Pandas Index
        import pandas as pd
        if isinstance(cols, pd.Index):
            return cols.intersection(present_cols)
    except Exception:
        pass
    # Callables, selectors, slices, etc. leave as-is.
    return cols

def trim_preprocessor_columns(preprocessor, X_cols):
    """
    Clone a ColumnTransformer-based preprocessor and trim any explicit column name lists
    to only those present in X_cols. This avoids KeyErrors when columns are subset.
    """
    pp = clone(preprocessor)  # shallow clone of estimator config
    # We need to adjust the underlying ColumnTransformer config
    if isinstance(pp, ColumnTransformer):
        # Access the raw 'transformers' param (list of (name, transformer, columns))
        new_transformers = []
        for name, trans, cols in pp.transformers:
            new_cols = _filter_columns(cols, set(X_cols))
            new_transformers.append((name, trans, new_cols))
        pp.set_params(transformers=new_transformers)
    else:
        # If your preprocessor is wrapped inside a Pipeline, unwrap/handle as needed
        try:
            from sklearn.pipeline import Pipeline
            if isinstance(pp, Pipeline):
                steps = []
                for nm, step in pp.steps:
                    if isinstance(step, ColumnTransformer):
                        # trim this step
                        step_trimmed = trim_preprocessor_columns(step, X_cols)
                        steps.append((nm, step_trimmed))
                    else:
                        steps.append((nm, step))
                pp.set_params(steps=steps)
        except Exception:
            pass
    return pp


In [73]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

def run_feature_set_grid_search_fold_safe(
    train: pd.DataFrame,
    target_col: str,
    group_patterns: dict,
    id_cols=None,
    n_splits: int = 5,
    random_state: int = 42,
    max_groups=None,
    save_path: str = "feature_grid_results_foldsafe.csv",
    verbose: bool = True,
    save_every: int = 0,          # e.g., 10 to save partial results every 10 combos; 0 disables
):
    """
    KFold CV with per-fold feature engineering to avoid leakage:
      - Split RAW train -> (tr, va)
      - Run create_engineered_features(tr) and (...va) independently
      - Select columns by group patterns *based on the train-fold columns*
      - Fit pipeline on train-fold, evaluate on val-fold

    Progress logging:
      - Prints progress "[i/total]" per combination if verbose=True
      - Prints per-fold RMSEs and final mean/std per combo
      - Optionally saves partial CSV every `save_every` combos
    """
    id_cols = id_cols or ["id"]
    if target_col not in train.columns:
        raise ValueError(f"target_col '{target_col}' not in raw train")

    if "create_engineered_features" not in globals():
        raise RuntimeError("Expected 'create_engineered_features' to be defined in the notebook.")

    all_groups = list(group_patterns.keys())
    combos = expand_boolean_grid(all_groups, max_groups=max_groups)
    if not combos:
        raise RuntimeError("No group combinations produced. Check your group_patterns.")

    records = []
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    total = len(combos)
    if verbose:
        print(f"Starting fold-safe grid over {total} group combinations "
              f"({n_splits}-fold CV).")

    for ci, include_groups in enumerate(combos, start=1):
        fold_rmses = []
        n_features_seen = None  # track from first fold

        if verbose:
            print(f"\n[{ci}/{total}] Groups: {include_groups}")

        for fold, (tr_idx, va_idx) in enumerate(kf.split(train), start=1):
            tr_raw = train.iloc[tr_idx].copy()
            va_raw = train.iloc[va_idx].copy()

            # FE separately per fold to avoid leakage
            tr_fe = create_engineered_features(tr_raw)
            va_fe = create_engineered_features(va_raw)

            y_tr = tr_fe[target_col].copy()
            y_va = va_fe[target_col].copy()

            # Drop target + ids to form feature matrices
            drop_cols = [c for c in id_cols if c in tr_fe.columns] + [target_col]
            X_tr_full = tr_fe.drop(columns=drop_cols, errors="ignore")
            X_va_full = va_fe.drop(columns=drop_cols, errors="ignore")

            # Select feature columns based on TRAIN-FOLD columns & requested groups
            X_tr_sub = select_columns_by_groups(X_tr_full, include_groups, group_patterns)

            if X_tr_sub.shape[1] == 0:
                if verbose:
                    print("  -> Skipping: no columns selected for this combo.")
                fold_rmses = []
                break

            # Align validation to selected columns
            X_va_sub = X_va_full.reindex(columns=X_tr_sub.columns, fill_value=np.nan)

            # Build & fit pipeline
            pipe = build_lgbm_pipeline(X_tr_sub, random_state=random_state)
            pipe.fit(X_tr_sub, y_tr)

            pred = pipe.predict(X_va_sub)
            rmse = mean_squared_error(y_va, pred)
            fold_rmses.append(rmse)

            if n_features_seen is None:
                n_features_seen = X_tr_sub.shape[1]

            if verbose:
                print(f"  Fold {fold}/{n_splits}: RMSE = {rmse:.6f}")

        if not fold_rmses:
            continue

        mean_rmse = float(np.mean(fold_rmses))
        std_rmse = float(np.std(fold_rmses))

        if verbose:
            print(f"  -> n_features: {n_features_seen}, "
                  f"cv_rmse_mean: {mean_rmse:.6f}, cv_rmse_std: {std_rmse:.6f}")

        records.append({
            "groups": include_groups,
            "n_features": int(n_features_seen),
            "cv_rmse_mean": mean_rmse,
            "cv_rmse_std": std_rmse,
            "fold_rmses": fold_rmses,
        })

        # Optional partial saves
        if save_every and (ci % save_every == 0):
            tmp = pd.DataFrame.from_records(records).sort_values("cv_rmse_mean").reset_index(drop=True)
            tmp.to_csv(save_path, index=False)
            if verbose:
                print(f"  [autosave] Wrote partial results to {save_path} at combo {ci}/{total}")

    results = pd.DataFrame.from_records(records).sort_values("cv_rmse_mean").reset_index(drop=True)
    results.to_csv(save_path, index=False)
    if verbose:
        print(f"\nFinished. Saved fold-safe results to {save_path} with {len(results)} rows.")
    return results

## for Ã¥ hente ut data

In [74]:
def load_data():
    # Read training, test, and sample submission datasets
    train = pd.read_csv("input/train.csv")
    test = pd.read_csv("input/test.csv")
    sample_submission = pd.read_csv("input/sample_submission.csv")

    # Return all three datasets
    return train, test, sample_submission


## Evt. feature engineering

In [75]:
def create_engineered_features(df):
    """
    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame with raw features.

    Returns
    -------
    pandas.DataFrame
        DataFrame with added engineered features (non-destructive copy).
    """
    df = df.copy()

    # Feature 1: speed / curvature (protect against div-by-zero with small epsilon)
    speed_o_curve = df["speed_limit"] / (df["curvature"] + 1e-6)
    df["speed_o_curve"] = speed_o_curve.fillna(0)

    # Feature 2 (disabled): speed squared
    # speed_x_speed = df["speed_limit"] ** 2
    # df["speed_x_speed"] = speed_x_speed.fillna(0)

    # Feature 3 (disabled): speed * reported accidents
    # speed_x_accidents = df["speed_limit"] * df["num_reported_accidents"]
    # df["speed_x_accidents"] = speed_x_accidents.fillna(0)

    # Feature 4: visibility risk components (lighting + weather) and composite
    lighting_w = {"night": 0.9, "dim": 0.3, "daylight": 0.1}
    weather_w = {"foggy": 0.8, "rainy": 0.7, "clear": 0.1}

    df["lighting_risk"] = df["lighting"].map(lighting_w).fillna(0)
    df["weather_risk"] = df["weather"].map(weather_w).fillna(0)
    df["visibility_composite"] = (df["lighting_risk"] + df["weather_risk"]) / 2

    # Feature 5: time of day as ordinal integer
    time_order = {"morning": 1, "evening": 2, "afternoon": 3}
    df["time_as_int"] = df["time_of_day"].map(time_order)

    # Feature 6: log1p(speed / curvature)
    df["log_speed_o_curve"] = np.log1p(speed_o_curve)

    # Feature 7: curvature * speed
    df["curvature_x_speed"] = df["curvature"] * df["speed_limit"]

    # Feature 8: reported accidents per lane (add 1 to avoid div-by-zero)
    df["accidents_o_lanes"] = df["num_reported_accidents"] / (df["num_lanes"] + 1)

    # Feature 9: speed * time (ordinal)
    df["speed_time_interaction"] = df["speed_limit"] * df["time_as_int"]

    # Feature 10: curvature * time (ordinal)
    df["curvature_time_interaction"] = df["curvature"] * df["time_as_int"]

    return df


## preparere features for bruk i modell

In [76]:
def prepare_features(train, test):
    # Create engineered features
    train = create_engineered_features(train)
    test = create_engineered_features(test)

    # Separate features and target variable
    X = train.drop(columns=["accident_risk", "id"])
    y = train["accident_risk"]

    print(f"Features: {X.shape[1]}")
    # Define categorical feature names
    categorical_features = [
        "road_type",
        "lighting",
        "weather",
        "time_of_day"
    ]

    # Define numerical feature names
    numerical_features = [
        "num_lanes",                # base
        "curvature",                # base
        "speed_limit",              # base
        "num_reported_accidents",   # base

        "speed_o_curve",            # Feature 1
        # "speed_x_speed",          # Feature 2 (disabled)
        # "speed_x_accidents",      # Feature 3 (disabled)

        "visibility_composite",     # Feature 4 (composite)
        "lighting_risk",            # Feature 4 (component)
        "weather_risk",             # Feature 4 (component)

        "time_as_int",              # Feature 5
        "log_speed_o_curve",        # Feature 6
        "curvature_x_speed",        # Feature 7
        "accidents_o_lanes",        # Feature 8
        "speed_time_interaction",   # Feature 9
        "curvature_time_interaction",  # Feature 10
    ]


    boolean_features = [
        "holiday",
        "school_season",
        "road_signs_present",
        "public_road"
    ]

        # --- Normalize column dtypes to avoid np.isnan / pd.NA type issues ---
    for col in categorical_features:
        X[col] = X[col].astype("string").fillna("__MISSING__")
        test[col] = test[col].astype("string").fillna("__MISSING__")

    for col in boolean_features:
        X[col] = X[col].astype("boolean")
        test[col] = test[col].astype("boolean")

    for col in numerical_features:
        # Coerce nullable numerics (like Int64) to float64
        X[col] = pd.to_numeric(X[col], errors="coerce")
        test[col] = pd.to_numeric(test[col], errors="coerce")

    # --- Pipelines for each type ---
    categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=True))
    ])

    numerical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        # with_mean=False keeps it compatible with sparse output
        ("scaler", StandardScaler(with_mean=False))
    ])

    # Combine everything in a ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", categorical_transformer, categorical_features),
            ("num", numerical_transformer, numerical_features),
            # You can include boolean features as numeric 0/1
            ("bool", "passthrough", boolean_features)
        ],
        sparse_threshold=0.3
    )

    # Return features, target, test set, and preprocessor
    return X, y, test, preprocessor

## bygge en modell (LGBM regressor i dette eksempelet

In [77]:
# ============================================================
# Function: Build LightGBM model
# ============================================================
def build_lgbm_model(preprocessor):
    # LightGBM hyperparameters
    params = {
        "n_estimators": 525,
        "learning_rate": 0.06,
        "max_depth": 8,
        "num_leaves": 64,
        "subsample": 0.8,
        "colsample_bytree": 0.9,
        "reg_lambda": 0.6,
        "reg_alpha": 0.2
    }

    # Create pipeline with preprocessing and LightGBM
    model = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("regressor", LGBMRegressor(
            **params,
            random_state=42,
            n_jobs=-1,
            verbosity=-1
        ))
    ])
    return model

## skape submission fil

In [78]:
# ============================================================
# Function: Train models and create averaged submission
# ============================================================
def generate_submission(lgbm_model, xgb_model, X, y, test, sample_submission):
    # Preprocess features
    X_processed = lgbm_model.named_steps["preprocessor"].fit_transform(X)
    test_processed = lgbm_model.named_steps["preprocessor"].transform(test)

    # Train LightGBM
    lgbm_model.named_steps["regressor"].fit(
        X_processed,
        y,
        eval_set=[(X_processed, y)],
        eval_metric="rmse",
        callbacks=[
            early_stopping(stopping_rounds=50),
            log_evaluation(period=50)
        ]
    )


    # Generate predictions from both models
    preds_lgbm = lgbm_model.named_steps["regressor"].predict(test_processed)


    # Prepare submission file
    submission = sample_submission.copy()
    # submission["accident_risk"] = final_predictions
    submission["accident_risk"] = preds_lgbm

    # Save CSV for Kaggle submission
    submission.to_csv("submissions/22_engineered_features_basic_lgbm.csv", index=False)


    print("Submission file saved in submissions folder.")

In [79]:
def main():
    # Load datasets
    train, test, sample_submission = load_data()

    # Prepare features and preprocessing
    X, y, test, preprocessor = prepare_features(train, test)

    # Build both models
    lgbm_model = build_lgbm_model(preprocessor)
    #xgb_model = build_xgb_model(preprocessor)

    # Generate final averaged submission
    generate_submission(lgbm_model, _, X, y, test, sample_submission)

In [26]:
main()

Features: 22
Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.0569068	training's l2: 0.00323838
[100]	training's rmse: 0.0559971	training's l2: 0.00313568
[150]	training's rmse: 0.0558546	training's l2: 0.00311973
[200]	training's rmse: 0.0557457	training's l2: 0.00310759
[250]	training's rmse: 0.0556473	training's l2: 0.00309662
[300]	training's rmse: 0.0555626	training's l2: 0.00308721
[350]	training's rmse: 0.0554857	training's l2: 0.00307866
[400]	training's rmse: 0.0554078	training's l2: 0.00307002
[450]	training's rmse: 0.0553369	training's l2: 0.00306218
[500]	training's rmse: 0.0552726	training's l2: 0.00305506




Submission file saved in submissions folder.


In [43]:
train, test, _ = load_data()

# ðŸ‘‰ Customize these patterns to match your column names if needed
group_patterns = group_patterns  # or edit a copy

results = run_feature_set_grid_search(
    train=train,
    target_col="accident_risk",     # adjust if your target name differs
    group_patterns=group_patterns,
    id_cols=["id"],                 # add any extra ID-like cols to drop
    n_splits=5,
    random_state=42,
    max_groups=None,                # or set to a small int to limit combo size
    save_path="feature_grid_results.csv"
)

results.head(20)



KeyboardInterrupt: 

In [83]:
safe_results = run_feature_set_grid_search_fold_safe(
    train=train,
    target_col="accident_risk",
    group_patterns=group_patterns,   # your patterns dict
    id_cols=["id"],
    n_splits=5,
    random_state=42,
    max_groups=None,
    save_path="feature_grid_results_foldsafe.csv",
)

safe_results.head(20)

Starting fold-safe grid over 255 group combinations (5-fold CV).

[1/255] Groups: ['categorical']




UnboundLocalError: cannot access local variable 'rmse' where it is not associated with a value

In [82]:
safe_only_patterns = {
    "categorical": [r"^(road_type|lighting|weather|time_of_day)$"],
    "booleans":    [r"^(holiday|school_season|road_signs_present|public_road)$"],
    "base_numeric":[r"^(num_lanes|curvature|speed_limit|num_reported_accidents)$"],
}

safe_only_results = run_feature_set_grid_search_fold_safe(
    train=train,
    target_col="accident_risk",
    group_patterns=safe_only_patterns,
    id_cols=["id"],
    n_splits=5,
    random_state=42,
    verbose=True
)
safe_only_results.head()

Starting fold-safe grid over 7 group combinations (5-fold CV).

[1/7] Groups: ['categorical']




UnboundLocalError: cannot access local variable 'rmse' where it is not associated with a value

In [84]:
from sklearn.base import clone
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd


def compute_rmse(y_true, y_pred) -> float:
    return float(np.sqrt(mse(y_true, y_pred)))


def run_feature_set_grid_search_replicate_main(
    train, test, target_col, group_patterns, id_cols=None,
    n_splits=5, random_state=42, max_groups=None,
    save_path="feature_grid_results_replicate_main.csv",
    verbose=True, save_every=0,
):
    id_cols = id_cols or ["id"]

    # Prepare once exactly like main()
    X, y, test_processed, preprocessor = prepare_features(train.copy(), test.copy())
    for c in id_cols:
        if c in X.columns:
            X = X.drop(columns=[c])

    combos = expand_boolean_grid(list(group_patterns.keys()), max_groups=max_groups)
    if not combos:
        raise RuntimeError("No group combinations produced. Check your group_patterns.")
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    records = []
    total = len(combos)
    if verbose:
        print(f"Starting grid over {total} combos ({n_splits}-fold CV) using your main() preprocessing.")

    for ci, include_groups in enumerate(combos, start=1):
        if verbose:
            print(f"\n[{ci}/{total}] Groups: {include_groups}")

        X_sub = select_columns_by_groups(X, include_groups, group_patterns)
        if X_sub.shape[1] == 0:
            if verbose:
                print("  -> Skipping: no columns selected for this combo.")
            continue

        fold_rmses = []
        n_features_seen = X_sub.shape[1]

        for fold, (tr_idx, va_idx) in enumerate(kf.split(X_sub), start=1):
            X_tr, X_va = X_sub.iloc[tr_idx], X_sub.iloc[va_idx]
            y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

            # Trim the cloned preprocessor to match current columns
            pp_trim = trim_preprocessor_columns(preprocessor, X_tr.columns)
            model = build_lgbm_model(pp_trim)

            model.fit(X_tr, y_tr)
            pred = model.predict(X_va)
            rmse_val = compute_rmse(y_va, pred)   # <-- use compute_rmse
            fold_rmses.append(rmse_val)
            
            if verbose:
                if fold == 1:
                    null_rmse = compute_rmse(y_va, np.full_like(y_va, y_tr.mean()))
                    print(f"  Fold 1 null baseline RMSE = {null_rmse:.6f}")
                print(f"  Fold {fold}/{n_splits}: RMSE = {rmse_val:.6f}")

        mean_rmse = float(np.mean(fold_rmses))
        std_rmse  = float(np.std(fold_rmses))
        if verbose:
            print(f"  -> n_features: {n_features_seen}, "
                  f"cv_rmse_mean: {mean_rmse:.6f}, cv_rmse_std: {std_rmse:.6f}")

        records.append({
            "groups": include_groups,
            "n_features": int(n_features_seen),
            "cv_rmse_mean": mean_rmse,
            "cv_rmse_std": std_rmse,
            "fold_rmses": fold_rmses,
        })

        if save_every and (ci % save_every == 0):
            tmp = pd.DataFrame.from_records(records).sort_values("cv_rmse_mean").reset_index(drop=True)
            tmp.to_csv(save_path, index=False)
            if verbose:
                print(f"  [autosave] Wrote partial results to {save_path} at combo {ci}/{total}")

    results = pd.DataFrame.from_records(records).sort_values("cv_rmse_mean").reset_index(drop=True)
    results.to_csv(save_path, index=False)
    if verbose:
        print(f"\nFinished. Saved results to {save_path} with {len(results)} rows.")
    return results


In [None]:
rep_results = run_feature_set_grid_search_replicate_main(
    train=train,
    test=test,
    target_col="accident_risk",
    group_patterns=group_patterns,
    id_cols=["id"],
    n_splits=5,
    random_state=42,
    max_groups=None,
    save_path="feature_grid_results_replicate_main.csv",
    verbose=True,
    save_every=10,
)
rep_results.head(20)


Features: 22
Starting grid over 255 combos (5-fold CV) using your main() preprocessing.

[1/255] Groups: ['categorical']




  Fold 1 null baseline RMSE = 0.166173
  Fold 1/5: RMSE = 0.142244




  Fold 2/5: RMSE = 0.142356




  Fold 3/5: RMSE = 0.142692




  Fold 4/5: RMSE = 0.142181




  Fold 5/5: RMSE = 0.142438
  -> n_features: 4, cv_rmse_mean: 0.142382, cv_rmse_std: 0.000179

[2/255] Groups: ['booleans']




  Fold 1 null baseline RMSE = 0.166173
  Fold 1/5: RMSE = 0.165856




  Fold 2/5: RMSE = 0.166176




  Fold 3/5: RMSE = 0.166567




  Fold 4/5: RMSE = 0.165705




  Fold 5/5: RMSE = 0.166269
  -> n_features: 4, cv_rmse_mean: 0.166114, cv_rmse_std: 0.000306

[3/255] Groups: ['base_numeric']




  Fold 1 null baseline RMSE = 0.166173
  Fold 1/5: RMSE = 0.105991




  Fold 2/5: RMSE = 0.106364




  Fold 3/5: RMSE = 0.106357




  Fold 4/5: RMSE = 0.105707
