# LGBM regressor
LGM regressor scorer ofte bra, jeg har valgt å bruke denne for å se hva vi kan få ut av datasettet uten større mengde feature engineering.

## importere bibloteker

In [65]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import seaborn as sb
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error as mse
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error as mse
from lightgbm import LGBMRegressor, early_stopping, log_evaluation


## for å hente ut data

In [66]:
def load_data():
    # Read training, test, and sample submission datasets
    train = pd.read_csv("input/train.csv")
    test = pd.read_csv("input/test.csv")
    sample_submission = pd.read_csv("input/sample_submission.csv")

    # Return all three datasets
    return train, test, sample_submission


## Evt. feature engineering

## preparere features for bruk i modell

In [68]:
def prepare_features(train, test):


    # Separate features and target variable
    X = train.drop(columns=["accident_risk", "id"])
    y = train["accident_risk"]

    print(f"Features: {X.shape[1]}")
    # Define categorical feature names
    categorical_features = [
        "road_type",
        "lighting",
        "weather",
        "time_of_day"
    ]

    # Define numerical feature names
    numerical_features = [
        "num_lanes",
        "curvature",
        "speed_limit",
        "num_reported_accidents",
        # "risk_density", #forslag til engineerd features
        #"dangerous_conditions_index" #forslag til engineerd features
    ]

    boolean_features = [
        "holiday",
        "school_season",
        "road_signs_present",
        "public_road"
    ]

    for col in categorical_features:
        X[col] = X[col].astype("string")
        test[col] = test[col].astype("string")

    for col in boolean_features:
        X[col] = X[col].astype("boolean")
        test[col] = test[col].astype("boolean")

    # One-hot encode categorical features
    categorical_transformer = OneHotEncoder(handle_unknown="ignore")

    # Standardize numerical features
    numerical_transformer = StandardScaler()

    # Combine transformations into a preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", categorical_transformer, categorical_features),
            ("num", numerical_transformer, numerical_features )
        ]
    )

    # Return features, target, test set, and preprocessor
    return X, y, test, preprocessor

## bygge en modell (LGBM regressor i dette eksempelet

In [69]:
# ============================================================
# Function: Build LightGBM model
# ============================================================
def build_lgbm_model(preprocessor):
    # LightGBM hyperparameters
    params = {
        "n_estimators": 525,
        "learning_rate": 0.06,
        "max_depth": 8,
        "num_leaves": 64,
        "subsample": 0.8,
        "colsample_bytree": 0.9,
        "reg_lambda": 0.6,
        "reg_alpha": 0.2
    }

    # Create pipeline with preprocessing and LightGBM
    model = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("regressor", LGBMRegressor(
            **params,
            random_state=42,
            n_jobs=-1,
            verbosity=-1
        ))
    ])
    return model

## skape submission fil

In [70]:
# ============================================================
# Function: Train models and create averaged submission
# ============================================================
def generate_submission(lgbm_model, xgb_model, X, y, test, sample_submission):
    # Preprocess features
    X_processed = lgbm_model.named_steps["preprocessor"].fit_transform(X)
    test_processed = lgbm_model.named_steps["preprocessor"].transform(test)

    # Train LightGBM
    lgbm_model.named_steps["regressor"].fit(
        X_processed,
        y,
        eval_set=[(X_processed, y)],
        eval_metric="rmse",
        callbacks=[
            early_stopping(stopping_rounds=50),
            log_evaluation(period=50)
        ]
    )

    # # Train XGBoost
    # xgb_model.named_steps["preprocessor"].fit(X, y)
    # xgb_model.named_steps["regressor"].fit(
    #     xgb_model.named_steps["preprocessor"].transform(X),
    #     y,
    #     eval_set=[(xgb_model.named_steps["preprocessor"].transform(X), y)],
    #     verbose=False
    # )

    # Generate predictions from both models
    preds_lgbm = lgbm_model.named_steps["regressor"].predict(test_processed)
    # preds_xgb = xgb_model.predict(test)

    # Average predictions
    # final_predictions = (preds_lgbm + preds_xgb) / 2

    # Prepare submission file
    submission = sample_submission.copy()
    # submission["accident_risk"] = final_predictions
    submission["accident_risk"] = preds_lgbm

    # Save CSV for Kaggle submission
    submission.to_csv("submissions/clean_lgbm.csv", index=False)


    print("Submission file saved in submissions folder.")

In [71]:
def main():
    # Load datasets
    train, test, sample_submission = load_data()

    # Prepare features and preprocessing
    X, y, test, preprocessor = prepare_features(train, test)

    # Build both models
    lgbm_model = build_lgbm_model(preprocessor)
    #xgb_model = build_xgb_model(preprocessor)

    # Generate final averaged submission
    generate_submission(lgbm_model, _, X, y, test, sample_submission)

In [72]:
main()

Features: 12
Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.0570194	training's l2: 0.00325122
[100]	training's rmse: 0.0560842	training's l2: 0.00314544
[150]	training's rmse: 0.0559765	training's l2: 0.00313337
[200]	training's rmse: 0.055916	training's l2: 0.0031266
[250]	training's rmse: 0.0558535	training's l2: 0.00311961
[300]	training's rmse: 0.0558079	training's l2: 0.00311452
[350]	training's rmse: 0.0557615	training's l2: 0.00310935
[400]	training's rmse: 0.0557197	training's l2: 0.00310469
[450]	training's rmse: 0.0556819	training's l2: 0.00310048
[500]	training's rmse: 0.0556454	training's l2: 0.00309641




Submission file saved in submissions folder.
