# Vi bruker en enkel randomforest for å få en base-case vi sammenligner med

## Del 1
Vi Importerer relevante bibloteker

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score



Vi lager en funksjon for å hente ut datafilene som vi kan trenge


In [2]:
def load_data():
    # Read training, test, and sample submission datasets
    train = pd.read_csv("input/train.csv")
    test = pd.read_csv("input/test.csv")

    # Return all three datasets
    return train, test

load_data()
train, _ = load_data()
# De fem første kolonner, for å sjekke korrekt innhenting
#print(train.head())

Vi gjør en veldig simpel preprossesering

- dropper target fra X
- gjør string kategorier om til int
- gjør boolean kategorier om til int
- bruker standardscaler på numerical_features
-

In [3]:

X = train.drop(columns=["accident_risk"])
y = train["accident_risk"]

categorical_cols = ["road_type", "lighting", "weather", "time_of_day"]
boolean_cols = ["road_signs_present", "public_road", "holiday", "school_season"]
numeric_cols = ["num_lanes", "curvature", "speed_limit", "num_reported_accidents"]

X[boolean_cols] = X[boolean_cols].astype(int)

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols + boolean_cols)
    ]
)

model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("rf", RandomForestRegressor(random_state=42))
])



In [4]:


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Fit og predict
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Hold-out RMSE
rmse_holdout = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Hold-out RMSE: {rmse_holdout:.5f}")

# 5-fold cross-validated RMSE (uses built-in scorer)
# Note: cross_val_score returns NEGATIVE RMSE (because higher is better convention), so we negate.
cv_neg_rmse = cross_val_score(
    model, X, y,
    scoring="neg_root_mean_squared_error",
    cv=5,
    n_jobs=-1
)
rmse_cv = -cv_neg_rmse
print(f"CV RMSE (per fold): {rmse_cv}")
print(f"CV RMSE (mean ± std): {rmse_cv.mean():.5f} ± {rmse_cv.std():.5f}")

Hold-out RMSE: 0.05941
CV RMSE (per fold): [0.0595928  0.05900487 0.05946114 0.05933419 0.05924238]
CV RMSE (mean ± std): 0.05933 ± 0.00020


In [7]:
# Example: Define a new single input line
single_input = {
    "road_type": "rural",
    "lighting": "daylight",
    "weather": "clear",
    "time_of_day": "night",
    "num_lanes": 2,
    "curvature": 0,
    "speed_limit": 60,
    "num_reported_accidents": 0,
    "road_signs_present": True,
    "public_road": True,
    "holiday": False,
    "school_season": False
}

# Convert the single input into a DataFrame with one row
single_input_df = pd.DataFrame([single_input])

# Align boolean and categorical features with training preprocessing
single_input_df[boolean_cols] = single_input_df[boolean_cols].astype(int)

# Predict accident risk using the trained model
single_input_prediction = model.predict(single_input_df)

print(f"Predicted accident risk: {single_input_prediction[0]:.5f}")

Predicted accident risk: 0.21485
