In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, cross_val_score ,train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler,LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import  mean_squared_error

import xgboost as xgb

## Data Loading

In [2]:
TRAIN_PATH = 'dataset/train.csv'
TEST_PATH = 'dataset/train.csv'

assert os.path.exists(TRAIN_PATH) , f"This path {TRAIN_Path} doesnt exist. Make sure it exists"
assert os.path.exists(TEST_PATH) , f"This path {TEST_PATH} doesnt exist. Make sure it exists"

train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

# Feature and column

In [3]:
TARGET = 'accident_risk'
ID_COL = 'id'

FEATURES_COLS = [c for c in train.columns]
DROP_COLS = ["road_type","time_of_day","id"]
COLS = [c for c in FEATURES_COLS if c not in DROP_COLS ]
print(COLS)

# Set categorical and numeric cols

CATEGORICAL_COLS = []
NUMERIQUES_COLS = []

for c in COLS:
    if c == ID_COL:
        continue
    if pd.api.types.is_numeric_dtype(train[c]):
        NUMERIQUES_COLS.append(c)
    else:
        CATEGORICAL_COLS.append(c)

print(f"Numeric Columns",NUMERIQUES_COLS)
print(f"Categorical Columns",CATEGORICAL_COLS)

['num_lanes', 'curvature', 'speed_limit', 'lighting', 'weather', 'road_signs_present', 'public_road', 'holiday', 'school_season', 'num_reported_accidents', 'accident_risk']
Numeric Columns ['num_lanes', 'curvature', 'speed_limit', 'road_signs_present', 'public_road', 'holiday', 'school_season', 'num_reported_accidents', 'accident_risk']
Categorical Columns ['lighting', 'weather']


# Pipeline

In [32]:
numeric_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())
])

categoric_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('encoder',OneHotEncoder(handle_unknown='ignore',sparse_output= False))
])

preprocess = ColumnTransformer(
   transformers = [
       ('num',numeric_transformer,NUMERIQUES_COLS),
       ('cat',categoric_transformer,CATEGORICAL_COLS)
   ]
)

model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=600,
    learning_rate=0.08,
    random_state=42,
)

pipeline = Pipeline(steps=[
    ('preprocess',preprocess),
    ('model',model)
]
)

X = train[COLS].drop(columns=[DROP_COLS],errors='ignore')
y = train[TARGET]

le = LabelEncoder()

for col in COLS:
    X[col] = le.fit_transform(X[col])

In [33]:
X.head()

Unnamed: 0,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,holiday,school_season,num_reported_accidents,accident_risk
0,1,7,1,0,2,0,1,0,1,1,13
1,3,259,1,0,0,1,0,1,1,0,35
2,3,218,4,1,0,0,1,1,0,2,30
3,3,8,1,1,2,1,1,0,0,1,21
4,0,209,3,0,1,0,0,1,0,1,56


# Split data  in train and test 

In [45]:
X_train,X_test, y_train,y_test = train_test_split(X,y,test_size=0.01)

# Training

In [46]:
pipeline.fit(X_train,y_train)

y_predict = pipeline.predict(X_test)


# Predicting

In [47]:
y_predict = pipeline.predict(X_test)

mse = mean_squared_error(y_test,y_predict)
print(mse)

5.57879699293683e-12
