In [78]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [79]:
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

In [80]:
train

Unnamed: 0,id,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents,accident_risk
0,0,urban,2,0.06,35,daylight,rainy,False,True,afternoon,False,True,1,0.13
1,1,urban,4,0.99,35,daylight,clear,True,False,evening,True,True,0,0.35
2,2,rural,4,0.63,70,dim,clear,False,True,morning,True,False,2,0.30
3,3,highway,4,0.07,35,dim,rainy,True,True,morning,False,False,1,0.21
4,4,rural,1,0.58,60,daylight,foggy,False,False,evening,True,False,1,0.56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517749,517749,highway,4,0.10,70,daylight,foggy,True,True,afternoon,False,False,2,0.32
517750,517750,rural,4,0.47,35,daylight,rainy,True,True,morning,False,False,1,0.26
517751,517751,urban,4,0.62,25,daylight,foggy,False,False,afternoon,False,True,0,0.19
517752,517752,highway,3,0.63,25,night,clear,True,False,afternoon,True,True,3,0.51


In [81]:
test

Unnamed: 0,id,road_type,num_lanes,curvature,speed_limit,lighting,weather,road_signs_present,public_road,time_of_day,holiday,school_season,num_reported_accidents
0,517754,highway,2,0.34,45,night,clear,True,True,afternoon,True,True,1
1,517755,urban,3,0.04,45,dim,foggy,True,False,afternoon,True,False,0
2,517756,urban,2,0.59,35,dim,clear,True,False,afternoon,True,True,1
3,517757,rural,4,0.95,35,daylight,rainy,False,False,afternoon,False,False,2
4,517758,highway,2,0.86,35,daylight,clear,True,False,evening,False,True,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
172580,690334,rural,2,0.01,45,dim,rainy,False,False,afternoon,True,True,2
172581,690335,rural,1,0.74,70,daylight,foggy,False,True,afternoon,False,False,2
172582,690336,urban,2,0.14,70,dim,clear,False,False,evening,True,True,1
172583,690337,urban,1,0.09,45,daylight,foggy,True,True,morning,False,True,0


# BASIC EDA


In [82]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517754 entries, 0 to 517753
Data columns (total 14 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   id                      517754 non-null  int64  
 1   road_type               517754 non-null  object 
 2   num_lanes               517754 non-null  int64  
 3   curvature               517754 non-null  float64
 4   speed_limit             517754 non-null  int64  
 5   lighting                517754 non-null  object 
 6   weather                 517754 non-null  object 
 7   road_signs_present      517754 non-null  bool   
 8   public_road             517754 non-null  bool   
 9   time_of_day             517754 non-null  object 
 10  holiday                 517754 non-null  bool   
 11  school_season           517754 non-null  bool   
 12  num_reported_accidents  517754 non-null  int64  
 13  accident_risk           517754 non-null  float64
dtypes: bool(4), float64(

In [83]:
train.nunique()

id                        517754
road_type                      3
num_lanes                      4
curvature                    261
speed_limit                    5
lighting                       3
weather                        3
road_signs_present             2
public_road                    2
time_of_day                    3
holiday                        2
school_season                  2
num_reported_accidents         8
accident_risk                 98
dtype: int64

In [84]:
train.isna().sum()

id                        0
road_type                 0
num_lanes                 0
curvature                 0
speed_limit               0
lighting                  0
weather                   0
road_signs_present        0
public_road               0
time_of_day               0
holiday                   0
school_season             0
num_reported_accidents    0
accident_risk             0
dtype: int64

In [85]:
print(train['road_type'].unique())
print(train['lighting'].unique())
print(train['weather'].unique())
print(train['time_of_day'].unique())

['urban' 'rural' 'highway']
['daylight' 'dim' 'night']
['rainy' 'clear' 'foggy']
['afternoon' 'evening' 'morning']


# PREPROCESSING

In [86]:
train = pd.get_dummies(train, columns=['road_type', 'lighting', 'weather', 'time_of_day'], drop_first=True)
test = pd.get_dummies(test, columns=['road_type', 'lighting', 'weather', 'time_of_day'], drop_first=True)

In [87]:
train.columns

Index(['id', 'num_lanes', 'curvature', 'speed_limit', 'road_signs_present',
       'public_road', 'holiday', 'school_season', 'num_reported_accidents',
       'accident_risk', 'road_type_rural', 'road_type_urban', 'lighting_dim',
       'lighting_night', 'weather_foggy', 'weather_rainy',
       'time_of_day_evening', 'time_of_day_morning'],
      dtype='object')

In [88]:
train[["road_signs_present", "public_road", "holiday", "school_season", 
    "road_type_rural", "road_type_urban", "lighting_dim", "lighting_night", 
    "weather_foggy", "weather_rainy", "time_of_day_evening", "time_of_day_morning"]] = train[["road_signs_present", "public_road", "holiday", 
                                                                                           "school_season", "road_type_rural", "road_type_urban", "lighting_dim", "lighting_night", 
                                                                                           "weather_foggy", "weather_rainy", "time_of_day_evening", "time_of_day_morning", ]].astype(int)


In [89]:
test[["road_signs_present", "public_road", "holiday", "school_season", 
    "road_type_rural", "road_type_urban", "lighting_dim", "lighting_night", 
    "weather_foggy", "weather_rainy", "time_of_day_evening", "time_of_day_morning"]] = test[["road_signs_present", "public_road", "holiday", 
                                                                                           "school_season", "road_type_rural", "road_type_urban", "lighting_dim", "lighting_night", 
                                                                                           "weather_foggy", "weather_rainy", "time_of_day_evening", "time_of_day_morning", ]].astype(int)

In [90]:
train

Unnamed: 0,id,num_lanes,curvature,speed_limit,road_signs_present,public_road,holiday,school_season,num_reported_accidents,accident_risk,road_type_rural,road_type_urban,lighting_dim,lighting_night,weather_foggy,weather_rainy,time_of_day_evening,time_of_day_morning
0,0,2,0.06,35,0,1,0,1,1,0.13,0,1,0,0,0,1,0,0
1,1,4,0.99,35,1,0,1,1,0,0.35,0,1,0,0,0,0,1,0
2,2,4,0.63,70,0,1,1,0,2,0.30,1,0,1,0,0,0,0,1
3,3,4,0.07,35,1,1,0,0,1,0.21,0,0,1,0,0,1,0,1
4,4,1,0.58,60,0,0,1,0,1,0.56,1,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517749,517749,4,0.10,70,1,1,0,0,2,0.32,0,0,0,0,1,0,0,0
517750,517750,4,0.47,35,1,1,0,0,1,0.26,1,0,0,0,0,1,0,1
517751,517751,4,0.62,25,0,0,0,1,0,0.19,0,1,0,0,1,0,0,0
517752,517752,3,0.63,25,1,0,1,1,3,0.51,0,0,0,1,0,0,0,0


In [91]:
X = train.drop(['id', 'accident_risk'], axis=1)
y = train['accident_risk']

In [92]:
test

Unnamed: 0,id,num_lanes,curvature,speed_limit,road_signs_present,public_road,holiday,school_season,num_reported_accidents,road_type_rural,road_type_urban,lighting_dim,lighting_night,weather_foggy,weather_rainy,time_of_day_evening,time_of_day_morning
0,517754,2,0.34,45,1,1,1,1,1,0,0,0,1,0,0,0,0
1,517755,3,0.04,45,1,0,1,0,0,0,1,1,0,1,0,0,0
2,517756,2,0.59,35,1,0,1,1,1,0,1,1,0,0,0,0,0
3,517757,4,0.95,35,0,0,0,0,2,1,0,0,0,0,1,0,0
4,517758,2,0.86,35,1,0,0,1,3,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172580,690334,2,0.01,45,0,0,1,1,2,1,0,1,0,0,1,0,0
172581,690335,1,0.74,70,0,1,0,0,2,1,0,0,0,1,0,0,0
172582,690336,2,0.14,70,0,0,1,1,1,0,1,1,0,0,0,1,0
172583,690337,1,0.09,45,1,1,0,1,0,0,1,0,0,1,0,0,1


# MODEL



In [93]:
from sklearn.model_selection import KFold
from sklearn.metrics import root_mean_squared_error
kf = KFold(n_splits=5, shuffle=True, random_state=42)

import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import numpy as np

oof_scores = []
for train_idx, val_idx in kf.split(X):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = lgb.LGBMRegressor()
    model.fit(X_train, y_train, 
              eval_set = [(X_val, y_val)],
              callbacks = [lgb.early_stopping(20)])
    
    preds = model.predict(X_val)
    err = root_mean_squared_error(y_val, preds)
    print("RMSE for this fold", err)
    oof_scores.append(err)

print(f"Average validation RMSE: {np.mean(oof_scores):.4f}")



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007681 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 173
[LightGBM] [Info] Number of data points in the train set: 414203, number of used features: 16
[LightGBM] [Info] Start training from score 0.352605
Training until validation scores don't improve for 20 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's l2: 0.00318309
RMSE for this fold 0.05641889112483569
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006400 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 168
[LightGBM] [Info] Number of data points in the train set: 414203, number of used features: 16
[LightGBM] [Info] Start training from score 0.352104
Training 

In [94]:
final_model = lgb.LGBMRegressor()
final_model.fit(X, y)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006197 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 177
[LightGBM] [Info] Number of data points in the train set: 517754, number of used features: 16
[LightGBM] [Info] Start training from score 0.352377


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [95]:
X_test = test.drop(columns=['id'])
final_preds = final_model.predict(X_test)


In [96]:
len(final_preds)

172585

In [97]:
submission = pd.DataFrame({
    'id': test['id'],
    'accident_risk': final_preds
})

submission




Unnamed: 0,id,accident_risk
0,517754,0.291644
1,517755,0.125570
2,517756,0.191306
3,517757,0.323335
4,517758,0.413796
...,...,...
172580,690334,0.111008
172581,690335,0.523718
172582,690336,0.250037
172583,690337,0.128142


In [98]:
submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,id,accident_risk
0,517754,0.291644
1,517755,0.12557
2,517756,0.191306
3,517757,0.323335
4,517758,0.413796
