In [22]:
import pandas as pd
from lazypredict.Supervised import LazyRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [23]:
def corriger_encodage(df):
    # Fonction pour corriger l'encodage d'une chaîne de caractères
    def corriger_chaine(chaine):
        if isinstance(chaine, str):
            try:
                return chaine.encode('latin1').decode('utf-8')
            except UnicodeEncodeError:
                return chaine
        return chaine

    # Corriger les valeurs dans le DataFrame
    for col in df.columns:
        df[col] = df[col].apply(corriger_chaine)

    # Corriger les noms de colonnes
    df.columns = [corriger_chaine(col) for col in df.columns]

    return df

In [24]:
df = corriger_encodage(pd.read_csv('./data/traffic_to_ml.csv', encoding='unicode_escape').drop(columns=["Unnamed: 0"]))

In [25]:
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns

In [26]:
df.columns

Index(['Year', 'Area', 'Date', 'Road', 'Municipality Name', 'County Name',
       'Province Name', 'Fatalties', 'Serious Injuries', 'Light Injuries',
       'Total Victims', 'Units Involved', 'Pedestrains Involved',
       'Bicycles Involved', 'Mopeds Involved', 'Motorcycles',
       'Light Vehicles Involved', 'Heavy Vehicles Involved',
       'Other Units Involved', 'Unspecified Units Involved',
       'Road Speed Limit', 'Accident with Hit and Run', 'Fog Presence',
       'Surrounding Environment', 'Special Lane Presence',
       'Special Traffic Measures', 'Weather Conditions',
       'Special Road Functions', 'Severity of Accident', 'Influence of Fog',
       'Influence of Environment', 'Influence of Traffic',
       'Influence of Weather', 'Influence of Wind Intensity',
       'Influence of Lighting', 'Influence of Special Measures',
       'Influence of Road Objects', 'Influence of Road Surface',
       'Influence of Visibility', 'Intersection Characteristics',
       'Lighting C

# FATALTIES

In [27]:
X_fatalties = df[numerical_cols].drop(['Fatalties', 'Total Victims'], axis=1)  # Drop non-feature columns
y_fatalties = df[numerical_cols]['Fatalties']  #

In [28]:
X_train_fatalties, X_test_fatalties, y_train_fatalties, y_test_fatalties = train_test_split(X_fatalties, y_fatalties, test_size=0.2, random_state=42)  # Notice shuffle=False for time series

In [29]:
scaler = StandardScaler()
X_train_scaled_fatalties = scaler.fit_transform(X_train_fatalties)
X_test_scaled_fatalties = scaler.transform(X_test_fatalties)

In [30]:
reg = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)
models_fatalties, predictions_fatalties = reg.fit(X_train_scaled_fatalties, X_test_scaled_fatalties, y_train_fatalties, y_test_fatalties)

 98%|█████████▊| 41/42 [12:18<00:04,  4.39s/it] 

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002626 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 580
[LightGBM] [Info] Number of data points in the train set: 16881, number of used features: 48
[LightGBM] [Info] Start training from score 0.140158


100%|██████████| 42/42 [12:19<00:00, 17.60s/it]


# Serious Injuries

In [31]:
X_serious = df[numerical_cols].drop(['Serious Injuries', 'Total Victims'], axis=1)  # Drop non-feature columns
y_serious = df[numerical_cols]['Serious Injuries']  #

In [32]:
X_train_serious, X_test_serious, y_train_serious, y_test_serious = train_test_split(X_serious, y_serious, test_size=0.2, random_state=42)  # Notice shuffle=False for time series

In [33]:
scaler = StandardScaler()
X_train_scaled_serious = scaler.fit_transform(X_train_serious)
X_test_scaled_serious = scaler.transform(X_test_serious)

In [34]:
reg = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)
models_serious, predictions_serious = reg.fit(X_train_scaled_serious, X_test_scaled_serious, y_train_serious, y_test_serious)

100%|██████████| 42/42 [12:45<00:00,  7.07s/it] 

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003164 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 578
[LightGBM] [Info] Number of data points in the train set: 16881, number of used features: 48
[LightGBM] [Info] Start training from score 0.999171


100%|██████████| 42/42 [12:45<00:00, 18.22s/it]


# Light Injuries

In [35]:
X_light = df[numerical_cols].drop(['Light Injuries', 'Total Victims'], axis=1)  # Drop non-feature columns
y_light  = df[numerical_cols]['Light Injuries']  #

In [36]:
X_train_light, X_test_light, y_train_light, y_test_light = train_test_split(X_light, y_light, test_size=0.2, random_state=42)  # Notice shuffle=False for time series

In [37]:
scaler = StandardScaler()
X_train_scaled_light = scaler.fit_transform(X_train_light)
X_test_scaled_light = scaler.transform(X_test_light)

In [38]:
reg = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)
models_light, predictions_light = reg.fit(X_train_scaled_light, X_test_scaled_light, y_train_light, y_test_light)

 95%|█████████▌| 40/42 [03:57<00:08,  4.26s/it]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001882 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 573
[LightGBM] [Info] Number of data points in the train set: 16881, number of used features: 48
[LightGBM] [Info] Start training from score 0.398910


100%|██████████| 42/42 [03:57<00:00,  5.66s/it]


In [39]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# Year

In [40]:
yearly_accidents = df.groupby('Year').size()

# Split the data (assuming you have multiple years, and you leave one year for testing)
train_data = yearly_accidents[:-1]
test_data = yearly_accidents[-1:]

In [41]:
# You may need to find the best parameters (p, d, q) (P, D, Q, s) using grid search or AIC
sarima_model = SARIMAX(train_data, order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))  # these are example parameters
sarima_result = sarima_model.fit()

# Forecast
sarima_forecast = sarima_result.get_forecast(steps=1)  # forecast next year
sarima_forecast.summary_frame()

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(


y,mean,mean_se,mean_ci_lower,mean_ci_upper
11,1208.0,1000.0,-751.96,3167.96


In [43]:
# This is Holt-Winters method with additive trend and seasonality
hw_model = ExponentialSmoothing(train_data, trend='add', seasonal='add', seasonal_periods=4).fit()  # adjust seasonal_periods based on your data's seasonality

# Forecast
hw_forecast = hw_model.forecast(steps=1)  # forecast next year

  self._init_dates(dates, freq)
  return get_prediction_index(


In [47]:
# For SARIMA
sarima_pred = sarima_result.predict(start=test_data.index[0], end=test_data.index[0])
print('SARIMA prediction:', sarima_pred)
print('Actual:', test_data)

# For Exponential Smoothing
print('Holt-Winters prediction:', hw_forecast)

SARIMA prediction: 2021   39104.08
dtype: float64
Actual: Year
2021    1446
dtype: int64
Holt-Winters prediction: 11   1338.08
dtype: float64


  return get_prediction_index(


# Month

In [48]:
monthly_accidents = df.groupby('Month').size()

# Split the data (assuming you have multiple years, and you leave one year for testing)
train_data = monthly_accidents[:-1]
test_data = monthly_accidents[-1:]

In [49]:
# You may need to find the best parameters (p, d, q) (P, D, Q, s) using grid search or AIC
sarima_model = SARIMAX(train_data, order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))  # these are example parameters
sarima_result = sarima_model.fit()

# Forecast
sarima_forecast = sarima_result.get_forecast(steps=1)  # forecast next year
sarima_forecast.summary_frame()

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(


y,mean,mean_se,mean_ci_lower,mean_ci_upper
11,1736.0,1000.0,-223.96,3695.96


In [52]:
# This is Holt-Winters method with additive trend and seasonality
hw_model = ExponentialSmoothing(train_data, trend='add', seasonal='add', seasonal_periods=2).fit()  # adjust seasonal_periods based on your data's seasonality

# Forecast
hw_forecast = hw_model.forecast(steps=12)  # forecast next year

  self._init_dates(dates, freq)
  return get_prediction_index(


In [53]:
# For SARIMA
sarima_pred = sarima_result.predict(start=test_data.index[0], end=test_data.index[0])
print('SARIMA prediction:', sarima_pred)
print('Actual:', test_data)

# For Exponential Smoothing
print('Holt-Winters prediction:', hw_forecast)

SARIMA prediction: 12   2529.00
dtype: float64
Actual: Month
12    1587
dtype: int64
Holt-Winters prediction: 11   1890.07
12   1929.39
13   1945.19
14   1984.51
15   2000.32
16   2039.63
17   2055.44
18   2094.75
19   2110.56
20   2149.87
21   2165.68
22   2204.99
dtype: float64


  return get_prediction_index(


# Day

In [54]:
day_accidents = df.groupby('Day').size()

# Split the data (assuming you have multiple years, and you leave one year for testing)
train_data = day_accidents[:-1]
test_data = day_accidents[-1:]

In [55]:
# You may need to find the best parameters (p, d, q) (P, D, Q, s) using grid search or AIC
sarima_model = SARIMAX(train_data, order=(1, 1, 1), seasonal_order=(1, 1, 1, 7))  # these are example parameters
sarima_result = sarima_model.fit()

# Forecast
sarima_forecast = sarima_result.get_forecast(steps=1)  # forecast next year
sarima_forecast.summary_frame()

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(


y,mean,mean_se,mean_ci_lower,mean_ci_upper
30,658.79,27.12,605.63,711.95


In [56]:
# This is Holt-Winters method with additive trend and seasonality
hw_model = ExponentialSmoothing(train_data, trend='add', seasonal='add', seasonal_periods=7).fit()  # adjust seasonal_periods based on your data's seasonality

# Forecast
hw_forecast = hw_model.forecast(steps=30)  # forecast next year

  self._init_dates(dates, freq)
  return get_prediction_index(


In [58]:
# For SARIMA
sarima_pred = sarima_result.predict(start=test_data.index[0], end=test_data.index[0])
print('SARIMA prediction:', sarima_pred)
print('Actual:', test_data)

# For Exponential Smoothing
print('Holt-Winters prediction:', hw_forecast)

SARIMA prediction: 31   665.15
dtype: float64
Actual: Day
31    408
dtype: int64
Holt-Winters prediction: 30   692.96
31   677.05
32   699.44
33   693.56
34   706.83
35   682.55
36   685.52
37   693.24
38   677.33
39   699.71
40   693.84
41   707.11
42   682.83
43   685.80
44   693.52
45   677.60
46   699.99
47   694.11
48   707.39
49   683.11
50   686.07
51   693.79
52   677.88
53   700.26
54   694.39
55   707.66
56   683.38
57   686.35
58   694.07
59   678.15
dtype: float64


  return get_prediction_index(


# RESULT

In [60]:
display(models_fatalties)
display(models_serious)
display(models_light)

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HistGradientBoostingRegressor,0.89,0.89,0.13,1.13
XGBRegressor,0.88,0.89,0.13,0.45
LGBMRegressor,0.88,0.88,0.13,0.36
ElasticNetCV,0.88,0.88,0.13,0.44
LarsCV,0.88,0.88,0.13,0.57
LassoLarsCV,0.88,0.88,0.13,0.31
LassoCV,0.88,0.88,0.13,0.43
LassoLarsIC,0.88,0.88,0.13,0.25
OrthogonalMatchingPursuit,0.88,0.88,0.13,0.06
OrthogonalMatchingPursuitCV,0.87,0.88,0.13,0.19


Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LGBMRegressor,0.41,0.42,0.37,0.3
HistGradientBoostingRegressor,0.4,0.41,0.38,0.74
XGBRegressor,0.36,0.37,0.39,0.27
RandomForestRegressor,0.34,0.34,0.4,13.25
ExtraTreesRegressor,0.33,0.34,0.4,9.02
SVR,0.33,0.34,0.4,10.51
LassoLarsIC,0.29,0.3,0.41,0.15
BayesianRidge,0.29,0.3,0.41,0.09
ElasticNetCV,0.29,0.3,0.41,1.07
LassoCV,0.29,0.3,0.41,1.33


Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HistGradientBoostingRegressor,0.22,0.22,0.92,0.49
LGBMRegressor,0.21,0.22,0.92,0.52
GradientBoostingRegressor,0.2,0.21,0.93,2.82
XGBRegressor,0.19,0.2,0.93,0.19
RandomForestRegressor,0.19,0.2,0.93,16.4
SVR,0.15,0.16,0.95,19.96
ExtraTreesRegressor,0.15,0.16,0.95,13.65
NuSVR,0.14,0.15,0.96,56.18
LassoLarsIC,0.13,0.14,0.96,0.11
Ridge,0.13,0.14,0.96,0.06
