In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
import pickle

In [2]:
project_accidents_df = pd.read_csv('../data/clean_df.csv')

In [48]:
project_accidents_df

Unnamed: 0,date,day_of_week,hour,accident_number,road_category,road_layout,num_lanes,reserved_lane,road_profile,road_shape,...,users_involved,light_conditions,department,urban_area,intersection_type,weather,collision_type,latitude,longitude,region
0,2019-11-30,Saturday,1,201900000001,Major Roads,Multi Lane,10,,Flat,Curved,...,3,Night,93,Outside urban area,No junction,Normal,2-car collision,48.896210,2.470120,Île-de-France
1,2019-11-30,Saturday,2,201900000002,Major Roads,One Way,2,,Slope / Near Slope,Curved,...,1,Night,93,Outside urban area,No junction,Normal,Multi-car collision,48.930700,2.368800,Île-de-France
2,2019-11-28,Thursday,15,201900000003,Major Roads,Multi Lane,8,,Flat,Curved,...,4,Day,92,Outside urban area,No junction,Normal,Multi-car collision,48.935872,2.319174,Île-de-France
3,2019-11-30,Saturday,20,201900000004,Major Roads,Multi Lane,5,,Flat,Straight,...,4,Night,94,Outside urban area,No junction,Normal,Multi-car collision,48.817329,2.428150,Île-de-France
4,2019-11-30,Saturday,4,201900000005,Major Roads,One Way,3,,Flat,Curved,...,3,Night,94,Outside urban area,No junction,Normal,2-car collision,48.776362,2.433254,Île-de-France
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
216076,2023-07-06,Thursday,11,202300054813,Secondary Roads,Two Way,2,,Flat,Curved,...,1,Night,66,Outside urban area,No junction,Normal,No collision,42.772286,2.842804,Occitanie
216077,2023-07-03,Monday,17,202300054814,Secondary Roads,Multi Lane,4,,Flat,Straight,...,2,Day,13,Outside urban area,No junction,Normal,Multi-car collision,43.492822,5.227633,Provence-Alpes-Côte d’Azur
216078,2023-07-06,Thursday,11,202300054815,Secondary Roads,Two Way,2,,Flat,Straight,...,1,Day,13,Inside urban area,No junction,Normal,2-car collision,43.450575,5.683472,Provence-Alpes-Côte d’Azur
216079,2023-06-28,Wednesday,12,202300054816,Secondary Roads,Two Way,2,,Slope / Near Slope,Straight,...,3,Day,13,Outside urban area,No junction,Normal,2-car collision,43.620450,5.491925,Provence-Alpes-Côte d’Azur


In [50]:
features = ['region', 'injury_severity']

In [51]:
project_accidents_df_2000 = project_accidents_df[['region', 'injury_severity']].sample(
    2000,
    random_state=42
).reset_index(drop=True)

project_accidents_df_2000.to_csv('../data/clean_df_2000.csv', index=False)

In [53]:
project_accidents_df_2000['is_killed'] = (project_accidents_df_2000['injury_severity'] == 'Killed').astype(int)

In [54]:
project_accidents_df_all = project_accidents_df[features].reset_index(drop=True)
project_accidents_df_all.to_csv('../data/clean_df_all.csv', index=False)

In [60]:
X = project_accidents_df_2000[['region']]
y = project_accidents_df_2000['is_killed']
y.value_counts()

is_killed
0    1495
1     505
Name: count, dtype: int64

In [61]:
X.shape

(2000, 1)

In [62]:
type(y)

pandas.core.series.Series

In [63]:
y = y.values

In [64]:
len(y), type(y)

(2000, numpy.ndarray)

In [65]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [66]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1600, 1), (400, 1), (1600,), (400,))

In [67]:
X_train

Unnamed: 0,region
1688,Occitanie
670,Auvergne-Rhône-Alpes
1750,Bretagne
1683,Île-de-France
543,Île-de-France
...,...
736,Bourgogne-Franche-Comté
43,Pays de la Loire
1012,Provence-Alpes-Côte d’Azur
11,Provence-Alpes-Côte d’Azur


In [68]:
pipe = Pipeline([
    ('onehot', OneHotEncoder()),
    ('logreg', LogisticRegression())
])

In [69]:
pipe

In [70]:
pipe.fit(X_train, y_train)

In [71]:
sample = X_test.sample(10)

y_sample = y_test[X_test.index.isin(sample.index)]

In [72]:
display(sample)
display(y_sample)

Unnamed: 0,region
943,Île-de-France
1292,Île-de-France
315,Bretagne
1554,Île-de-France
1487,Hauts-de-France
1983,Nouvelle-Aquitaine
1635,Bretagne
1159,Île-de-France
1977,Île-de-France
907,Bourgogne-Franche-Comté


array([0, 0, 1, 0, 0, 0, 0, 0, 0, 1])

In [73]:
pipe.predict(sample)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [74]:
def from_prediction_to_accident_severity(prediction):

    if prediction == 1:
        return 'Fatality (Killed)'
    else:
        return 'Non-Fatality / Minor'


In [75]:
sample

Unnamed: 0,region
943,Île-de-France
1292,Île-de-France
315,Bretagne
1554,Île-de-France
1487,Hauts-de-France
1983,Nouvelle-Aquitaine
1635,Bretagne
1159,Île-de-France
1977,Île-de-France
907,Bourgogne-Franche-Comté


In [76]:
X_new = pd.DataFrame([['Occitanie']], columns=['region'])

In [77]:
X_new

Unnamed: 0,region
0,Occitanie


In [78]:
pipe.predict(X_new)[0]

0

In [79]:
from_prediction_to_accident_severity(pipe.predict(X_new)[0])

'Non-Fatality / Minor'

In [81]:
with open('../models/project_accidents_model.pkl', 'wb') as f:
    pickle.dump(pipe, f)

In [80]:
with open('../models/project_accidents_model.pkl', 'rb') as f:
    model = pickle.load(f)

In [82]:
model.predict(X_new)[0]

0

In [83]:
X_new

Unnamed: 0,region
0,Occitanie


In [84]:
model

In [85]:
project_accidents_df_all.columns

Index(['region', 'injury_severity'], dtype='object')

In [86]:
project_accidents_df_all['region'].unique()

array(['Île-de-France', 'Nouvelle-Aquitaine', 'Auvergne-Rhône-Alpes',
       'Occitanie', 'Provence-Alpes-Côte d’Azur', 'Hauts-de-France',
       'Normandie', 'Pays de la Loire', 'Bourgogne-Franche-Comté',
       'Grand Est', 'Centre-Val de Loire', 'Bretagne'], dtype=object)