# Predict the accident group depending the conditons

In [2]:
# Import libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

import folium

from pyproj import Proj, transform

from datetime import datetime
from datetime import date, time
from dateutil.parser import parse

In [3]:
# Download
acc_df = pd.read_pickle('acc_df.pkl')

In [18]:
# Import machine learning libraries
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

### 3.1 Feature transformation

In [20]:
# Add is_accident and is_incident features (0: incident, 1: accident)
is_acc = (acc_new.NB_BLESSES_LEGERS > 0)|(acc_new.NB_BLESSES_GRAVES > 0)|(acc_new.NB_BLESSES_GRAVES > 0)
acc_new['is_acc'] = is_acc.astype(int)

In [21]:
# Change data type for time features and add some features
acc_new['DATE_'] = acc_new['DATE_'].apply(lambda d: pd.to_datetime(d))
acc_new['HEURE'] = acc_new['HEURE'].apply(lambda d: pd.to_datetime(d))
acc_new['HOUR'] = [date.hour for date in acc_new['HEURE']]
acc_new.drop('HEURE', axis=1, inplace=True)

In order to create a new feature of `DayTime`, the HOUR featured is aggregated in bins as follows:
- From 22 to 6: Night
- From 6 to 11: Morning
- From 11 to 13: Mid-day
- From 13 to 17: Afternoon
- From 17 to 22: Evening

In [35]:
# Bin by daytime
bins = (-0.1, 5, 10, 13, 17, 22, 24.1)
group_names = ['Night', 'Morning', 'Mid-day', 'Afternoon','Evening', 'Night']
daytime = pd.cut(acc_new.HOUR, bins, labels=group_names)
acc_new['DayTime'] = daytime

In [39]:
# Bin by season
bins = (0, 3, 6, 8, 11, 12.1)
group_names1 = ['Winter', 'Spring', 'Summer', 'Autumn', 'Winter']
season = pd.cut(acc_new.MONTH, bins, labels=group_names1)
acc_new['SEASON'] = season

In [42]:
# Feature selection
features_to_cluster = ['CONDITIONS_LUMINEUSES','CONDITIONS_METEO','ETAT_ROUTE',
                       'GENRE_ROUTE', 'JOUR','MONTH','DayTime', 'SEASON','GROUPE_ACCIDENT']

In [43]:
df = acc_new[features_to_cluster]

In [None]:
acc_new.CONDITIONS_LUMINEUSES.value_counts()

In [None]:
acc_new.CONDITIONS_METEO.value_counts()

In [None]:
acc_new.DayTime.value_counts()

In [None]:
acc_new.DayTime.value_counts()

In [45]:
for feature in features_to_cluster:
    le = LabelEncoder()
    le = le.fit(df[feature])
    df[feature] = le.transform(df[feature])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [48]:
predictors = ['CONDITIONS_LUMINEUSES','CONDITIONS_METEO','ETAT_ROUTE',
                       'GENRE_ROUTE', 'JOUR','MONTH','DayTime', 'SEASON']
predicted = 'GROUPE_ACCIDENT'

X = df[predictors]
y = df[predicted]

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [71]:
rfc = RandomForestClassifier(n_estimators=2000)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=2000, n_jobs=1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [72]:
rfc.score(X_train,y_train)

0.58607575287177893

In [73]:
rfc.score(X_test,y_test)

0.299669135024421

In [74]:
rfc.feature_importances_

array([ 0.07480473,  0.0931184 ,  0.06586885,  0.11728261,  0.26696416,
        0.19466715,  0.13781637,  0.04947773])

In [70]:
df.corr()

Unnamed: 0,CONDITIONS_LUMINEUSES,CONDITIONS_METEO,ETAT_ROUTE,GENRE_ROUTE,JOUR,MONTH,DayTime,SEASON,GROUPE_ACCIDENT
CONDITIONS_LUMINEUSES,1.0,0.132046,0.014162,0.014796,0.002971,0.019448,0.186541,0.034377,0.167807
CONDITIONS_METEO,0.132046,1.0,-0.404329,0.00646,0.001113,0.048104,0.051848,0.020762,0.115117
ETAT_ROUTE,0.014162,-0.404329,1.0,0.002157,0.004708,-0.014485,-0.069672,-0.073904,0.001007
GENRE_ROUTE,0.014796,0.00646,0.002157,1.0,-0.004487,0.002523,0.002162,-0.008198,-0.05671
JOUR,0.002971,0.001113,0.004708,-0.004487,1.0,-0.012623,-0.022983,-0.004524,-0.016962
MONTH,0.019448,0.048104,-0.014485,0.002523,-0.012623,1.0,-0.0049,-0.529236,-0.030464
DayTime,0.186541,0.051848,-0.069672,0.002162,-0.022983,-0.0049,1.0,0.017886,0.136257
SEASON,0.034377,0.020762,-0.073904,-0.008198,-0.004524,-0.529236,0.017886,1.0,0.024539
GROUPE_ACCIDENT,0.167807,0.115117,0.001007,-0.05671,-0.016962,-0.030464,0.136257,0.024539,1.0
