# Installs & imports

In [None]:
# Dataset download
!wget -N https://github.com/Joffreybvn/road-accident-belgium-analysis/blob/master/datasets/raw/TF_ACCIDENTS_2019.xlsx?raw=true
!wget -N https://raw.githubusercontent.com/Joffreybvn/road-accident-belgium-analysis/master/datasets/raw/BELGIUM-Municipalities.geojson

## Packages install

In [None]:
# Install last Seaborn vesrion
!pip install seaborn --upgrade

In [None]:
# Geopandas, geoplot and folium install

%%time

try:
  import geopandas as gpd
  import geoplot as gplt
  import folium

except:
  !pip install folium
  !pip install git+git://github.com/geopandas/geopandas.git
  !apt install proj-bin libproj-dev libgeos-dev
  !pip install git+git://github.com/ResidentMario/geoplot.git

## Packages imports

In [None]:
# Allow to print multiple output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import geopandas as gpd
import geoplot as gplt
#import folium
#from scipy import stats

from datetime import datetime
import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

%matplotlib inline

## Useful functions

In [None]:
def movecol(df, cols_to_move=[], ref_col='', place='After'):

    cols = df.columns.tolist()
    if place == 'After':
        seg1 = cols[:list(cols).index(ref_col) + 1]
        seg2 = cols_to_move
    if place == 'Before':
        seg1 = cols[:list(cols).index(ref_col)]
        seg2 = cols_to_move + [ref_col]

    seg1 = [i for i in seg1 if i not in seg2]
    seg3 = [i for i in cols if i not in seg1 + seg2]

    return(df[seg1 + seg2 + seg3])

## Datasets creation

In [None]:
df = pd.read_excel('/content/TF_ACCIDENTS_2019.xlsx?raw=true')

# Per variable cleaning

In [None]:
df.count()

## Light condition

In [None]:
df['light_condition'].describe()

1600 entries has no light condition, drop them

In [None]:
df['light_condition'].value_counts()

In [None]:
df = df[df['light_condition'] != 'Non disponible']

Create a "light intensity index" variable, thanks to: https://fr.wikipedia.org/wiki/Lumi%C3%A8re_du_jour

More light to less light:
*   Plein jour
*   Aube, crépuscule
*   Nuit, éclairage public
*   Nuit, sans éclairage public



In [None]:
def light_condition_to_index(string):

  if string == 'Plein jour':
    return 3
  elif string == 'Aube, crépuscule':
    return 2
  elif string == 'Nuit, éclairage public allumé':
    return 1
  else:
    return 0

In [None]:
# Transform the 'light_condition' to a light index.
df_light_index = df.apply(lambda x: light_condition_to_index(x['light_condition']),axis=1)

# Merge with df
df = pd.concat([df, df_light_index], axis=1)

# Rename if
df.rename(columns = {0: 'light_index'}, inplace=True)

In [None]:
df['light_index'].value_counts()

## Road type

In [None]:
df['road_type'].describe()

Some variables are missing, drop them

In [None]:
df['road_type'].value_counts()

In [None]:
df = df[df['road_type'] != 'Inconnu']

## Incident place

In [None]:
df['incident_place'].describe()

Drop the 'Non disponible' entries

In [None]:
df['incident_place'].value_counts()

In [None]:
df = df[df['incident_place'] != 'Non disponible']

## Max velocity

Based on road type and incident place, create a "max velocity" variable

In [None]:
def to_max_velocity(road_type, incident_place):

  if road_type == 'Autoroute':
    return 120
  elif incident_place == 'En agglomération':
    return 50
  else:
    return 90

In [None]:
# Create a "max_velocity" column based on "road_type" and "incident_place"
max_velocity = df.apply(lambda x: to_max_velocity(x['road_type'], x['incident_place']), axis=1)

# Merge 'max_velocity' with the dataframe
df = pd.concat([df, max_velocity], axis=1)

# Rename it
df.rename(columns = {0: 'max_velocity'}, inplace = True)

In [None]:
df.head(3)

## Collision type

In [None]:
df['collision_type'].value_counts()

Collision type has a lot of missing values. However, this is just an informative value. We won't use it for math, so we won't remove the missings.

In [None]:
df['collision_type'].describe()

## Province

It's seems that entries with null/nan are from the brussels' municipalities: Brussels' region has no province.

In [None]:
missing = df[df.isnull().any(axis=1)]
missing.head(2)

In [None]:
missing['municipality_name'].value_counts()

Copy the region code and name to province for the municipality of Brussels:

In [None]:
df['province_ins'].fillna(4000, inplace = True)
df['province_name'].fillna("Région de Bruxelles-Capitale", inplace = True)

In [None]:
df.count()

# Data analysis

## Univariate: Death - The target

*Dead total 30 days* variable's distribution

In [None]:
by_month = df.resample('M').sum()
by_month.head()

In [None]:
fig, ax = plt.subplots(figsize = (20,12))
fig = sns.barplot(x=by_month.index, y="dead_total_30_days", data=by_month, ax=ax)


x_dates = df.index.strftime('%m-%B').sort_values().unique()
ax.set_xticklabels(labels=x_dates, rotation=45, ha='right')

In [None]:
# Math the Mode, Median and Mean
mean = df['dead_total_30_days'].mean()
median = df['dead_total_30_days'].median()
mode = df['dead_total_30_days'].mode()[0]

In [None]:
# Resize the graph size
plt.figure(figsize=(20,10))
plt.ticklabel_format(style='plain', axis='x')

#plt.axvline(mean, color='m', linewidth=3,label='Mean')
#plt.axvline(median, color='b', linestyle='dashed', linewidth=3,label='Median')
#plt.axvline(mode, color='g', linestyle='dashed', linewidth=3,label='Mode')

# Label the axis
plt.xlabel('Total death')
plt.ylabel('Incidents')
plt.legend()

# Show the graph
sns.displot(data=df, x="dead_injured", kde=True, kde_kws={'bw_adjust': 1}, discrete=True)
print(f"Mean: {round(mean)} - Median: {round(median)} - Mode: {round(mode)}")

# plt.savefig('quality.svg', dpi=300, bbox_inches='tight')
