In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from chart_studio import plotly
from plotly.offline import init_notebook_mode,iplot
import plotly.graph_objs as go
from plotly import tools
import string,os,_random
from plotly.offline import init_notebook_mode, iplot
import calendar
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
globalcolors = ['092a35', 'a2738c', '645c84', '427996', '658525', '404b69', '0f4471', '0f4471', '0f4471', '0f4471']
init_notebook_mode(connected=True)
punc = string.punctuation

In [None]:
df1 = pd.read_csv('characteristics.csv', low_memory = False, encoding = 'latin-1')
df2 = pd.read_csv('vehicles.csv', low_memory = False)
df3 = pd.read_csv('places.csv', low_memory = False)
df4 = pd.read_csv('users.csv', low_memory = False)
df5 = pd.read_csv('holidays.csv',low_memory = False)

In [None]:
from functools import reduce
accidents = reduce(lambda left, right: pd.merge(left, right, on = "Num_Acc"), [df1, df2, df3, df4])

In [None]:
print("Rows: ", accidents.shape[0], "Columns: ", accidents.shape[1])

In [None]:
accidents.head()

In [None]:
accidents.columns.values

In [None]:
def missing_data(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    return pd.concat([total, percent], axis=1, keys=['Total NaN Values', 'Percentage of NaN Values'])

missing_data(accidents)

In [None]:
accidents = accidents.drop(['v2', 'v1', 'long', 'lat', 'pr1', 'pr', 'gps'], axis = 1)
missing_data(accidents)

# 1.Exploration based on date of accidents

Is the number of Accidents per year decreasing? (from 2005 to 2016)

In [None]:
def create_stack_bar_data(col, df):
    aggregated = df[col].value_counts().sort_index()
    x_values = aggregated.index.tolist()
    y_values = aggregated.values.tolist()
    return x_values, y_values
x1, y1 = create_stack_bar_data('an', accidents)
for i in range(len(x1)):
    x1[i] += 2000
color1 = ['092a35']*9
color2 = ['a2738c']*3
color1.extend(color2)
trace1 = go.Bar(x=x1, y=y1, opacity=0.75, name="year count", marker = dict(color = 'red'))
layout = dict(height=400, title='Number OF Accidents In France Per Year', legend=dict(orientation="h"), 
              xaxis = dict(title = 'Year Of Accidents'), yaxis = dict(title = 'Number of Accidents'))
fig = go.Figure(data=[trace1], layout=layout);
iplot(fig);


Key Points:

The amount of accidents in France had been on the decline from 2005 to 2013, but there was a surge recorded in the year 2013 to 2016

According to the dataset provided, the highest number of accidents occured in 2005, that's a total of 374561 accidents.

# Which months have higher frequency of Accidents ?

In [None]:
x2, y2 = create_stack_bar_data('mois', accidents)
xn = [calendar.month_name[int(x)] for x in (x2)]
vn = y2
trace1 = go.Bar(x=xn, y=vn, opacity=0.75, name="month", marker=dict(color='blue'))
layout = dict(height=400, title='Number Of Accidents Per Month In France', legend=dict(orientation="h"));
fig = go.Figure(data=[trace1], layout=layout)
iplot(fig, filename='stacked-bar')

Key Points:

June, July, September, October have the highest number of accidents, while February has the lowest.

On an average about 296,164 accidents occur every month in France.

October has the highest number of accidents (with about 334,884 incidents) than any other month

Weather in France during September and October is cold and wet whereas, June and July form the peak tourist season

# Which Day-of-the-Month is most safe to drive ?

In [None]:
x1, y1 = create_stack_bar_data('jour', accidents)
trace1 = go.Bar(x=x1, y=y1, opacity=0.75, name="monthday", marker=dict(color='midnightblue'))
layout = dict(height=400, title='Nummber of Accidents In France Per Day', legend=dict(orientation="h") );
fig = go.Figure(data=[trace1], layout=layout)
iplot(fig, filename='stacked-bar')


Key Points:

Number of accidents per month-day is mostly uniform. 31st is lowest beacuse only 7 months have 31 days.

On an average about 114,644 accidents occur every day in France.

# Time series of all accidents from 2005 to 2016

In [None]:
accidents.an += 2000
dates = pd.to_datetime(accidents.an*10000+accidents.mois*100+accidents.jour,format='%Y%m%d')
accidents.an -= 2000
aggregated = dates.value_counts().sort_index()
x_values = aggregated.index.tolist()
y_values = aggregated.values.tolist()
x1,y1 = x_values, y_values
trace1 = go.Scatter(x=x1, y=y1, opacity=0.75, name="monthday", marker=dict(color='firebrick'), line = dict(width = 0.6))
layout = dict(height=400, title='Time Series of Accidents from 2005 to 2016', legend=dict(orientation="h"));
fig = go.Figure(data=[trace1], layout=layout)
iplot(fig, filename='stacked-bar')


# Time series for all accidents in each year

In [None]:
accidents.an += 2000
dates = pd.to_datetime(accidents.an*10000+accidents.mois*100+accidents.jour,format='%Y%m%d')
accidents.an -= 2000
traces = []
for key, grp in dates.groupby(dates.dt.year):
    aggregated = grp.dt.month.value_counts().sort_index()
    x_values = aggregated.index.tolist()
    y_values = aggregated.values.tolist()
    x1,y1 = x_values, y_values
    x1 = [calendar.month_name[int(x)] for x in (x1)]
    trace1 = go.Scatter(x=x1, y=y1, opacity=0.75, line = dict(width = 1.5), name = str(key), mode = 'lines',text = str(key))
    layout = dict(height=400, title='Time Series of Accidents for each Year', legend=dict(orientation="h"));
    traces.append(trace1)
fig = go.Figure(data= traces, layout=layout)
iplot(fig, filename='stacked-bar')

Key Points:

A sharp rise is observed in the months of June, July, September and October.

Sharp drops are observed in February and August.

December, 2006 has the highest number of accidents at 36,648.

February, 2013 has the lowest number of accidents at 15,605.

# 2 Exploration based on roads where accidents occured

Which types of roads are high risk?

In [None]:
x1, y1 = create_stack_bar_data('catr', accidents)
x1 = ['Highway', 'National Road', 'Departmental Road', 'Communal Way', 'Off-Public Network', 'Parking Lot', 'Other']
trace1 = go.Bar(x=x1, y=y1, opacity=0.75, name="Category", marker=dict(color='Blue'))
x2, y2 = create_stack_bar_data('circ', accidents)
x2 = ['Unknown','One Way', 'Bidirectional', 'Separated Carriageways', 'Variable Assignment Channels']
trace2 = go.Bar(x = x2, y = y2, opacity = 0.75, marker=dict(color='Red'), name = "Traffic Flow")
x3, y3 = create_stack_bar_data('prof', accidents)
x3 = ['Unknown', 'Dish','Slope', 'Hill-Top', 'Hill-Bottom']
trace3 = go.Bar(x = x3, y = y3, opacity = 0.75, marker=dict(color='Magenta'), name = "Road Gradient")
fig = tools.make_subplots(rows = 3, cols = 1)
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 2, 1)
fig.append_trace(trace3, 3, 1)
layout = dict(height=900, title='Accidents Based On Type Of Road');
fig.layout.update(layout)
iplot(fig, filename='stacked-bar')


Key Points:

Communal Ways and Departmental Roads are riskiest with 1.6 million and 1.1 million accidents each.

Biderectional Roads are by far the riskiest with over 2.1 million accidents

# Which type of road gradient is high risk?

In [None]:
keydict = {1:'Highway', 2:'National Road', 3:'Departmental Road', 4:'Communal Way', 5:'Off-Public Network', 6:'Parking Lot', 9:'Other'}
roadtype = accidents[['catr','circ']]
traces = []
for key, grp in roadtype.groupby(roadtype.catr):
    aggregated = grp.circ.value_counts().sort_index()
    x_values = aggregated.index.tolist()
    y_values = aggregated.values.tolist()
    x1,y1 = x_values, y_values
    x1 = ['Unknown','One Way', 'Bidirectional', 'Separated Carriageways', 'Variable Assignment Channels']
    trace1 = go.Bar(x=x1, y=y1, opacity=0.75, name = keydict[key])
    layout = dict(height=400, title='Distribution of Accidents based on Type of Road', legend=dict(orientation="h"));
    traces.append(trace1)
fig = go.Figure(data= traces, layout=layout)
iplot(fig)

# 3 Exploration based on people involved in the accidents

What was the condition of the people after the accident?

In [None]:
keydict = {1:'Driver', 2:'Passenger', 3:'Pedestrian', 4:'Pedestrian in Motion'}
people = accidents[['catu','grav']]
traces = []
for key, grp in people.groupby(people.catu):
    aggregated = grp.grav.value_counts().sort_index()
    x_values = aggregated.index.tolist()
    y_values = aggregated.values.tolist()
    x1,y1 = x_values, y_values
    x1 = ['Unscathed','Killed', 'Hospitalized', 'Light Injury']
    trace1 = go.Bar(x=x1, y=y1, opacity=0.75, name = keydict[key])
    layout = dict(height=400, title='Condition of People involved in the Accidents', legend=dict(orientation="h"));
    traces.append(trace1)
fig = go.Figure(data= traces, layout=layout)
iplot(fig)

# What was the age distribution of the people involved?

In [None]:
ageusers = accidents[['an_nais', 'catu']]
ageusers['age'] = 2016 - ageusers.an_nais
keydict = {1:'Driver', 2:'Passenger', 3:'Pedestrian', 4:'Pedestrian in Motion'}
traces = []
for key, grp in ageusers.groupby(ageusers.catu):
    if(key < 4):
        x1 = grp.age.values
        trace1 = go.Histogram(x=x1, opacity=0.5, name = keydict[key])
        layout = dict(height=400, title='Ae Distribution Of People Involved In The Accident', 
                  legend=dict(orientation="h"), barmode = 'overlay');
        traces.append(trace1)
fig = go.Figure(data= traces, layout=layout)
iplot(fig)

# What was the sex distribution of the people involved?

In [None]:
keydict = {1:'Male', 2:'Female'}
people = accidents[['catu','sexe']]
traces = []
for key, grp in people.groupby(people.sexe):
    aggregated = grp.catu.value_counts().sort_index()
    x_values = aggregated.index.tolist()
    y_values = aggregated.values.tolist()
    x1,y1 = x_values, y_values
    x1 = ['Driver','Passenger', 'Pedestrian', 'Pedestrian in Motion']
    trace1 = go.Bar(x=x1, y=y1, opacity=0.75, name = keydict[key])
    layout = dict(height=400, title='Distribution of people involved in accidents by Sex', legend=dict(orientation="h"));
    traces.append(trace1)
fig = go.Figure(data= traces, layout=layout)
iplot(fig)

# 4 Exploration based on use of safety equipment

What was the distribution of Safety Equipment used?

In [None]:
safety = accidents[['secu', 'grav']]
safety = safety.dropna()
safety['equipment'] = (safety.secu/10).astype(int)
safety.secu = (safety.secu - safety.equipment*10).astype(int)
x1, y1 = create_stack_bar_data('equipment', safety)
x1 = ['Belt', 'Helmet', "Children's Device", 'Reflective Equipment', "Other"]
trace1 = go.Bar(x=x1, y=y1, opacity=0.75)
layout = dict(height=400, title='Distribution of Safety Equipment', legend=dict(orientation="h") );
fig = go.Figure(data=[trace1], layout=layout)
iplot(fig, filename='stacked-bar')

# Did use of Safety Eqipment impact condition of people after the accident?

In [None]:
keydict = {1:'Unscathed', 2:'Killed', 3: 'Hospitalized', 4: 'Light Injury'}
traces = []
for key, grp in safety.groupby(safety.grav):
    if (key != 0):
        count = safety.secu.count()
        aggregated = (grp.secu.value_counts()).sort_index()
        x_values = aggregated.index.tolist()
        y_values = (aggregated.values/safety.secu.value_counts().sort_index().values*100).tolist()
        x1,y1 = x_values[1:], y_values[1:]
        x1 = ['Equipment Present','Equipment Absent', 'Not Determined']
        trace1 = go.Bar(x=x1, y=y1, opacity=0.75, name = keydict[key])
        layout = dict(height=530, title='Relationship between Safety Equipment and Severity of Accident', 
                      legend=dict(orientation="h"), barmode = 'stack', yaxis = dict(title = 'Percentage'),
                      xaxis = dict(title = 'Safety Equipment'));
        traces.append(trace1)
fig = go.Figure(data= traces, layout=layout)
iplot(fig)
