In [1]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from datasist.structdata import detect_outliers

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio

import seaborn as sns
import matplotlib.pyplot as plt


pio.templates.default = "plotly_dark"
sns.set(rc={'figure.figsize': [10,10]}, font_scale=1.3)



# Dataset abbrevations:

1-ID - This is a unique identifier of the accident record.

2-Severity - Shows the severity of the accident, a number between 1 and 4, where 1 indicates the least impact on traffic (i.e., short delay as a result of the accident) and 4 indicates a significant impact on traffic (i.e., long delay).

3-Start_Time - Shows start time of the accident in local time zone.

4-End_Time - Shows end time of the accident in local time zone. End time here refers to when the impact of accident on traffic flow

5-Start_Lat - Shows latitude in GPS coordinate of the start point.

6-Start_Lng - Shows longitude in GPS coordinate of the start point.

7-Distance(mi) - The length of the road extent affected by the accident.

8-Description - Shows natural language description of the accident.

9-Number - Shows the street number in address record.

10-Street - Shows the street name in address record.

11-Side - Shows the relative side of the street (Right/Left) in address record.

12-City - Shows the city in address record.

13-County - Shows the county in address record.

14-State - Shows the state in address record.

15-Country - Shows the country in address record.

16-Timezone - Shows timezone based on the location of the accident (eastern, central, etc.)

17-Temperature(F) - Shows the temperature (in Fahrenheit).

18-Wind_Chill(F) - Shows the wind chill (in Fahrenheit).

19-Humidity(%) - Shows the humidity (in percentage).

20-Pressure(in) - Shows the air pressure (in inches).

21-Visibility(mi) - Shows visibility (in miles).

22-Wind_Direction - Shows wind direction.

23-Wind_Speed(mph) - Shows wind speed (in miles per hour).

24-Precipitation(in) - Shows precipitation amount in inches, if there is any.

25-Weather_Condition - Shows the weather condition (rain, snow, thunderstorm, fog, etc.)

26-Amenity - A POI annotation which indicates presence of amenity in a nearby location.

27-Bump - A POI annotation which indicates presence of speed bump or hump in a nearby location.

28-Crossing - A POI annotation which indicates presence of crossing in a nearby location.

29-Give_Way - A POI annotation which indicates presence of give_way in a nearby location.

30-Junction - A POI annotation which indicates presence of junction in a nearby location.

31-No_Exit - A POI annotation which indicates presence of junction in a nearby location.

32-Railway - A POI annotation which indicates presence of railway in a nearby location.

33-Roundabout - A POI annotation which indicates presence of roundabout in a nearby location.

34-Station - A POI annotation which indicates presence of station in a nearby location.

35-Stop - A POI annotation which indicates presence of stop in a nearby location.

36-Traffic_Calming - A POI annotation which indicates presence of traffic_calming in a nearby location.

37-Traffic_Signal - A POI annotation which indicates presence of traffic_signal in a nearby location.

38-Turning_Loop - A POI annotation which indicates presence of turning_loop in a nearby location.

39-Sunrise_Sunset - Shows the period of day (i.e. day or night) based on sunrise/sunset.

40-Civil_Twilight - Shows the period of day (i.e. day or night) based on civil twilight.

41-Nautical_Twilight - Shows the period of day (i.e. day or night) based on nautical twilight.

42-Astronomical_Twilight - Shows the period of day (i.e. day or night) based on astronomical twilight.

 # Some Questhions :


 1 - Which state has the most accidents and Why ?


 2 - Are there specific months or seasons with higher accident rates?


 3 - What factors contribute most to severe accidents?


 4 - Are there trends over the years (e.g., increasing or decreasing accident rates)?
 

In [2]:
# read the dataset
df = pd.read_csv('USA_accidents.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

# Data cleaning

### 1- preparation

In [None]:
df.columns

In [None]:
df.columns = df.columns.str.lower()
df.columns

In [None]:
# check the duplicates
duplicates = df.duplicated().sum()
print(f"The duplicates row = {duplicates} ")

### 2- Missing Values

In [None]:
# check missing values
missing_values =  (df.isna().sum().sort_values(ascending = False) / len(df)) *100
missing_values = pd.DataFrame(missing_values).reset_index()
missing_values.rename(columns = {'index':'columns', 0 : 'Percentages_of_missing'},inplace = True)
missing_values

In [None]:
px.histogram(missing_values[:20],x = 'Percentages_of_missing',y='columns'
             ,height=700,title='Percentages of missing values'
             ,color_discrete_sequence= px.colors.sequential.deep )

In [11]:
# Fill missing values of numneric columns

# drop columns with more than 50% missing values
# wind_chill(f) = 57.8
# precipitation(in)	= 63.3
droped_columns = missing_values['columns'].iloc[missing_values[missing_values['Percentages_of_missing'] > 50].index]
df.drop(columns=droped_columns,inplace=True)

In [12]:
# drop unnamed: 0	
df.drop(columns = "unnamed: 0",inplace=True)

In [13]:
# fill missing values in numerical data
# num col  = wind_speed(mph),visibility(mi),humidity(%),temperature(f),pressure(in)	
# fill with median
imputer= SimpleImputer(strategy='median')
df[['pressure(in)','wind_speed(mph)','visibility(mi)','humidity(%)','temperature(f)']]=imputer.fit_transform(df[['pressure(in)','wind_speed(mph)','visibility(mi)','humidity(%)','temperature(f)']])


In [None]:
# the filling values of columns median
imputer.statistics_

In [15]:
# Fill missing values in cat data


# cat col = weather_condition,wind_direction , timezone ,astronomical_twilight ,nautical_twilight ,civil_twilight ,sunrise_sunset ,city ,street ,description
# fill with mode
imputer= SimpleImputer(strategy='most_frequent')
df[["weather_condition","wind_direction" , "timezone" ,"astronomical_twilight" ,"nautical_twilight" ,"civil_twilight" ,"sunrise_sunset" ,"city" ,"street" ,"description"]]=imputer.fit_transform(df[["weather_condition","wind_direction" , "timezone" ,"astronomical_twilight" ,"nautical_twilight" ,"civil_twilight" ,"sunrise_sunset" ,"city" ,"street" ,"description"]])


In [None]:
# the filling values of columns mode
imputer.statistics_

In [None]:
# check the missing values again
df.isna().sum()

### 3- Check the outlier

In [None]:
# make numeric_columns to check outlier
numeric_columns = df.select_dtypes(include=['number']).columns
numeric_columns = numeric_columns.drop('severity')   # severity is cat
numeric_columns



In [None]:
# Make boxplot to  visualization the outlier

# Create a figure and a grid of subplots
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))

# Plot on each subplot using Seaborn
sns.boxplot(x='start_lat', data=df, ax=axes[0, 0])
axes[0, 0].set_title('Box Plot:  start_lat')

sns.boxplot(x='start_lng', data=df, ax=axes[0, 1])
axes[0, 1].set_title('Box Plot: start_lng)')

sns.boxplot(x = 'distance(mi)', data=df, ax=axes[1, 0])
axes[1, 0].set_title('Box plot: distance(mi)')

sns.boxplot(x='wind_speed(mph)', data=df, ax=axes[1, 1])
axes[1, 1].set_title('Box Plot: wind_speed(mph)')

# Adjust layout to prevent overlapping
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Make boxplot to  visualization the outlier


# Create a figure and a grid of subplots
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))

# Plot on each subplot using Seaborn
sns.boxplot(x='pressure(in)', data=df, ax=axes[0, 0])
axes[0, 0].set_title('Box Plot:  pressure(in')

sns.boxplot(x='humidity(%)', data=df, ax=axes[0, 1])
axes[0, 1].set_title('Box Plot: humidity(%)')

sns.boxplot(x = 'visibility(mi)', data=df, ax=axes[1, 0])
axes[1, 0].set_title('Box plot: visibility(mi)')

sns.boxplot(x='temperature(f)', data=df, ax=axes[1, 1])
axes[1, 1].set_title('Box Plot: temperature(f)')

# Adjust layout to prevent overlapping
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Calculate IQR and identify outliers for each numerical column

for col in numeric_columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]

    print(f"Column: {col}")
    print(f"Number of outliers: {len(outliers)}")
    print(f'percentage of outlier = {(len(outliers)/len(df))*100}')
    print("-" * 20)

In [None]:
# drop outliers of columns temperature(f) and wind_speed(mph) 

outliers_indices = detect_outliers(df, 0, df[['temperature(f)','wind_speed(mph)']])
len(outliers_indices)


In [23]:
df.drop(outliers_indices, inplace=True)


In [None]:
# handle outlier values
for col in numeric_columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    median_value = df[col].median()
    mean_value = df[col].mean()
    # Replace outliers with the mean and median
    df[col] = np.where(df[col] < lower_bound,mean_value, df[col])
    df[col] = np.where(df[col] > upper_bound,median_value, df[col])
    print(f"{col} handled !")


In [None]:

# Create a figure and a grid of subplots
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))

# Plot on each subplot using Seaborn
sns.boxplot(x='start_lat', data=df, ax=axes[0, 0])
axes[0, 0].set_title('Box Plot:  start_lat')

sns.boxplot(x='start_lng', data=df, ax=axes[0, 1])
axes[0, 1].set_title('Box Plot: start_lng')

sns.boxplot(x = 'distance(mi)', data=df, ax=axes[1, 0])
axes[1, 0].set_title('Box plot: distance(mi)')

sns.boxplot(x='wind_speed(mph)', data=df, ax=axes[1, 1])
axes[1, 1].set_title('Box Plot: wind_speed(mph)')

# Adjust layout to prevent overlapping
plt.tight_layout()

# Show the plot
plt.show()

In [None]:

# Create a figure and a grid of subplots
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))

# Plot on each subplot using Seaborn
sns.boxplot(x='pressure(in)', data=df, ax=axes[0, 0])
axes[0, 0].set_title('Box Plot:  pressure(in)')

sns.boxplot(x='humidity(%)', data=df, ax=axes[0, 1])
axes[0, 1].set_title('Box Plot: humidity(%)')

sns.boxplot(x = 'visibility(mi)', data=df, ax=axes[1, 0])
axes[1, 0].set_title('Box plot: visibility(mi)')

sns.boxplot(x='temperature(f)', data=df, ax=axes[1, 1])
axes[1, 1].set_title('Box Plot: temperature(f)')

# Adjust layout to prevent overlapping
plt.tight_layout()

# Show the plot
plt.show()

### 4- handle date time data

In [None]:
df.drop(columns = 'end_time',inplace=True) # drop end_time columns
df['start_time'] = pd.to_datetime(df['start_time'],format='mixed') # change start_time type to datetime64

df['year'] = df.start_time.dt.year # make year column from start time
df['month'] = df.start_time.dt.month # make month column from start time
df['day'] = df.start_time.dt.day # make day column from start time
df['day_name'] = df.start_time.dt.day_name() # make day_name column from start time
df['hour'] = df.start_time.dt.hour # make year column from start time
df

In [28]:
# make season columns 
def get_season(month): # function to determaind the season
    
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    elif month in [9, 10, 11]:
        return 'Autumn'


In [None]:
df['season'] = df['month'].apply(get_season) # apply the function 
df['season']

# Exploratory Analysis and Visualization

## 1.Univariate Analysis

In [None]:
# describe num columns
df.describe().T

In [None]:
# describe cat columns
df.describe(include='O').T

### - Count of incidents per 'city', 'state', 'county' and 'street

In [32]:
# function to make datafram of columns and make bar analysis
def show_fig(col) :
    data_accident = df[col].value_counts()
    data_accident = pd.DataFrame(data_accident)
    data_accident.reset_index(inplace=True)
    fig = px.bar(data_accident[:15],y=col, x = 'count'
                ,color_discrete_sequence= px.colors.sequential.deep)
    fig.show()

In [None]:
# count of state accidents
show_fig("state")

In [None]:
# county
show_fig("county")

In [None]:
# street
show_fig("street")

In [None]:
# Timezone
show_fig('timezone')

### - Trends of year , season, Month , Day of month , Day name and Hour 

In [37]:
# function to check trends time 


def time_fig(col):

    time_trends = df[col].value_counts().sort_index()
    time_trends = pd.DataFrame(time_trends).reset_index()
    time_trends.columns = [col, 'count']  

    # Create subplots
    fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'pie'}, {'type': 'bar'}]])

    # Add pie chart trace
    pie_trace = px.pie(time_trends, names=col, values='count', color_discrete_sequence=px.colors.sequential.deep).data[0]
    fig.add_trace(pie_trace, row=1, col=1)

    # Add bar chart trace
    bar_trace = px.bar(time_trends, x=col, y='count', color_discrete_sequence=px.colors.sequential.deep).data[0]
    fig.add_trace(bar_trace, row=1, col=2)

    # Update layout
    fig.update_layout(
        height=600,
        width=1000,
        title_text=f"Distribution of {col}",
        title_x=0.5
    )

    # Show the figure
    fig.show()

In [None]:
# Incidents by year
time_fig('year')

In [None]:
# Incidents by season
time_fig('season')

In [None]:
# Incidents by month
time_fig('month')

In [None]:
# Incidents by day_name
time_fig("day_name")


In [None]:
# Incidents by hour
time_fig("hour")


### -Compare incidents during different 'sunrise_sunset' phases or 'twilight' categories.

In [None]:
# Incidents by sunrise_sunset phase
sunrise_sunset_trends = df['sunrise_sunset'].value_counts().reset_index()
px.bar(sunrise_sunset_trends,x='sunrise_sunset',y = 'count',barmode="group", title= 'sunrise_sunset_trends',
       color_discrete_sequence=px.colors.sequential.deep)

In [None]:
# Incidents by twilight categories
twilight_trends = df[['civil_twilight', 'nautical_twilight', 'astronomical_twilight']].melt()
twilight_counts = twilight_trends['value'].value_counts().reset_index()
px.bar(twilight_counts,x='value',y = 'count',barmode="group", title= 'twilight_counts',
       color_discrete_sequence=px.colors.sequential.deep)


### - Road Features

In [None]:
df_bool = df.select_dtypes(include='bool')
df_bool.head()

In [None]:
# sum the occurrences of each feature
bump_sum = df['bump'].sum()
crossing_sum = df['crossing'].sum()
give_way_sum = df['give_way'].sum()
junction_sum = df['junction'].sum()
no_exit_sum = df['no_exit'].sum()
railway_sum = df['railway'].sum()
roundabout_sum = df['roundabout'].sum()
station_sum = df['station'].sum()
stop_sum = df['stop'].sum()
traffic_calming_sum = df['traffic_calming'].sum()
traffic_signal_sum = df['traffic_signal'].sum()
turning_loop_sum = df['turning_loop'].sum()

# Create a dictionary to store the sums
d = {"columns":['bump','crossing','give_way','junction','no_exit','railway','roundabout','station','stop','traffic_calming'
                ,'traffic_signal','turning_loop'],
     "sum" : [bump_sum,crossing_sum,give_way_sum,junction_sum,no_exit_sum,railway_sum,roundabout_sum,station_sum,stop_sum,traffic_calming_sum
              ,traffic_signal_sum,turning_loop_sum]

}

road_df = pd.DataFrame(data=d).reset_index()

# Create subplots
fig = make_subplots(rows=1, cols=2, specs=[[{'type': 'pie'}, {'type': 'histogram'}]])

# Add pie chart trace
pie_trace = px.pie(road_df, names='columns', values='sum', color_discrete_sequence=px.colors.sequential.deep).data[0]
fig.add_trace(pie_trace, row=1, col=1)

# Add histogram chart trace
histogram_trace = px.histogram(road_df, x='columns', y='sum', color_discrete_sequence=px.colors.sequential.deep).data[0]
fig.add_trace(histogram_trace, row=1, col=2)

# Update layout
fig.update_layout(
        height=600,
        width=1400,
        title_text=f"Distribution of road features",
        title_x=0.5
    )

# Show the figure
fig.show()


In [None]:
# Distribution Severity levels.
px.histogram(df,x = 'severity',title='Distribution Severity levels',
             color_discrete_sequence= px.colors.sequential.deep)

In [None]:
show_fig('weather_condition')

## 2- Bi/multi-variate Analysis 

### - Distribution Severity levels.

In [None]:
# corr of severity and 'temperature(f)','humidity(%)', 'pressure(in)', 'wind_speed(mph)','severity'
df_weather = df[['temperature(f)','humidity(%)', 'pressure(in)', 'wind_speed(mph)','severity']]
px.imshow(df_weather.corr()
                   ,color_continuous_scale=px.colors.sequential.deep)


In [50]:
# function to show top of 4 severity
def severity_analysis(col):
    data_accident = df.groupby([col,'severity']).agg({'severity':'count'})
    data_accident.reset_index(level = col , inplace = True )
    data_accident.index = data_accident.index.rename("new_data")
    data_accident.reset_index(inplace = True)
    data_accident.columns = ["severity",col,'count_of_severity']
    
    fig = px.scatter(data_accident,y='count_of_severity', x= col
                        ,color='severity',title=f'Correlation of {col} and severity',)
    fig.show()



In [None]:
#'humidity(%)'
severity_analysis('humidity(%)')

In [52]:
# Show the rate of change of accidents over the years and region
def severity_line(col,value):
# make subset of select column and value in column
    data_accident = df.groupby([col, 'severity', 'year']).size().reset_index(name='count_of_severity')

# Rename columns for clarity
    data_accident.columns = [col, 'severity', 'year', 'count_of_severity']
    data_accident = data_accident[data_accident[col] == value].sort_values(by = 'year')
    # line plot 
    fig = px.line(data_accident, y = 'count_of_severity',x = 'year',color = 'severity')
    fig.show()

In [None]:
# state
severity_line("state",'CA')

In [None]:
# county
severity_line("county",'Los Angeles')

In [None]:
# street
severity_line('street','I-95 S')

In [None]:
severity_line('timezone','US/Eastern')

In [None]:
# Hour
df_hs = df.groupby(['hour','severity']).agg({'severity':'count'})
df_hs.reset_index(level = 'hour', inplace = True )
df_hs.index = df_hs.index.rename("new_severity")
df_hs.reset_index(inplace = True)
df_hs.columns = ['severity','hour','count_of_severity']
px.bar(df_hs[df_hs['severity']==4] ,x = 'hour',y = 'count_of_severity',
       color='severity',barmode="group",
       color_continuous_scale = px.colors.sequential.deep)

In [58]:
# insights
# Top State of accident is CA = 510.564K
# Top City of accidents is Houston = 74.991K
# Top street of accident is I-5 N = 28.273K
# Top county of accidents is Los Angeles = 173.195K
# Top Timezone of accident is US/Eastern
# Top year of accidents is 2019 
# Accidents are increasing every year.
# Top season of accidents is Autumn = 719.795K
# The number of accidents is close every day of the month except for the 31st, which is slightly less.
# The number of accidents is close every day of the week except for two weekends.
# The accidents in weekend saturday and sunday = 142.979 + 126.797 = 269.716 K 
# Top day in the week has accidents is tuesday = 445.602
# The top hours of accidents are 7 and 8 = 238.465K and 243.991K
# The largest number of accidents at the beginning of working hours
# There are many accidents at sunrise.
# There are many accidents at twilight in day .
# Top 3 reasons of accidents are traffic_signal ,crossing and junction
# Severity levels 2 = 1.652M Top count
# Severity levels 4 = 74K
# Most accidents occur in clear weather.
# There is no strong Correlation between temperature and severity
# There is a good Correlation between humidity and severity.
# Accidents are Increasing by years in I-95 S street
# Accident with severity = 4 are decreasing by year
# the highest county had 4 severiy is Los Angeles = 1857
# the highest state had 4 severiy is FL = 6459
# the highest street had 4 severiy is I-95 S = 1018
# the highest hour had 4 severity is 17