In [None]:
# Filtering warnings
import warnings
warnings.filterwarnings("ignore")

##Mounting Drive and Importing Libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
import folium
import math

In [None]:
sns.set_style("darkgrid")

#US Accidents EDA
[Link to dataset](https://www.kaggle.com/datasets/sobhanmoosavi/us-accidents?select=US_Accidents_Dec21_updated.csv)

##Importing Dataset from Google Drive

In [None]:
df0 = pd.read_csv('/content/drive/MyDrive/Analytics_Project/US_Accidents_Dec21_updated.csv')

##Perform Data Preparation and Cleaning using Numpy and Pandas

1. Load Information about Data and the columns
2. Fix missing [Null], incorrect values, Duplicate Data.
3. Fix Data Type for variables.


###Information of Data

In [None]:
df0.head()

In [None]:
print(df0.size)
print(df0.shape)
print(df0.columns)

In [None]:
df0.info()

In [None]:
df0.describe()

In [None]:
numerics = ['int16','int32','int64','float16','float32','float64']
numeric_df = df0.select_dtypes(include = numerics)
len(numeric_df.columns)

###Percentage of missing Values

In [None]:
df0.isna().sum().sort_values(ascending=False)

In [None]:
null_percent = df0.isna().sum() * 100 / len(df0)
null_percent_df = pd.DataFrame({'percent_missing': null_percent})
null_percent_df_not_zero = null_percent_df[null_percent_df['percent_missing'] != 0]
null_percent_df_not_zero['percent_missing'].sort_values(ascending = False)

In [None]:
null_percent_df_not_zero_list = null_percent_df_not_zero['percent_missing'].sort_values(ascending = False).tolist()
len(null_percent_df_not_zero_list)
#for i in null_percent_df_not_zero_list:
 # if i<20:
  #  print(null_percent_df_not_zero_list)

In [None]:
from numpy.lib.function_base import append
i = int(0)
colors = []
while i<20:
  if null_percent_df_not_zero_list[i]<20:
    colors.append('Yellow')
  else:
    colors.append('Red')
  i=i+1
colors

In [None]:
ax = null_percent_df_not_zero.plot(
    kind = 'barh',
    color = 'Green',
    title = 'Percentage of Null in Each Column',
    legend = True,
    label = True,
    figsize=(25,15)
)
plt.savefig('Percentage of Null in Each Column.png')

###Duplicate values

In [None]:
temp_df0 = df0[df0.duplicated()].copy()

In [None]:
temp_df0.size

### Datatype Correction
  1. End_Time 
  2. Start_Time

In [None]:
df0[['Start_Time','End_Time']].dtypes

In [None]:
df0['Start_Time'] = pd.to_datetime(df0['Start_Time'])

In [None]:
df0['End_Time'] = pd.to_datetime(df0['End_Time'])

In [None]:
df0[['Start_Time','End_Time']].dtypes

##Perform EDA, Vsiusalisation using Matplotlib and Seaborn

In [None]:
df0.columns

###Gegraphical Distribution of Accidents

####City

In [None]:
su1 = pd.DataFrame()
su1['Unique Values'] = df0['City'].value_counts().index.tolist()
print(su1)

In [None]:
su1['Count'] = df0['City'].value_counts().tolist()
print(su1)

In [None]:
print(su1.shape)

In [None]:
su1[:10]

In [None]:
x = su1['Unique Values'][:10].tolist()
x

In [None]:
y = su1['Count'][:10].tolist()
y

In [None]:
def addlabels(x,y):
    for i in range(len(x)):
        plt.text(i, y[i],y[i], ha = 'center', fontsize = 20,
                 Bbox = dict(facecolor = 'white', alpha = .5))

In [None]:
plt.figure(figsize=(40,20)) 
plt.bar(x, y, color = '#c36468')

# calling the function to add value labels
addlabels(x, y)
  
# giving title to the plot
plt.title("Top 10 cities by their count of accidents", fontsize = 40, color = '#820e25')
  
# giving X and Y labels
plt.xlabel("Cities", fontsize = 28, color = '#820e25')
plt.ylabel("Counts of Accidents", fontsize = 28, color = '#820e25')

plt.tick_params(axis='x', labelsize=20)
plt.tick_params(axis='y', labelsize=20)


#saving plot
plt.savefig('Top 10 Cities by Highest Counts of Accidents.jpeg')

# visualizing the plot
#plt.show()

####States

In [None]:
su1 = pd.DataFrame()
su1['Unique Values'] = df0['State'].value_counts().index.tolist()

In [None]:
su1['Count'] = df0['State'].value_counts().tolist()

In [None]:
def addlabels(x,y):
    for i in range(len(x)):
        plt.text(i, y[i],y[i], ha = 'center', fontsize = 20,
                 Bbox = dict(facecolor = 'white', alpha = .5))

In [None]:
x = su1['Unique Values'][:10].tolist()
y = su1['Count'][:10].tolist()

plt.figure(figsize=(40,20))
plt.bar(x, y, color = '#dc9b3b')

# calling the function to add value labels
addlabels(x, y)
  
# giving title to the plot
plt.title("Top 10 State by their count of accidents", fontsize = 40, color = '#820e25')
  
# giving X and Y labels
plt.xlabel("State", fontsize = 28, color = '#820e25')
plt.ylabel("Counts of Accidents", fontsize = 28, color = '#820e25')

plt.tick_params(axis='x', labelsize=20)
plt.tick_params(axis='y', labelsize=20)

#saving plot
plt.savefig('Top 10 State by Highest Counts of Accidents.jpeg')

# visualizing the plot
#plt.show()

#### Map Plot of 0.001 percent of dataframe

In [None]:
# 0.001 percent of data frame
sampl_df = df0.sample(math.floor((len(df0)*0.00001)))

map = folium.Map()

locations = sampl_df[['Start_Lat', 'Start_Lng']]
locationlist = locations.values.tolist()
len(locationlist)
locationlist[7]

for point in range(0, len(locationlist)):
    folium.Marker(locationlist[point]).add_to(map)
map

###Time dependence of Accidents

####Start_Time --- Time dependence of Accidents

Extract 
  - year, month, Weekday, 
  - hour, minute and seconds

In [None]:
df0['Start_Year'] = pd.DatetimeIndex(df0['Start_Time']).year

plt.figure(figsize=(30,20))
sns.histplot(df0['Start_Year'], stat='percent',bins = 5, discrete = True, color = '#d7d34b', edgecolor='black')

plt.title("Yearly Distribution of Counts of Accidents", fontsize = 40, color = '#820e25')
plt.xlabel("Years", fontsize = 40, color = '#820e25')
plt.ylabel("Counts of Accidents", fontsize = 28, color = '#820e25')

plt.tick_params(axis='x', labelsize=20)
plt.tick_params(axis='y', labelsize=20)

#saving plot
plt.savefig('Yearly Distribution of Counts of Accidents.jpeg')

In [None]:
df0['Start_Month'] = pd.DatetimeIndex(df0['Start_Time']).month

month = np.sort(df0['Start_Month'].unique())

#plt.rcParams["figure.figsize"] = [30, 20]
#plt.rcParams["figure.autolayout"] = True

plt.figure(figsize=(30, 20))
n, bins, patches = plt.hist(df0['Start_Month'], color='#569be0', bins = 12, edgecolor='black')

ticks = [(patch._x0 + patch._x1)/2 for patch in patches]
ticklabels = [i for i in month]
plt.xticks(ticks, ticklabels)

# giving title to the plot
plt.title("Monthly Distribution of Counts of Accidents", fontsize = 40, color = '#820e25')
  
# giving X and Y labels
plt.xlabel("Month", fontsize = 40, color = '#820e25')
plt.ylabel("Counts of Accidents", fontsize = 28, color = '#820e25')

plt.tick_params(axis='x', labelsize=20)
plt.tick_params(axis='y', labelsize=20)

#saving plot
plt.savefig('Monthly Distribution of Counts of Accidents.jpeg')

In [None]:
df0['Start_Time_weekday'] = df0['Start_Time'].dt.dayofweek

Start_Time_weekday = np.sort(df0['Start_Time_weekday'].unique())

plt.figure(figsize=(30, 20))
n, bins, patches = plt.hist(df0['Start_Time_weekday'], color='#90be76', bins = 7, edgecolor='white')

ticks = [(patch._x0 + patch._x1)/2 for patch in patches]
ticklabels = [i for i in Start_Time_weekday]
plt.xticks(ticks, ticklabels)

# giving title to the plot
plt.title("Weekly Distribution of Counts of Accidents", fontsize = 40, color = '#820e25')
  
# giving X and Y labels
plt.xlabel("Weekday", fontsize = 40, color = '#820e25')
plt.ylabel("Counts of Accidents", fontsize = 28, color = '#820e25')

plt.tick_params(axis='x', labelsize=20)
plt.tick_params(axis='y', labelsize=20)

#saving plot
plt.savefig('Weekly Distribution of Counts of Accidents.jpeg')

In [None]:
#@title
df0['Start_Hour'] = df0['Start_Time'].dt.hour
plt.figure(figsize=(30, 20))
sns.histplot(df0['Start_Hour'],stat='percent', bins = 24, discrete = True, color='#e9c64d', edgecolor='black')

Start_Hour = np.sort(df0['Start_Hour'].unique())

# giving title to the plot
plt.title("Hourly Distribution of Counts of Accidents (Overall)", fontsize = 40, color = '#820e25')
  
# giving X and Y labels
plt.xlabel("Hour", fontsize = 40, color = '#820e25')
plt.ylabel("% - Counts of Accidents", fontsize = 28, color = '#820e25')


list1 = list(range(0,24))
plt.xticks(ticks = list1)
plt.tick_params(axis='x', labelsize=20)
plt.tick_params(axis='y', labelsize=20)

#saving plot
plt.savefig('Hourly Distribution of Counts of Accidents (Overall).jpeg')

####Weekdays vs Weekends Trend in Accident

#####Weekend

In [None]:
sunday_df = df0[df0['Start_Time_weekday'].isin([5,6])]

plt.figure(figsize=(30, 20))

sns.histplot(sunday_df['Start_Hour'], stat='percent', bins = 24, kde=False, discrete = True, color = '#42c4c7')

# giving title to the plot
plt.title("Hourly Distribution of Counts of Accidents (Weekends)", fontsize = 40, color = '#820e25')
  
# giving X and Y labels
plt.xlabel("Hour", fontsize = 40, color = '#820e25')
plt.ylabel("% - Counts of Accidents", fontsize = 28, color = '#820e25')

plt.xticks(ticks = list1)
plt.tick_params(axis='x', labelsize=20)
plt.tick_params(axis='y', labelsize=20)

#saving plot
plt.savefig("Hourly Distribution of Counts of Accidents (Weekends).jpeg")

#####Weekdays

In [None]:
weekdays = df0[df0['Start_Time_weekday'].isin([0,1,2,3,4])]

plt.figure(figsize=(30, 20))

sns.histplot(weekdays['Start_Hour'], stat='percent', bins = 24, kde=False, discrete = True, color = '#ffb78e')

# giving title to the plot
plt.title("Hourly Distribution of Counts of Accidents (Weekdays)", fontsize = 40, color = '#820e25')
  
# giving X and Y labels
plt.xlabel("Hour", fontsize = 40, color = '#820e25')
plt.ylabel("% - Counts of Accidents", fontsize = 28, color = '#820e25')

plt.xticks(ticks = list1)
plt.tick_params(axis='x', labelsize=20)
plt.tick_params(axis='y', labelsize=20)

#saving plot
plt.savefig("Hourly Distribution of Counts of Accidents (Weekdays).jpeg")

#####Friday had most accidents

In [None]:
Fri_df = df0[df0['Start_Time_weekday'] == 4]

plt.figure(figsize=(30, 20))
sns.histplot(Fri_df['Start_Hour'],stat='percent', bins = 24, discrete = True, color='#7984cb', edgecolor='black')

Start_Hour = np.sort(Fri_df['Start_Hour'].unique())

# giving title to the plot
plt.title("Friday’s Analysis", fontsize = 40, color = '#820e25')
  
# giving X and Y labels
plt.xlabel("Hour", fontsize = 40, color = '#820e25')
plt.ylabel("% - Counts of Accidents", fontsize = 28, color = '#820e25')

plt.xticks(ticks = list1)
plt.tick_params(axis='x', labelsize=20)
plt.tick_params(axis='y', labelsize=20)

#saving plot
plt.savefig('Friday’s Analysis.jpeg')

###Physical conditions

####Temperature --- Physcial Condition's impact on Accidents

In [None]:
#@title
#Creation of an array using Range() Function
list2 = list(range(0,125))
 
print(list2)

In [None]:
#df.loc[(df['Salary_in_1000']>=100) & (df['Age']< 60) & (df['FT_Team'].str.startswith('S')),['Name','FT_Team']]
temp_range = df0.loc[(df0['Temperature(F)'] > 32) & (df0['Temperature(F)']<100)]

In [None]:
plt.figure(figsize=(20,10))
sns.distplot(df0['Temperature(F)'], color='#64bde9')

# giving title to the plot
plt.title("Temperature Distribution during the accidents", fontsize = 40, color = '#820e25')
  
# giving X and Y labels
plt.xlabel("Temperature", fontsize = 28, color = '#820e25')
plt.ylabel("% - Counts of Accidents", fontsize = 28, color = '#820e25')

#saving plot
plt.savefig('Temperature Distribution during the accidents.jpeg')

#### Natural Lighting Conditions
  'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight', 'Start_Time_weekday'

In [None]:
plt.figure(figsize=(30,15))

labels1 = ['Day','Night']
plt.rcParams['font.size'] = 9.0
plt.pie(df0['Sunrise_Sunset'].value_counts(), 
      labels = labels1, 
      autopct='%.2f', 
      colors = ['#e2b273','#aab8e2'],
      wedgeprops={'linewidth': 5.0, 'edgecolor': 'white'},
      textprops={'size': 15})

# giving title to the plot
plt.title("Natural Lighting Conditions during the accidents", fontsize = 30, color = '#820e25')
#saving plot
plt.savefig("Hourly Distribution of Counts of Accidents (Weekdays).jpeg")
  
#saving plot
plt.savefig('Natural Lighting Conditions during the accidents.jpeg')