# Project: US Accidents Analysis

Import necessary libraries for Exploratory data analysis

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Read the data using panda and check the size

In [None]:
df = pd.read_csv('/kaggle/input/us-accidents/US_Accidents_March23.csv')

In [None]:
df

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# How many numeric data columns so we have in the dataset ?
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

numeric_df = df.select_dtypes(include=numerics)
len(numeric_df.columns)

In [None]:
df.isna().sum()

In [None]:
# Percentage of missing values per column

missing_percentages = df.isna().sum().sort_values(ascending=False) / len(df)
missing_percentages

In [None]:
type(missing_percentages)

In [None]:
missing_percentages[missing_percentages != 0].plot(kind='barh')

## Exploratory Analysis and Visualization

Columns we'll analyze:

1. City
2. Start Time
3. Start Lat, Start Lng
4. Temperature
5. Weather Condition

In [None]:
df.City

In [None]:
cities=df.City.unique()
len(cities)

In [None]:
cities[:100]

In [None]:
cities_by_accident = df.City.value_counts()
cities_by_accident

In [None]:
cities_by_accident[:20]

In [None]:
df[df['City']=='New York']

In [None]:
cities_by_accident['New York']

In [None]:
cities_by_accident[:20].plot(kind='barh')

In [None]:
import seaborn as sns
sns.set_style("darkgrid")

In [None]:
sns.histplot(cities_by_accident, log_scale=True)

In [None]:
cities_by_accident[cities_by_accident == 1]

### Start Time

In [None]:
df.Start_Time

In [None]:
#converting the type of Start_time from object to time format
df.Start_Time = pd.to_datetime(df.Start_Time)

In [None]:
df.Start_Time

In [None]:
sns.distplot(df.Start_Time.dt.hour, bins=24, kde=False, norm_hist=True)

In [None]:
sns.histplot(df.Start_Time.dt.hour, bins=24)

- A high percentage of accidents occur between 6 am to 10 am (probably people in a hurry to get to work)
- Next higest percentage is 3 pm to 6 pm.

In [None]:
sns.distplot(df.Start_Time.dt.dayofweek, bins=7, kde=False, norm_hist=True)

Is the distribution of accidents by hour the same on weekends as on weekdays.

In [None]:
sundays_start_time = df.Start_Time[df.Start_Time.dt.dayofweek == 6]
sns.distplot(sundays_start_time.dt.hour, bins=24, kde=False, norm_hist=True)

In [None]:
monday_start_time = df.Start_Time[df.Start_Time.dt.dayofweek == 0]
sns.distplot(monday_start_time.dt.hour, bins=24, kde=False, norm_hist=True)

On Sundays, the peak occurs between 10 am and 3 pm, unlike weekdays

In [None]:
df.Start_Time.dt.year

In [None]:
df_2019 = df[df.Start_Time.dt.year == 2022]
df_2019_Bing = df_2019[df_2019.Source == 'MapQuest']
sns.distplot(df_2019.Start_Time.dt.month, bins=12, kde=False, norm_hist=True)

In [None]:
df.Source.value_counts().plot(kind='pie')

### Start Latitude & Longitude

In [None]:
df.Start_Lat

In [None]:
df.Start_Lng

In [None]:
sample_df = df.sample(int(0.1 * len(df)))

In [None]:
sns.scatterplot(x=sample_df.Start_Lng, y=sample_df.Start_Lat, size=0.001)

In [None]:
import folium

In [None]:
lat, lon = df.Start_Lat[0], df.Start_Lng[0]
lat, lon

In [None]:
for x in df[['Start_Lat', 'Start_Lng']].sample(100).iteritems():
    print(x[1])

In [None]:
zip(list(df.Start_Lat), list(df.Start_Lng))

In [None]:
from folium.plugins import HeatMap

In [None]:
sample_df = df.sample(int(0.001 * len(df)))
lat_lon_pairs = list(zip(list(sample_df.Start_Lat), list(sample_df.Start_Lng)))

In [None]:
map = folium.Map()
HeatMap(lat_lon_pairs).add_to(map)
map