In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("Accidents_US.csv")


In [3]:
df.head()

Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-102,Source3,2,2016-02-11 08:34:14,2016-02-11 09:04:14,40.11195,-83.016663,,,0.48,...,False,False,False,False,False,False,Day,Day,Day,Day
1,A-121,Source3,3,2016-02-11 19:22:00,2016-02-11 21:00:00,39.995369,-82.985085,,,0.01,...,False,False,False,False,False,False,Night,Night,Night,Day
2,A-242,Source3,3,2016-02-18 07:50:11,2016-02-18 08:20:11,39.819839,-84.189087,,,0.01,...,False,False,False,False,False,False,Day,Day,Day,Day
3,A-243,Source3,2,2016-02-18 08:01:09,2016-02-18 08:31:09,39.745735,-84.120644,,,0.01,...,False,False,False,False,False,False,Day,Day,Day,Day
4,A-257,Source3,3,2016-02-18 09:25:10,2016-02-18 09:55:10,39.993843,-82.985054,,,0.01,...,False,False,False,False,False,False,Day,Day,Day,Day


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97389 entries, 0 to 97388
Data columns (total 46 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ID                     97389 non-null  object 
 1   Source                 97389 non-null  object 
 2   Severity               97389 non-null  int64  
 3   Start_Time             97389 non-null  object 
 4   End_Time               97389 non-null  object 
 5   Start_Lat              97389 non-null  float64
 6   Start_Lng              97389 non-null  float64
 7   End_Lat                0 non-null      float64
 8   End_Lng                0 non-null      float64
 9   Distance(mi)           97389 non-null  float64
 10  Description            97389 non-null  object 
 11  Street                 97300 non-null  object 
 12  City                   97388 non-null  object 
 13  County                 97389 non-null  object 
 14  State                  97389 non-null  object 
 15  Zi

## Check the dataframe How many rows and columns

In [27]:
rows, cols = df.shape
print(f"The DataFrame has rows - {rows:,} and columns - {cols}")

The DataFrame has rows - 96,004 and columns - 44


## Converting to the correct data types


In [33]:
'''Converting the Start_Time & End_Time column data type from object to datetime64[ns]'''
cols = df.columns[3:5]


df[cols] = df[cols].apply(pd.to_datetime, errors='coerce')

<class 'pandas.core.frame.DataFrame'>
Index: 96004 entries, 0 to 97388
Data columns (total 44 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   ID                     96004 non-null  object        
 1   Source                 96004 non-null  object        
 2   Severity               96004 non-null  int64         
 3   Start_Time             96004 non-null  datetime64[ns]
 4   End_Time               96004 non-null  datetime64[ns]
 5   Start_Lat              96004 non-null  float64       
 6   Start_Lng              96004 non-null  float64       
 7   Distance(mi)           96004 non-null  float64       
 8   Description            96004 non-null  object        
 9   Street                 96004 non-null  object        
 10  City                   96003 non-null  object        
 11  County                 96004 non-null  object        
 12  State                  96004 non-null  object        
 13  Zipcod

## Dropping columns which has more than 40% none value

In [5]:
df.isna().mean()*100

ID                         0.000000
Source                     0.000000
Severity                   0.000000
Start_Time                 0.000000
End_Time                   0.000000
Start_Lat                  0.000000
Start_Lng                  0.000000
End_Lat                  100.000000
End_Lng                  100.000000
Distance(mi)               0.000000
Description                0.000000
Street                     0.091386
City                       0.001027
County                     0.000000
State                      0.000000
Zipcode                    0.000000
Country                    0.000000
Timezone                   0.000000
Airport_Code               0.053394
Weather_Timestamp          1.422132
Temperature(F)             2.120363
Wind_Chill(F)             22.011726
Humidity(%)                2.233312
Pressure(in)               1.695263
Visibility(mi)             2.019735
Wind_Direction             2.208668
Wind_Speed(mph)            6.411402
Precipitation(in)         26

In [6]:
df= df.drop(columns=["End_Lat", "End_Lng"])

In [7]:
df.isna().mean().tail(25)*100

Wind_Chill(F)            22.011726
Humidity(%)               2.233312
Pressure(in)              1.695263
Visibility(mi)            2.019735
Wind_Direction            2.208668
Wind_Speed(mph)           6.411402
Precipitation(in)        26.229862
Weather_Condition         1.977636
Amenity                   0.000000
Bump                      0.000000
Crossing                  0.000000
Give_Way                  0.000000
Junction                  0.000000
No_Exit                   0.000000
Railway                   0.000000
Roundabout                0.000000
Station                   0.000000
Stop                      0.000000
Traffic_Calming           0.000000
Traffic_Signal            0.000000
Turning_Loop              0.000000
Sunrise_Sunset            0.137593
Civil_Twilight            0.137593
Nautical_Twilight         0.137593
Astronomical_Twilight     0.137593
dtype: float64

## Replacing None Values with Median or Mode of the column

In [12]:
df=df.dropna(subset=["Weather_Timestamp"])

In [13]:
df["Temperature(F)"]=df["Temperature(F)"].fillna(df["Temperature(F)"].median())

In [14]:
df["Wind_Chill(F)"]=df["Wind_Chill(F)"].fillna(df["Wind_Chill(F)"].median())

In [15]:
df["Humidity(%)"]=df["Humidity(%)"].fillna(df["Humidity(%)"].median())

In [16]:
df["Pressure(in)"]=df["Pressure(in)"].fillna(df["Pressure(in)"].median())

In [17]:
df["Visibility(mi)"]=df["Visibility(mi)"].fillna(df["Visibility(mi)"].median())

In [18]:
df["Wind_Direction"]=df["Wind_Direction"].fillna(df["Wind_Direction"].mode())

In [19]:
df["Wind_Speed(mph)"]=df["Wind_Speed(mph)"].fillna(df["Wind_Speed(mph)"].median())

In [20]:
df["Precipitation(in)"]=df["Precipitation(in)"].fillna(df["Precipitation(in)"].median())

In [21]:
df["Weather_Condition"]=df["Weather_Condition"].fillna(df["Weather_Condition"].mode())

In [22]:
df["Weather_Condition"]=df["Weather_Condition"].fillna(df["Weather_Condition"].mode())

In [39]:
df["Sunrise_Sunset"]=df["Sunrise_Sunset"].fillna(df["Sunrise_Sunset"].mode()[0])

In [40]:
df["Civil_Twilight"]=df["Civil_Twilight"].fillna(df["Civil_Twilight"].mode()[0])

In [41]:
df["Nautical_Twilight"]=df["Nautical_Twilight"].fillna(df["Nautical_Twilight"].mode()[0])

In [42]:
df["Astronomical_Twilight"]=df["Astronomical_Twilight"].fillna(df["Astronomical_Twilight"].mode()[0])

In [44]:
df.isna().mean()

ID                       0.000000
Source                   0.000000
Severity                 0.000000
Start_Time               0.000000
End_Time                 0.000000
Start_Lat                0.000000
Start_Lng                0.000000
Distance(mi)             0.000000
Description              0.000000
Street                   0.000000
City                     0.000010
County                   0.000000
State                    0.000000
Zipcode                  0.000000
Country                  0.000000
Timezone                 0.000000
Airport_Code             0.000000
Weather_Timestamp        0.000000
Temperature(F)           0.000000
Wind_Chill(F)            0.000000
Humidity(%)              0.000000
Pressure(in)             0.000000
Visibility(mi)           0.000000
Wind_Direction           0.000000
Wind_Speed(mph)          0.000000
Precipitation(in)        0.000000
Weather_Condition        0.005635
Amenity                  0.000000
Bump                     0.000000
Crossing      

In [24]:
df["Wind_Direction"]=df["Wind_Direction"].fillna(df["Wind_Direction"].mode()[0])

In [25]:
df["Street"]=df["Street"].fillna(df["Street"].mode()[0])

In [26]:
df.dtypes

ID                        object
Source                    object
Severity                   int64
Start_Time                object
End_Time                  object
Start_Lat                float64
Start_Lng                float64
Distance(mi)             float64
Description               object
Street                    object
City                      object
County                    object
State                     object
Zipcode                   object
Country                   object
Timezone                  object
Airport_Code              object
Weather_Timestamp         object
Temperature(F)           float64
Wind_Chill(F)            float64
Humidity(%)              float64
Pressure(in)             float64
Visibility(mi)           float64
Wind_Direction            object
Wind_Speed(mph)          float64
Precipitation(in)        float64
Weather_Condition         object
Amenity                     bool
Bump                        bool
Crossing                    bool
Give_Way  

In [45]:
df.to_csv("Us_accidents_Cleaned.csv")