### Handling Missing Data on Dataset

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

In [3]:
dataset_path = '/content/drive/MyDrive/RoadSafety_Nov25/data/raw/US_Accidents_March23.csv'
us_accidents_df = pd.read_csv(dataset_path)

In [4]:
us_accidents_df.isnull()

Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,False,False,False,False,False,False,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7728389,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7728390,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7728391,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7728392,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [5]:
us_accidents_df.isnull().sum()

Unnamed: 0,0
ID,0
Source,0
Severity,0
Start_Time,0
End_Time,0
Start_Lat,0
Start_Lng,0
End_Lat,3402762
End_Lng,3402762
Distance(mi),0


In [6]:
(us_accidents_df.isnull().sum()/us_accidents_df.shape[0])*100

Unnamed: 0,0
ID,0.0
Source,0.0
Severity,0.0
Start_Time,0.0
End_Time,0.0
Start_Lat,0.0
Start_Lng,0.0
End_Lat,44.029355
End_Lng,44.029355
Distance(mi),0.0


In [7]:
# Drop the columns of 'End_Lat' & 'End_Lng' from this dataset.
us_accidents_df.drop(['End_Lat', 'End_Lng'], axis=1, inplace=True)

In [8]:
us_accidents_df.head()

Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,Distance(mi),Description,Street,...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,Source2,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,0.01,Right lane blocked due to accident on I-70 Eas...,I-70 E,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,Source2,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,0.01,Accident on Brice Rd at Tussing Rd. Expect del...,Brice Rd,...,False,False,False,False,False,False,Night,Night,Night,Day
2,A-3,Source2,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,0.01,Accident on OH-32 State Route 32 Westbound at ...,State Route 32,...,False,False,False,False,True,False,Night,Night,Day,Day
3,A-4,Source2,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,0.01,Accident on I-75 Southbound at Exits 52 52B US...,I-75 S,...,False,False,False,False,False,False,Night,Day,Day,Day
4,A-5,Source2,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,0.01,Accident on McEwen Rd at OH-725 Miamisburg Cen...,Miamisburg Centerville Rd,...,False,False,False,False,True,False,Day,Day,Day,Day


In [9]:
# Handling the missing data of 'Wind_Speed(mph)' & 'Precipication(in)'
temp_df =   us_accidents_df[['Wind_Speed(mph)', 'Precipitation(in)']]
temp_df.head()

Unnamed: 0,Wind_Speed(mph),Precipitation(in)
0,,0.02
1,,0.0
2,3.5,
3,4.6,
4,3.5,


- `IterativeImputer` uses regression models to impute missing values in each column as a function of other columns.

- It is more flexible than simple mean/median imputation and can capture inter-feature relationships.

- You can adjust estimator type via the estimator argument for custom regression models if desired (default is `BayesianRidge`).

In [10]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Step-1: Randomly sample 100,000 rows from the dataset
sample_df = temp_df.sample(n=100000, random_state=42)

# Step-2: Perform regression imputation on the sample using IterativeImputer
reg_imputer = IterativeImputer(random_state=42)
sample_imputed = reg_imputer.fit_transform(sample_df)
sample_imputed_df = pd.DataFrame(sample_imputed, columns=sample_df.columns)

# Step-3: This sample_imputed_df is the imputed subset
sample_imputed_df.head()

Unnamed: 0,Wind_Speed(mph),Precipitation(in)
0,13.0,0.01
1,9.0,0.0
2,10.4,0.009937
3,3.0,0.0
4,6.0,0.0


### TASK: Impute the columns of 'Wind_Speed(mph)' & 'Weather_Conditions'

In [12]:
print("Missing values in Weather_Condition:", us_accidents_df['Weather_Condition'].isnull().sum())
print("\nMost common weather conditions:")
print(us_accidents_df['Weather_Condition'].value_counts().head(10))
most_frequent_weather = us_accidents_df['Weather_Condition'].mode()[0]
print(f"Most frequent weather condition: {most_frequent_weather}")
us_accidents_df['Weather_Condition'].fillna(most_frequent_weather, inplace=True)
print("Missing values after imputation:", us_accidents_df['Weather_Condition'].isnull().sum())
print("\nFirst few rows of Weather_Condition:")
print(us_accidents_df['Weather_Condition'].head())


Missing values in Weather_Condition: 173459

Most common weather conditions:
Weather_Condition
Fair                2560802
Mostly Cloudy       1016195
Cloudy               817082
Clear                808743
Partly Cloudy        698972
Overcast             382866
Light Rain           352957
Scattered Clouds     204829
Light Snow           128680
Fog                   99238
Name: count, dtype: int64
Most frequent weather condition: Fair


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  us_accidents_df['Weather_Condition'].fillna(most_frequent_weather, inplace=True)


Missing values after imputation: 0

First few rows of Weather_Condition:
0       Light Rain
1       Light Rain
2         Overcast
3    Mostly Cloudy
4    Mostly Cloudy
Name: Weather_Condition, dtype: object
