In [None]:
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
dataset_path = "/content/drive/MyDrive/US_Accidents_March23.csv"
us_accidents_df = pd.read_csv(dataset_path)

In [None]:
us_accidents_df.isnull()

Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,False,False,False,False,False,False,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,True,True,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7728389,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7728390,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7728391,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7728392,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
us_accidents_df.isnull().sum()

Unnamed: 0,0
ID,0
Source,0
Severity,0
Start_Time,0
End_Time,0
Start_Lat,0
Start_Lng,0
End_Lat,3402762
End_Lng,3402762
Distance(mi),0


In [None]:
(us_accidents_df.isnull().sum()/us_accidents_df.shape[0])*100

Unnamed: 0,0
ID,0.0
Source,0.0
Severity,0.0
Start_Time,0.0
End_Time,0.0
Start_Lat,0.0
Start_Lng,0.0
End_Lat,44.029355
End_Lng,44.029355
Distance(mi),0.0


In [None]:
# Drop the columns of 'End_lat' & 'End_lng' from this dataset.
us_accidents_df.drop(['End_Lat', 'End_Lng'], axis=1, inplace=True)

In [None]:
# Handling the missing data of 'Wind_Speed(mph)' & 'Precipitation(in)'
temp_df = us_accidents_df[['Wind_Speed(mph)', 'Precipitation(in)']][:1000000]
temp_df.head()

Unnamed: 0,Wind_Speed(mph),Precipitation(in)
0,,0.02
1,,0.0
2,3.5,
3,4.6,
4,3.5,


- `IterativeImputer` uses regression models to impute missing values in each column as a function of other columns.

- It is more flexible than simple mean/median imputation and can capture inter-feature relationships.

- You can adjust estimator type via the estimator argument for custom regression models if desired (default is `BayesianRidge`).

In [None]:
import pandas as pd
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

# Step 1: Randomly sample 100,000 rows from the dataset
sample_df = temp_df.sample(n=100000, random_state=42)

# Step 2: Perform regression imputation on the sample using IterativeImputer
reg_imputer = IterativeImputer(random_state=42)
sample_imputed = reg_imputer.fit_transform(sample_df)
sample_imputed_df = pd.DataFrame(sample_imputed, columns=sample_df.columns)

# Step 3: This sample_imputed_df is your imputed subset
print(sample_imputed_df.head())

   Wind_Speed(mph)  Precipitation(in)
0         13.00000           0.000000
1         11.50000           0.018010
2         10.00000           0.000000
3         10.40000           0.020000
4          7.76949           0.014144


### TASK: Impute the columns of 'Wind_Speed(mph)' & 'Weather_Conditions'

In [None]:
# Impute 'Weather_Condition' with the mode
most_frequent_weather = us_accidents_df['Weather_Condition'].mode()[0]
us_accidents_df['Weather_Condition'].fillna(most_frequent_weather, inplace=True)

# Define numerical columns with missing values for IterativeImputer
numerical_cols_to_impute = [
    'Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)',
    'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)', 'Precipitation(in)'
]

# Create a copy of the subset of the DataFrame to impute
df_numerical_subset = us_accidents_df[numerical_cols_to_impute].copy()

# Initialize IterativeImputer
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

reg_imputer = IterativeImputer(random_state=42)

# Fit and transform the numerical subset
imputed_numerical_data = reg_imputer.fit_transform(df_numerical_subset)

# Convert the imputed array back to a DataFrame
imputed_numerical_df = pd.DataFrame(imputed_numerical_data, columns=numerical_cols_to_impute, index=df_numerical_subset.index)

# Update the original DataFrame with the imputed values
us_accidents_df[numerical_cols_to_impute] = imputed_numerical_df

# Verify that missing values have been handled for these columns
print("Missing values after imputation:")
print(us_accidents_df[numerical_cols_to_impute + ['Weather_Condition']].isnull().sum())

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  us_accidents_df['Weather_Condition'].fillna(most_frequent_weather, inplace=True)


Missing values after imputation:
Temperature(F)       0
Wind_Chill(F)        0
Humidity(%)          0
Pressure(in)         0
Visibility(mi)       0
Wind_Speed(mph)      0
Precipitation(in)    0
Weather_Condition    0
dtype: int64
