In [None]:
# Load relevant imports here
import pandas as pd
import numpy as np

df = pd.read_csv("data/US_Accidents_March23.csv")

df.head()
df.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7728394 entries, 0 to 7728393
Data columns (total 46 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   ID                     object 
 1   Source                 object 
 2   Severity               int64  
 3   Start_Time             object 
 4   End_Time               object 
 5   Start_Lat              float64
 6   Start_Lng              float64
 7   End_Lat                float64
 8   End_Lng                float64
 9   Distance(mi)           float64
 10  Description            object 
 11  Street                 object 
 12  City                   object 
 13  County                 object 
 14  State                  object 
 15  Zipcode                object 
 16  Country                object 
 17  Timezone               object 
 18  Airport_Code           object 
 19  Weather_Timestamp      object 
 20  Temperature(F)         float64
 21  Wind_Chill(F)          float64
 22  Humidity(%)       

In [3]:
# Convert 'Start_Time' column to datetime format

df["Start_Time"] = pd.to_datetime(df["Start_Time"], errors="coerce")


In [4]:
# Extract Year, Month, Hour, and Day of the Week from 'Start_Time'

df["Year"] = df["Start_Time"].dt.year
df["Month"] = df["Start_Time"].dt.month
df["Hour"] = df["Start_Time"].dt.hour
df["Day_of_Week"] = df["Start_Time"].dt.day_name()


In [5]:
# Create 'Day_Night' column based on 'Sunrise_Sunset' values

df["Day_Night"] = df["Sunrise_Sunset"]


In [6]:
# Create 'Is_Severe' column based on 'Severity' values

df["Is_Severe"] = df["Severity"].isin([3, 4])


In [7]:
# Classify roads into categories 

def classify_road(street):
    if pd.isna(street):
        return "Unknown"
    street = street.upper()
    if "I-" in street or "INTERSTATE" in street:
        return "Interstate"
    elif "US-" in street or "US ROUTE" in street:
        return "US Highway"
    elif "STATE ROUTE" in street or "SR-" in street:
        return "State Highway"
    else:
        return "Local / Other"

df["Road_Type"] = df["Street"].apply(classify_road)


In [8]:
# Create a reduced DataFrame with selected columns

reduced_df = df[
    [
        "Severity",
        "Is_Severe",
        "Year",
        "Month",
        "Hour",
        "Day_of_Week",
        "State",
        "Weather_Condition",
        "Road_Type",
        "Day_Night"
    ]
]


In [9]:
# Display the first few rows and info of the reduced DataFrame

reduced_df.head()
reduced_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7728394 entries, 0 to 7728393
Data columns (total 10 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Severity           int64  
 1   Is_Severe          bool   
 2   Year               float64
 3   Month              float64
 4   Hour               float64
 5   Day_of_Week        object 
 6   State              object 
 7   Weather_Condition  object 
 8   Road_Type          object 
 9   Day_Night          object 
dtypes: bool(1), float64(3), int64(1), object(5)
memory usage: 538.0+ MB


In [10]:
reduced_df["Accident_Count"] = 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reduced_df["Accident_Count"] = 1


In [11]:
# Remove rows with missing values in critical columns

reduced_df = reduced_df.dropna(
    subset=["State", "Weather_Condition", "Year", "Month"]
)
reduced_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6829766 entries, 0 to 7728393
Data columns (total 11 columns):
 #   Column             Dtype  
---  ------             -----  
 0   Severity           int64  
 1   Is_Severe          bool   
 2   Year               float64
 3   Month              float64
 4   Hour               float64
 5   Day_of_Week        object 
 6   State              object 
 7   Weather_Condition  object 
 8   Road_Type          object 
 9   Day_Night          object 
 10  Accident_Count     int64  
dtypes: bool(1), float64(3), int64(2), object(5)
memory usage: 579.7+ MB


In [12]:
# Save the reduced DataFrame to a new CSV file

reduced_df.to_csv("us_accidents_2016_2023.csv", index=False)


In [13]:
# Ensure correct data types for certain columns

reduced_df["Year"] = reduced_df["Year"].astype("int64")
reduced_df["Month"] = reduced_df["Month"].astype("int64")
reduced_df["Hour"] = reduced_df["Hour"].astype("int64")


In [14]:
# Check data types

reduced_df.dtypes


Severity              int64
Is_Severe              bool
Year                  int64
Month                 int64
Hour                  int64
Day_of_Week          object
State                object
Weather_Condition    object
Road_Type            object
Day_Night            object
Accident_Count        int64
dtype: object

In [15]:
# Save the reduced DataFrame to a new CSV file

reduced_df.to_csv("us_accidents_2016_2023_.csv", index=False)
