In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from pathlib import Path

In [26]:
df = pd.read_csv('../../data/bronze/toronto_collisions/toronto_collisions_20260131_180442.csv')

In [27]:
print(f"Initial Shape: {df.shape}")
print("\n--- Data Types ---")
print(df.dtypes)
print("\n--- Missing Values ---")
print(df.isnull().sum()[df.isnull().sum() > 0])

Initial Shape: (772516, 21)

--- Data Types ---
OBJECTID               int64
EVENT_UNIQUE_ID       object
OCC_DATE               int64
OCC_MONTH             object
OCC_DOW               object
OCC_YEAR               int64
OCC_HOUR               int64
DIVISION              object
FATALITIES             int64
INJURY_COLLISIONS     object
FTR_COLLISIONS        object
PD_COLLISIONS         object
HOOD_158              object
NEIGHBOURHOOD_158     object
LONG_WGS84           float64
LAT_WGS84            float64
AUTOMOBILE            object
MOTORCYCLE            object
PASSENGER             object
BICYCLE               object
PEDESTRIAN            object
dtype: object

--- Missing Values ---
INJURY_COLLISIONS    4
FTR_COLLISIONS       4
PD_COLLISIONS        4
AUTOMOBILE           4
MOTORCYCLE           4
PASSENGER            4
BICYCLE              4
PEDESTRIAN           4
dtype: int64


In [28]:
# These often come as 'YES' or NaN. We'll map them to clean Boolean or 1/0.
flag_cols = ['AUTOMOBILE', 'MOTORCYCLE', 'PASSENGER', 'BICYCLE', 'PEDESTRIAN', 
             'INJURY_COLLISIONS', 'FTR_COLLISIONS', 'PD_COLLISIONS']

In [29]:
df['clean_date'] = pd.to_datetime(df['OCC_DATE'], unit='ms')

print("\n--- Unique Values in Flag Columns (Before Clean) ---")
for col in flag_cols:
    if col in df.columns:
        print(f"{col}: {df[col].unique()}")


--- Unique Values in Flag Columns (Before Clean) ---
AUTOMOBILE: ['YES' 'N/R' 'NO' nan]
MOTORCYCLE: ['NO' 'N/R' 'YES' nan]
PASSENGER: ['NO' 'YES' 'N/R' nan]
BICYCLE: ['NO' 'N/R' 'YES' nan]
PEDESTRIAN: ['NO' 'N/R' 'YES' nan]
INJURY_COLLISIONS: ['NO' 'YES' nan]
FTR_COLLISIONS: ['YES' 'NO' nan]
PD_COLLISIONS: ['NO' 'YES' nan]


In [30]:
for col in flag_cols:
    df[col] = df[col].map({'YES': 'YES', 'NO': 'NO', 'N/R': 'NO'}).fillna('NO')


In [31]:
geo_cols = ['LAT_WGS84', 'LONG_WGS84']

In [32]:
for col in geo_cols:
    if col in df.columns:
        # mask 0.0 or near 0 values
        df.loc[np.isclose(df[col], 0), col] = np.nan

In [33]:
print(f"Coordinates cleaned. {df['LAT_WGS84'].isna().sum()} records now have NaN coordinates (likely NSA).")

Coordinates cleaned. 126069 records now have NaN coordinates (likely NSA).


In [34]:
if 'DIVISION' in df.columns:
    d54_count = df[df['DIVISION'].str.contains('54', na=False)].shape[0]
    if d54_count > 0:
        print(f"\n Normalizing {d54_count} records from D54 to D55 (Amalgamation).")
        df['DIVISION'] = df['DIVISION'].replace({'D54': 'D55'}) # Adjust string format based on actual data

In [35]:
print(f"Initial Shape: {df.shape}")
print("\n--- Data Types ---")
print(df.dtypes)
print("\n--- Missing Values ---")
print(df.isnull().sum()[df.isnull().sum() > 0])

Initial Shape: (772516, 22)

--- Data Types ---
OBJECTID                      int64
EVENT_UNIQUE_ID              object
OCC_DATE                      int64
OCC_MONTH                    object
OCC_DOW                      object
OCC_YEAR                      int64
OCC_HOUR                      int64
DIVISION                     object
FATALITIES                    int64
INJURY_COLLISIONS            object
FTR_COLLISIONS               object
PD_COLLISIONS                object
HOOD_158                     object
NEIGHBOURHOOD_158            object
LONG_WGS84                  float64
LAT_WGS84                   float64
AUTOMOBILE                   object
MOTORCYCLE                   object
PASSENGER                    object
BICYCLE                      object
PEDESTRIAN                   object
clean_date           datetime64[ns]
dtype: object

--- Missing Values ---
LONG_WGS84    126069
LAT_WGS84     126069
dtype: int64


In [36]:
save_dir = Path("../../data/silver/toronto_collisions").resolve()
save_dir.mkdir(parents=True, exist_ok=True)
output_file = save_dir / "toronto_collisions_silver.csv"

In [37]:
df.to_csv(output_file, index=False)