In [5]:
import requests
import pandas as pd
from pathlib import Path
from datetime import datetime
import logging
import time
import json

In [6]:
logger = logging.getLogger(__name__)
if not logger.handlers:
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    )

In [None]:
class TorontoCollisionsExtractor:
    """Extract Toronto Police traffic collision data via ArcGIS REST API."""

    BASE_URL = "https://services.arcgis.com/S9th0jAJ7bqgIRjw/arcgis/rest/services/Traffic_Collisions_Open_Data/FeatureServer/0/query"

    def __init__(self):
        self.bronze_path = Path("data/bronze/toronto_collisions")
        self.bronze_path.mkdir(parents=True, exist_ok=True)

    def fetch_collisions(self, start_year: int = 2014, end_year: int = None) -> pd.DataFrame:
       
        if end_year is None:
            end_year = datetime.now().year

        
        logger.info(f"Fetching Toronto collision data: {start_year}-{end_year}")

        all_records = []

        for year in range(start_year, end_year + 1):
            logger.info(f"  Fetching year {year}...")
            offset = 0
            year_records = []
            
            while True:
                
                params = {
                    'where': f"OCC_YEAR = '{year}'",
                    'outFields': '*',
                    'returnGeometry': 'false',
                    'f': 'json',
                    'resultRecordCount': 2000,
                    'resultOffset': offset
                }

                try:
                    response = requests.get(self.BASE_URL, params=params, timeout=30)
                    
             
                    if response.status_code == 400:
                        logger.error(f" API Rejected URL: {self.BASE_URL}")
                        logger.error("Please verify the Service URL on the Toronto Police Open Data Portal.")
                        return pd.DataFrame(all_records)
                        
                    response.raise_for_status()
                    data = response.json()

                    if 'features' not in data or not data['features']:
                        break

                    records = [feature['attributes'] for feature in data['features']]
                    year_records.extend(records)

                    if len(records) < 2000:
                        break

                    offset += 2000
                    time.sleep(0.5)

                except Exception as e:
                    logger.error(f"    Error fetching year {year}: {e}")
                    break

            logger.info(f"   Year {year}: {len(year_records):,} total records")
            all_records.extend(year_records)

        df = pd.DataFrame(all_records)
        if not df.empty:
            logger.info(f"\n Total records fetched: {len(df):,}")
        return df

    def save_raw_data(self, df: pd.DataFrame):
        if df.empty:
            logger.warning("DataFrame is empty. Skipping save.")
            return

        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_file = self.bronze_path / f"toronto_collisions_{timestamp}.csv"
        df.to_csv(output_file, index=False)
        logger.info(f"Raw data saved to: {output_file}")

In [11]:
if __name__ == "__main__":
    extractor = TorontoCollisionsExtractor()
    
    df = extractor.fetch_collisions(start_year=2014)
    extractor.save_raw_data(df)

2026-01-30 14:56:06,953 - __main__ - INFO - Fetching Toronto collision data: 2014-2026
2026-01-30 14:56:06,953 - __main__ - INFO -   Fetching year 2014...
2026-01-30 14:56:42,132 - __main__ - INFO -   ✅ Year 2014: 64,596 total records
2026-01-30 14:56:42,133 - __main__ - INFO -   Fetching year 2015...
2026-01-30 14:57:19,381 - __main__ - INFO -   ✅ Year 2015: 67,265 total records
2026-01-30 14:57:19,383 - __main__ - INFO -   Fetching year 2016...
2026-01-30 14:57:57,888 - __main__ - INFO -   ✅ Year 2016: 69,669 total records
2026-01-30 14:57:57,890 - __main__ - INFO -   Fetching year 2017...
2026-01-30 14:58:41,335 - __main__ - INFO -   ✅ Year 2017: 74,209 total records
2026-01-30 14:58:41,337 - __main__ - INFO -   Fetching year 2018...
2026-01-30 14:59:29,019 - __main__ - INFO -   ✅ Year 2018: 79,271 total records
2026-01-30 14:59:29,021 - __main__ - INFO -   Fetching year 2019...
2026-01-30 15:00:19,982 - __main__ - INFO -   ✅ Year 2019: 82,831 total records
2026-01-30 15:00:19,984 -