In [13]:
import requests
import pandas as pd
from pathlib import Path
import logging
import re

logging.basicConfig(level=logging.INFO, format='%(message)s')
logger = logging.getLogger(__name__)
bronze_path = Path("../../data/bronze/NCDB")
bronze_path.mkdir(parents=True, exist_ok=True)

# Script for Downloading the NCDB Dataset

This script downloads the NCDB (National Collision Database) datasets for all available years, from 1999 to the most recent release.

The data is retrieved from the Open Canada API. Upon verification, the API provides datasets in either CSV or XLSX format. To standardize the data for downstream Transformation and Load cycles, all files are converted into a consistent CSV format.

The standardized CSV files are stored in the Bronze layer and will later be migrated to the Silver layer after cleaning and merging, following the Medallion Architecture — a widely used design pattern in data engineering.


In [14]:
def download_and_standardize_ncdb():
    api_url = "https://open.canada.ca/data/api/3/action/package_show?id=1eb9eba7-71d1-4b30-9fb1-30cbdab7e63a"
    
    print("1. Fetching file list from Open Canada...")
    try:
        response = requests.get(api_url)
        resources = response.json()['result']['resources']
    except Exception as e:
        print(f"❌ API Error: {e}")
        return

    # Regex to find year (e.g., 1999, 2020)
    year_pattern = re.compile(r'(19|20)\d{2}')

    for r in resources:
        name = r.get('name', '')
        url = r.get('url', '')
        fmt = r.get('format', '').upper()
        
        # FILTER 1: Skip French and Dictionaries
        if 'french' in name.lower() or 'dictionary' in name.lower() or 'dictionnaire' in name.lower():
            continue
            
        # FILTER 2: Find a Year
        match = year_pattern.search(name)
        if not match:
            continue
            
        year = match.group(0)
        
        # Special handling for "99_TO_2001" -> Skip it if we can get individual years instead
        if "99_TO" in name:
            print(f"   Skipping aggregate file {name} (preferring individual years)")
            continue

        # Target Filename (Everything becomes .csv in the end)
        final_file = bronze_path / f"ncdb_{year}.csv"
        
        if final_file.exists():
            print(f"    {year}: Already exists ({final_file.name})")
            continue

        print(f"    Downloading {year} ({fmt})...")
        
        try:
            # Download content to memory
            r = requests.get(url)
            
            if fmt == 'CSV':
                # Direct save for CSV
                with open(final_file, 'wb') as f:
                    f.write(r.content)
                print(f"      Saved as {final_file.name}")
                
            elif fmt == 'XLSX':
                # CONVERT Excel to CSV
                print(f"       Converting XLSX to CSV...")
                # Load Excel into Pandas
                df_temp = pd.read_excel(r.content)
                # Save as CSV
                df_temp.to_csv(final_file, index=False, encoding='latin1')
                print(f"      Converted & Saved as {final_file.name}")
                
        except Exception as e:
            print(f"     Failed to process {year}: {e}")

    print("All years downloaded and standardized to CSV.")

In [15]:
download_and_standardize_ncdb()

1. Fetching file list from Open Canada...
    Downloading 2019 (CSV)...
      Saved as ncdb_2019.csv
    2019: Already exists (ncdb_2019.csv)
   Skipping aggregate file National Collision Database (99_TO_2001) (preferring individual years)
   Skipping aggregate file National Collision Database (99_TO_2001) (preferring individual years)
    Downloading 1999 (XLSX)...
       Converting XLSX to CSV...


  df_temp = pd.read_excel(r.content)


      Converted & Saved as ncdb_1999.csv
    1999: Already exists (ncdb_1999.csv)
    Downloading 2000 (XLSX)...
       Converting XLSX to CSV...


  df_temp = pd.read_excel(r.content)


      Converted & Saved as ncdb_2000.csv
    2000: Already exists (ncdb_2000.csv)
    Downloading 2001 (XLSX)...
       Converting XLSX to CSV...


  df_temp = pd.read_excel(r.content)


      Converted & Saved as ncdb_2001.csv
    2001: Already exists (ncdb_2001.csv)
    Downloading 2002 (XLSX)...
       Converting XLSX to CSV...


  df_temp = pd.read_excel(r.content)


      Converted & Saved as ncdb_2002.csv
    2002: Already exists (ncdb_2002.csv)
    Downloading 2003 (XLSX)...
       Converting XLSX to CSV...


  df_temp = pd.read_excel(r.content)


      Converted & Saved as ncdb_2003.csv
    2003: Already exists (ncdb_2003.csv)
    Downloading 2004 (XLSX)...
       Converting XLSX to CSV...


  df_temp = pd.read_excel(r.content)


      Converted & Saved as ncdb_2004.csv
    2004: Already exists (ncdb_2004.csv)
    Downloading 2005 (XLSX)...
       Converting XLSX to CSV...


  df_temp = pd.read_excel(r.content)


      Converted & Saved as ncdb_2005.csv
    2005: Already exists (ncdb_2005.csv)
    Downloading 2006 (XLSX)...
       Converting XLSX to CSV...


  df_temp = pd.read_excel(r.content)


      Converted & Saved as ncdb_2006.csv
    2006: Already exists (ncdb_2006.csv)
    Downloading 2007 (XLSX)...
       Converting XLSX to CSV...


  df_temp = pd.read_excel(r.content)


      Converted & Saved as ncdb_2007.csv
    2007: Already exists (ncdb_2007.csv)
    Downloading 2008 (XLSX)...
       Converting XLSX to CSV...


  df_temp = pd.read_excel(r.content)


      Converted & Saved as ncdb_2008.csv
    2008: Already exists (ncdb_2008.csv)
    Downloading 2009 (XLSX)...
       Converting XLSX to CSV...


  df_temp = pd.read_excel(r.content)


      Converted & Saved as ncdb_2009.csv
    2009: Already exists (ncdb_2009.csv)
    Downloading 2010 (XLSX)...
       Converting XLSX to CSV...


  df_temp = pd.read_excel(r.content)


      Converted & Saved as ncdb_2010.csv
    2010: Already exists (ncdb_2010.csv)
    Downloading 2011 (XLSX)...
       Converting XLSX to CSV...


  df_temp = pd.read_excel(r.content)


      Converted & Saved as ncdb_2011.csv
    2011: Already exists (ncdb_2011.csv)
    Downloading 2012 (XLSX)...
       Converting XLSX to CSV...


  df_temp = pd.read_excel(r.content)


      Converted & Saved as ncdb_2012.csv
    2012: Already exists (ncdb_2012.csv)
    Downloading 2013 (XLSX)...
       Converting XLSX to CSV...


  df_temp = pd.read_excel(r.content)


      Converted & Saved as ncdb_2013.csv
    2013: Already exists (ncdb_2013.csv)
    Downloading 2014 (XLSX)...
       Converting XLSX to CSV...


  df_temp = pd.read_excel(r.content)


      Converted & Saved as ncdb_2014.csv
    2014: Already exists (ncdb_2014.csv)
    Downloading 2015 (XLSX)...
       Converting XLSX to CSV...


  df_temp = pd.read_excel(r.content)


      Converted & Saved as ncdb_2015.csv
    2015: Already exists (ncdb_2015.csv)
    Downloading 2016 (XLSX)...
       Converting XLSX to CSV...


  df_temp = pd.read_excel(r.content)


      Converted & Saved as ncdb_2016.csv
    2016: Already exists (ncdb_2016.csv)
    Downloading 2017 (XLSX)...
       Converting XLSX to CSV...


  df_temp = pd.read_excel(r.content)


      Converted & Saved as ncdb_2017.csv
    2017: Already exists (ncdb_2017.csv)
    Downloading 2018 (XLSX)...
       Converting XLSX to CSV...


  df_temp = pd.read_excel(r.content)


      Converted & Saved as ncdb_2018.csv
    2018: Already exists (ncdb_2018.csv)
    2019: Already exists (ncdb_2019.csv)
    2019: Already exists (ncdb_2019.csv)
    Downloading 2020 (CSV)...
      Saved as ncdb_2020.csv
    2020: Already exists (ncdb_2020.csv)
    Downloading 2021 (CSV)...
     Failed to process 2021: Invalid URL '': No scheme supplied. Perhaps you meant https://?
    Downloading 2021 (CSV)...
      Saved as ncdb_2021.csv
All years downloaded and standardized to CSV.
