**Some vehicles stopped in violation are exempt from fines due to business reasons. For vehicles that are exempt, are there repeat offenders? Where are exempt vehicles frequently in violation?**

*Recommendation - This problem is great for a mapping visualization! Choose a CUNY bus route that you are familiar with plot the long and latitude of where vehicles are in violation as well as the overall bus route.*

## Data Extraction via API

### ACE Dataset

In [8]:
import requests
import random
import logging
import time
import csv
import pandas as pd
import os
import io

DATA_DIR_RAW = os.path.join("..", "..", "data", "raw")
DATA_DIR_PROCESSED = os.path.join("..", "..", "data", "processed")

# 🪵 Logging setup
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

In [9]:
def write_to_csv(data, output_file, replace=False):
    if os.path.exists(output_file) and not replace:
        logging.info(f"☑️ File '{output_file}' already exists. Skipping write.")
        return

    try:
        if isinstance(data, pd.DataFrame):
            # Handle DataFrame directly
            if data.empty:
                logging.warning("⚠️ No data to write (DataFrame is empty).")
                return
            data.to_csv(output_file, index=False, encoding="utf-8")
            logging.info(f"📄 Successfully wrote {len(data)} rows (DataFrame) to '{output_file}'")

        elif isinstance(data, list):
            # Handle list of dicts
            if not data:
                logging.warning("⚠️ No data to write (list is empty).")
                return
            fieldnames = sorted(set().union(*(d.keys() for d in data)))
            with open(output_file, mode="w", newline="", encoding="utf-8") as f:
                writer = csv.DictWriter(f, fieldnames=fieldnames)
                writer.writeheader()
                writer.writerows(data)
            logging.info(f"📄 Successfully wrote {len(data)} rows (list of dicts) to '{output_file}'")

        else:
            logging.error(f"❌ Unsupported data type: {type(data)}. Expected DataFrame or list of dicts.")

    except Exception as e:
        logging.error(f"❌ Failed to write CSV: {e}")

In [None]:
ACE_endpoint_url = "https://data.ny.gov/resource/kh8p-hcbm.csv"

exempt_types = (
    "'EXEMPT - BUS/PARATRANSIT', "
    "'EXEMPT - OTHER', "
    "'EXEMPT - EMERGENCY VEHICLE', "
    "'EXEMPT - COMMERCIAL UNDER 20'"
)

count_query = f"""
    SELECT count(*)
    WHERE violation_status IN ({exempt_types})
"""
response = requests.get(ACE_endpoint_url, params={"$query": count_query})

# Fix: parse the count
count_df = pd.read_csv(io.StringIO(response.text))
max_rows = int(count_df.iloc[0, 0])
print(f"Total exempt rows available: {max_rows}")

all_data = []
limit = 50000
offset = 0

while offset < max_rows:
    print(f"Fetching rows {offset + 1} to {min(offset + limit, max_rows)}")

    soql_query = f"""
        SELECT *
        WHERE violation_status IN ({exempt_types})
        LIMIT {limit}
        OFFSET {offset}
    """

    response = requests.get(ACE_endpoint_url, params={"$query": soql_query})

    try:
        chunk_data = pd.read_csv(io.StringIO(response.text))
        if not chunk_data.empty:
            all_data.append(chunk_data)
    except pd.errors.EmptyDataError:
        break

    offset += limit
    time.sleep(1)

ACE_exempt_violations = pd.concat(all_data, ignore_index=True)
print(f"Total rows collected: {len(ACE_exempt_violations)}")

Total exempt rows available: 870810
Fetching rows 1 to 50000
Fetching rows 50001 to 100000
Fetching rows 100001 to 150000
Fetching rows 150001 to 200000
Fetching rows 200001 to 250000
Fetching rows 250001 to 300000
Fetching rows 300001 to 350000
Fetching rows 350001 to 400000
Fetching rows 400001 to 450000
Fetching rows 450001 to 500000
Fetching rows 500001 to 550000
Fetching rows 550001 to 600000
Fetching rows 600001 to 650000
Fetching rows 650001 to 700000
Fetching rows 700001 to 750000
Fetching rows 750001 to 800000
Fetching rows 800001 to 850000
Fetching rows 850001 to 870810
Total rows collected: 870810


In [12]:
OUTPUT_ACE = os.path.join(DATA_DIR_RAW, "ACE_exempt_violations.csv")
write_to_csv(ACE_exempt_violations, OUTPUT_ACE)

2025-09-21 19:00:04,590 - INFO - 📄 Successfully wrote 870810 rows (DataFrame) to '..\..\data\raw\ACE_exempt_violations.csv'


In [11]:
ACE_endpoint_url = "https://data.ny.gov/resource/kh8p-hcbm.csv"

non_exempt_types = (
    "'VIOLATION ISSUED'"
)

count_query = f"""
    SELECT count(*)
    WHERE violation_status IN ({non_exempt_types})
"""
response = requests.get(ACE_endpoint_url, params={"$query": count_query})

# Fix: parse the count
count_df = pd.read_csv(io.StringIO(response.text))
max_rows = int(count_df.iloc[0, 0])
print(f"Total exempt rows available: {max_rows}")

all_data = []
limit = 50000
offset = 0

while offset < max_rows:
    print(f"Fetching rows {offset + 1} to {min(offset + limit, max_rows)}")

    soql_query = f"""
        SELECT *
        WHERE violation_status IN ({non_exempt_types})
        LIMIT {limit}
        OFFSET {offset}
    """

    response = requests.get(ACE_endpoint_url, params={"$query": soql_query})

    try:
        chunk_data = pd.read_csv(io.StringIO(response.text))
        if not chunk_data.empty:
            all_data.append(chunk_data)
    except pd.errors.EmptyDataError:
        break

    offset += limit
    time.sleep(1)

ACE_non_exempt_violations = pd.concat(all_data, ignore_index=True)
print(f"Total rows collected: {len(ACE_non_exempt_violations)}")

Total exempt rows available: 2312878
Fetching rows 1 to 50000
Fetching rows 50001 to 100000
Fetching rows 100001 to 150000
Fetching rows 150001 to 200000
Fetching rows 200001 to 250000
Fetching rows 250001 to 300000
Fetching rows 300001 to 350000
Fetching rows 350001 to 400000
Fetching rows 400001 to 450000
Fetching rows 450001 to 500000
Fetching rows 500001 to 550000
Fetching rows 550001 to 600000
Fetching rows 600001 to 650000
Fetching rows 650001 to 700000
Fetching rows 700001 to 750000
Fetching rows 750001 to 800000
Fetching rows 800001 to 850000
Fetching rows 850001 to 900000
Fetching rows 900001 to 950000
Fetching rows 950001 to 1000000
Fetching rows 1000001 to 1050000
Fetching rows 1050001 to 1100000
Fetching rows 1100001 to 1150000
Fetching rows 1150001 to 1200000
Fetching rows 1200001 to 1250000
Fetching rows 1250001 to 1300000
Fetching rows 1300001 to 1350000
Fetching rows 1350001 to 1400000
Fetching rows 1400001 to 1450000
Fetching rows 1450001 to 1500000
Fetching rows 15000

In [12]:
OUTPUT_ACE_NON = os.path.join(DATA_DIR_RAW, "ACE_non_exempt_violations.csv")
write_to_csv(ACE_non_exempt_violations, OUTPUT_ACE_NON)

2025-09-22 17:31:30,538 - INFO - 📄 Successfully wrote 2312878 rows (DataFrame) to '..\..\data\raw\ACE_non_exempt_violations.csv'


## Events Dataset

In [4]:
import requests
import pandas as pd
import io
import time

events_endpoint_url = "https://data.cityofnewyork.us/resource/bkfu-528j.csv"

# get total number of rows for Manhattan events in 2024
count_query = """
    SELECT count(*)
    WHERE event_borough = 'Manhattan'
    AND date_extract_y(start_date_time) = 2024
"""
response = requests.get(events_endpoint_url, params={"$query": count_query})
count_df = pd.read_csv(io.StringIO(response.text))
max_rows = int(count_df.iloc[0, 0])
print(f"Total Manhattan events in 2024 available: {max_rows}")

all_data = []
limit = 50000
offset = 0

while offset < max_rows:
    print(f"Fetching rows {offset + 1} to {min(offset + limit, max_rows)}")

    soql_query = f"""
        SELECT *
        WHERE event_borough = 'Manhattan'
        AND date_extract_y(start_date_time) = 2024
        LIMIT {limit}
        OFFSET {offset}
    """

    response = requests.get(events_endpoint_url, params={"$query": soql_query})

    try:
        chunk_data = pd.read_csv(io.StringIO(response.text))
        if not chunk_data.empty:
            all_data.append(chunk_data)
    except pd.errors.EmptyDataError:
        break

    offset += limit
    time.sleep(1)

events_manhattan_2024 = pd.concat(all_data, ignore_index=True)

print(f"Total Manhattan events in 2024 collected: {len(events_manhattan_2024)}")
print(events_manhattan_2024.head())


Total Manhattan events in 2024 available: 1419452
Fetching rows 1 to 50000
Fetching rows 50001 to 100000
Fetching rows 100001 to 150000
Fetching rows 150001 to 200000
Fetching rows 200001 to 250000
Fetching rows 250001 to 300000
Fetching rows 300001 to 350000
Fetching rows 350001 to 400000
Fetching rows 400001 to 450000
Fetching rows 450001 to 500000
Fetching rows 500001 to 550000
Fetching rows 550001 to 600000
Fetching rows 600001 to 650000
Fetching rows 650001 to 700000
Fetching rows 700001 to 750000
Fetching rows 750001 to 800000
Fetching rows 800001 to 850000
Fetching rows 850001 to 900000
Fetching rows 900001 to 950000
Fetching rows 950001 to 1000000
Fetching rows 1000001 to 1050000
Fetching rows 1050001 to 1100000
Fetching rows 1100001 to 1150000
Fetching rows 1150001 to 1200000
Fetching rows 1200001 to 1250000
Fetching rows 1250001 to 1300000
Fetching rows 1300001 to 1350000
Fetching rows 1350001 to 1400000
Fetching rows 1400001 to 1419452
Total Manhattan events in 2024 collecte

In [5]:
events_manhattan_2024

Unnamed: 0,event_id,event_name,start_date_time,end_date_time,event_agency,event_type,event_borough,event_location,event_street_side,street_closure_type,community_board,police_precinct
0,788839,Soccer - Non Regulation,2024-10-20T09:00:00.000,2024-10-20T11:00:00.000,Parks Department,Sport - Adult,Manhattan,Robert Moses Playground: Soccer-01,,,06,17
1,785235,Baseball - 13 and Older (Little League),2024-09-22T08:00:00.000,2024-09-22T19:30:00.000,Parks Department,Sport - Youth,Manhattan,Randall's Island Park: Bronx Shore Fields-Base...,,,11,25
2,739258,Saturday Greeter Shift and Guided Tour,2024-09-14T12:00:00.000,2024-09-14T14:00:00.000,Parks Department,Special Event,Manhattan,Washington Square Park: Washington Square Park,,,2,6
3,804514,Party,2024-09-22T16:30:00.000,2024-09-22T18:30:00.000,Parks Department,Special Event,Manhattan,Central Park: Wild West Playground Lawn,,,64,22
4,783690,Baseball - 12 and Under (Little League),2024-11-09T12:00:00.000,2024-11-09T17:30:00.000,Parks Department,Sport - Youth,Manhattan,Randall's Island Park: Bronx Shore Fields-Soft...,,,11,25
...,...,...,...,...,...,...,...,...,...,...,...,...
1419447,796129,Closure,2024-06-30T08:00:00.000,2026-12-31T23:00:00.000,Parks Department,Special Event,Manhattan,Augustus St. Gaudens Playground: Basketball-01,,,06,13
1419448,796129,Closure,2024-06-30T08:00:00.000,2026-12-31T23:00:00.000,Parks Department,Special Event,Manhattan,Augustus St. Gaudens Playground: Basketball-01,,,06,13
1419449,796129,Closure,2024-06-30T08:00:00.000,2026-12-31T23:00:00.000,Parks Department,Special Event,Manhattan,Augustus St. Gaudens Playground: Basketball-01,,,06,13
1419450,796129,Closure,2024-06-30T08:00:00.000,2026-12-31T23:00:00.000,Parks Department,Special Event,Manhattan,Augustus St. Gaudens Playground: Basketball-01,,,06,13


In [7]:
OUTPUT_events = os.path.join(DATA_DIR_RAW, "events_manhattan_2024.csv")
write_to_csv(events_manhattan_2024, OUTPUT_events)

2025-09-22 12:43:43,021 - INFO - 📄 Successfully wrote 1419452 rows (DataFrame) to '..\..\data\raw\events_manhattan_2024.csv'
