In [2]:
import requests
import pandas as pd

In [29]:
# Base URL
url = "https://data.ny.gov/resource/kh8p-hcbm.json"

# Query parameters (optional)
params = {
    "$limit": 10000,        # how many rows to fetch
    "$offset": 0,          # for pagination
    # "$select": "col1, col2, col3",  # only necessary columns
    # "$where": "some_column = 'some_value'",  # filtering
}

# Make the request
response = requests.get(url, params=params)

if response.status_code == 200:
    data = response.json()
    df = pd.DataFrame(data)
else:
    print("Request failed with status:", response.status_code)


In [30]:
df.head()

Unnamed: 0,violation_id,vehicle_id,first_occurrence,last_occurrence,violation_status,violation_type,bus_route_id,violation_latitude,violation_longitude,stop_id,stop_name,bus_stop_latitude,bus_stop_longitude,violation_georeference,bus_stop_georeference
0,489749182,c5ae1411153b52556a1e648cc80d718aa519a4bdd189ab...,2025-08-20T23:12:08.000,2025-08-21T00:24:08.000,TECHNICAL ISSUE/OTHER,MOBILE BUS STOP,BX36,40.840509,-73.881189,102498,EAST TREMONT AV/VYSE AV,40.841076,-73.882483,"{'type': 'Point', 'coordinates': [-73.881189, ...","{'type': 'Point', 'coordinates': [-73.882483, ..."
1,489744714,df9044acf85cf55488aea4cd3ce1d0e17ef050551726b6...,2025-08-20T23:48:59.000,2025-08-20T23:54:47.000,EXEMPT - BUS/PARATRANSIT,MOBILE BUS STOP,BX28,40.874017,-73.890646,100080,PAUL AV/BEDFORD PARK BLVD,40.874629,-73.891539,"{'type': 'Point', 'coordinates': [-73.890646, ...","{'type': 'Point', 'coordinates': [-73.891539, ..."
2,489743631,eb5a337966ba65f66ab1db8e169d2446a4fb429b0efc63...,2025-08-20T22:33:13.000,2025-08-20T23:56:02.000,TECHNICAL ISSUE/OTHER,MOBILE DOUBLE PARKED,Q53+,40.721971,-73.867136,550473,WOODHAVEN BLVD/PENELOPE AV,40.722487,-73.867736,"{'type': 'Point', 'coordinates': [-73.867136, ...","{'type': 'Point', 'coordinates': [-73.867736, ..."
3,489741945,3f877f70d9b253515a945be807c9c62d5814949f810310...,2025-08-20T22:50:45.000,2025-08-20T23:32:43.000,EXEMPT - OTHER,MOBILE BUS STOP,Q44+,40.762529,-73.831728,501140,UNION ST/35 AV,40.765422,-73.827944,"{'type': 'Point', 'coordinates': [-73.831728, ...","{'type': 'Point', 'coordinates': [-73.827944, ..."
4,489741940,7feac037b62d591ffb1214e356157f3dd197fc22fee5bb...,2025-08-20T10:52:57.000,2025-08-20T11:16:57.000,EXEMPT - EMERGENCY VEHICLE,MOBILE BUS STOP,M101,40.815113,-73.95504,401458,AMSTERDAM AV/W 131 ST,40.816009,-73.954424,"{'type': 'Point', 'coordinates': [-73.95504, 4...","{'type': 'Point', 'coordinates': [-73.954424, ..."


In [31]:
df.shape

(10000, 15)

**Data Cleaning**

In [11]:
df.isnull().sum()

violation_id              0
vehicle_id                8
first_occurrence          0
last_occurrence           0
violation_status          0
violation_type            0
bus_route_id              0
violation_latitude        0
violation_longitude       0
stop_id                   0
stop_name                 0
bus_stop_latitude         0
bus_stop_longitude        0
violation_georeference    0
bus_stop_georeference     0
dtype: int64

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   violation_id         10000 non-null  object
 1   vehicle_id           9992 non-null   object
 2   first_occurrence     10000 non-null  object
 3   last_occurrence      10000 non-null  object
 4   violation_status     10000 non-null  object
 5   violation_type       10000 non-null  object
 6   bus_route_id         10000 non-null  object
 7   violation_latitude   10000 non-null  object
 8   violation_longitude  10000 non-null  object
 9   stop_id              10000 non-null  object
 10  stop_name            10000 non-null  object
 11  bus_stop_latitude    10000 non-null  object
 12  bus_stop_longitude   10000 non-null  object
dtypes: object(13)
memory usage: 1015.8+ KB


**Violation_georeference and bus_stop_georeference not needed as these are copies of bus_stop and violation long/lat**

In [19]:
df = df.drop(columns=["violation_georeference", "bus_stop_georeference"])

In [22]:
# converting to datetime
df["first_occurrence"] = pd.to_datetime(df["first_occurrence"])
df["last_occurrence"] = pd.to_datetime(df["last_occurrence"])

In [24]:
# checking for duplicates
df.duplicated().sum()

0

In [26]:
df["violation_status"].unique()

array(['TECHNICAL ISSUE/OTHER', 'EXEMPT - BUS/PARATRANSIT',
       'EXEMPT - OTHER', 'EXEMPT - EMERGENCY VEHICLE',
       'DRIVER/VEHICLE INFO MISSING', 'EXEMPT - COMMERCIAL UNDER 20'],
      dtype=object)

In [27]:
df['violation_type'].unique()

array(['MOBILE BUS STOP', 'MOBILE DOUBLE PARKED', 'MOBILE BUS LANE'],
      dtype=object)

Unnamed: 0,violation_id,vehicle_id,first_occurrence,last_occurrence,violation_status,violation_type,bus_route_id,violation_latitude,violation_longitude,stop_id,stop_name,bus_stop_latitude,bus_stop_longitude
0,489749182,c5ae1411153b52556a1e648cc80d718aa519a4bdd189ab...,2025-08-20 23:12:08,2025-08-21 00:24:08,TECHNICAL ISSUE/OTHER,MOBILE BUS STOP,BX36,40.840509,-73.881189,102498,EAST TREMONT AV/VYSE AV,40.841076,-73.882483
1,489744714,df9044acf85cf55488aea4cd3ce1d0e17ef050551726b6...,2025-08-20 23:48:59,2025-08-20 23:54:47,EXEMPT - BUS/PARATRANSIT,MOBILE BUS STOP,BX28,40.874017,-73.890646,100080,PAUL AV/BEDFORD PARK BLVD,40.874629,-73.891539
2,489743631,eb5a337966ba65f66ab1db8e169d2446a4fb429b0efc63...,2025-08-20 22:33:13,2025-08-20 23:56:02,TECHNICAL ISSUE/OTHER,MOBILE DOUBLE PARKED,Q53+,40.721971,-73.867136,550473,WOODHAVEN BLVD/PENELOPE AV,40.722487,-73.867736
3,489741945,3f877f70d9b253515a945be807c9c62d5814949f810310...,2025-08-20 22:50:45,2025-08-20 23:32:43,EXEMPT - OTHER,MOBILE BUS STOP,Q44+,40.762529,-73.831728,501140,UNION ST/35 AV,40.765422,-73.827944
4,489741940,7feac037b62d591ffb1214e356157f3dd197fc22fee5bb...,2025-08-20 10:52:57,2025-08-20 11:16:57,EXEMPT - EMERGENCY VEHICLE,MOBILE BUS STOP,M101,40.815113,-73.95504,401458,AMSTERDAM AV/W 131 ST,40.816009,-73.954424
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,488739171,1265d2ab9c9d365f5832bd8bd076bf03f595e033274108...,2025-08-14 09:39:29,2025-08-14 09:46:56,EXEMPT - COMMERCIAL UNDER 20,MOBILE DOUBLE PARKED,BX19,40.8125,-73.90547,101366,SOUTHERN BLVD/E 149 ST,40.812224,-73.903953
9996,488739166,a381f100984b123817a50d43ba05c1596f3d8fcf49aa91...,2025-08-14 09:39:10,2025-08-14 09:46:58,TECHNICAL ISSUE/OTHER,MOBILE BUS LANE,BX12+,40.862704,-73.902186,100796,WEST FORDHAM RD/UNIVERSITY AV,40.862703,-73.904256
9997,488739139,ce9d34c60b7b4650b0f5267e61825053f5257c23558069...,2025-08-14 09:09:56,2025-08-14 09:45:39,TECHNICAL ISSUE/OTHER,MOBILE DOUBLE PARKED,Q54,40.712829,-73.90343,504256,METROPOLITAN AV/FOREST AV,40.712821,-73.905623
9998,488739046,51eb04a496d2075f74521c08c368302188ee9ce1b5d136...,2025-08-14 09:37:49,2025-08-14 09:44:38,EXEMPT - BUS/PARATRANSIT,MOBILE BUS LANE,M101,40.782448,-73.953448,402521,AMSTERDAM AV/W 145 ST,40.825615,-73.947231
