# **Prepare Data**

In [None]:
!pip install pyproj geopandas

In [6]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pyplot as mdates

In [None]:
violations_url = 'https://data.ny.gov/resource/kh8p-hcbm.json?$limit=50000&$offset=0'
violations_data = requests.get(violations_url).json()
violations_2025 = pd.DataFrame(violations_data)

In [None]:
violations_2025['last_occurrence'].min() # too recent

'2025-07-02T06:37:04.000'

In [None]:
import pandas as pd

base = "https://data.ny.gov/resource/kh8p-hcbm.json"
limit = 50000
offset = 0
chunks = []

for _ in range(30):
  url = f"{base}?$limit={limit}&$offset={offset}"
  df = pd.read_json(url)
  if df.empty:
      break
  chunks.append(df)
  offset += limit

violations_df = pd.concat(chunks, ignore_index=True)
print(len(violations_df))

In [None]:
for _ in range(5):
  url = f"{base}?$limit={limit}&$offset={offset}"
  df = pd.read_json(url)
  if df.empty:
      break
  chunks.append(df)
  offset += limit

violations_df = pd.concat(chunks, ignore_index=True)
print(len(violations_df))

1750000


In [None]:
violations_df['first_occurrence'].max()

'2025-08-21T19:40:47.000'

In [None]:
violations_2025 = violations_df[violations_df['first_occurrence'].astype(str).str[:4] == '2025'].copy()

In [None]:
violations_2025['first_occurrence'].min()

'2025-01-01T00:00:17.000'

In [None]:
violations_2025.to_csv('violations_2025.csv')

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

violations_2025.to_csv('/content/drive/MyDrive/datasets/violations_2025.csv')

Mounted at /content/drive


# **Campus Priority Gaps (CUNY) - Data Organization**

- Draw a (2 km) circle around each CUNY campus and compute violations per 100 trips inside each circle, split by month.

- Hotspot Mapping: Put those circles on a map and rank campuses. A lightweight classifier can label each campus as **low/high medium/high risk** using features like day-of-week mix, burstiness, and number of nearby routes.

- Gives a clean "which campuses need help most?" story.

## Load Data

In [7]:
from google.colab import drive
drive.mount('/content/drive')

violations_path = '/content/drive/MyDrive/datathon/datasets/violations_2025.csv'
violations_2025 = pd.read_csv(violations_path).drop(columns=['Unnamed: 0'], axis=1)

Mounted at /content/drive


## Organize CUNY GeoData

In [8]:
# get info of each CUNY
cuny = [
    {'campus': 'Baruch College', 'lat': 40.7404, 'lon': -73.9832},
    {'campus': 'Borough of Manhattan Community College', 'lat':40.7179, 'lon':-74.0120},
    {'campus': 'Bronx Community College', 'lat':40.8575, 'lon':-73.9129},
    {'campus': 'Brooklyn College', 'lat':40.6309, 'lon':-73.9515},
    {'campus': 'College of Staten Island', 'lat':40.6022, 'lon':-74.1504},
    {'campus': 'Craig Newmark Graduate School of Journalism', 'lat':40.755, 'lon': -73.989},
    {'campus': 'CUNY Graduate Center', 'lat':40.7486, 'lon':-73.9840},
    {'campus': 'CUNY Graduate School of Public Health and Health Policy', 'lat':40.8075, 'lon':-73.9441},
    {'campus': 'CUNY School of Labor and Urban Studies', 'lat':40.7551, 'lon':-73.9817},
    {'campus': 'CUNY School of Law', 'lat':40.7479, 'lon':-73.9440},
    {'campus': 'CUNY School of Professional Studies', 'lat':40.7484, 'lon':-73.9900},
    {'campus': 'Guttman Community College', 'lat':40.7529, 'lon':-73.9841},
    {'campus': 'Hostos Community College', 'lat':40.8174, 'lon':-73.9272},
    {'campus': 'Hunter College', 'lat':40.7678, 'lon':-73.9645},
    {'campus': 'John Jay College of Criminal Justice', 'lat':40.7707, 'lon':-73.9892},
    {'campus': 'Kingsborough Community College', 'lat':40.5787, 'lon':-73.9351},
    {'campus': 'LaGuardia Community College', 'lat':40.7438, 'lon':-73.9351},
    {'campus': 'Lehman College', 'lat':40.8729, 'lon':-73.8945},
    {'campus': 'Macaulay Honors College', 'lat':40.7740, 'lon':-73.9802},
    {'campus': 'Medgar Evers College', 'lat':40.6664, 'lon':-73.9571},
    {'campus': 'New York City College of Technology (City Tech)', 'lat':40.6954, 'lon':-73.9875},
    {'campus': 'Queens College', 'lat':40.7367, 'lon':-73.8203},
    {'campus': 'Queensborough Community College', 'lat':40.7554, 'lon':-73.7574},
    {'campus': 'The City College of New York', 'lat':40.8200, 'lon':-73.9493},
    {'campus': 'York College', 'lat':40.7010, 'lon':-73.7961},
]

cuny_data = pd.DataFrame(cuny)

In [9]:
# create CUNY buffers
import geopandas as gpd
from shapely.geometry import Point, LineString

cuny_geo = gpd.GeoDataFrame(
    cuny_data,
    geometry=gpd.points_from_xy(cuny_data['lon'], cuny_data['lat']),
    crs=4326
)

radius = 2000
min_trips = 5
feet_per_meter = 3.28084

cuny_geo['buffer'] = cuny_geo.to_crs(2263).buffer(radius * feet_per_meter)

## Parse & Clean Violations Dataset

In [10]:
# parse violations info
violations_2025['first_occurrence'] = pd.to_datetime(violations_2025['first_occurrence'], errors='coerce', utc=True)

In [11]:
time_local = violations_2025['first_occurrence'].dt.tz_convert('America/New_York')
violations_2025['month'] = (time_local.dt.tz_localize(None).dt.to_period('M').dt.to_timestamp())
violations_2025['day'] = time_local.dt.floor('D')

In [12]:
violations_2025['bus_route_id'].unique()

array(['BX36', 'BX28', 'Q53+', 'Q44+', 'M101', 'B46+', 'Q69', 'BX38',
       'M42', 'M60+', 'M2', 'BX6+', 'B35', 'BX35', 'M4', 'B82+', 'M34+',
       'M15+', 'BX19', 'BX41+', 'M23+', 'M100', 'BX12+', 'Q43', 'Q54',
       'B41', 'B44+', 'M79+', 'Q58', 'M14+', 'B25', 'B62', 'M86+', 'B26',
       'Q5', 'B42', 'S79+', 'BX5', 'BX28-BX38', 'S46'], dtype=object)

## Form Clean Dataset with Geocoordinates of Violations

In [13]:
violations_2025['violation_latitude'] = violations_2025['violation_latitude'].astype(float)
violations_2025['violation_longitude'] = violations_2025['violation_longitude'].astype(float)

violations_geo = gpd.GeoDataFrame(
    violations_2025,
    geometry=gpd.points_from_xy(violations_2025['violation_longitude'], violations_2025['violation_latitude']), crs=4326
)

In [14]:
violations_geo

Unnamed: 0,violation_id,vehicle_id,first_occurrence,last_occurrence,violation_status,violation_type,bus_route_id,violation_latitude,violation_longitude,stop_id,stop_name,bus_stop_latitude,bus_stop_longitude,violation_georeference,bus_stop_georeference,month,day,geometry
0,489749182,c5ae1411153b52556a1e648cc80d718aa519a4bdd189ab...,2025-08-20 23:12:08+00:00,2025-08-21T00:24:08.000,TECHNICAL ISSUE/OTHER,MOBILE BUS STOP,BX36,40.840509,-73.881189,102498,EAST TREMONT AV/VYSE AV,40.841076,-73.882483,"{'type': 'Point', 'coordinates': [-73.881189, ...","{'type': 'Point', 'coordinates': [-73.882483, ...",2025-08-01,2025-08-20 00:00:00-04:00,POINT (-73.88119 40.84051)
1,489744714,df9044acf85cf55488aea4cd3ce1d0e17ef050551726b6...,2025-08-20 23:48:59+00:00,2025-08-20T23:54:47.000,EXEMPT - BUS/PARATRANSIT,MOBILE BUS STOP,BX28,40.874017,-73.890646,100080,PAUL AV/BEDFORD PARK BLVD,40.874629,-73.891539,"{'type': 'Point', 'coordinates': [-73.890646, ...","{'type': 'Point', 'coordinates': [-73.891539, ...",2025-08-01,2025-08-20 00:00:00-04:00,POINT (-73.89065 40.87402)
2,489743631,eb5a337966ba65f66ab1db8e169d2446a4fb429b0efc63...,2025-08-20 22:33:13+00:00,2025-08-20T23:56:02.000,TECHNICAL ISSUE/OTHER,MOBILE DOUBLE PARKED,Q53+,40.721971,-73.867136,550473,WOODHAVEN BLVD/PENELOPE AV,40.722487,-73.867736,"{'type': 'Point', 'coordinates': [-73.867136, ...","{'type': 'Point', 'coordinates': [-73.867736, ...",2025-08-01,2025-08-20 00:00:00-04:00,POINT (-73.86714 40.72197)
3,489741945,3f877f70d9b253515a945be807c9c62d5814949f810310...,2025-08-20 22:50:45+00:00,2025-08-20T23:32:43.000,EXEMPT - OTHER,MOBILE BUS STOP,Q44+,40.762529,-73.831728,501140,UNION ST/35 AV,40.765422,-73.827944,"{'type': 'Point', 'coordinates': [-73.831728, ...","{'type': 'Point', 'coordinates': [-73.827944, ...",2025-08-01,2025-08-20 00:00:00-04:00,POINT (-73.83173 40.76253)
4,489741940,7feac037b62d591ffb1214e356157f3dd197fc22fee5bb...,2025-08-20 10:52:57+00:00,2025-08-20T11:16:57.000,EXEMPT - EMERGENCY VEHICLE,MOBILE BUS STOP,M101,40.815113,-73.955040,401458,AMSTERDAM AV/W 131 ST,40.816009,-73.954424,"{'type': 'Point', 'coordinates': [-73.95504, 4...","{'type': 'Point', 'coordinates': [-73.954424, ...",2025-08-01,2025-08-20 00:00:00-04:00,POINT (-73.95504 40.81511)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1667472,452367899,1d8c6ce135f099d5deadeb7cc6a353b628f35d215610ad...,2025-01-01 02:51:01+00:00,2025-01-01T02:54:07.000,TECHNICAL ISSUE/OTHER,MOBILE BUS STOP,M15+,40.753589,-73.966315,402662,E 125 ST/PARK AV,40.804799,-73.938583,"{'type': 'Point', 'coordinates': [-73.966315, ...","{'type': 'Point', 'coordinates': [-73.938583, ...",2024-12-01,2024-12-31 00:00:00-05:00,POINT (-73.96632 40.75359)
1667473,452367897,6155256f62b774268185e5198ef8d5346f4053e03d7c92...,2025-01-01 02:37:34+00:00,2025-01-01T05:13:31.000,DRIVER/VEHICLE INFO MISSING,MOBILE BUS STOP,M15+,40.800004,-73.932451,402679,3 AV/E 42 ST,40.751085,-73.974122,"{'type': 'Point', 'coordinates': [-73.932451, ...","{'type': 'Point', 'coordinates': [-73.974122, ...",2024-12-01,2024-12-31 00:00:00-05:00,POINT (-73.93245 40.8)
1667474,452367896,408df659d91a7f0493c2d92adcd03dee7638fde103f46d...,2025-01-01 01:01:35+00:00,2025-01-01T01:31:28.000,VIOLATION ISSUED,MOBILE BUS STOP,M15+,40.727621,-73.985253,404949,AMSTERDAM AV/W 161 ST,40.835536,-73.940181,"{'type': 'Point', 'coordinates': [-73.985253, ...","{'type': 'Point', 'coordinates': [-73.940181, ...",2024-12-01,2024-12-31 00:00:00-05:00,POINT (-73.98525 40.72762)
1667475,452367885,a214b44bc6edbca4dec27c984aba6934a78e95bd6b6e67...,2025-01-01 01:01:14+00:00,2025-01-01T01:05:40.000,TECHNICAL ISSUE/OTHER,MOBILE BUS STOP,B82+,40.647886,-73.881956,303358,ROCKAWAY PKWY/AVENUE M,40.637079,-73.893199,"{'type': 'Point', 'coordinates': [-73.881956, ...","{'type': 'Point', 'coordinates': [-73.893199, ...",2024-12-01,2024-12-31 00:00:00-05:00,POINT (-73.88196 40.64789)


## Get Violation and Trip Data Within Campus Buffers

In [15]:
import time, requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

In [16]:
retry = Retry(
    total=5,
    backoff_factor=0.5,
    status_forcelist=[429, 500, 502, 503, 504],
    allowed_methods=["GET"],
)
session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=retry))

In [17]:
vgeo = violations_geo # create copy to not tamper with original
if "month" not in vgeo.columns:
  v_ts = (pd.to_datetime(vgeo["first_occurrence"], errors="coerce", utc=True)
    .dt.tz_convert("America/New_York")
    .dt.tz_localize(None))
  vgeo["month"] = v_ts.dt.to_period("M").dt.to_timestamp()

months_all = (pd.period_range(vgeo["month"].min(), vgeo["month"].max(), freq="M")
                .to_timestamp())

BASE = "https://data.ny.gov/resource/kufs-yh3x.json"
RADIUS_M = 2000
PAD_LAT, PAD_LON = 0.008, 0.010

In [19]:
all_rows = []

for _, row in cuny_data.iterrows():
  campus, lat, lon = row["campus"], row["lat"], row["lon"]
  print(f"\n{campus}")

  pt_wgs = gpd.GeoSeries([Point(lon, lat)], crs=4326)
  buf_2263 = pt_wgs.to_crs(2263).buffer(RADIUS_M).iloc[0]
  buf_wgs  = gpd.GeoSeries([buf_2263], crs=2263).to_crs(4326).iloc[0]
  minx, miny, maxx, maxy = gpd.GeoSeries([buf_wgs], crs=4326).total_bounds
  west, south, east, north = minx - PAD_LON, miny - PAD_LAT, maxx + PAD_LON, maxy + PAD_LAT

  where = (
    "("
    f"(timepoint_stop_longitude between {west} and {east} AND "
    f" timepoint_stop_latitude  between {south} and {north}) "
    "OR "
    f"(next_timepoint_stop_longitude between {west} and {east} AND "
    f" next_timepoint_stop_latitude  between {south} and {north})"
    ")"
)
  params = {
    "$select": ("route_id, direction, timestamp, bus_trip_count, "
                "timepoint_stop_latitude, timepoint_stop_longitude, "
                "next_timepoint_stop_latitude, next_timepoint_stop_longitude"),
    "$where": where,
    "$order": "route_id, direction, timestamp",
    "$limit": 50000
  }

  seg_chunks = []
  for m in months_all:
    start = m.replace(day=1)
    end   = (start + pd.offsets.MonthEnd(0) + pd.Timedelta(hours=23, minutes=59, seconds=59))

    where = (
        "("
        f"(timepoint_stop_longitude between {west} and {east} AND "
        f" timepoint_stop_latitude  between {south} and {north}) "
        "OR "
        f"(next_timepoint_stop_longitude between {west} and {east} AND "
        f" next_timepoint_stop_latitude  between {south} and {north})"
        f") AND timestamp between '{start:%Y-%m-%dT%H:%M:%S}' and '{end:%Y-%m-%dT%H:%M:%S}'"
      )

    params = {
      "$select": ("route_id, direction, timestamp, bus_trip_count, "
                  "timepoint_stop_latitude, timepoint_stop_longitude, "
                  "next_timepoint_stop_latitude, next_timepoint_stop_longitude"),
      "$where": where,
      "$limit": 50000
    }

    offset = 0
    while True:
      q = dict(params, **{"$offset": offset})
      try:
        r = session.get(BASE, params=q, timeout=180)
        r.raise_for_status()
      except requests.exceptions.ReadTimeout:
        break

      rows_ = r.json()
      if not rows_:
         break

      df = pd.DataFrame(rows_)
      if not df.empty:
        for c in ["timepoint_stop_latitude","timepoint_stop_longitude",
                "next_timepoint_stop_latitude","next_timepoint_stop_longitude",
                "bus_trip_count"]:
          df[c] = pd.to_numeric(df[c], errors="coerce")
        df = df.dropna(subset=[
          "timepoint_stop_latitude","timepoint_stop_longitude",
          "next_timepoint_stop_latitude","next_timepoint_stop_longitude"
        ])
        df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
        seg_chunks.append(df)

      if len(rows_) < 50000:
        break
      offset += 50000
      time.sleep(0.2)
  segment = (pd.concat(seg_chunks, ignore_index=True) if seg_chunks else pd.DataFrame(columns=["route_id","direction","timestamp","bus_trip_count",
            "timepoint_stop_latitude","timepoint_stop_longitude", "next_timepoint_stop_latitude","next_timepoint_stop_longitude"]))
  print("segment rows:", len(segment))

  if not segment.empty:
    p1 = gpd.GeoSeries(gpd.points_from_xy(segment["timepoint_stop_longitude"], segment["timepoint_stop_latitude"]), crs=4326).to_crs(2263)
    p2 = gpd.GeoSeries(gpd.points_from_xy(segment["next_timepoint_stop_longitude"], segment["next_timepoint_stop_latitude"]), crs=4326).to_crs(2263)
    inside = p1.within(buf_2263) | p2.within(buf_2263)
    seg_in  = segment.loc[inside].copy()
    seg_in["month"] = seg_in["timestamp"].dt.to_period("M").dt.to_timestamp()
    seg_in["bus_trip_count"] = pd.to_numeric(seg_in["bus_trip_count"], errors="coerce").fillna(0)
    trips_monthly = (seg_in.groupby("month", as_index=False)
                            .agg(trips_in_buffer=("bus_trip_count","sum")))
  else:
    trips_monthly = pd.DataFrame(columns=["month","trips_in_buffer"])

  vmask = vgeo.geometry.to_crs(2263).within(buf_2263)
  viol_in = vgeo.loc[vmask].copy()
  violations_monthly = (viol_in.groupby("month", as_index=False)
                                .agg(violations=("violation_id","count")))

  for d in (trips_monthly, violations_monthly):
    if not d.empty:
      d["month"] = pd.to_datetime(d["month"], errors="coerce").dt.to_period("M").dt.to_timestamp()

  monthly = (trips_monthly.merge(violations_monthly, on="month", how="outer")
                          .fillna({"trips_in_buffer":0, "violations":0})
                          .sort_values("month").reset_index(drop=True))
  monthly["viol_per_100_trips"] = 100 * monthly["violations"] / monthly["trips_in_buffer"]
  monthly.insert(0, "campus", campus)

  all_rows.append(monthly)

campus_monthly = pd.concat(all_rows, ignore_index=True)

rank_base = campus_monthly.copy()
rank = (rank_base.dropna(subset=["trips_in_buffer"])
                  .groupby("campus", as_index=False)
                  .agg(violations=("violations","sum"),
                       trips=("trips_in_buffer","sum")))
rank["viol_per_100_trips_pooled"] = 100 * rank["violations"] / rank["trips"]
rank = rank.sort_values("viol_per_100_trips_pooled", ascending=False).reset_index(drop=True)

display(campus_monthly.head(12))
display(rank.head(15))


Baruch College
segment rows: 214180

Borough of Manhattan Community College
segment rows: 84828

Bronx Community College
segment rows: 110334

Brooklyn College
segment rows: 90712

College of Staten Island
segment rows: 29620

Craig Newmark Graduate School of Journalism
segment rows: 218475

CUNY Graduate Center
segment rows: 243353

CUNY Graduate School of Public Health and Health Policy
segment rows: 155287

CUNY School of Labor and Urban Studies
segment rows: 253149

CUNY School of Law
segment rows: 50180

CUNY School of Professional Studies
segment rows: 241388

Guttman Community College
segment rows: 249852

Hostos Community College
segment rows: 99282

Hunter College
segment rows: 199669

John Jay College of Criminal Justice
segment rows: 120355

Kingsborough Community College
segment rows: 16195

LaGuardia Community College
segment rows: 50237

Lehman College
segment rows: 130815

Macaulay Honors College
segment rows: 167953

Medgar Evers College
segment rows: 54814

New York C

Unnamed: 0,campus,month,trips_in_buffer,violations,viol_per_100_trips
0,Baruch College,2024-12-01,0.0,5.0,inf
1,Baruch College,2025-01-01,131001.0,3493.0,2.666392
2,Baruch College,2025-02-01,101032.0,2942.0,2.911949
3,Baruch College,2025-03-01,128032.0,3945.0,3.081261
4,Baruch College,2025-04-01,127454.0,4290.0,3.36592
5,Baruch College,2025-05-01,123816.0,4374.0,3.532661
6,Baruch College,2025-06-01,118660.0,3829.0,3.226867
7,Baruch College,2025-07-01,124492.0,3690.0,2.964046
8,Baruch College,2025-08-01,0.0,802.0,inf
9,Borough of Manhattan Community College,2025-01-01,22782.0,0.0,0.0


Unnamed: 0,campus,violations,trips,viol_per_100_trips_pooled
0,CUNY Graduate School of Public Health and Heal...,43140.0,277688.0,15.535421
1,Lehman College,6963.0,83304.0,8.358542
2,The City College of New York,21595.0,267822.0,8.063191
3,Hostos Community College,5861.0,124576.0,4.704759
4,Baruch College,27370.0,854487.0,3.203091
5,Hunter College,17721.0,635274.0,2.789505
6,Bronx Community College,4229.0,155252.0,2.723958
7,New York City College of Technology (City Tech),17382.0,675464.0,2.573342
8,Brooklyn College,16041.0,715628.0,2.241528
9,York College,15630.0,1460115.0,1.070464


In [None]:
display(campus_monthly)

rank

Unnamed: 0,campus,month,trips_in_buffer,violations,viol_per_100_trips
0,Baruch College,2024-12-01,,5.0,
1,Baruch College,2025-01-01,131001.0,3493.0,2.666392
2,Baruch College,2025-02-01,101032.0,2942.0,2.911949
3,Baruch College,2025-03-01,128032.0,3945.0,3.081261
4,Baruch College,2025-04-01,127454.0,4290.0,3.365920
...,...,...,...,...,...
197,York College,2025-04-01,217167.0,2210.0,1.017650
198,York College,2025-05-01,219043.0,2221.0,1.013956
199,York College,2025-06-01,209920.0,2133.0,1.016101
200,York College,2025-07-01,181596.0,2108.0,1.160819


Unnamed: 0,campus,violations,trips,viol_per_100_trips_pooled
0,CUNY Graduate School of Public Health and Heal...,41453.0,277688.0,14.927905
1,The City College of New York,20967.0,267822.0,7.828707
2,Lehman College,6446.0,83304.0,7.737924
3,Hostos Community College,5545.0,124576.0,4.451098
4,Baruch College,26563.0,854487.0,3.108649
5,Bronx Community College,4165.0,155252.0,2.682735
6,Hunter College,16999.0,635274.0,2.675853
7,New York City College of Technology (City Tech),16718.0,675464.0,2.475039
8,Brooklyn College,15650.0,715628.0,2.18689
9,York College,14870.0,1460115.0,1.018413


# Classifier

**What we want**

	1.	Combine multiple signals optimally
Not just severity (viol/100 trips), but also exposure (trips), trend (slope), volatility, weekday mix, route count, etc. Thresholds on one metric can miss interactions (e.g., medium severity + exploding trend).

	2.	Consistency across places/time
A model can learn different “normal” levels by borough/corridor and be less brittle than fixed cutoffs when distributions shift (e.g., a month with system-wide campaigns).

	3.	Calibrated probabilities (not just bins)
“CCNY has a 62% chance of being High next month” is a stronger prioritization tool than “High/Medium/Low.”

	4.	Forecasting next month (actually useful!)
Train on month t features → label whether month t+1 was “High.” Now you can say which campuses are about to be hot, not just which are hot now.

	5.	Feature importance
You get a ranked list of drivers to explain your map (“severity and rising trend drove 80% of the risk”).

## Feature Engineering

**Features**


*   Severity (pooled) [`severity = 100 * sum(violations) / sum(trips_in_buffer)`]
*   Exposire (pooled trips) [`exposure = sum(trips_in_buffer)`]
* Trend (slope): rate change per month (Fit a line to monthly `viol_per_100_trips` vs time in months. `trend_slope` (positive = getting worse, negative = improving))
* Volatility (`rate_std` (std dev of monthly viol_per_100_trips), `rate_max` (peak rate) — catches spikes even if average is modest)
* Service mix (%weekday = `weekday trips / total`, `route_count` = distinct routes with endpoints inside circle (Feb–May))
* Context: `borough` (one-hot)

**Classifier**: Logistic Regression

In [22]:
cm = campus_monthly
cm['month'] = pd.to_datetime(cm['month'], errors='coerce')
for c in ['trips_in_buffer','violations','viol_per_100_trips']:
  if c in cm.columns:
    cm[c] = pd.to_numeric(cm[c], errors='coerce')


print("Rows:", len(cm), "\n Campuses:", cm['campus'].nunique(), "\n Months:", sorted(cm['month'].dt.strftime('%Y-%m').unique()))
cm

Rows: 202 
 Campuses: 25 
 Months: ['2024-12', '2025-01', '2025-02', '2025-03', '2025-04', '2025-05', '2025-06', '2025-07', '2025-08']


Unnamed: 0,campus,month,trips_in_buffer,violations,viol_per_100_trips
0,Baruch College,2024-12-01,,5.0,
1,Baruch College,2025-01-01,131001.0,3493.0,2.666392
2,Baruch College,2025-02-01,101032.0,2942.0,2.911949
3,Baruch College,2025-03-01,128032.0,3945.0,3.081261
4,Baruch College,2025-04-01,127454.0,4290.0,3.365920
...,...,...,...,...,...
197,York College,2025-04-01,217167.0,2210.0,1.017650
198,York College,2025-05-01,219043.0,2221.0,1.013956
199,York College,2025-06-01,209920.0,2133.0,1.016101
200,York College,2025-07-01,181596.0,2108.0,1.160819


In [25]:
def slope_per_group(g):
  x = (g['month'] - g['month'].min()).dt.days.values / 30.0
  y = g['viol_per_100_trips'].astype(float).values
  ok = ~np.isnan(y)
  if ok.sum() < 2:
      return np.nan
  return np.polyfit(x[ok], y[ok], 1)[0]

agg1 = (cm.groupby('campus', as_index=False)
      .agg(violations=('violations','sum'),
            trips=('trips_in_buffer','sum'),
            months_valid=('viol_per_100_trips','count'),
            rate_mean=('viol_per_100_trips','mean'),
            rate_std =('viol_per_100_trips','std'),
            rate_max =('viol_per_100_trips','max')))

trend = (cm.groupby('campus', as_index=False)
          .apply(slope_per_group)
          .rename(columns={0:'trend_slope'}))

feat_core = agg1.merge(trend, on='campus', how='left')
feat_core['severity'] = 100.0 * feat_core['violations'] / feat_core['trips']

print(feat_core.shape)
feat_core.head()

(25, 9)


  .apply(slope_per_group)


Unnamed: 0,campus,violations,trips,months_valid,rate_mean,rate_std,rate_max,None,severity
0,Baruch College,27370.0,854487.0,7,3.107014,0.293222,3.532661,0.069606,3.203091
1,Borough of Manhattan Community College,0.0,150310.0,7,0.0,0.0,0.0,0.0,0.0
2,Bronx Community College,4229.0,155252.0,7,2.685051,0.362955,3.167045,-0.05899,2.723958
3,Brooklyn College,16041.0,715628.0,7,2.188421,0.461767,2.888255,-0.204129,2.241528
4,CUNY Graduate Center,7036.0,726034.0,7,0.888219,0.290851,1.277332,0.122815,0.969101


In [None]:
vgeo = violations_geo.copy()


for need in ['month','day','hour','dow']:
  if need not in vgeo.columns:
    vgeo['_local'] = pd.to_datetime(vgeo['first_occurrence'], errors='coerce', utc=True).dt.tz_convert('America/New_York')
    vgeo['month']  = vgeo['_local'].dt.to_period('M').dt.to_timestamp()
    vgeo['day']    = vgeo['_local'].dt.floor('D')
    vgeo['hour']   = vgeo['_local'].dt.hour
    vgeo['dow']    = vgeo['_local'].dt.dayofweek

RADIUS_M = 2000
campus_points = gpd.GeoSeries(
    [Point(lon, lat) for lat, lon in zip(cuny_data['lat'], cuny_data['lon'])],
    crs=4326
)
campus_buffers_2263 = campus_points.to_crs(2263).buffer(RADIUS_M)
campus_buffers = gpd.GeoSeries(campus_buffers_2263, crs=2263).to_crs(4326)

campus_shapes = gpd.GeoDataFrame(
    cuny_data[['campus']].copy(),
    geometry=campus_buffers,
    crs=4326
)

v_in = gpd.sjoin(vgeo[['violation_id','bus_route_id','month','day','is_weekday','is_am','is_pm','geometry']],
                 campus_shapes[['campus','geometry']],
                 how='inner', predicate='within')

In [None]:
def p95_over_median(x):
    x = x.dropna()
    if len(x)==0 or x.median()==0:
        return np.nan
    return np.percentile(x,95) / x.median()

In [24]:

daily = (v_in.groupby(['campus','day'], as_index=False)
          .agg(day_violations=('violation_id','count')))

burst = (daily.groupby('campus', as_index=False)
            .agg(burstiness=('day_violations', p95_over_median)))

extra = (v_in.groupby('campus', as_index=False)
        .agg(route_count=('bus_route_id', lambda s: s.dropna().nunique()),
            weekday_share=('is_weekday', 'mean'),
            am_share=('is_am', 'mean'),
            pm_share=('is_pm', 'mean')))

feat_extra = extra.merge(burst, on='campus', how='left')

print("Extra features from violations (in buffers):", feat_extra.shape)
feat_extra.head()

  vgeo['month']  = vgeo['_local'].dt.to_period('M').dt.to_timestamp()


Extra features from violations (in buffers): (20, 6)


Unnamed: 0,campus,route_count,weekday_share,am_share,pm_share,burstiness
0,Baruch College,5,0.85612,0.440994,0.086774,1.469847
1,Bronx Community College,1,0.759045,0.354694,0.157957,1.694737
2,Brooklyn College,2,0.723895,0.31594,0.072315,1.625714
3,CUNY Graduate Center,5,0.847214,0.419983,0.096788,2.144828
4,CUNY Graduate School of Public Health and Heal...,3,0.71159,0.352874,0.105749,1.607527


In [None]:
feat_extra

Unnamed: 0,campus,route_count,weekday_share,am_share,pm_share,burstiness
0,Baruch College,5,0.85612,0.440994,0.086774,1.469847
1,Bronx Community College,1,0.759045,0.354694,0.157957,1.694737
2,Brooklyn College,2,0.723895,0.31594,0.072315,1.625714
3,CUNY Graduate Center,5,0.847214,0.419983,0.096788,2.144828
4,CUNY Graduate School of Public Health and Heal...,3,0.71159,0.352874,0.105749,1.607527
5,CUNY School of Labor and Urban Studies,3,0.794862,0.340968,0.088429,5.431034
6,CUNY School of Law,2,0.784601,0.344638,0.069203,2.111111
7,CUNY School of Professional Studies,3,0.859826,0.345991,0.1327,2.585
8,Craig Newmark Graduate School of Journalism,2,0.81073,0.318298,0.09441,4.457895
9,Guttman Community College,3,0.802427,0.333538,0.114611,3.973684


In [None]:
feat_campus = (feat_core
               .merge(feat_extra, on='campus', how='left'))

for c in ['weekday_share','am_share','pm_share']:
  if c in feat_campus.columns:
    feat_campus[c] = feat_campus[c].fillna(0.0)
if 'route_count' in feat_campus.columns:
  feat_campus['route_count'] = feat_campus['route_count'].fillna(0).astype(int)

cols = ['campus','severity','exposure','trend_slope','rate_std','rate_max',
        'violations','trips','months_valid',
        'route_count','weekday_share','am_share','pm_share','burstiness']
feat_campus = feat_campus[[c for c in cols if c in feat_campus.columns]].copy()

print("Final per-campus features:", feat_campus.shape)
feat_campus.sort_values('severity', ascending=False).head(10)

Final per-campus features: (25, 12)


Unnamed: 0,campus,severity,rate_std,rate_max,violations,trips,months_valid,route_count,weekday_share,am_share,pm_share,burstiness
5,CUNY Graduate School of Public Health and Heal...,15.535421,1.362129,16.190499,43140.0,277688.0,7,3,0.71159,0.352874,0.105749,1.607527
17,Lehman College,8.358542,1.49771,9.23143,6963.0,83304.0,7,1,0.798363,0.30619,0.139164,2.089091
23,The City College of New York,8.063191,1.373843,10.078008,21595.0,267822.0,7,3,0.80125,0.357629,0.138782,1.587629
12,Hostos Community College,4.704759,0.394523,4.940328,5861.0,124576.0,7,1,0.789626,0.426548,0.077461,1.615385
0,Baruch College,3.203091,0.293222,3.532661,27370.0,854487.0,7,5,0.85612,0.440994,0.086774,1.469847
13,Hunter College,2.789505,0.358907,3.361571,17721.0,635274.0,7,3,0.86152,0.433723,0.065798,1.550299
2,Bronx Community College,2.723958,0.362955,3.167045,4229.0,155252.0,7,1,0.759045,0.354694,0.157957,1.694737
20,New York City College of Technology (City Tech),2.573342,0.182144,2.610751,17382.0,675464.0,7,4,0.89725,0.506156,0.044529,1.541071
3,Brooklyn College,2.241528,0.461767,2.888255,16041.0,715628.0,7,2,0.723895,0.31594,0.072315,1.625714
24,York College,1.070464,0.069686,1.160819,15630.0,1460115.0,7,3,0.820601,0.345745,0.102175,1.498592


In [None]:
feat_campus.rename(columns={'trips': 'exposure'}, inplace=True)

In [None]:
q75_sev = feat_campus['severity'].quantile(0.75)
med_exp = feat_campus['exposure'].median()

In [26]:
def classify_row(r):
  base = ('high' if (r['severity'] >= q75_sev and r['exposure'] >= med_exp)
          else 'medium' if (r['severity'] >= q75_sev or r['exposure'] >= med_exp)
          else 'low')
  slope = r.get('trend_slope', np.nan)
  if pd.notna(slope):
    if slope >= 0.25:     # rising fast
      base = {'low':'medium','medium':'high','high':'high'}.get(base, base)
    elif slope <= -0.25:  # improving fast
      base = {'high':'medium','medium':'low','low':'low'}.get(base, base)
  return base

In [None]:
feat_campus['risk_label'] = feat_campus.apply(classify_row, axis=1)

risk_board = (feat_campus.sort_values(['risk_label','severity','exposure'],
                                      ascending=[True, False, False])
                         .reset_index(drop=True))
risk_board.head(15)

Unnamed: 0,campus,severity,rate_std,rate_max,violations,exposure,months_valid,route_count,weekday_share,am_share,pm_share,burstiness,risk_label
0,Baruch College,3.203091,0.293222,3.532661,27370.0,854487.0,7,5,0.85612,0.440994,0.086774,1.469847,high
1,Hunter College,2.789505,0.358907,3.361571,17721.0,635274.0,7,3,0.86152,0.433723,0.065798,1.550299,high
2,Queens College,0.936004,0.275504,1.326798,1861.0,198824.0,7,1,0.845782,0.454057,0.094035,3.092857,low
3,Kingsborough Community College,0.344268,0.155829,0.677327,350.0,101665.0,7,1,0.697143,0.26,0.145714,3.0,low
4,LaGuardia Community College,0.193493,0.093832,0.341048,227.0,117317.0,7,1,0.797357,0.387665,0.0837,5.0,low
5,John Jay College of Criminal Justice,0.0,0.0,0.0,0.0,197634.0,7,0,0.0,0.0,0.0,,low
6,Borough of Manhattan Community College,0.0,0.0,0.0,0.0,150310.0,7,0,0.0,0.0,0.0,,low
7,Queensborough Community College,0.0,0.0,0.0,0.0,122462.0,7,0,0.0,0.0,0.0,,low
8,College of Staten Island,0.0,0.0,0.0,0.0,18936.0,7,0,0.0,0.0,0.0,,low
9,CUNY Graduate School of Public Health and Heal...,15.535421,1.362129,16.190499,43140.0,277688.0,7,3,0.71159,0.352874,0.105749,1.607527,medium


In [None]:
cm = campus_monthly.copy()
cm['month'] = pd.to_datetime(cm['month'], errors='coerce')
for c in ['trips_in_buffer','violations','viol_per_100_trips']:
  if c in cm.columns:
    cm[c] = pd.to_numeric(cm[c], errors='coerce')

q75_next = (cm.groupby('month')['viol_per_100_trips']
              .quantile(0.75)
              .rename('q75_next'))

# make t+1 month
cm['month_next'] = cm['month'] + pd.offsets.MonthBegin(1)

rate_next = (cm[['campus','month','viol_per_100_trips']]
               .rename(columns={'month':'month_next',
                                'viol_per_100_trips':'rate_next'}))
cm = cm.merge(rate_next, on=['campus','month_next'], how='left')

cm = cm.merge(q75_next, left_on='month_next', right_index=True, how='left')

cm['y_next_high'] = (cm['rate_next'] >= cm['q75_next']).astype('Int64')

cm['rate_now']  = cm['viol_per_100_trips']
cm['exposure']  = cm['trips_in_buffer']
cm['rate_prev'] = cm.groupby('campus')['rate_now'].shift(1)
cm['expo_prev'] = cm.groupby('campus')['exposure'].shift(1)
cm['delta_rate'] = cm['rate_now'] - cm['rate_prev']
cm['delta_expo'] = cm['exposure'] - cm['expo_prev']
cm['rate_std3'] = (cm.groupby('campus')['rate_now']
                     .rolling(3, min_periods=2).std()
                     .reset_index(level=0, drop=True))

if 'route_count' in feat_campus.columns:
    cm = cm.merge(feat_campus[['campus','route_count']], on='campus', how='left')

feat_cols = ['rate_now','exposure','rate_prev','expo_prev','delta_rate','delta_expo','rate_std3']
if 'route_count' in cm.columns:
    feat_cols.append('route_count')

train_df = cm[cm['y_next_high'].notna()].copy()

train_df = train_df.dropna(subset=['rate_now','exposure'], how='any')

X = train_df[feat_cols].fillna(0.0)
y = train_df['y_next_high'].astype(int).values
groups = train_df['campus'].values

print("Train rows:", len(train_df),
      "| campuses:", train_df['campus'].nunique(),
      "| positives:", int(y.sum()),
      "| negatives:", int((1-y).sum()))
train_df[['campus','month','month_next','rate_now','exposure','rate_prev','expo_prev','y_next_high']]

Train rows: 175 | campuses: 25 | positives: 42 | negatives: 133


Unnamed: 0,campus,month,month_next,rate_now,exposure,rate_prev,expo_prev,y_next_high
1,Baruch College,2025-01-01,2025-02-01,2.666392,131001.0,,,1
2,Baruch College,2025-02-01,2025-03-01,2.911949,101032.0,2.666392,131001.0,1
3,Baruch College,2025-03-01,2025-04-01,3.081261,128032.0,2.911949,101032.0,1
4,Baruch College,2025-04-01,2025-05-01,3.365920,127454.0,3.081261,128032.0,1
5,Baruch College,2025-05-01,2025-06-01,3.532661,123816.0,3.365920,127454.0,1
...,...,...,...,...,...,...,...,...
196,York College,2025-03-01,2025-04-01,1.023720,219396.0,0.989839,190738.0,0
197,York College,2025-04-01,2025-05-01,1.017650,217167.0,1.023720,219396.0,0
198,York College,2025-05-01,2025-06-01,1.013956,219043.0,1.017650,217167.0,0
199,York College,2025-06-01,2025-07-01,1.016101,209920.0,1.013956,219043.0,0


In [None]:
train_df['no_prev'] = (train_df['rate_prev'].isna() | train_df['expo_prev'].isna()).astype(int)

train_df['rate_prev'] = train_df['rate_prev'].fillna(train_df['rate_now'])
train_df['expo_prev'] = train_df['expo_prev'].fillna(train_df['exposure'])

train_df['delta_rate'] = (train_df['rate_now'] - train_df['rate_prev']).fillna(0.0)
train_df['delta_expo'] = (train_df['exposure'] - train_df['expo_prev']).fillna(0.0)
train_df['rate_std3']  = train_df['rate_std3'].fillna(0.0)

feat_cols = ['rate_now','exposure','rate_prev','expo_prev','delta_rate','delta_expo','rate_std3','no_prev']

In [None]:
train_df = train_df.sort_values(['campus','month']).copy()

train_df['rate_ma3'] = (train_df
    .groupby('campus')['rate_now']
    .rolling(3, min_periods=2).mean()
    .reset_index(level=0, drop=True))

train_df['rate_sem_avg'] = train_df.groupby('campus')['rate_now'].transform('mean')

feat_cols = [
    'rate_now','exposure','rate_prev','expo_prev',
    'delta_rate','delta_expo','rate_std3','no_prev',
    'rate_ma3','rate_sem_avg'
]

X = train_df[feat_cols].astype(float).values
y = train_df['y_next_high'].astype(int).values
groups = train_df['campus'].values  # GroupKFold by campus

In [None]:
train_df

Unnamed: 0,campus,month,trips_in_buffer,violations,viol_per_100_trips,month_next,rate_next,q75_next,y_next_high,rate_now,exposure,rate_prev,expo_prev,delta_rate,delta_expo,rate_std3,route_count,no_prev,rate_ma3,rate_sem_avg
1,Baruch College,2025-01-01,131001.0,3493.0,2.666392,2025-02-01,2.911949,2.748180,1,2.666392,131001.0,2.666392,131001.0,0.000000,0.0,0.000000,5,1,,3.107014
2,Baruch College,2025-02-01,101032.0,2942.0,2.911949,2025-03-01,3.081261,2.590518,1,2.911949,101032.0,2.666392,131001.0,0.245557,-29969.0,0.173635,5,0,2.789170,3.107014
3,Baruch College,2025-03-01,128032.0,3945.0,3.081261,2025-04-01,3.365920,2.697047,1,3.081261,128032.0,2.911949,101032.0,0.169312,27000.0,0.208599,5,0,2.886534,3.107014
4,Baruch College,2025-04-01,127454.0,4290.0,3.365920,2025-05-01,3.532661,2.933496,1,3.365920,127454.0,3.081261,128032.0,0.284659,-578.0,0.229415,5,0,3.119710,3.107014
5,Baruch College,2025-05-01,123816.0,4374.0,3.532661,2025-06-01,3.226867,2.355915,1,3.532661,123816.0,3.365920,127454.0,0.166741,-3638.0,0.228253,5,0,3.326614,3.107014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,York College,2025-03-01,219396.0,2246.0,1.023720,2025-04-01,1.017650,2.697047,0,1.023720,219396.0,0.989839,190738.0,0.033880,28658.0,0.048177,3,0,0.980741,1.021535
197,York College,2025-04-01,217167.0,2210.0,1.017650,2025-05-01,1.013956,2.933496,0,1.017650,217167.0,1.023720,219396.0,-0.006070,-2229.0,0.018065,3,0,1.010403,1.021535
198,York College,2025-05-01,219043.0,2221.0,1.013956,2025-06-01,1.016101,2.355915,0,1.013956,219043.0,1.017650,217167.0,-0.003694,1876.0,0.004930,3,0,1.018442,1.021535
199,York College,2025-06-01,209920.0,2133.0,1.016101,2025-07-01,1.160819,2.215482,0,1.016101,209920.0,1.013956,219043.0,0.002145,-9123.0,0.001855,3,0,1.015903,1.021535


In [None]:
train_df['month'].unique()

<DatetimeArray>
['2025-01-01 00:00:00', '2025-02-01 00:00:00', '2025-03-01 00:00:00',
 '2025-04-01 00:00:00', '2025-05-01 00:00:00', '2025-06-01 00:00:00',
 '2025-07-01 00:00:00']
Length: 7, dtype: datetime64[ns]

In [None]:
# Build features for all campus-month rows from your campus_monthly table
cm_all = campus_monthly.sort_values(['campus','month']).copy()

# Base features
cm_all['rate_now']  = pd.to_numeric(cm_all['viol_per_100_trips'], errors='coerce')
cm_all['exposure']  = pd.to_numeric(cm_all['trips_in_buffer'],      errors='coerce')
cm_all['rate_prev'] = cm_all.groupby('campus')['rate_now'].shift(1)
cm_all['expo_prev'] = cm_all.groupby('campus')['exposure'].shift(1)
cm_all['delta_rate'] = cm_all['rate_now'] - cm_all['rate_prev']
cm_all['delta_expo'] = cm_all['exposure'] - cm_all['expo_prev']
cm_all['rate_std3']  = (cm_all.groupby('campus')['rate_now']
                           .rolling(3, min_periods=2).std()
                           .reset_index(level=0, drop=True))
cm_all['no_prev']    = cm_all['rate_prev'].isna().astype(int)

# New context features (match training)
cm_all['rate_ma3'] = (cm_all.groupby('campus')['rate_now']
                         .rolling(3, min_periods=2).mean()
                         .reset_index(level=0, drop=True))
cm_all['rate_sem_avg'] = cm_all.groupby('campus')['rate_now'].transform('mean')

# Latest row per campus to predict next-month risk
last_idx = cm_all.groupby('campus')['month'].idxmax()
latest_feats = cm_all.loc[last_idx, ['campus','month'] + feat_cols].copy()

## Training Pipeline

In [None]:
FEATS = [
    'rate_now','exposure','rate_prev','expo_prev',
    'delta_rate','delta_expo','rate_std3','rate_ma3','no_prev'
]

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GroupKFold, cross_val_predict
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [None]:
pipe = Pipeline([
  ('imputer', SimpleImputer(strategy='median')),
  ('scaler',  StandardScaler()),
  ('clf',     LogisticRegression(
      penalty='elasticnet', l1_ratio=0.4, solver='saga',
      class_weight='balanced', max_iter=4000, C=1.0, random_state=0
  ))
])

gkf = GroupKFold(n_splits=min(5, len(np.unique(groups))))
oof_proba = cross_val_predict(pipe, X, y, groups=groups, cv=gkf, method='predict_proba')[:,1]

cal_lr = LogisticRegression(max_iter=2000)
cal_lr.fit(oof_proba.reshape(-1,1), y)

pipe.fit(X, y)

In [None]:
X_score = (latest_feats
           .reindex(columns=FEATS)
           .apply(pd.to_numeric, errors='coerce'))
p_raw  = pipe.predict_proba(X_score)[:, 1]
p_next = cal_lr.predict_proba(p_raw.reshape(-1,1))[:, 1]  # calibrated probs

roc = roc_auc_score(y, oof_proba)
pr  = average_precision_score(y, oof_proba)
prec, rec, thr = precision_recall_curve(y, oof_proba)
f1  = 2*prec*rec/(prec+rec+1e-9)
best_thr = thr[np.nanargmax(f1[:-1])]
print(f"Group CV — ROC-AUC={roc:.3f} | PR-AUC={pr:.3f} | best_thr≈{best_thr:.2f}")

In [None]:
hi_cut  = np.quantile(oof_proba, 0.75)
med_cut = np.quantile(oof_proba, 0.50)
to_tier = lambda p: 'high' if p >= hi_cut else ('medium' if p >= med_cut else 'low')

rank_view = latest_feats[['campus']].copy()
rank_view['p_high_next']   = p_next
rank_view['risk_next_tier'] = rank_view['p_high_next'].apply(to_tier)
rank_view['rate_last']     = latest_feats['rate_now']
rank_view['rate_sem_avg']  = latest_feats['rate_sem_avg']
rank_view['expo_last']     = latest_feats['exposure']
rank_view = rank_view.sort_values('p_high_next', ascending=False).reset_index(drop=True)
rank_view

Train rows: 150 | campuses: 25 | positives: 42 | negatives: 108
Group CV — ROC-AUC=0.981 | PR-AUC=0.955 | best_thr≈0.28


Unnamed: 0,campus,p_high_next,risk_next_tier,rate_last,rate_sem_avg,expo_last
0,CUNY Graduate School of Public Health and Heal...,0.839581,high,,14.958259,
1,The City College of New York,0.839514,high,,7.820948,
2,Lehman College,0.839445,high,,7.719023,
3,Hostos Community College,0.74255,high,,4.453736,
4,Baruch College,0.297739,medium,,3.107014,
5,Hunter College,0.214583,medium,,2.684077,
6,Bronx Community College,0.190011,medium,,2.685051,
7,New York City College of Technology (City Tech),0.12784,medium,,2.476408,
8,Craig Newmark Graduate School of Journalism,0.089797,medium,,0.771868,
9,Brooklyn College,0.089411,medium,,2.188421,


In [None]:
rank_view.to_csv('predictions.csv')
campus_monthly.to_csv('campus_monthly.csv')
rank.to_csv('campus_pooled.csv')
risk_board.to_csv('campus_additional_stats.csv')

In [None]:
campus_points_gdf = gpd.GeoDataFrame(
    cuny_geo[['campus','lon','lat']],
    geometry=gpd.points_from_xy(cuny_geo['lon'], cuny_geo['lat']),
    crs=4326
)
campus_points_gdf[['campus','geometry']].to_file(
    "campus_points.geojson", driver="GeoJSON")

In [None]:
import joblib

model_bundle = {
    "type": "elasticnet_logit_pipeline",
    "pipeline": pipe,
    "feat_cols": FEATS,
    "created_utc": str(pd.Timestamp.utcnow()),
}

joblib.dump(model_bundle, 'model_bundle.pkl')

['model_bundle.pkl']

**Sources**

- List of colleges: https://www.cuny.edu/about/colleges/
- AM/PM Peak: https://www.mta.info/fares-tolls/subway-bus/reduced-fare
- Coordinates: Google Search