In [None]:
#Load in the Stadium list and then find the lat and long of each stadium
import requests

In [2]:
df.to_csv('Stadium_list_export.csv', index=False)

In [6]:
# Load the saved export into a new DataFrame
stadiums = pd.read_csv('Stadium_list_final.csv', encoding='cp1252')
stadiums

Unnamed: 0,Image,Stadium,City,State,Team,Azimuth,Lat,Long,geo_precision,Conference,Capacity,Opened,Lat.1,Long.1,City_clean,State_clean,Stadium_clean,geo_precision.1,geo_query
0,,Varsity Field,Albany,NY,Albany,,42.678600,-73.823700,stadium,America East,500,c.1980,41.958990,-73.996978,Albany,NY,Varsity Field,stadium,"Varsity Field, NY, USA"
1,,Binghamton Baseball Stadium Complex,Vestal[a],NY,Binghamton,,42.085070,-76.053813,city,America East,1906,2022,42.085070,-76.053813,Vestal,NY,Binghamton Baseball Stadium Complex,city,"Vestal, NY, USA"
2,,Conaty Park,Smithfield,RI,Bryant,,41.924742,-71.539686,stadium,America East,500,2012,41.924742,-71.539686,Smithfield,RI,Conaty Park,stadium,"Conaty Park, Smithfield, RI, USA"
3,,Mahaney Diamond,Orono,ME,Maine,,44.905177,-68.669979,stadium,America East,4400,Early 1980s,44.905177,-68.669979,Orono,ME,Mahaney Diamond,stadium,"Mahaney Diamond, Orono, ME, USA"
4,,Yogi Berra Stadium,Little Falls,NJ,NJIT,,40.868894,-74.195335,stadium,America East,5000,1998,40.868894,-74.195335,Little Falls,NJ,Yogi Berra Stadium,stadium,"Yogi Berra Stadium, Little Falls, NJ, USA"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
303,,John Smith Field,Sacramento,CA,Sacramento State,,38.581061,-121.493895,city,Western Athletic(Big West in 2027),1200,1953,38.581061,-121.493895,Sacramento,CA,John Smith Field,city,"Sacramento, CA, USA"
304,,Cecil Ballow Baseball Complex,Stephenville,TX,Tarleton,,32.220139,-98.204539,city,Western Athletic[c],750[58],1988,32.220139,-98.204539,Stephenville,TX,Cecil Ballow Baseball Complex,city,"Stephenville, TX, USA"
305,,Bruce Hurst Field,St. George,UT,Utah Tech[60],,37.098632,-113.565178,stadium,Western Athletic(MW in 2027),"2,500[61]",1994,37.098632,-113.565178,St. George,UT,Bruce Hurst Field,stadium,"Bruce Hurst Field, St. George, UT, USA"
306,,UCCU Ballpark,Orem,UT,Utah Valley,,40.276732,-111.717129,stadium,Western Athletic(Big West in 2027),5000,2005,40.276732,-111.717129,Orem,UT,UCCU Ballpark,stadium,"UCCU Ballpark, Orem, UT, USA"


In [2]:
import pandas as pd
import numpy as np
import math
import requests
from datetime import datetime, timedelta
from zoneinfo import ZoneInfo

# Load stadiums with azimuth
stadiums = pd.read_csv('Stadium_list_final.csv', encoding='cp1252')
cols = list(stadiums.columns)

# Normalize and locate lat/long immediately after Azimuth column
if 'Azimuth' not in stadiums.columns:
    raise ValueError('Azimuth column not found in Stadium_list_final.csv')

az_idx = cols.index('Azimuth')
lat_col = None
lon_col = None
for i in range(az_idx + 1, len(cols)):
    if cols[i].strip().lower() == 'lat' and lat_col is None:
        lat_col = cols[i]
    elif cols[i].strip().lower() == 'long' and lon_col is None:
        lon_col = cols[i]
    if lat_col and lon_col:
        break

if lat_col is None or lon_col is None:
    raise ValueError('Could not find Lat/Long columns following Azimuth')

# Clean azimuth values (strip non-numeric)
def parse_azimuth(x):
    if pd.isna(x):
        return np.nan
    s = str(x)
    s = ''.join(ch for ch in s if ch.isdigit() or ch == '.')
    try:
        val = float(s)
        return val % 360.0
    except Exception:
        return np.nan

stadiums['Azimuth_deg'] = stadiums['Azimuth'].apply(parse_azimuth)
valid = stadiums.dropna(subset=['Azimuth_deg', lat_col, lon_col]).copy()

# Open-Meteo hourly forecast function (timezone auto)
OPEN_METEO_URL = 'https://api.open-meteo.com/v1/forecast'
SESSION = requests.Session()
cache = {}

def get_tomorrow_noon(lat, lon):
    key = (round(float(lat), 4), round(float(lon), 4))
    if key in cache:
        return cache[key]
    params = {
        'latitude': float(lat),
        'longitude': float(lon),
        'hourly': 'wind_speed_10m,wind_direction_10m',
        'timezone': 'auto',
        'windspeed_unit': 'ms'
    }
    r = SESSION.get(OPEN_METEO_URL, params=params, timeout=20)
    r.raise_for_status()
    data = r.json()
    tzname = data.get('timezone', 'UTC')
    times = data['hourly']['time']
    speeds = data['hourly']['wind_speed_10m']
    dirs_from = data['hourly']['wind_direction_10m']

    now_local = datetime.now(ZoneInfo(tzname))
    tomorrow_date = (now_local + timedelta(days=1)).date()

    # Robust match for local tomorrow at 12:00
    idx = None
    for i, t in enumerate(times):
        dt = datetime.fromisoformat(t).replace(tzinfo=ZoneInfo(tzname))
        if dt.date() == tomorrow_date and dt.hour == 12:
            idx = i
            break
    if idx is None:
        # Fallback: choose the time closest to 12:00 tomorrow
        target_dt = datetime.combine(tomorrow_date, datetime.min.time()).replace(hour=12, tzinfo=ZoneInfo(tzname))
        diffs = []
        for i, t in enumerate(times):
            dt = datetime.fromisoformat(t).replace(tzinfo=ZoneInfo(tzname))
            diffs.append((abs((dt - target_dt).total_seconds()), i))
        idx = min(diffs)[1]

    result = {'time': times[idx], 'timezone': tzname, 'ws': float(speeds[idx]), 'wd_from': float(dirs_from[idx])}
    cache[key] = result
    return result

# Vector math helpers
RAD = math.pi / 180.0

def wind_components(ws, wd_from_deg, az_deg):
    if ws is None or wd_from_deg is None or az_deg is None:
        return (np.nan, np.nan, np.nan)
    # Convert wind direction (meteorological FROM) to TOWARD
    wt_deg = (wd_from_deg + 180.0) % 360.0
    wt = wt_deg * RAD
    az = az_deg * RAD
    # Unit vectors: east-north frame
    w_ex = math.sin(wt)
    w_ny = math.cos(wt)
    a_ex = math.sin(az)
    a_ny = math.cos(az)
    # Component along azimuth axis (positive means blowing toward azimuth direction)
    comp_along_az = ws * (w_ex * a_ex + w_ny * a_ny)
    # Pure north-south component (north positive, south negative)
    comp_ns = ws * w_ny
    # Pure east-west component (east positive, west negative) for reference
    comp_ew = ws * w_ex
    return (comp_along_az, comp_ns, comp_ew)

rows = []
for i, row in valid.iterrows():
    lat = row[lat_col]
    lon = row[lon_col]
    az = row['Azimuth_deg']
    fc = get_tomorrow_noon(lat, lon)
    comp_along_az, comp_ns, comp_ew = wind_components(fc['ws'], fc['wd_from'], az)
    rows.append({
        'Stadium': row.get('Stadium', ''),
        'Team': row.get('Team', ''),
        'Lat': lat,
        'Long': lon,
        'Azimuth_deg': az,
        'Forecast_Time_Local': fc['time'],
        'Timezone': fc['timezone'],
        'Wind_Speed_10m_ms': fc['ws'],
        'Wind_Direction_From_deg': fc['wd_from'],
        'Wind_Component_Azimuth_ms': comp_along_az,
        'Wind_Component_NorthSouth_ms': comp_ns,
        'Wind_Component_EastWest_ms': comp_ew
    })

result_df = pd.DataFrame(rows)
result_df.to_csv('Stadium_wind_components.csv', index=False)

# Show a preview
result_df.head(12)

Unnamed: 0,Stadium,Team,Lat,Long,Azimuth_deg,Forecast_Time_Local,Timezone,Wind_Speed_10m_ms,Wind_Direction_From_deg,Wind_Component_Azimuth_ms,Wind_Component_NorthSouth_ms,Wind_Component_EastWest_ms
0,Varsity Field,Albany,42.678326,-73.823569,153.0,2026-01-17T12:00,America/New_York,4.44,157.0,-4.429184,4.087042,-1.734846
1,Binghamton Baseball Stadium Complex,Binghamton,42.094446,-75.974086,30.4,2026-01-17T12:00,America/New_York,3.67,209.0,3.668904,3.209854,1.779251
2,Conaty Park,Bryant,41.925751,-71.540025,29.11,2026-01-17T12:00,America/New_York,4.92,193.0,4.726795,4.793901,1.106759
3,Mahaney Diamond,Maine,44.905148,-68.670092,307.97,2026-01-17T12:00,America/New_York,2.75,199.0,0.893951,2.600176,0.895312
4,Yogi Berra Stadium,NJIT,40.869285,-74.195209,358.1,2026-01-17T12:00,America/New_York,3.69,193.0,3.565928,3.595426,0.830069
5,Edward A. LeLacheur Park,UMass Lowell,42.65384,-71.317609,30.66,2026-01-17T12:00,America/New_York,2.25,201.0,2.218097,2.100556,0.806328
6,The Baseball Factory Field at UMBC,UMBC,39.249481,-76.708378,180.5,2026-01-17T12:00,America/New_York,5.3,216.0,-4.314812,4.28779,3.115262
7,Robert and Mariam Hayes Stadium,Charlotte,35.308094,-80.738954,139.58,2026-01-17T12:00,America/New_York,5.16,224.0,-0.501735,3.711793,3.584437
8,Clarke LeClair Stadium,East Carolina,35.593396,-77.367029,301.45,2026-01-17T12:00,America/New_York,6.56,218.0,-0.748301,5.169351,4.038739
9,FAU Baseball Stadium,Florida Atlantic,26.358688,-80.083098,179.52,2026-01-17T12:00,America/New_York,2.16,146.0,-1.800777,1.790721,-1.207857


In [15]:
# Recommended: fetch full 2025 D1 baseball schedule via year endpoint (with fallback)
import requests
import json
import sys
import time
from typing import Dict, Any, Optional

DEFAULT_BASE_URL = "http://localhost:3000"
DEFAULT_SPORT = "baseball"
DEFAULT_DIVISION = "d1"
DEFAULT_YEAR = 2025
OUTPUT_JSON = f"schedules_{DEFAULT_YEAR}_d1_baseball.json"
MIN_SLEEP_BETWEEN_REQUESTS = 0.25  # safety if adding more calls later


def safe_get(url: str, headers: Dict[str, str], max_retries: int = 5) -> Optional[Dict[str, Any]]:
    """GET with simple backoff for 429/5xx and JSON check."""
    backoff = 0.5
    for attempt in range(1, max_retries + 1):
        try:
            resp = requests.get(url, headers=headers, timeout=20)
        except requests.RequestException as e:
            print(f"[attempt {attempt}] network error: {e}", file=sys.stderr)
            time.sleep(backoff)
            backoff *= 2
            continue

        if resp.status_code == 200:
            try:
                return resp.json()
            except ValueError:
                print(f"[attempt {attempt}] non-JSON response from {url}", file=sys.stderr)
                return None
        elif resp.status_code in (429,) or 500 <= resp.status_code < 600:
            print(f"[attempt {attempt}] got {resp.status_code} from {url}, backing off {backoff}s", file=sys.stderr)
            time.sleep(backoff)
            backoff *= 2
            continue
        elif resp.status_code == 404:
            print(f"[attempt {attempt}] 404 for {url}", file=sys.stderr)
            return None
        else:
            print(f"[attempt {attempt}] unexpected status {resp.status_code} from {url}: {resp.text[:200]}", file=sys.stderr)
            return None
    print(f"Exceeded retries for {url}", file=sys.stderr)
    return None


def fetch_year_schedule(base_url: str = DEFAULT_BASE_URL,
                        sport: str = DEFAULT_SPORT,
                        division: str = DEFAULT_DIVISION,
                        year: int = DEFAULT_YEAR,
                        key: Optional[str] = None) -> Dict[str, Any]:
    headers: Dict[str, str] = {}
    if key:
        headers["x-ncaa-key"] = key

    url_primary = f"{base_url.rstrip('/')}/schedule/{sport}/{division}/{year}"
    print(f"Requesting {url_primary}", file=sys.stderr)
    data = safe_get(url_primary, headers=headers)
    if not data:
        url_alt = f"{base_url.rstrip('/')}/schedule-alt/{sport}/{division}/{year}"
        print(f"Primary failed/empty; trying {url_alt}", file=sys.stderr)
        data = safe_get(url_alt, headers=headers)

    if not data:
        raise RuntimeError("Failed to fetch schedule from both primary and alt endpoints")

    game_dates = data.get("gameDates")
    total = len(game_dates) if isinstance(game_dates, list) else 0
    print(f"Fetched {total} contest dates for {sport}/{division}/{year}")

    with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print(f"Wrote merged schedule JSON to {OUTPUT_JSON}")
    return data


# Execute and preview
schedule_2025 = fetch_year_schedule()
# Preview first 10 dates (contest_date strings if present)
preview = []
if isinstance(schedule_2025.get("gameDates"), list):
    preview = [gd.get("contest_date") for gd in schedule_2025["gameDates"][:10]]
print("Preview contest dates:", preview)

Requesting http://localhost:3000/schedule/baseball/d1/2025
[attempt 1] 404 for http://localhost:3000/schedule/baseball/d1/2025
Primary failed/empty; trying http://localhost:3000/schedule-alt/baseball/d1/2025


Fetched 0 contest dates for baseball/d1/2025
Wrote merged schedule JSON to schedules_2025_d1_baseball.json
Preview contest dates: []


In [16]:
# Fallback: fetch and merge by months (02-08) when year endpoint is empty
import argparse
import json
import sys
import time
from typing import Dict, List, Any, Optional
import requests

FALLBACK_OUTPUT_JSON = "schedules_2025_d1_baseball_monthly.json"


def safe_get_month(url: str, headers: Dict[str, str], max_retries: int = 6) -> Optional[Dict[str, Any]]:
    backoff = 0.5
    for attempt in range(1, max_retries + 1):
        try:
            resp = requests.get(url, headers=headers, timeout=20)
        except requests.RequestException as e:
            print(f"[attempt {attempt}] network error: {e}", file=sys.stderr)
            time.sleep(backoff)
            backoff *= 2
            continue
        if resp.status_code == 200:
            try:
                return resp.json()
            except ValueError:
                print(f"[attempt {attempt}] non-JSON response from {url}", file=sys.stderr)
                return None
        elif resp.status_code in (429,) or 500 <= resp.status_code < 600:
            print(f"[attempt {attempt}] got {resp.status_code} from {url}, backing off {backoff}s", file=sys.stderr)
            time.sleep(backoff)
            backoff *= 2
            continue
        elif resp.status_code == 404:
            print(f"[attempt {attempt}] 404 for {url} (skip)", file=sys.stderr)
            return None
        else:
            print(f"[attempt {attempt}] unexpected status {resp.status_code} from {url}: {resp.text[:200]}", file=sys.stderr)
            return None
    print(f"Exceeded retries for {url}", file=sys.stderr)
    return None


def merge_game_dates(all_game_dates: List[List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
    by_date: Dict[str, Dict[str, Any]] = {}
    ordered: List[str] = []
    for month_list in all_game_dates:
        for entry in month_list:
            cdate = entry.get("contest_date")
            key = cdate if cdate else json.dumps(entry, sort_keys=True)
            if key not in by_date:
                ordered.append(key)
            by_date[key] = entry
    return [by_date[k] for k in ordered]


def fetch_schedule_for_year_months(base_url: str, sport: str, division: str, year: int, months: List[int], key: Optional[str]) -> Dict[str, Any]:
    headers: Dict[str, str] = {}
    if key:
        headers["x-ncaa-key"] = key
    all_game_dates_months: List[List[Dict[str, Any]]] = []
    fetched_months: List[str] = []
    for m in months:
        month_str = f"{m:02d}"
        url = f"{base_url.rstrip('/')}/schedule/{sport}/{division}/{year}/{month_str}"
        print(f"Requesting {url}", file=sys.stderr)
        data = safe_get_month(url, headers=headers)
        time.sleep(0.25)
        if not data:
            continue
        game_dates = data.get("gameDates")
        if isinstance(game_dates, list) and game_dates:
            all_game_dates_months.append(game_dates)
            fetched_months.append(month_str)
            print(f"  -> got {len(game_dates)} contest dates for {year}/{month_str}", file=sys.stderr)
    merged = merge_game_dates(all_game_dates_months)
    result = {
        "sport": sport,
        "division": division,
        "year": year,
        "base_url": base_url,
        "fetched_months": fetched_months,
        "total_contest_dates": len(merged),
        "gameDates": merged,
    }
    with open(FALLBACK_OUTPUT_JSON, "w", encoding="utf-8") as f:
        json.dump(result, f, indent=2, ensure_ascii=False)
    print(f"Wrote merged monthly schedule to {FALLBACK_OUTPUT_JSON} (total contest dates: {result['total_contest_dates']})")
    return result


# Execute fallback monthly aggregation
months = [2, 3, 4, 5, 6, 7, 8]
sched_monthly = fetch_schedule_for_year_months(
    base_url=DEFAULT_BASE_URL,
    sport=DEFAULT_SPORT,
    division=DEFAULT_DIVISION,
    year=DEFAULT_YEAR,
    months=months,
    key=None,
)
# Preview first 10 dates
gd = sched_monthly.get("gameDates", [])
print("Preview contest dates (monthly merge):", [d.get("contest_date") for d in gd[:10]])

Requesting http://localhost:3000/schedule/baseball/d1/2025/02
  -> got 28 contest dates for 2025/02
Requesting http://localhost:3000/schedule/baseball/d1/2025/03
  -> got 31 contest dates for 2025/03
Requesting http://localhost:3000/schedule/baseball/d1/2025/04
  -> got 30 contest dates for 2025/04
Requesting http://localhost:3000/schedule/baseball/d1/2025/05
  -> got 31 contest dates for 2025/05
Requesting http://localhost:3000/schedule/baseball/d1/2025/06
  -> got 30 contest dates for 2025/06
Requesting http://localhost:3000/schedule/baseball/d1/2025/07
[attempt 1] 404 for http://localhost:3000/schedule/baseball/d1/2025/07 (skip)
Requesting http://localhost:3000/schedule/baseball/d1/2025/08
[attempt 1] 404 for http://localhost:3000/schedule/baseball/d1/2025/08 (skip)


Wrote merged monthly schedule to schedules_2025_d1_baseball_monthly.json (total contest dates: 150)
Preview contest dates (monthly merge): ['2-1-2025', '2-2-2025', '2-3-2025', '2-4-2025', '2-5-2025', '2-6-2025', '2-7-2025', '2-8-2025', '2-9-2025', '2-10-2025']


In [17]:
# Last-resort fallback: scan daily scoreboards (Feb–Aug 2025) to assemble schedule
import requests
import sys
import time
import json
from datetime import date, timedelta
from typing import Dict, Any, List

SCOREBOARD_OUTPUT_JSON = "schedules_2025_d1_baseball_scoreboard.json"
BASE_URL = DEFAULT_BASE_URL
SLEEP = 0.22  # keep under ~5 req/sec
DRY_RUN_LIMIT = 5  # short preview; set to 0 for full run


def iter_dates(start: date, end: date):
    d = start
    while d <= end:
        yield d
        d = d + timedelta(days=1)


def fetch_scoreboard_day(base_url: str, sport: str, division: str, d: date) -> Dict[str, Any]:
    url = f"{base_url.rstrip('/')}/scoreboard/{sport}/{division}/{d.year}/{d.month:02d}/{d.day:02d}"
    try:
        resp = requests.get(url, timeout=20)
        if resp.status_code == 200:
            return resp.json()
        elif resp.status_code == 404:
            return {}
        else:
            print(f"Unexpected {resp.status_code} from {url}", file=sys.stderr)
            return {}
    except requests.RequestException as e:
        print(f"Network error for {url}: {e}", file=sys.stderr)
        return {}


def assemble_schedule_via_scoreboard(base_url: str, sport: str, division: str, start: date, end: date) -> Dict[str, Any]:
    records: List[Dict[str, Any]] = []
    checked_days: List[str] = []
    for i, d in enumerate(iter_dates(start, end), start=1):
        data = fetch_scoreboard_day(base_url, sport, division, d)
        checked_days.append(d.isoformat())
        games = data.get("games", []) if isinstance(data, dict) else []
        for g in games:
            info = g.get("game", {})
            rec = {
                "contest_date": info.get("gameDate") or d.isoformat(),
                "id": info.get("id"),
                "away": info.get("away", {}).get("names", {}).get("short"),
                "home": info.get("home", {}).get("names", {}).get("short"),
                "location": info.get("location"),
                "status": info.get("status"),
                "start": info.get("start") or info.get("startTime")
            }
            records.append(rec)
        if DRY_RUN_LIMIT and i >= DRY_RUN_LIMIT:
            break
        time.sleep(SLEEP)
    result = {
        "sport": sport,
        "division": division,
        "year": start.year,
        "method": "scoreboard-scan",
        "checked_days": checked_days,
        "total_games": len(records),
        "games": records,
    }
    with open(SCOREBOARD_OUTPUT_JSON, "w", encoding="utf-8") as f:
        json.dump(result, f, indent=2, ensure_ascii=False)
    print(f"Wrote scoreboard-assembled schedule to {SCOREBOARD_OUTPUT_JSON} (games: {len(records)})")
    return result


# Execute preview scan
sched_scan = assemble_schedule_via_scoreboard(
    base_url=BASE_URL,
    sport=DEFAULT_SPORT,
    division=DEFAULT_DIVISION,
    start=date(2025, 2, 1),
    end=date(2025, 8, 31),
)
print("Preview first 5 records:", (sched_scan.get("games", [])[:5]))

Wrote scoreboard-assembled schedule to schedules_2025_d1_baseball_scoreboard.json (games: 0)
Preview first 5 records: []


In [None]:
# Switch to local ncaa-api instance for subsequent calls
DEFAULT_BASE_URL = "http://localhost:3000"
BASE_URL = DEFAULT_BASE_URL
print("Using local API:", DEFAULT_BASE_URL)

In [18]:
# Aggregate 2025 games by contest_date using monthly schedule
import json
import time
import sys
import requests
from typing import Dict, Any, List, Optional
from pathlib import Path

BASE_URL = DEFAULT_BASE_URL
SPORT = DEFAULT_SPORT
DIVISION = DEFAULT_DIVISION
INPUT_MONTHLY_JSON = "schedules_2025_d1_baseball_monthly.json"
OUTPUT_GAMES_JSON = "schedules_2025_d1_baseball_games.json"
SLEEP = 0.22  # keep under ~5 req/sec
NCAA_KEY: Optional[str] = None  # set if your instance requires x-ncaa-key

def parse_contest_date(s: str) -> Optional[Dict[str, int]]:
    """Parse contest_date like '2-1-2025' or '02-01-2025' to {year, month, day}."""
    if not s or not isinstance(s, str):
        return None
    parts = s.split("-")
    if len(parts) != 3:
        return None
    try:
        # input format appears to be M-D-YYYY (no leading zeros sometimes)
        m = int(parts[0])
        d = int(parts[1])
        y = int(parts[2])
        return {"year": y, "month": m, "day": d}
    except ValueError:
        return None

def fetch_scoreboard_day(base_url: str, sport: str, division: str, y: int, m: int, d: int, key: Optional[str]) -> Dict[str, Any]:
    path = f"{division}/{y}/{m:02d}/{d:02d}"
    url = f"{base_url.rstrip('/')}/scoreboard/{sport}/{path}"
    headers: Dict[str, str] = {}
    if key:
        headers["x-ncaa-key"] = key
    try:
        resp = requests.get(url, headers=headers, timeout=20)
        if resp.status_code == 200:
            try:
                return resp.json()
            except ValueError:
                print(f"Non-JSON from {url}", file=sys.stderr)
                return {}
        elif resp.status_code == 404:
            return {}
        else:
            print(f"Unexpected {resp.status_code} from {url}", file=sys.stderr)
            return {}
    except requests.RequestException as e:
        print(f"Network error for {url}: {e}", file=sys.stderr)
        return {}

# Load monthly schedule
if not Path(INPUT_MONTHLY_JSON).exists():
    print(f"Monthly schedule JSON not found: {INPUT_MONTHLY_JSON}. Run the monthly schedule cell first.")
else:
    with open(INPUT_MONTHLY_JSON, "r", encoding="utf-8") as f:
        monthly = json.load(f)
    game_dates = monthly.get("gameDates", []) if isinstance(monthly, dict) else []
    if not game_dates:
        print("No gameDates found in monthly schedule JSON.")
    else:
        records: List[Dict[str, Any]] = []
        checked: List[str] = []
        for gd in game_dates:
            s = gd.get("contest_date")
            dt = parse_contest_date(s)
            if not dt:
                continue
            checked.append(s)
            data = fetch_scoreboard_day(BASE_URL, SPORT, DIVISION, dt["year"], dt["month"], dt["day"], NCAA_KEY)
            games = data.get("games", []) if isinstance(data, dict) else []
            for g in games:
                info = g.get("game", {})
                rec = {
                    "contest_date": s,
                    "id": info.get("id"),
                    "away": info.get("away", {}).get("names", {}).get("short"),
                    "home": info.get("home", {}).get("names", {}).get("short"),
                    "location": info.get("location"),
                    "status": info.get("status"),
                    "start": info.get("start") or info.get("startTime"),
                    "network": info.get("network"),
                }
                records.append(rec)
            time.sleep(SLEEP)
        result = {
            "sport": SPORT,
            "division": DIVISION,
            "year": 2025,
            "base_url": BASE_URL,
            "dates_checked": checked,
            "total_game_records": len(records),
            "games": records,
        }
        with open(OUTPUT_GAMES_JSON, "w", encoding="utf-8") as f:
            json.dump(result, f, indent=2, ensure_ascii=False)
        print(f"Wrote aggregated games to {OUTPUT_GAMES_JSON} (records: {len(records)})")

Wrote aggregated games to schedules_2025_d1_baseball_games.json (records: 8615)


In [19]:
# Convert aggregated 2025 games JSON to CSV for analysis
import json
import pandas as pd
from pathlib import Path

INPUT_GAMES_JSON = "schedules_2025_d1_baseball_games.json"
OUTPUT_GAMES_CSV = "schedules_2025_d1_baseball_games.csv"

if not Path(INPUT_GAMES_JSON).exists():
    print(f"Games JSON not found: {INPUT_GAMES_JSON}. Run the aggregation cell first.")
else:
    with open(INPUT_GAMES_JSON, "r", encoding="utf-8") as f:
        data = json.load(f)
    games = data.get("games", [])
    df_games = pd.DataFrame(games)
    df_games.to_csv(OUTPUT_GAMES_CSV, index=False)
    print(f"Wrote CSV: {OUTPUT_GAMES_CSV} (rows: {len(df_games)})")
    # quick preview
    print(df_games.head(10))

Wrote CSV: schedules_2025_d1_baseball_games.csv (rows: 8615)
  contest_date    id         away               home location status start  \
0   02-14-2025  None     Illinois  Abilene Christian     None   None   TBA   
1   02-14-2025  None   Miami (OH)     Charleston So.     None   None   TBA   
2   02-14-2025  None    Manhattan    Mississippi St.     None   None   TBA   
3   02-14-2025  None   Holy Cross             Auburn     None   None   TBA   
4   02-14-2025  None      Bradley            Alabama     None   None   TBA   
5   02-14-2025  None         Iona         Coppin St.     None   None   TBA   
6   02-14-2025  None  Utah Valley       UC Riverside     None   None   TBA   
7   02-14-2025  None   Wright St.           Longwood     None   None   TBA   
8   02-14-2025  None        Maine     Louisiana Tech     None   None   TBA   
9   02-14-2025  None     Kentucky           Lipscomb     None   None   TBA   

  network  
0          
1          
2          
3          
4          
5       

In [20]:
# Filter Stanford 2025 schedule from aggregated games
import pandas as pd
from pathlib import Path

GAMES_CSV = "schedules_2025_d1_baseball_games.csv"
TEAM = "Stanford"
OUTPUT_TEAM_CSV = "stanford_2025_schedule.csv"

if not Path(GAMES_CSV).exists():
    print(f"Games CSV not found: {GAMES_CSV}. Run the JSON→CSV cell first.")
else:
    df = pd.read_csv(GAMES_CSV)
    # Normalize team names for matching
    def norm(x):
        return str(x).strip().lower() if pd.notna(x) else ""
    df["home_norm"] = df["home"].apply(norm)
    df["away_norm"] = df["away"].apply(norm)
    team_norm = TEAM.lower()
    df_team = df[(df["home_norm"] == team_norm) | (df["away_norm"] == team_norm)].copy()
    # Derive opponent and home/away
    def opponent(row):
        return row["away"] if row["home_norm"] == team_norm else row["home"]
    def ha(row):
        return "Home" if row["home_norm"] == team_norm else "Away"
    df_team["opponent"] = df_team.apply(opponent, axis=1)
    df_team["home_away"] = df_team.apply(ha, axis=1)
    # Parse date and sort
    df_team["date"] = pd.to_datetime(df_team["contest_date"], errors="coerce")
    df_team = df_team.sort_values(["date", "start"], ascending=[True, True])
    # Select columns for output
    cols = ["date", "home_away", "opponent", "start", "location", "network", "status"]
    df_out = df_team[cols]
    df_out.to_csv(OUTPUT_TEAM_CSV, index=False)
    print(f"Saved: {OUTPUT_TEAM_CSV} (rows: {len(df_out)})")
    # Show first 20 games
    print(df_out.head(20))

Saved: stanford_2025_schedule.csv (rows: 52)
           date home_away           opponent start  location network  status
309  2025-02-15      Away  Cal St. Fullerton   TBA       NaN     NaN     NaN
310  2025-02-15      Away  Cal St. Fullerton   TBA       NaN     NaN     NaN
433  2025-02-16      Away  Cal St. Fullerton   TBA       NaN     NaN     NaN
469  2025-02-17      Away  Cal St. Fullerton   TBA       NaN     NaN     NaN
647  2025-02-21      Home         Washington   TBA       NaN     NaN     NaN
799  2025-02-22      Home         Washington   TBA       NaN     NaN     NaN
964  2025-02-23      Home         Washington   TBA       NaN     NaN     NaN
1042 2025-02-24      Home         Washington   TBA       NaN     NaN     NaN
1299 2025-02-28      Home             Xavier   TBA       NaN     NaN     NaN
1473 2025-03-01      Home             Xavier   TBA       NaN     NaN     NaN
1474 2025-03-01      Home             Xavier   TBA       NaN     NaN     NaN
1638 2025-03-02      Home      

In [23]:
# ESPN College Baseball: full-season scrape (dates=YYYYMMDD)
import requests
import json
import time
import pandas as pd
from datetime import date, timedelta
from typing import List, Dict, Any

ESPN_URL = "https://site.api.espn.com/apis/site/v2/sports/baseball/college-baseball/scoreboard"
YEAR = 2026  # change to 2026 when season data is available
START = date(YEAR, 2, 1)
END = date(YEAR, 8, 31)
ESPN_SLEEP = 0.15  # be polite; adjust if needed

def iter_dates(start: date, end: date):
    d = start
    while d <= end:
        yield d
        d = d + timedelta(days=1)

def yyyymmdd(d: date) -> str:
    return f"{d.year}{d.month:02d}{d.day:02d}"

def safe_get_espx(params: Dict[str, str], retries: int = 3) -> Dict[str, Any]:
    backoff = 0.5
    for attempt in range(1, retries + 1):
        try:
            r = requests.get(ESPN_URL, params=params, timeout=20)
        except requests.RequestException as e:
            time.sleep(backoff)
            backoff *= 2
            continue
        if r.status_code == 200:
            try:
                return r.json()
            except ValueError:
                return {}
        elif r.status_code in (429,) or 500 <= r.status_code < 600:
            time.sleep(backoff)
            backoff *= 2
            continue
        elif r.status_code == 404:
            return {}
        else:
            return {}
    return {}

def extract_games(day_json: Dict[str, Any]) -> List[Dict[str, Any]]:
    events = day_json.get("events", []) if isinstance(day_json, dict) else []
    out: List[Dict[str, Any]] = []
    for ev in events:
        ev_id = ev.get("id")
        ev_date = ev.get("date")
        status = (ev.get("status") or {}).get("type", {}).get("description")
        competitions = ev.get("competitions", []) or []
        comp = competitions[0] if competitions else {}
        competitors = comp.get("competitors", []) or []
        home_name, away_name = None, None
        for c in competitors:
            side = c.get("homeAway")
            team = (c.get("team") or {})
            name = team.get("shortDisplayName") or team.get("displayName") or team.get("name")
            if side == "home":
                home_name = name
            elif side == "away":
                away_name = name
        venue = (comp.get("venue") or {})
        venue_full = None
        if venue:
            vn = venue.get("fullName")
            loc = venue.get("address") or {}
            city = loc.get("city")
            state = loc.get("state")
            venue_full = vn or None
            loc_str = ", ".join(x for x in [city, state] if x)
        else:
            loc_str = None
        out.append({
            "event_id": ev_id,
            "event_date": ev_date,
            "status": status,
            "home": home_name,
            "away": away_name,
            "venue": venue_full,
            "location": loc_str,
        })
    return out

# Scrape season range
agg: List[Dict[str, Any]] = []
checked: List[str] = []
for d in iter_dates(START, END):
    ds = yyyymmdd(d)
    checked.append(ds)
    day_json = safe_get_espx({"dates": ds})
    games = extract_games(day_json)
    agg.extend(games)
    time.sleep(ESPN_SLEEP)

espn_json_out = f"espn_{YEAR}_college_baseball_games.json"
espn_csv_out = f"espn_{YEAR}_college_baseball_games.csv"
with open(espn_json_out, "w", encoding="utf-8") as f:
    json.dump({"year": YEAR, "checked_dates": checked, "total": len(agg), "games": agg}, f, indent=2, ensure_ascii=False)
df = pd.DataFrame(agg)
df.to_csv(espn_csv_out, index=False)
print(f"ESPN scrape complete: {len(agg)} games. JSON: {espn_json_out}, CSV: {espn_csv_out}")
print(df.head(12))

ESPN scrape complete: 4110 games. JSON: espn_2026_college_baseball_games.json, CSV: espn_2026_college_baseball_games.csv
     event_id         event_date     status            home            away  \
0   401847153  2026-02-13T05:00Z  Scheduled          Auburn   Youngstown St   
1   401852986  2026-02-13T05:00Z  Scheduled             FIU       Villanova   
2   401852735  2026-02-13T05:00Z  Scheduled             USC      Pepperdine   
3   401852240  2026-02-13T05:00Z  Scheduled    Saint Mary's       Creighton   
4   401850139  2026-02-13T05:00Z  Scheduled   South Florida        Illinois   
5   401848608  2026-02-13T05:00Z  Scheduled       Jax State      Cincinnati   
6   401848577  2026-02-13T05:00Z  Scheduled   Virginia Tech  William & Mary   
7   401847692  2026-02-13T05:00Z  Scheduled  Boston College      Seton Hall   
8   401847426  2026-02-13T05:00Z  Scheduled  Mount St Marys        Missouri   
9   401848667  2026-02-13T14:30Z  Scheduled     Wake Forest         Houston   
10  401853

In [1]:
# Timezone-aware game times using venue/location and stadium lat/lon
import pandas as pd
from datetime import datetime, timezone
from zoneinfo import ZoneInfo
from timezonefinder import TimezoneFinder

# Inputs: choose source CSV (ESPN or NCAA aggregated)
SOURCE_CSV = "espn_2025_college_baseball_games.csv"  # change if needed
STADIUMS_CSV = "Stadium_list_geocoded.csv"

# Load games
games = pd.read_csv(SOURCE_CSV)
# Expect columns: event_date (ISO UTC), home, away, venue, location
if "event_date" not in games.columns:
    # If using NCAA pipeline, adapt to its date fields
    raise ValueError("Expected 'event_date' column from ESPN dataset.")

# Parse UTC datetimes
def parse_utc(ts: str):
    try:
        # ESPN date like '2025-02-14T22:00Z'
        return datetime.strptime(ts, "%Y-%m-%dT%H:%MZ").replace(tzinfo=timezone.utc)
    except Exception:
        try:
            return datetime.fromisoformat(ts.replace("Z", "+00:00"))
        except Exception:
            return None

games["event_dt_utc"] = games["event_date"].apply(parse_utc)

# Load stadiums with lat/long
stadiums = pd.read_csv(STADIUMS_CSV)
stadiums = stadiums.rename(columns={"Lat": "lat", "Long": "lon", "Team": "team", "Stadium": "stadium"})
stadiums = stadiums[["team", "stadium", "lat", "lon", "City", "State"]].dropna(subset=["lat", "lon"])

# Heuristic: map ESPN home team names to stadium rows by team name normalization
def norm_team(name: str) -> str:
    if not isinstance(name, str):
        return ""
    return name.strip().lower().replace("(nc)", "").replace("(sc)", "").replace("  ", " ")

stadiums["team_norm"] = stadiums["team"].apply(norm_team)
games["home_norm"] = games["home"].apply(norm_team)

# Join on team_norm to obtain lat/lon for home venue
games_geo = games.merge(stadiums[["team_norm", "lat", "lon", "stadium", "City", "State"]],
                          left_on="home_norm", right_on="team_norm", how="left", suffixes=("", "_stadium"))

# Resolve timezone via lat/lon
tf = TimezoneFinder()
def latlon_to_tz(lat, lon):
    if pd.isna(lat) or pd.isna(lon):
        return None
    try:
        return tf.timezone_at(lng=float(lon), lat=float(lat))
    except Exception:
        return None

games_geo["tz_name"] = games_geo.apply(lambda r: latlon_to_tz(r.get("lat"), r.get("lon")), axis=1)

# Convert to local time where timezone available
def to_local(dt_utc, tz_name):
    if dt_utc is None or pd.isna(tz_name):
        return None
    try:
        return dt_utc.astimezone(ZoneInfo(tz_name))
    except Exception:
        return None

games_geo["event_dt_local"] = games_geo.apply(lambda r: to_local(r.get("event_dt_utc"), r.get("tz_name")), axis=1)

# Derive date/time components for forecast lookup
games_geo["local_date"] = games_geo["event_dt_local"].apply(lambda x: x.date().isoformat() if pd.notna(x) else None)
games_geo["local_time"] = games_geo["event_dt_local"].apply(lambda x: x.time().isoformat(timespec="minutes") if pd.notna(x) else None)

# Output enriched CSV
out_csv = f"{SOURCE_CSV.replace('.csv', '')}_tz_enriched.csv"
cols = [
    "event_id","home","away","stadium","City","State","lat","lon",
    "event_date","event_dt_utc","tz_name","event_dt_local","local_date","local_time"
]
existing_cols = [c for c in cols if c in games_geo.columns]
games_geo[existing_cols].to_csv(out_csv, index=False)
print(f"Wrote timezone-enriched CSV: {out_csv}. Rows: {len(games_geo)}")
print(games_geo[existing_cols].head(12))

Wrote timezone-enriched CSV: espn_2025_college_baseball_games_tz_enriched.csv. Rows: 3116
     event_id            home           away  \
0   401750024       Texas A&M           Elon   
1   401750514        Michigan       Virginia   
2   401750049             LSU      Purdue FW   
3   401749975       Tennessee        Hofstra   
4   401746173        Arkansas  Washington St   
5   401746175        Arkansas  Washington St   
6   401750424  North Carolina     Texas Tech   
7   401750425  North Carolina     Texas Tech   
8   401751736       Oregon St         Xavier   
9   401748395         Georgia     Quinnipiac   
10  401750311      Florida St  James Madison   
11  401748349         Florida      Air Force   

                                 stadium             City State        lat  \
0          Olsen Field at Blue Bell Park  College Station    TX  30.618394   
1                     Ray Fisher Stadium        Ann Arbor    MI  42.267464   
2   Alex Box Stadium, Skip Bertman Field      Baton

In [2]:
from pathlib import Path

# GitHub Copilot

# Build a match report between ESPN venues and known stadiums

# Prefer 2026 ESPN CSV if present, otherwise fall back to SOURCE_CSV
espn_csv_2026 = "espn_2026_college_baseball_games.csv"
INPUT_ESPN = espn_csv_2026 if Path(espn_csv_2026).exists() else SOURCE_CSV

# Load ESPN games and extract unique venues
games_espx = pd.read_csv(INPUT_ESPN)
venue_series = games_espx["venue"].dropna().astype(str)
unique_venues = sorted(set(v.strip() for v in venue_series if v.strip()))

# Normalize names for matching
def norm_name(s: str) -> str:
    s = s.strip().lower()
    s = (
        s.replace("—", "-")
         .replace("–", "-")
         .replace("’", "'")
         .replace("“", '"')
         .replace("”", '"')
    )
    # collapse whitespace
    s = " ".join(s.split())
    return s

# Prepare stadium lookup from already loaded `stadiums` DataFrame
stadiums["stadium_norm"] = stadiums["stadium"].astype(str).apply(norm_name)

# Map normalized stadium name -> first matching stadium row
stad_map = {}
for _, r in stadiums.iterrows():
    key = r["stadium_norm"]
    if key not in stad_map:
        stad_map[key] = {
            "stadium": r["stadium"],
            "City": r.get("City"),
            "State": r.get("State"),
            "lat": r.get("lat"),
            "lon": r.get("lon"),
        }

# Build match report
rows = []
for v in unique_venues:
    vn = norm_name(v)
    match = vn in stad_map
    info = stad_map.get(vn, {})
    rows.append({
        "espn_venue": v,
        "espn_venue_norm": vn,
        "match": bool(match),
        "matched_stadium": info.get("stadium"),
        "City": info.get("City"),
        "State": info.get("State"),
        "lat": info.get("lat"),
        "lon": info.get("lon"),
    })

df_match = pd.DataFrame(rows).sort_values(["match", "espn_venue"], ascending=[False, True])

# Save report
out_match_csv = f"{Path(INPUT_ESPN).stem}_venue_match_report.csv"
df_match.to_csv(out_match_csv, index=False)

# Summary and preview
matched_count = int(df_match["match"].sum())
total = len(df_match)
print(f"Input ESPN CSV: {INPUT_ESPN}")
print(f"Unique venues: {total}, matched: {matched_count}, unmatched: {total - matched_count}")
print(f"Wrote match report: {out_match_csv}")
print(df_match.head(20))

Input ESPN CSV: espn_2026_college_baseball_games.csv
Unique venues: 291, matched: 166, unmatched: 125
Wrote match report: espn_2026_college_baseball_games_venue_match_report.csv
                                      espn_venue  \
0           Alex Box Stadium, Skip Bertman Field   
1        Alex Rodriguez Park at Mark Light Field   
2                                Alexander Field   
3                                    Allen Field   
4                        Alumni Baseball Diamond   
8                           Bailey-Brayton Field   
9                                  Bainton Field   
11                               Bannerwood Park   
12                            Bart Kaufman Field   
13                           Baum-Walker Stadium   
14                                  Bear Stadium   
16  Beaver Field at Jim and Bettie Smith Stadium   
17      Ben Meyer Diamond at Ray E. Didier Field   
19                    Bill Aker Baseball Complex   
20                               Bill Beck

In [3]:
# Build noon-wind dataset from stadium_match and stadium list (testing mode)
import pandas as pd
import numpy as np
import math
import requests
from datetime import datetime, timedelta
from zoneinfo import ZoneInfo
from pathlib import Path
import glob

# 1) Locate stadium match CSV
match_candidates = []
match_candidates += glob.glob("*venue_match_report.csv")
match_candidates += glob.glob("*stadium*match*.csv")
STADIUM_MATCH_CSV = match_candidates[0] if match_candidates else None
if STADIUM_MATCH_CSV is None:
    raise FileNotFoundError("No stadium match CSV found. Expected '*venue_match_report.csv' or '*stadium*match*.csv' in working directory.")

print(f"Using stadium match file: {STADIUM_MATCH_CSV}")
match_df = pd.read_csv(STADIUM_MATCH_CSV)

# Helper: get existing column by candidates (case-insensitive)
def pick_col(df, candidates):
    cols_lower = {c.lower(): c for c in df.columns}
    for cand in candidates:
        if cand.lower() in cols_lower:
            return cols_lower[cand.lower()]
    return None

# Identify the stadium name column
stad_col = pick_col(match_df, ["stadium final", "stadium_final", "matched_stadium", "stadium"])
if stad_col is None:
    raise KeyError("Could not find a stadium column in match file (looked for 'stadium final', 'matched_stadium', or 'stadium').")

# Unique stadiums
stadiums_unique = (
    match_df[[stad_col]]
    .dropna()
    .rename(columns={stad_col: "stadium_final"})
    .drop_duplicates()
)
print(f"Unique stadiums found: {len(stadiums_unique)}")

# 2) Load the canonical stadium list to get Azimuth and Lat/Long
stad_list_path = "Stadium_list_final.csv"
if not Path(stad_list_path).exists():
    raise FileNotFoundError(f"Missing {stad_list_path}")
base_stads = pd.read_csv(stad_list_path, encoding="cp1252")
cols = list(base_stads.columns)

# Find Lat/Long columns following Azimuth
if "Azimuth" not in base_stads.columns:
    raise ValueError("Azimuth column not found in Stadium_list_final.csv")
az_idx = cols.index("Azimuth")
lat_col = None
lon_col = None
for i in range(az_idx + 1, len(cols)):
    name = str(cols[i]).strip().lower()
    if name in ("lat", "latitude") and lat_col is None:
        lat_col = cols[i]
    elif name in ("long", "longitude", "lon") and lon_col is None:
        lon_col = cols[i]
    if lat_col and lon_col:
        break
if lat_col is None or lon_col is None:
    # Fallback: try generic names anywhere
    for c in base_stads.columns:
        n = str(c).strip().lower()
        if n in ("lat", "latitude") and lat_col is None:
            lat_col = c
        elif n in ("long", "longitude", "lon") and lon_col is None:
            lon_col = c
    if lat_col is None or lon_col is None:
        raise ValueError("Could not find latitude/longitude columns in Stadium_list_final.csv")

# Normalize azimuth to degrees
def parse_azimuth(x):
    if pd.isna(x):
        return np.nan
    s = str(x)
    s = "".join(ch for ch in s if ch.isdigit() or ch == ".")
    try:
        val = float(s)
        return val % 360.0
    except Exception:
        return np.nan

base_stads["Azimuth_deg"] = base_stads["Azimuth"].apply(parse_azimuth)

# Normalize stadium names for join

def norm_name(s: str) -> str:
    s = str(s) if pd.notna(s) else ""
    s = s.strip().lower()
    s = (
        s.replace("—", "-")
         .replace("–", "-")
         .replace("’", "'")
         .replace("“", '"')
         .replace("”", '"')
    )
    s = " ".join(s.split())
    return s

base_stads["stadium_norm"] = base_stads.get("Stadium", base_stads.get("stadium", "")).astype(str).apply(norm_name)
stadiums_unique["stadium_norm"] = stadiums_unique["stadium_final"].astype(str).apply(norm_name)

# Join to get azimuth and coords
merged = stadiums_unique.merge(
    base_stads[["stadium_norm", "Stadium", "Azimuth_deg", lat_col, lon_col]].rename(
        columns={lat_col: "latitude", lon_col: "longitude"}
    ),
    on="stadium_norm",
    how="left",
)

# 3) Compute noon-tomorrow wind per stadium
OPEN_METEO_URL = "https://api.open-meteo.com/v1/forecast"
SESSION = requests.Session()
cache = {}


def get_tomorrow_noon(lat, lon):
    key = (round(float(lat), 4), round(float(lon), 4))
    if key in cache:
        return cache[key]
    params = {
        "latitude": float(lat),
        "longitude": float(lon),
        "hourly": "wind_speed_10m,wind_direction_10m",
        "timezone": "auto",
        "windspeed_unit": "ms",
    }
    r = SESSION.get(OPEN_METEO_URL, params=params, timeout=20)
    r.raise_for_status()
    data = r.json()
    tzname = data.get("timezone", "UTC")
    times = data["hourly"]["time"]
    speeds = data["hourly"]["wind_speed_10m"]
    dirs_from = data["hourly"]["wind_direction_10m"]

    now_local = datetime.now(ZoneInfo(tzname))
    tomorrow_date = (now_local + timedelta(days=1)).date()

    idx = None
    for i, t in enumerate(times):
        dt = datetime.fromisoformat(t).replace(tzinfo=ZoneInfo(tzname))
        if dt.date() == tomorrow_date and dt.hour == 12:
            idx = i
            break
    if idx is None:
        target_dt = datetime.combine(tomorrow_date, datetime.min.time()).replace(hour=12, tzinfo=ZoneInfo(tzname))
        diffs = []
        for i, t in enumerate(times):
            dt = datetime.fromisoformat(t).replace(tzinfo=ZoneInfo(tzname))
            diffs.append((abs((dt - target_dt).total_seconds()), i))
        idx = min(diffs)[1]

    result = {
        "time": times[idx],
        "timezone": tzname,
        "ws_ms": float(speeds[idx]),
        "wd_from_deg": float(dirs_from[idx]),
    }
    cache[key] = result
    return result

RAD = math.pi / 180.0


def wind_components(ws, wd_from_deg, az_deg):
    if ws is None or pd.isna(ws) or wd_from_deg is None or pd.isna(wd_from_deg) or az_deg is None or pd.isna(az_deg):
        return (np.nan, np.nan, np.nan)
    wt_deg = (wd_from_deg + 180.0) % 360.0
    wt = wt_deg * RAD
    az = az_deg * RAD
    w_ex = math.sin(wt)
    w_ny = math.cos(wt)
    a_ex = math.sin(az)
    a_ny = math.cos(az)
    comp_along_az = ws * (w_ex * a_ex + w_ny * a_ny)
    comp_ns = ws * w_ny
    comp_ew = ws * w_ex
    return (comp_along_az, comp_ns, comp_ew)

# Drop missing basics and compute
work = merged.dropna(subset=["latitude", "longitude", "Azimuth_deg"]).copy()
rows = []
for _, r in work.iterrows():
    lat = r["latitude"]
    lon = r["longitude"]
    az = r["Azimuth_deg"]
    fc = get_tomorrow_noon(lat, lon)
    comp_az, comp_ns, comp_ew = wind_components(fc["ws_ms"], fc["wd_from_deg"], az)
    rows.append({
        "stadium_final": r["stadium_final"],
        "Stadium": r.get("Stadium"),
        "latitude": lat,
        "longitude": lon,
        "Azimuth_deg": az,
        "Forecast_Time_Local": fc["time"],
        "Timezone": fc["timezone"],
        "Wind_Speed_10m_ms": fc["ws_ms"],
        "Wind_Speed_10m_mph": fc["ws_ms"] * 2.23694,
        "Wind_Direction_From_deg": fc["wd_from_deg"],
        "Wind_Component_Azimuth_ms": comp_az,
        "Wind_Component_Azimuth_mph": comp_az * 2.23694,
        "Wind_Component_NorthSouth_ms": comp_ns,
        "Wind_Component_EastWest_ms": comp_ew,
        "Component_Along_Azimuth_abs_ms": abs(comp_az),
    })

out_df = pd.DataFrame(rows)
out_path = "stadium_wind_testing.csv"
out_df.to_csv(out_path, index=False)
print(f"Wrote testing-mode stadium wind CSV: {out_path} (rows: {len(out_df)})")
print(out_df.head(12))

Using stadium match file: espn_2026_college_baseball_games_venue_match_report.csv
Unique stadiums found: 166
Wrote testing-mode stadium wind CSV: stadium_wind_testing.csv (rows: 166)
                                   stadium_final  \
0           Alex Box Stadium, Skip Bertman Field   
1        Alex Rodriguez Park at Mark Light Field   
2                                Alexander Field   
3                                    Allen Field   
4                        Alumni Baseball Diamond   
5                           Bailey–Brayton Field   
6                                  Bainton Field   
7                                Bannerwood Park   
8                             Bart Kaufman Field   
9                            Baum–Walker Stadium   
10                                  Bear Stadium   
11  Beaver Field at Jim and Bettie Smith Stadium   

                                         Stadium   latitude   longitude  \
0           Alex Box Stadium, Skip Bertman Field  30.406735  -91.

In [4]:
pip install folium

Note: you may need to restart the kernel to use updated packages.


In [None]:
# Demo mode: build wind dataset for first games (2026-02-13) using noon-tomorrow forecasts
import pandas as pd
import numpy as np
import math
import requests
from datetime import datetime, timedelta
from zoneinfo import ZoneInfo
from pathlib import Path
import glob

# 1) Load ESPN season CSV and filter first day of season
ESPN_2026_CSV = "espn_2026_college_baseball_games.csv"
if not Path(ESPN_2026_CSV).exists():
    raise FileNotFoundError(f"Missing {ESPN_2026_CSV}. Run the ESPN scrape cell or place the file in the workspace.")

games = pd.read_csv(ESPN_2026_CSV)
if "event_date" not in games.columns:
    raise KeyError("ESPN CSV missing 'event_date' column.")
games["event_dt_utc"] = pd.to_datetime(games["event_date"], utc=True, errors="coerce")
games["event_date_only"] = games["event_dt_utc"].dt.date.astype(str)
first_day = "2026-02-13"
games_day = games[games["event_date_only"] == first_day].copy()
if games_day.empty:
    print(f"No games found for {first_day} in {ESPN_2026_CSV}.")
    display(games.head(12))
else:
    print(f"Games on {first_day}: {len(games_day)}")

    # 2) Venue match report to map ESPN venue -> stadium name
    match_candidates = []
    match_candidates += glob.glob("*venue_match_report.csv")
    match_candidates += glob.glob("*stadium*match*.csv")
    STADIUM_MATCH_CSV = match_candidates[0] if match_candidates else None
    if STADIUM_MATCH_CSV is None:
        raise FileNotFoundError("No stadium match CSV found. Expected '*venue_match_report.csv' or '*stadium*match*.csv'.")
    match_df = pd.read_csv(STADIUM_MATCH_CSV)

    def norm_name(s: str) -> str:
        s = str(s) if pd.notna(s) else ""
        s = s.strip().lower()
        s = (s.replace("—", "-")
              .replace("–", "-")
              .replace("’", "'")
              .replace("“", '"')
              .replace("”", '"'))
        s = " ".join(s.split())
        return s

    # Normalize for join
    games_day["venue_norm"] = games_day["venue"].astype(str).apply(norm_name)
    match_df["espn_venue_norm"] = match_df.get("espn_venue_norm", match_df.get("espn_venue", "")).astype(str).apply(norm_name)

    # Stadium column can be any of these
    def pick_col(df, candidates):
        cols_lower = {c.lower(): c for c in df.columns}
        for cand in candidates:
            if cand.lower() in cols_lower:
                return cols_lower[cand.lower()]
        return None

    stad_col = pick_col(match_df, ["stadium final", "stadium_final", "matched_stadium", "stadium"])
    if stad_col is None:
        raise KeyError("Could not find a stadium column in match file (looked for 'stadium final', 'matched_stadium', or 'stadium').")

    # Join games to matched stadiums
    dfj = games_day.merge(match_df[["espn_venue_norm", stad_col]], left_on="venue_norm", right_on="espn_venue_norm", how="left")
    dfj = dfj.rename(columns={stad_col: "stadium_final"})
    dfj = dfj.dropna(subset=["stadium_final"]).copy()
    print(f"Matched stadiums for {first_day}: {dfj['stadium_final'].nunique()} / {len(dfj)} games")

    # 3) Load canonical stadium list to get Azimuth + Lat/Lon
    stad_list_path = "Stadium_list_final.csv"
    if not Path(stad_list_path).exists():
        raise FileNotFoundError(f"Missing {stad_list_path}")
    base = pd.read_csv(stad_list_path, encoding="cp1252")
    cols = list(base.columns)
    if "Azimuth" not in base.columns:
        raise ValueError("Azimuth column not found in Stadium_list_final.csv")
    az_idx = cols.index("Azimuth")
    lat_col = None
    lon_col = None
    for i in range(az_idx + 1, len(cols)):
        nm = str(cols[i]).strip().lower()
        if nm in ("lat", "latitude") and lat_col is None:
            lat_col = cols[i]
        elif nm in ("long", "longitude", "lon") and lon_col is None:
            lon_col = cols[i]
        if lat_col and lon_col:
            break
    if lat_col is None or lon_col is None:
        for c in base.columns:
            nm = str(c).strip().lower()
            if nm in ("lat", "latitude") and lat_col is None:
                lat_col = c
            elif nm in ("long", "longitude", "lon") and lon_col is None:
                lon_col = c
    if lat_col is None or lon_col is None:
        raise ValueError("Could not find latitude/longitude columns in Stadium_list_final.csv")

    # Normalize azimuth to degrees
    def parse_azimuth(x):
        if pd.isna(x):
            return np.nan
        s = str(x)
        s = "".join(ch for ch in s if ch.isdigit() or ch == ".")
        try:
            return float(s) % 360.0
        except Exception:
            return np.nan

    base["Azimuth_deg"] = base["Azimuth"].apply(parse_azimuth)
    base["stadium_norm"] = base.get("Stadium", base.get("stadium", "")).astype(str).apply(norm_name)
    dfj["stadium_norm"] = dfj["stadium_final"].astype(str).apply(norm_name)

    merged = (dfj[["stadium_final", "stadium_norm"]]\
              .drop_duplicates()\
              .merge(base[["stadium_norm", "Stadium", "Azimuth_deg", lat_col, lon_col]]\
                        .rename(columns={lat_col: "latitude", lon_col: "longitude"}),
                     on="stadium_norm", how="left"))

    work = merged.dropna(subset=["latitude", "longitude", "Azimuth_deg"]).copy()
    if work.empty:
        print("No matched stadiums with azimuth + coordinates found.")
        display(merged.head(12))
    else:
        # 4) Forecast noon tomorrow at local time via Open-Meteo
        OPEN_METEO_URL = "https://api.open-meteo.com/v1/forecast"
        SESSION = requests.Session()
        cache = {}

        def get_tomorrow_noon(lat, lon):
            key = (round(float(lat), 4), round(float(lon), 4))
            if key in cache:
                return cache[key]
            params = {
                "latitude": float(lat),
                "longitude": float(lon),
                "hourly": "wind_speed_10m,wind_direction_10m",
                "timezone": "auto",
                "windspeed_unit": "ms",
            }
            r = SESSION.get(OPEN_METEO_URL, params=params, timeout=20)
            r.raise_for_status()
            data = r.json()
            tzname = data.get("timezone", "UTC")
            times = data["hourly"]["time"]
            speeds = data["hourly"]["wind_speed_10m"]
            dirs_from = data["hourly"]["wind_direction_10m"]
            now_local = datetime.now(ZoneInfo(tzname))
            tomorrow_date = (now_local + timedelta(days=1)).date()
            idx = None
            for i, t in enumerate(times):
                dt = datetime.fromisoformat(t).replace(tzinfo=ZoneInfo(tzname))
                if dt.date() == tomorrow_date and dt.hour == 12:
                    idx = i
                    break
            if idx is None:
                target_dt = datetime.combine(tomorrow_date, datetime.min.time()).replace(hour=12, tzinfo=ZoneInfo(tzname))
                diffs = []
                for i, t in enumerate(times):
                    dt = datetime.fromisoformat(t).replace(tzinfo=ZoneInfo(tzname))
                    diffs.append((abs((dt - target_dt).total_seconds()), i))
                idx = min(diffs)[1]
            result = {
                "time": times[idx],
                "timezone": tzname,
                "ws_ms": float(speeds[idx]),
                "wd_from_deg": float(dirs_from[idx]),
            }
            cache[key] = result
            return result

        RAD = math.pi / 180.0

        def wind_components(ws, wd_from_deg, az_deg):
            if ws is None or pd.isna(ws) or wd_from_deg is None or pd.isna(wd_from_deg) or az_deg is None or pd.isna(az_deg):
                return (np.nan, np.nan, np.nan)
            wt_deg = (wd_from_deg + 180.0) % 360.0
            wt = wt_deg * RAD
            az = az_deg * RAD
            w_ex = math.sin(wt)
            w_ny = math.cos(wt)
            a_ex = math.sin(az)
            a_ny = math.cos(az)
            comp_along_az = ws * (w_ex * a_ex + w_ny * a_ny)
            comp_ns = ws * w_ny
            comp_ew = ws * w_ex
            return (comp_along_az, comp_ns, comp_ew)

        rows = []
        for _, r in work.iterrows():
            lat = r["latitude"]
            lon = r["longitude"]
            az = r["Azimuth_deg"]
            fc = get_tomorrow_noon(lat, lon)
            comp_az, comp_ns, comp_ew = wind_components(fc["ws_ms"], fc["wd_from_deg"], az)
            rows.append({
                "stadium_final": r["stadium_final"],
                "Stadium": r.get("Stadium"),
                "latitude": lat,
                "longitude": lon,
                "Azimuth_deg": az,
                "Forecast_Time_Local": fc["time"],
                "Timezone": fc["timezone"],
                "Wind_Speed_10m_ms": fc["ws_ms"],
                "Wind_Speed_10m_mph": fc["ws_ms"] * 2.23694,
                "Wind_Direction_From_deg": fc["wd_from_deg"],
                "Wind_Component_Azimuth_ms": comp_az,
                "Wind_Component_Azimuth_mph": comp_az * 2.23694,
                "Wind_Component_NorthSouth_ms": comp_ns,
                "Wind_Component_EastWest_ms": comp_ew,
                "Component_Along_Azimuth_abs_ms": abs(comp_az),
            })

        demo_df = pd.DataFrame(rows)
        out_demo = f"stadium_wind_demo_{first_day.replace('-', '')}.csv"
        demo_df.to_csv(out_demo, index=False)
        print(f"Wrote demo-mode stadium wind CSV: {out_demo} (rows: {len(demo_df)})")
        # Show top wind-aligned stadiums
        top = demo_df.sort_values("Component_Along_Azimuth_abs_ms", ascending=False).copy()
        top["Azimuth_mph_abs"] = top["Component_Along_Azimuth_abs_ms"] * 2.23694
        print(top[["stadium_final","Wind_Speed_10m_mph","Azimuth_mph_abs","Forecast_Time_Local","Timezone"]].head(12))