## Open-Meteo Historical Weather Data Collection

This script downloads **hourly historical weather data** from the Open-Meteo Historical Forecast API to construct **non-accident baseline observations** for Michigan.

**Coverage**

* 30 Michigan cities (urban, rural, lake-effect, northern, UP)
* January 2016 to March 2023
* Hourly resolution

**Variables**

* Temperature (2m, °F)
* Weather code
* Relative humidity (2m)
* Surface pressure
* Visibility
* Wind speed (10m, mph)
* Precipitation (inches)

**Implementation details**

* Data pulled in **monthly chunks** to manage API limits and connection stability.
* Local caching and retry logic enabled.
* Results appended incrementally to a single CSV with deduplication on
  `time + latitude + longitude`.

**Data status**

* **1,757,232 total observations collected**
* Full coverage (2016–Mar 2023) for most cities
* **12 cities missing ~2 years of data** due to daily API limits during initial full-range pulls

**Notes**

* Missing city-year ranges can be backfilled with targeted date-range queries.
* Weekly chunking may be required if monthly requests fail.

In [None]:
import openmeteo_requests
import pandas as pd
import requests_cache
import os
import time
from retry_requests import retry

# --- client ---
cache_session = requests_cache.CachedSession(".cache", expire_after=3600)
retry_session = retry(cache_session, retries=5, backoff_factor=0.6)
openmeteo = openmeteo_requests.Client(session=retry_session)

url = "https://historical-forecast-api.open-meteo.com/v1/forecast"

latitudes = [
    42.3314,  # Detroit
    42.2808,  # Ann Arbor
    42.7325,  # Lansing
    42.9634,  # Grand Rapids
    43.0125,  # Flint
    42.2917,  # Kalamazoo
    44.7631,  # Traverse City
    46.5436,  # Marquette
    46.4953,  # Sault Ste. Marie
    43.5978,  # Mount Pleasant
    43.0006,  # St. Johns
    42.8722,  # Ionia
    43.8014,  # Bad Axe
    43.4195,  # Saginaw
    43.5945,  # Bay City
    44.2519,  # Ludington
    44.0974,  # Manistee
    43.0631,  # Grand Haven
    45.8203,  # Iron Mountain
    44.9731,  # Charlevoix
    45.7875,  # Cheboygan
    43.7223,  # Holland
    43.9553,  # Big Rapids
    44.2444,  # Alpena
    45.3733,  # Petoskey
    43.2917,  # Jackson
    44.6610,  # Gaylord
    46.7867,  # Ironwood
    46.4110,  # Escanaba
    45.0275   # Mackinaw City
]

longitudes = [
    -83.0458,  # Detroit
    -83.7430,  # Ann Arbor
    -84.5555,  # Lansing
    -85.6681,  # Grand Rapids
    -83.6875,  # Flint
    -85.5872,  # Kalamazoo
    -85.6206,  # Traverse City
    -87.3954,  # Marquette
    -84.3453,  # Sault Ste. Marie
    -84.7675,  # Mount Pleasant
    -84.5590,  # St. Johns
    -84.8986,  # Ionia
    -83.0008,  # Bad Axe
    -83.9508,  # Saginaw
    -83.8889,  # Bay City
    -86.3240,  # Ludington
    -86.2044,  # Manistee
    -86.2284,  # Grand Haven
    -88.0659,  # Iron Mountain
    -85.1973,  # Charlevoix
    -84.7272,  # Cheboygan
    -86.1056,  # Holland
    -85.4839,  # Big Rapids
    -83.3308,  # Alpena
    -84.9553,  # Petoskey
    -84.4014,  # Jackson
    -84.7147,  # Gaylord
    -90.1710,  # Ironwood
    -86.6345,  # Escanaba
    -84.7278   # Mackinaw City
]

hourly_vars = [
    "temperature_2m", "weather_code", "relative_humidity_2m",
    "surface_pressure", "visibility", "wind_speed_10m", "precipitation"
]

os.makedirs("weather_outputs", exist_ok=True)
OUTFILE = "weather_outputs/mi_hourly_2016_2023.csv"

def append_dedup(df: pd.DataFrame, path: str):
    # ensure consistent dtype
    df["time"] = pd.to_datetime(df["time"], utc=True)

    df = df.drop_duplicates(subset=["time", "latitude", "longitude"])

    if os.path.exists(path):
        old = pd.read_csv(path, usecols=["time", "latitude", "longitude"])
        old["time"] = pd.to_datetime(old["time"], utc=True)

        df = df.merge(
            old,
            on=["time", "latitude", "longitude"],
            how="left",
            indicator=True,
        )
        df = df[df["_merge"] == "left_only"].drop(columns="_merge")

    if not df.empty:
        df.to_csv(path, mode="a", header=not os.path.exists(path), index=False)

    return len(df)


# monthly chunks
start = pd.Timestamp("2016-01-01")
end   = pd.Timestamp("2023-03-31")

chunk_starts = pd.date_range(start=start, end=end, freq="MS")  # Month Start
for cs in chunk_starts:
    ce = (cs + pd.offsets.MonthBegin(1)) - pd.Timedelta(days=1)  # last day of month
    if ce > end:
        ce = end

    params = {
        "latitude": latitudes,
        "longitude": longitudes,
        "start_date": cs.strftime("%Y-%m-%d"),
        "end_date": ce.strftime("%Y-%m-%d"),
        "hourly": hourly_vars,
        "wind_speed_unit": "mph",
        "temperature_unit": "fahrenheit",
        "precipitation_unit": "inch",
        "timezone": "America/Detroit",
    }

    try:
        responses = openmeteo.weather_api(url, params=params)
    except Exception as e:
        # backoff and continue
        print(f"Chunk {params['start_date']} to {params['end_date']} failed: {e}")
        print("Sleeping 90s then continuing...")
        time.sleep(90)
        continue

    rows = []
    for response in responses:
        lat = round(response.Latitude(), 4)
        lon = round(response.Longitude(), 4)

        hourly = response.Hourly()
        times = pd.date_range(
            start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
            end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
            freq=pd.Timedelta(seconds=hourly.Interval()),
            inclusive="left",
        )

        # same variable order as requested
        vals = [hourly.Variables(i).ValuesAsNumpy() for i in range(len(hourly_vars))]

        df = pd.DataFrame({
            "time": times,
            "temperature_2m": vals[0],
            "weather_code": vals[1],
            "relative_humidity_2m": vals[2],
            "surface_pressure": vals[3],
            "visibility": vals[4],
            "wind_speed_10m": vals[5],
            "precipitation": vals[6],
            "latitude": lat,
            "longitude": lon,
        })
        rows.append(df)

    chunk_df = pd.concat(rows, ignore_index=True)
    n_new = append_dedup(chunk_df, OUTFILE)
    print(f"Saved {n_new} new rows for {params['start_date']} to {params['end_date']}")

    # gentle spacing between chunks
    time.sleep(2.0)

print("Done.")
