In [5]:
import os
from datetime import datetime, timedelta
from pathlib import Path

import pandas as pd
import requests
import numpy as np
import re

In [18]:
def weather_to_dataframe(data: dict) -> pd.DataFrame:
    rows = []

    for day in data.get("days", []):
        for hour in day.get("hours", []):
            row = {
                "datetime": hour.get("datetime"),
                "datetimeEpoch": hour.get("datetimeEpoch"),
                "temp": hour.get("temp"),
                "feelslike": hour.get("feelslike"),
                "humidity": hour.get("humidity"),
                "dew": hour.get("dew"),
                "precip": hour.get("precip"),
                "windgust": hour.get("windgust"),
                "windspeed": hour.get("windspeed"),
                "winddir": hour.get("winddir"),
                "pressure": hour.get("pressure"),
                "cloudcover": hour.get("cloudcover"),
                "visibility": hour.get("visibility"),
                "uvindex": hour.get("uvindex"),
                "conditions": hour.get("conditions"),
                "icon": hour.get("icon"),
                "source": hour.get("source"),
            }
            rows.append(row)

    return pd.DataFrame(rows)

def fetch_weather(
        start_date: str,
        end_date: str,
        location: str = "KLAX",
        api_key: str = "35UKBH4MZ8E34ABXX5872T7AJ"
        # api_key: str = "4XQUEPUHXM742C6ZHEUSF7L5Z"
) -> pd.DataFrame:
    """
    Fetch hourly weather data day-by-day to avoid cost overruns.
    """

    start = datetime.fromisoformat(start_date)
    end = datetime.fromisoformat(end_date)

    all_rows = []

    current = start
    while current < end:
        day_str = current.strftime("%Y-%m-%d")

        url = (
            "https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/"
            f"timeline/{location}/{day_str}"
        )

        params = {
            "unitGroup": "metric",
            "contentType": "json",
            "key": api_key,
            "include": "hours",
        }

        response = requests.get(url, params=params)

        if response.status_code == 429:
            raise RuntimeError(
                "Visual Crossing API quota exceeded. "
                "Wait until tomorrow or reduce request volume."
            )

        response.raise_for_status()

        data = response.json()

        df_day = weather_to_dataframe(data)
        all_rows.append(df_day)

        current += timedelta(days=1)

    return pd.concat(all_rows, ignore_index=True)

In [7]:
def generate_weekly_queries(
        start_date="2024-09-01",
        end_date="2024-12-31"
):
    start = pd.to_datetime(start_date)
    end = pd.to_datetime(end_date)

    # Weekly boundaries (7-day frequency)
    dates = pd.date_range(start=start, end=end, freq="7D")

    queries = []
    for i in range(len(dates)):
        week_start = dates[i]
        # Ensure the last week ends exactly at the end_date
        if i + 1 < len(dates):
            week_end = dates[i + 1]
        else:
            week_end = end + pd.Timedelta(days=1)

        queries.append((
            week_start.strftime("%Y-%m-%d"),
            week_end.strftime("%Y-%m-%d")
        ))

    return queries

In [8]:
NOISE_SCALES = {
    "temp": 0.25,
    "feelslike": 0.25,
    "humidity": 1.2,
    "dew": 0.2,
    "precip": 0.05,
    "windgust": 0.5,
    "windspeed": 0.4,
    "winddir": 2.8,
    "pressure": 0.5,
    "cloudcover": 1.9,
    "visibility": 0.2,
    "uvindex": 0.1
}

In [9]:
def hourly_to_minutely_weather(df):
    df = df.copy()

    # Ensure proper datetime
    df["weather_time"] = pd.to_datetime(df["weather_time"])

    minute_rows = []

    for _, row in df.iterrows():
        hour_start = row["weather_time"]

        # Stable seed per hour
        seed = int(row["datetimeEpoch"])
        rng = np.random.default_rng(seed)

        for minute in range(60):
            ts = hour_start + timedelta(minutes=minute)
            new_row = row.copy()
            new_row["weather_time"] = ts

            # Smooth noise within the hour
            phase = minute / 60

            for col, scale in NOISE_SCALES.items():
                if col not in df.columns:
                    continue

                base = row[col]

                # Low-frequency intra-hour drift
                drift = rng.normal(0, scale) * phase

                # Very small white noise
                jitter = rng.normal(0, scale * 0.1)

                value = base + drift + jitter

                # Physical constraints
                if col == "humidity":
                    value = np.clip(value, 0, 100)
                elif col == "cloudcover":
                    value = np.clip(value, 0, 100)
                elif col == "uvindex":
                    value = max(value, 0)
                elif col == "visibility":
                    value = max(value, 0)

                new_row[col] = round(value, 2)

            minute_rows.append(new_row)

    minute_df = pd.DataFrame(minute_rows)

    # Optional: drop original hour-only fields
    minute_df["datetimeEpoch"] = (
        minute_df["weather_time"].astype("int64") // 10**9
    )

    return minute_df

In [10]:
def hourly_to_30s_weather(df):
    df = df.copy()

    # Ensure proper datetime
    df["weather_time"] = pd.to_datetime(df["weather_time"])

    rows_30s = []
    steps_per_hour = 120

    for _, row in df.iterrows():
        hour_start = row["weather_time"]

        # Stable seed per hour
        seed = int(row["datetimeEpoch"])
        rng = np.random.default_rng(seed)

        for step in range(steps_per_hour):
            ts = hour_start + timedelta(seconds=30 * step)
            new_row = row.copy()
            new_row["weather_time"] = ts

            # Smooth noise within the hour
            phase = step / steps_per_hour

            for col, scale in NOISE_SCALES.items():
                if col not in df.columns:
                    continue

                base = row[col]

                # Low-frequency intra-hour drift
                drift = rng.normal(0, scale) * phase

                # Very small white noise
                jitter = rng.normal(0, scale * 0.1)

                value = base + drift + jitter

                # Physical constraints
                if col == "humidity":
                    value = np.clip(value, 0, 100)
                elif col == "cloudcover":
                    value = np.clip(value, 0, 100)
                elif col == "uvindex":
                    value = max(value, 0)
                elif col == "visibility":
                    value = max(value, 0)

                new_row[col] = round(value, 2)

            rows_30s.append(new_row)

    df_30s = pd.DataFrame(rows_30s)

    # Update epoch time to match 30s resolution
    df_30s["datetimeEpoch"] = (
        df_30s["weather_time"].astype("int64") // 10**9
    )

    return df_30s

In [23]:
for i in range(10):
    output_file="weather_"
    output_dir="weather"
    os.makedirs(output_dir, exist_ok=True)
    output_dir = Path(output_dir)
    date_pattern = re.compile(rf"{output_file}(\d{{4}}-\d{{2}}-\d{{2}})\.csv")

    existing_dates = []
    for file in output_dir.glob(f"{output_file}*.csv"):
        match = date_pattern.match(file.name)
        if match:
            existing_dates.append(datetime.strptime(match.group(1), "%Y-%m-%d").date())

    if existing_dates:
        latest_date = max(existing_dates)
        next_date = latest_date + timedelta(days=1)
        end_date = next_date + timedelta(days=1)

    weather = fetch_weather(next_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d'))
    weather["weather_time"] = pd.to_datetime(
        weather["datetimeEpoch"], unit="s"
    )
    weather["weather_time"] = weather["weather_time"].dt.tz_localize("UTC").dt.tz_convert("America/Los_Angeles").dt.tz_localize(None)
    weather = weather.sort_values("weather_time")
    weather = hourly_to_30s_weather(weather)
    weather.to_csv(f"weather/weather_{next_date}.csv", index=False)

RuntimeError: Visual Crossing API quota exceeded. Wait until tomorrow or reduce request volume.

In [53]:
import pandas as pd
from pathlib import Path

def split_weather_by_day(input_csv, output_dir):
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    df = pd.read_csv(input_csv)
    df["weather_time"] = pd.to_datetime(df["weather_time"])

    # Extract date
    df["date"] = df["weather_time"].dt.date

    # Split and save
    for date, day_df in df.groupby("date"):
        out_file = output_dir / f"weather_{date}.csv"
        day_df.drop(columns="date").to_csv(out_file, index=False)

In [61]:
split_weather_by_day(
    "data_historic/weather_2024-10-20_to_2024-10-27.csv",
    "weather"
)