In [0]:
%pip install openmeteo-requests
%pip install requests-cache retry-requests numpy pandas

In [0]:
'''
The purpose of this file is to grab the previous days weather and save those results to the data bricks table for now. Perhaps I will build a staging table and then that table will be pushed to a mysql database in another notebook.
'''

from pyspark.sql.functions import *

cities = spark.table('workspace.idasky.idaho_cities')
connection = spark.table('workspace.idasky.city_grid_lookup')
# connection.printSchema()

In [0]:
# This will let us grab the correct dates from idaho. We only want to grab the previous dates that are completed.

from datetime import datetime, timedelta
from zoneinfo import ZoneInfo

idaho_tz = ZoneInfo("America/Boise")  # Most of Idaho

now_idaho = datetime.now(tz=idaho_tz)
today = now_idaho.strftime("%Y-%m-%d")
yesterday = (now_idaho - timedelta(days=1)).strftime("%Y-%m-%d")

print("Today:", today)
print("Yesterday:", yesterday)

In [0]:
import openmeteo_requests
import pandas as pd
import requests_cache
from retry_requests import retry
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame as SparkDataFrame
from datetime import datetime, timedelta
from zoneinfo import ZoneInfo

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after=-1)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)


def get_yesterday_weather_for_grids(grid_locations_df: SparkDataFrame, hourly_vars: list = None):
    """
    Fetch yesterday's weather data for unique grid locations (Idaho timezone).
    
    Parameters:
    -----------
    grid_locations_df : SparkDataFrame
        DataFrame with 'grid_lat' and 'grid_long' columns
    hourly_vars : list, optional
        List of hourly variables to fetch
    
    Returns:
    --------
    SparkDataFrame with yesterday's hourly weather data for all grid locations
    """
    
    # Get yesterday's date in Idaho timezone
    idaho_tz = ZoneInfo("America/Boise")
    now_idaho = datetime.now(tz=idaho_tz)
    yesterday = (now_idaho - timedelta(days=1)).strftime("%Y-%m-%d")
    
    print(f"Fetching weather data for: {yesterday} (Idaho time)")
    
    if hourly_vars is None:
        hourly_vars = [
            "temperature_2m",
            "relative_humidity_2m",
            "apparent_temperature",
            "dew_point_2m",
            "precipitation",
            "weather_code",
            "pressure_msl",
            "cloud_cover",
            "wind_speed_10m",
            "rain",
            "snow_depth",
            "snowfall"
        ]
    
    url = "https://archive-api.open-meteo.com/v1/archive"
    
    # Get unique grid locations
    locations_pandas = grid_locations_df.select("grid_lat", "grid_long").distinct().toPandas()
    
    all_data = []
    
    # Process each grid location
    for idx, row in locations_pandas.iterrows():
        grid_lat = row['grid_lat']
        grid_long = row['grid_long']
        
        params = {
            "latitude": grid_lat,
            "longitude": grid_long,
            "start_date": yesterday,
            "end_date": yesterday,
            "hourly": hourly_vars,
            "temperature_unit": "fahrenheit",
            "wind_speed_unit": "mph",
            "precipitation_unit": "inch",
        }
        
        try:
            responses = openmeteo.weather_api(url, params=params)
            response = responses[0]
            
            print(f"Processing grid {idx+1}/{len(locations_pandas)}: ({grid_lat:.4f}, {grid_long:.4f})")
            
            # Process hourly data
            hourly = response.Hourly()
            
            hourly_data = {
                "date": pd.date_range(
                    start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
                    end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
                    freq=pd.Timedelta(seconds=hourly.Interval()),
                    inclusive="left"
                )
            }
            
            hourly_data["grid_lat"] = grid_lat
            hourly_data["grid_long"] = grid_long
            
            # Extract all hourly variables
            for i, var_name in enumerate(hourly_vars):
                hourly_data[var_name] = hourly.Variables(i).ValuesAsNumpy()
            
            location_df = pd.DataFrame(data=hourly_data)
            all_data.append(location_df)
            
        except Exception as e:
            print(f"Error processing grid ({grid_lat}, {grid_long}): {str(e)}")
            continue
    
    if all_data:
        combined_pandas_df = pd.concat(all_data, ignore_index=True)
        spark = SparkSession.builder.getOrCreate()
        spark_df = spark.createDataFrame(combined_pandas_df)
        
        print(f"\n{'='*60}")
        print(f"Successfully processed: {len(all_data)}/{len(locations_pandas)} grids")
        print(f"Total hourly records: {spark_df.count()}")
        print(f"Date: {yesterday}")
        print(f"{'='*60}")
        
        return spark_df
    else:
        raise ValueError("No data was successfully retrieved")




In [0]:
# # Usage:
# # Get unique grids from your connection table
connection = spark.table('workspace.idasky.city_grid_lookup')
unique_grids = connection.select("grid_lat", "grid_long").distinct()

# # Fetch yesterday's weather
yesterday_weather = get_yesterday_weather_for_grids(unique_grids)

# yesterday_weather.show(20)

In [0]:
yesterday_weather = (
    yesterday_weather
                    .withColumn("year", year("date")) 
                    .withColumn("month", month("date")) 
                    .withColumn("day", dayofmonth("date"))
)
# display(yesterday_weather)

# daily_agg = yesterday_weather.groupBy('year', 'month', 'day', 'grid_lat', 'grid_long').agg(
#     max('temperature_2m').alias('temp_high'),
#     min('temperature_2m').alias('temp_low'),
#     max('relative_humidity_2m').alias('humidity_high'),
#     min('temperature_2m').alias('humidity_low'),
#     sum('rain').alias('total_rain'),
#     sum('snowfall').alias('total_snow')
# )

# display(daily_agg)

In [0]:
from delta.tables import DeltaTable
from pyspark.sql.functions import year, month, dayofmonth

# Add Idaho time columns if needed
from zoneinfo import ZoneInfo
from datetime import datetime, timedelta

# Table name in your schema
table_name = "workspace.idasky.idaho_historic"

# 1️⃣ Create table if it doesn't exist
if not spark.catalog.tableExists(table_name):
    yesterday_weather.write.format("delta").saveAsTable(table_name)

# 2️⃣ Append only new rows (avoid duplicates)
else:
    delta_table = DeltaTable.forName(spark, table_name)
    
    delta_table.alias("target").merge(
        yesterday_weather.alias("source"),
        """
        target.year = source.year AND
        target.month = source.month AND
        target.day = source.day AND
        target.grid_lat = source.grid_lat AND
        target.grid_long = source.grid_long AND
        target.date = source.date
        """
    ).whenNotMatchedInsertAll().execute()

In [0]:
historic = spark.table(table_name)
historic.count()

In [0]:
# # Table name
# table_name = "workspace.idasky.idaho_historic"

# # Drop the table if it exists
# if spark.catalog.tableExists(table_name):
#     spark.sql(f"DROP TABLE {table_name}")
#     print(f"Table {table_name} has been dropped.")
# else:
#     print(f"Table {table_name} does not exist.")
