In [0]:
%pip install openmeteo-requests
%pip install requests-cache retry-requests numpy pandas

In [0]:


import pandas as pd
import requests_cache
from retry_requests import retry

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = 3600)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://historical-forecast-api.open-meteo.com/v1/forecast"
params = {
	"latitude": 43.826,
	"longitude": -111.7897,
	"start_date": "2025-01-01",
	"end_date": "2025-12-07",
	"daily": ["weather_code", "temperature_2m_max", "temperature_2m_min"],
	"hourly": ["relative_humidity_2m", "temperature_2m", "dew_point_2m", "apparent_temperature", "precipitation_probability", "precipitation", "showers", "rain", "weather_code", "surface_pressure", "visibility", "cloud_cover", "pressure_msl", "wind_speed_10m", "wind_speed_80m", "wind_speed_120m", "wind_speed_180m", "temperature_80m", "uv_index", "is_day", "sunshine_duration"],
	"wind_speed_unit": "mph",
	"temperature_unit": "fahrenheit",
	"precipitation_unit": "inch",
}
responses = openmeteo.weather_api(url, params=params)

# Process first location. Add a for-loop for multiple locations or weather models
response = responses[0]
print(f"Coordinates: {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation: {response.Elevation()} m asl")
print(f"Timezone difference to GMT+0: {response.UtcOffsetSeconds()}s")

# Process hourly data. The order of variables needs to be the same as requested.
hourly = response.Hourly()
hourly_relative_humidity_2m = hourly.Variables(0).ValuesAsNumpy()
hourly_temperature_2m = hourly.Variables(1).ValuesAsNumpy()
hourly_dew_point_2m = hourly.Variables(2).ValuesAsNumpy()
hourly_apparent_temperature = hourly.Variables(3).ValuesAsNumpy()
hourly_precipitation_probability = hourly.Variables(4).ValuesAsNumpy()
hourly_precipitation = hourly.Variables(5).ValuesAsNumpy()
hourly_showers = hourly.Variables(6).ValuesAsNumpy()
hourly_rain = hourly.Variables(7).ValuesAsNumpy()
hourly_weather_code = hourly.Variables(8).ValuesAsNumpy()
hourly_surface_pressure = hourly.Variables(9).ValuesAsNumpy()
hourly_visibility = hourly.Variables(10).ValuesAsNumpy()
hourly_cloud_cover = hourly.Variables(11).ValuesAsNumpy()
hourly_pressure_msl = hourly.Variables(12).ValuesAsNumpy()
hourly_wind_speed_10m = hourly.Variables(13).ValuesAsNumpy()
hourly_wind_speed_80m = hourly.Variables(14).ValuesAsNumpy()
hourly_wind_speed_120m = hourly.Variables(15).ValuesAsNumpy()
hourly_wind_speed_180m = hourly.Variables(16).ValuesAsNumpy()
hourly_temperature_80m = hourly.Variables(17).ValuesAsNumpy()
hourly_uv_index = hourly.Variables(18).ValuesAsNumpy()
hourly_is_day = hourly.Variables(19).ValuesAsNumpy()
hourly_sunshine_duration = hourly.Variables(20).ValuesAsNumpy()

hourly_data = {"date": pd.date_range(
	start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
	end =  pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
	freq = pd.Timedelta(seconds = hourly.Interval()),
	inclusive = "left"
)}

hourly_data['location_id'] = 1
hourly_data["relative_humidity_2m"] = hourly_relative_humidity_2m
hourly_data["temperature_2m"] = hourly_temperature_2m
hourly_data["dew_point_2m"] = hourly_dew_point_2m
hourly_data["apparent_temperature"] = hourly_apparent_temperature
hourly_data["precipitation_probability"] = hourly_precipitation_probability
hourly_data["precipitation"] = hourly_precipitation
hourly_data["showers"] = hourly_showers
hourly_data["rain"] = hourly_rain
hourly_data["weather_code"] = hourly_weather_code
hourly_data["surface_pressure"] = hourly_surface_pressure
hourly_data["visibility"] = hourly_visibility
hourly_data["cloud_cover"] = hourly_cloud_cover
hourly_data["pressure_msl"] = hourly_pressure_msl
hourly_data["wind_speed_10m"] = hourly_wind_speed_10m
hourly_data["wind_speed_80m"] = hourly_wind_speed_80m
hourly_data["wind_speed_120m"] = hourly_wind_speed_120m
hourly_data["wind_speed_180m"] = hourly_wind_speed_180m
hourly_data["temperature_80m"] = hourly_temperature_80m
hourly_data["uv_index"] = hourly_uv_index
hourly_data["is_day"] = hourly_is_day
hourly_data["sunshine_duration"] = hourly_sunshine_duration

hourly_dataframe = pd.DataFrame(data = hourly_data)
print("\nHourly data\n", hourly_dataframe)

# Process daily data. The order of variables needs to be the same as requested.
daily = response.Daily()
daily_weather_code = daily.Variables(0).ValuesAsNumpy()
daily_temperature_2m_max = daily.Variables(1).ValuesAsNumpy()
daily_temperature_2m_min = daily.Variables(2).ValuesAsNumpy()

daily_data = {"date": pd.date_range(
	start = pd.to_datetime(daily.Time(), unit = "s", utc = True),
	end =  pd.to_datetime(daily.TimeEnd(), unit = "s", utc = True),
	freq = pd.Timedelta(seconds = daily.Interval()),
	inclusive = "left"
)}

daily_data['location_id'] = 1
daily_data["weather_code"] = daily_weather_code
daily_data["temperature_2m_max"] = daily_temperature_2m_max
daily_data["temperature_2m_min"] = daily_temperature_2m_min

daily_dataframe = pd.DataFrame(data = daily_data)
print("\nDaily data\n", daily_dataframe)


In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS idasky;






In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType
from pyspark.sql import Row

schema = StructType([
    StructField('location_id', IntegerType(), True),
    StructField('city', StringType(), True),
    StructField('state', StringType(), True),
    StructField('lat', DoubleType(), True),
    StructField('lon', DoubleType(), True),
    StructField('elevation', DoubleType(), True)
])

# Rexburg values
rexburg_row = Row(
    location_id = 1,
    city = "Rexburg",
    state = "ID",
    lat = 43.826,          # your API latitude
    lon = -111.7897,       # your API longitude
    elevation = 1480.0     # approx meters above sea level
)

# Create Spark DataFrame
location_df = spark.createDataFrame([rexburg_row], schema=schema)

In [0]:
hourly_df = spark.createDataFrame(hourly_dataframe)
daily_df = spark.createDataFrame(daily_dataframe)

spark.sql("DROP TABLE IF EXISTS idasky.daily_table")
spark.sql("DROP TABLE IF EXISTS idasky.hourly_table")
spark.sql("DROP TABLE IF EXISTS idasky.location")

spark_df.write.mode("overwrite").saveAsTable("idasky.daily_table")
hourly_df.write.mode("overwrite").saveAsTable("idasky.hourly_table")
location_df.write.mode("overwrite").saveAsTable("idasky.location")