In [5]:
import openmeteo_requests
import requests_cache
import pandas as pd
from retry_requests import retry
from datetime import datetime, timedelta

In [6]:
# TODO: Add error handling
def fetch_weather(latitudes, longitudes, start_date, end_date):
    """
    Get weather data for (latitudes, longitudes) from start_date to end_date

    Args:
        latitudes (list): List of latitudes [1st_lat, 2nd_lat, ...]
        longitudes (list): List of longitudes corresponding to the latitudes [1st_long, 2nd_long, ...]
        start_date (str): String of starting date in the ISO 8601 format (i.e. YYYY-MM-DD)
        end_date (str): String of starting date in the same format as start_date

    Returns:
        int: NOTSURE YET
    """

    url = "https://archive-api.open-meteo.com/v1/archive"
    # Setup the Open-Meteo API client with cache and retry on error
    cache_session = requests_cache.CachedSession('.cache', expire_after = 3600)
    retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
    openmeteo = openmeteo_requests.Client(session = retry_session)

    latitudes = list(latitudes)
    longitudes = list(longitudes)
    
    params = {
	"latitude": latitudes,
	"longitude": longitudes,
	"start_date": start_date,
	"end_date": end_date,
	"hourly": ["temperature_2m", "rain", "precipitation", "relative_humidity_2m"],
	"timezone": "auto"   
    }

    responses = openmeteo.weather_api(url, params=params)
    return responses

In [7]:
df = pd.read_csv('Datasets/Suburb.csv')
df.head(2)

Unnamed: 0,OfficialNameSuburb,OfficialNameState,OfficialCodeLocalGovernmentArea,OfficialCodeState,Latitude,Longitude,GeoShape
0,Adaminaby,New South Wales,17040,1,-36.011932,148.78632,"{""coordinates"": [[[148.71675360000006, -36.060..."
1,Albury,New South Wales,10050,1,-36.073698,146.913468,"{""coordinates"": [[[146.92431042300007, -36.086..."


In [27]:
# Test the function
n = 5
latitudes = df.Latitude[:n]
longitudes = df.Longitude[:n]
start_date = '2000-01-01'
# start_date = datetime.today() - timedelta(3) # Get the date for 3 days ago
# start_date = start_date.strftime('%Y-%m-%d')
end_date = datetime.today() - timedelta(1) # Get yesterday date. 
end_date = end_date.strftime('%Y-%m-%d')

res = fetch_weather(latitudes, longitudes, start_date, end_date)

In [28]:
res

[<openmeteo_sdk.WeatherApiResponse.WeatherApiResponse at 0x208a002cd00>,
 <openmeteo_sdk.WeatherApiResponse.WeatherApiResponse at 0x208a002ca60>,
 <openmeteo_sdk.WeatherApiResponse.WeatherApiResponse at 0x208a002cf40>,
 <openmeteo_sdk.WeatherApiResponse.WeatherApiResponse at 0x208a002cb20>,
 <openmeteo_sdk.WeatherApiResponse.WeatherApiResponse at 0x208a00c9220>]

In [29]:
res1 = res[0]
# help(res1)

In [32]:
# Location detail
latitude = res1.Latitude()
longitude = res1.Longitude()
timezone = res1.Timezone()
timezone_diff = res1.UtcOffsetSeconds() # Timezone difference to GMT+0

# Hourly
hourly = res1.Hourly()
hourly_index = pd.date_range(
	start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
	end = pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
	freq = pd.Timedelta(seconds = hourly.Interval()),
	inclusive = "left"
    )
hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
rain = hourly.Variables(1).ValuesAsNumpy()
precipitation = hourly.Variables(2).ValuesAsNumpy()
relative_humidity_2m = hourly.Variables(3).ValuesAsNumpy()

In [37]:
print("Number of rows:", len(hourly_index))

Number of rows: 215448


In [34]:
hourly_data = {
    "index" : hourly_index,
    "temperature" : hourly_temperature_2m,
    "rain" : rain,
    "precipitation" : precipitation,
    "relative_humidity" : relative_humidity_2m
}

hourly_df= pd.DataFrame(hourly_data)

In [36]:
hourly_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215448 entries, 0 to 215447
Data columns (total 5 columns):
 #   Column             Non-Null Count   Dtype              
---  ------             --------------   -----              
 0   index              215448 non-null  datetime64[ns, UTC]
 1   temperature        215411 non-null  float32            
 2   rain               215411 non-null  float32            
 3   precipitation      215411 non-null  float32            
 4   relative_humidity  215411 non-null  float32            
dtypes: datetime64[ns, UTC](1), float32(4)
memory usage: 4.9 MB


4.9 MB * 15000 suburbs = 73.5 Gb

1. Can decrease the size by clustering coordinates that are close together. Then for each suburb just assign a cluster id

In [None]:
# Function for processing the responses
def process_responses(responses):
    pass

Actually probably have to use mysql