In [1]:
import openmeteo_requests
import requests_cache
import pandas as pd
from retry_requests import retry
from datetime import datetime, timedelta

In [2]:
# TODO: Add error handling
def fetch_weather(latitudes, longitudes, start_date, end_date):
    """
    Get weather data for (latitudes, longitudes) from start_date to end_date

    Args:
        latitudes (list): List of latitudes [1st_lat, 2nd_lat, ...]
        longitudes (list): List of longitudes corresponding to the latitudes [1st_long, 2nd_long, ...]
        start_date (str): String of starting date in the ISO 8601 format (i.e. YYYY-MM-DD)
        end_date (str): String of starting date in the same format as start_date

    Returns:
        int: NOTSURE YET
    """

    url = "https://archive-api.open-meteo.com/v1/archive"
    # Setup the Open-Meteo API client with cache and retry on error
    cache_session = requests_cache.CachedSession('.cache', expire_after = 3600)
    retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
    openmeteo = openmeteo_requests.Client(session = retry_session)

    latitudes = list(latitudes)
    longitudes = list(longitudes)
    
    params = {
	"latitude": latitudes,
	"longitude": longitudes,
	"start_date": start_date,
	"end_date": end_date,
	"hourly": ["temperature_2m", "rain", "relative_humidity_2m"],
	"timezone": "auto"   
    }

    responses = openmeteo.weather_api(url, params=params)
    return responses

In [3]:
df = pd.read_csv('../Datasets/Suburb.csv')
df.head(2)

Unnamed: 0,OfficialNameSuburb,OfficialNameState,OfficialCodeLocalGovernmentArea,OfficialCodeState,Latitude,Longitude,GeoShape
0,Adaminaby,New South Wales,17040,1,-36.011932,148.78632,"{""coordinates"": [[[148.71675360000006, -36.060..."
1,Albury,New South Wales,10050,1,-36.073698,146.913468,"{""coordinates"": [[[146.92431042300007, -36.086..."


In [4]:
# Test the function
n = 5
latitudes = df.Latitude[:n]
longitudes = df.Longitude[:n]
start_date = '2000-01-01'
# start_date = datetime.today() - timedelta(3) # Get the date for 3 days ago
# start_date = start_date.strftime('%Y-%m-%d')
end_date = datetime.today() - timedelta(1) # Get yesterday date. 
end_date = end_date.strftime('%Y-%m-%d')

res = fetch_weather(latitudes, longitudes, start_date, end_date)

In [5]:
res

[<openmeteo_sdk.WeatherApiResponse.WeatherApiResponse at 0x2b8d08cdc70>,
 <openmeteo_sdk.WeatherApiResponse.WeatherApiResponse at 0x2b8d08cdca0>,
 <openmeteo_sdk.WeatherApiResponse.WeatherApiResponse at 0x2b8d08cdbb0>,
 <openmeteo_sdk.WeatherApiResponse.WeatherApiResponse at 0x2b8d08cdfd0>,
 <openmeteo_sdk.WeatherApiResponse.WeatherApiResponse at 0x2b8d08cdd30>]

In [6]:
res1 = res[0]
# help(res1)

In [7]:
# Location detail
latitude = res1.Latitude()
longitude = res1.Longitude()
timezone = res1.Timezone()
timezone_diff = res1.UtcOffsetSeconds() # Timezone difference to GMT+0

# Hourly
hourly = res1.Hourly()
hourly_index = pd.date_range(
	start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
	end = pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
	freq = pd.Timedelta(seconds = hourly.Interval()),
	inclusive = "left"
    )
hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
rain = hourly.Variables(1).ValuesAsNumpy()
relative_humidity_2m = hourly.Variables(2).ValuesAsNumpy()

In [8]:
print("Number of rows:", len(hourly_index))

Number of rows: 215448


In [9]:
hourly_data = {
    "index" : hourly_index,
    "temperature" : hourly_temperature_2m,
    "rain" : rain,
    "relative_humidity" : relative_humidity_2m
}

hourly_df= pd.DataFrame(hourly_data)

In [10]:
hourly_df.sample(5)

Unnamed: 0,index,temperature,rain,relative_humidity
60768,2006-12-06 14:00:00+00:00,11.966001,0.0,90.538605
15656,2001-10-13 22:00:00+00:00,6.766,0.2,88.914581
192999,2022-01-06 05:00:00+00:00,21.09,0.1,63.891418
9495,2001-01-30 05:00:00+00:00,19.016001,0.3,67.514183
92035,2010-07-01 09:00:00+00:00,1.516,0.0,89.429184


In [11]:
hourly_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215448 entries, 0 to 215447
Data columns (total 4 columns):
 #   Column             Non-Null Count   Dtype              
---  ------             --------------   -----              
 0   index              215448 non-null  datetime64[ns, UTC]
 1   temperature        215435 non-null  float32            
 2   rain               215435 non-null  float32            
 3   relative_humidity  215435 non-null  float32            
dtypes: datetime64[ns, UTC](1), float32(3)
memory usage: 4.1 MB


4.1 Mb * 15000 suburbs = 61.5 Gb of Hourly data

Can be reduced by aggregating the data into daily data

In [14]:
agglist = ['mean', 'max', 'min']
daily_df = hourly_df.groupby(pd.Grouper(key='index', freq='D')).agg({
    'temperature': agglist,
    'rain': ['sum'],
    'relative_humidity' : agglist
})

daily_df.columns = ['_'.join(col).strip() for col in daily_df.columns.values]
daily_df = daily_df.reset_index()

In [15]:
daily_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8978 entries, 0 to 8977
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype              
---  ------                  --------------  -----              
 0   index                   8978 non-null   datetime64[ns, UTC]
 1   temperature_mean        8978 non-null   float32            
 2   temperature_max         8978 non-null   float32            
 3   temperature_min         8978 non-null   float32            
 4   rain_sum                8978 non-null   float32            
 5   relative_humidity_mean  8978 non-null   float32            
 6   relative_humidity_max   8978 non-null   float32            
 7   relative_humidity_min   8978 non-null   float32            
dtypes: datetime64[ns, UTC](1), float32(7)
memory usage: 315.8 KB


0.3 Mb * 15000 suburbs = 4.5 Gb of Daily data

Can be further reduced by clustering Suburbs that are close to each other. 

In [15]:
# Function for processing the responses
def process_responses(responses):
    pass

Actually probably have to use mysql