In [1]:
import openmeteo_requests
import requests_cache
import pandas as pd
from retry_requests import retry
from datetime import datetime, timedelta
import os
import sys
import pickle
import logging
from tqdm.notebook import trange, tqdm


## Define Functions

In [2]:
# TODO: Add error handling
def fetch_weather(latitudes, longitudes, start_date, end_date):
    """
    Get weather data for (latitudes, longitudes) from start_date to end_date

    Args:
        latitudes (list): List of latitudes [1st_lat, 2nd_lat, ...]
        longitudes (list): List of longitudes corresponding to the latitudes [1st_long, 2nd_long, ...]
        start_date (str): String of starting date in the ISO 8601 format (i.e. YYYY-MM-DD)
        end_date (str): String of starting date in the same format as start_date

    Returns:
        int: NOTSURE YET
    """

    url = "https://archive-api.open-meteo.com/v1/archive"
    # Setup the Open-Meteo API client with cache and retry on error
    cache_session = requests_cache.CachedSession('.cache', expire_after = 3600)
    retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
    openmeteo = openmeteo_requests.Client(session = retry_session)

    latitudes = list(latitudes)
    longitudes = list(longitudes)
    
    params = {
	"latitude": latitudes,
	"longitude": longitudes,
	"start_date": start_date,
	"end_date": end_date,
	"hourly": ["temperature_2m", "rain", "relative_humidity_2m"],
	"timezone": "auto"   
    }

    responses = openmeteo.weather_api(url, params=params)
    return responses

In [5]:
# Function for processing the responses
def process_response(response):
    # Hourly
    hourly = response.Hourly()
    hourly_index = pd.date_range(
        start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
        end = pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
        freq = pd.Timedelta(seconds = hourly.Interval()),
        inclusive = "left"
        )
    
    hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
    rain = hourly.Variables(1).ValuesAsNumpy()
    relative_humidity_2m = hourly.Variables(2).ValuesAsNumpy()

    # Turn response into df
    hourly_data = {
        "Datetime" : hourly_index,
        "Temperature" : hourly_temperature_2m,
        "Rain" : rain,
        "RelativeHumidity" : relative_humidity_2m
    }

    hourly_df= pd.DataFrame(hourly_data)

    # Aggregate for daily data
    daily_df = hourly_df.groupby(pd.Grouper(key='Datetime', freq='D')).agg({
        'Temperature': ['mean', 'max', 'min'],
        'Rain': ['sum'],
        'RelativeHumidity' :['mean', 'max', 'min']
    })

    daily_df.columns = [x + y.capitalize() for x,y in daily_df.columns.values]
    daily_df = daily_df.reset_index()
    
    return daily_df

In [4]:
def get_logger(logname):
    # Create a log file in case we reach max api calls or random failure
    logname = "APIcalls.log"

    # If log file doesn't exist, create new one
    if not os.path.exists(logname):
        logger = logging.getLogger()
        fhandler = logging.FileHandler(filename=logname, encoding="utf-8")
        formatter = logging.Formatter('%(asctime)s [%(levelname)s] > %(message)s')
        fhandler.setFormatter(formatter)
        logger.addHandler(fhandler)
        logger.setLevel(logging.DEBUG)
        print("Created log " + logname)

    # Else use the existing file
    else:
        # Configure the logger
        logging.basicConfig(
            filename= logname,    # Use the existing log file
            filemode='a',          # Append mode, to add to the existing log file. 'w' for overwrite
            format='%(asctime)s [%(levelname)s] - %(name)s:%(filename)s > %(message)s', # Format using %() for backward compatibility
            level=logging.DEBUG     # Set the logging level as needed. DEBUG means log everything above DEBUG
        )

        # Create a logger object
        logger = logging.getLogger()
        print("Loaded log " + logname)

    return logger

In [5]:
# insert(0, "ClusterID", )

In [6]:
def collect_data(df):
    pass

In [6]:
df = pd.read_csv('../Datasets/Clusters.csv')
df.head(2)

Unnamed: 0,ClusterID,Latitude,Longitude
0,100412,-35.855856,148.648649
1,101407,-36.216216,146.846847


## Test the Functions

In [8]:
# Test the function
n = 1
latitudes = df.Latitude[:n]
longitudes = df.Longitude[:n]
start_date = '2000-01-01'
end_date = '2024-07-31'
# start_date = datetime.today() - timedelta(3) # Get the date for 3 days ago
# start_date = start_date.strftime('%Y-%m-%d')
# end_date = datetime.today() - timedelta(1) # Get yesterday date. 
# end_date = end_date.strftime('%Y-%m-%d')

responses = fetch_weather(latitudes, longitudes, start_date, end_date)

In [9]:
responses

[<openmeteo_sdk.WeatherApiResponse.WeatherApiResponse at 0x1ee1c72e3d0>]

In [None]:
# n = 10 => ~12 seconds
# n = 100 => ~77seconds

In [10]:
print(f'The size of arrays of {n} responses : {sys.getsizeof(responses)} bytes')
print(f'The size of each response : {sys.getsizeof(responses[0])} bytes')

The size of arrays of 1 responses : 88 bytes
The size of each response : 40 bytes


## Collect and Store Responses Using Pickle

In [7]:
pname = "responses.pkl" # Pickle Filenamae

responses = []
if not os.path.exists(pname):
    # Create file
    with open(pname, 'wb') as f:
        pickle.dump(responses, f)

else:
    print("File already exist")

# Load from a file
with open(pname, 'rb') as f:
    loaded_responses = pickle.load(f)

File already exist


In [8]:
len(loaded_responses)

200

In [11]:
responses = []
start = len(loaded_responses)
end = len(df)
step = 100
start_date = '2000-01-01'
end_date = '2024-07-31'
logger = get_logger("APIcalls.log")
pname = 'responses.pkl'

for i in trange(start, end, step):
    try:
        latitudes = df.Latitude[i:i+step]
        longitudes = df.Longitude[i:i+step]
        responses += fetch_weather(latitudes, longitudes, start_date, end_date)
        logger.debug(f"Responses fetched for rows {i}-{i+step}")
    
    except Exception as err:
        s = f"{err} Encountered at iteration {start + i}"
        print(s)
        logger.error(s)
        # Load from a file
        with open(pname, 'rb') as f:
            loaded_responses = pickle.load(f)
        loaded_responses += responses
        
        with open(pname, 'wb') as f:
            pickle.dump(responses, f)
        break

Loaded log APIcalls.log


  0%|          | 0/21 [00:00<?, ?it/s]

{'error': True, 'reason': 'Daily API request limit exceeded. Please try again tomorrow.'} Encountered at iteration 0
