In [1]:
!pip install ratelimit tenacity openaq tqdm


StatementMeta(, f003f4e9-a9c3-4490-85e5-656b3499e50e, 3, Finished, Available, Finished)

Collecting ratelimit
  Downloading ratelimit-2.2.1.tar.gz (5.3 kB)
  Preparing metadata (setup.py) ... [?25l- done
Collecting openaq
  Downloading openaq-0.6.0-py3-none-any.whl.metadata (4.5 kB)
Collecting httpx<1.0,>=0.28.1 (from openaq)
  Downloading httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting httpcore==1.* (from httpx<1.0,>=0.28.1->openaq)
  Downloading httpcore-1.0.9-py3-none-any.whl.metadata (21 kB)
Collecting h11>=0.16 (from httpcore==1.*->httpx<1.0,>=0.28.1->openaq)
  Downloading h11-0.16.0-py3-none-any.whl.metadata (8.3 kB)
Downloading openaq-0.6.0-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.5/61.5 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx-0.28.1-py3-none-any.whl (73 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.5/73.5 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpcore-1.0.9-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
import json
from datetime import datetime, timedelta,timezone
from concurrent.futures import ThreadPoolExecutor, as_completed
from openaq import OpenAQ
import time
from threading import Lock
from tqdm import tqdm
import fsspec
from ratelimit import limits, sleep_and_retry
from tenacity import retry, wait_exponential, stop_after_attempt, retry_if_exception
from requests.exceptions import HTTPError

StatementMeta(, f003f4e9-a9c3-4490-85e5-656b3499e50e, 4, Finished, Available, Finished)

In [3]:
client = OpenAQ(api_key="d074d34fa5a6201fa2177fe98c5da21ec073be89871a5b7cb85e12584da0f253")

abfss_path = "abfss://4906b11e-1e59-4869-9321-062a4696a2db@onelake.dfs.fabric.microsoft.com/62794233-3c68-4109-ab1e-7666b1963827/Files/aq"
account_name = "4906b11e-1e59-4869-9321-062a4696a2db"
account_host = "onelake.dfs.fabric.microsoft.com"

fs = fsspec.filesystem(
    "abfss",
    account_name=account_name,
    account_host=account_host
)

StatementMeta(, f003f4e9-a9c3-4490-85e5-656b3499e50e, 5, Finished, Available, Finished)

In [4]:
with fs.open(f"{abfss_path}/sensors/sensors.json",'r') as file:
    checkpoint_data=json.load(file)

sensors_list=[int(key) for key in checkpoint_data.keys()]
data_base = f"{abfss_path}/data"          


def to_dict(obj):
    if isinstance(obj, list):
        return [to_dict(o) for o in obj]
    elif hasattr(obj, "__dict__"):
        result = {}
        for k, v in obj.__dict__.items():
            result[k] = to_dict(v)
        return result
    else:
        return obj

StatementMeta(, f003f4e9-a9c3-4490-85e5-656b3499e50e, 6, Finished, Available, Finished)

In [5]:
lock = Lock()
calls = 60        
periods = 60 
call_intervall=1.1
@sleep_and_retry
@limits(calls=calls, period=periods)
def safe_measurements_list(**kwargs):
    """Thread-safe, rate-limited API call."""
    with lock:
        result=client.measurements.list(**kwargs).results
        time.sleep(call_intervall)
        return result

def is_rate_limit_error(e):
    return isinstance(e, HTTPError) and e.response.status_code == 429

def is_timeout_error(e):
    return isinstance(e, HTTPError) and e.response.status_code == 408

@retry(
    retry=retry_if_exception(lambda e: is_rate_limit_error(e) or is_timeout_error(e)),
    wait=wait_exponential(multiplier=1, min=1, max=60),
    stop=stop_after_attempt(5)
)
def safe_measurements_retry(**kwargs):
    return safe_measurements_list(**kwargs)




end_date = datetime(2025, 12, 16, 0, 0, tzinfo=timezone.utc)

def fetch_sensor_data(sensor_id):
    sensor_info = client.sensors.get(sensor_id).results[0]

    datetime_first = datetime.fromisoformat(
        sensor_info.datetime_first["utc"].replace("Z", "+00:00")
    )

    datetime_last = min(
        end_date,
        datetime.fromisoformat(
            sensor_info.datetime_last["utc"].replace("Z", "+00:00")
        ),
    )

    chunk_days = 100
    current_from = datetime_first

    sensor_folder = f"{data_base}/{sensor_id}"
    fs.mkdirs(sensor_folder, exist_ok=True)

    chunk_index = 0

    while current_from < datetime_last:
        current_to = min(current_from + timedelta(days=chunk_days), datetime_last)
        page = 1

        while True:
            measurements = safe_measurements_retry(
                sensors_id=sensor_id,
                datetime_from=current_from.isoformat(),
                datetime_to=current_to.isoformat(),
                page=page,
                limit=500,
                data="hours",
            )

            if not measurements:
                break
            file_path = f"{data_base}/{sensor_id}/chunk_{chunk_index:05d}.json"
            with fs.open(file_path, "w") as f:
                json.dump([to_dict(m) for m in measurements], f, indent=2)

            last_dt = max(
                datetime.fromisoformat(
                    m.period.datetime_to.utc.replace("Z", "+00:00")
                )
                for m in measurements
            )
            checkpoint_data[str(sensor_id)]["last_fetch"] = last_dt.isoformat()

            page += 1
            chunk_index += 1

        current_from = current_to

    with fs.open(f"{abfss_path}/sensors/sensors.json", "w") as f:
        json.dump(checkpoint_data, f, indent=2)

    return f"Finished sensor {sensor_id}"



with ThreadPoolExecutor(max_workers=2) as executor: 
    futures = [executor.submit(fetch_sensor_data, sensor) for sensor in sensors_list]
    for future in as_completed(futures):
        print(future.result())

client.close()

StatementMeta(, f003f4e9-a9c3-4490-85e5-656b3499e50e, 7, Finished, Available, Finished)

Finished sensor 1097
Finished sensor 673
Finished sensor 1102
Finished sensor 1103
Finished sensor 1152
Finished sensor 1145
Finished sensor 1146
Finished sensor 25520
Finished sensor 1662910


In [6]:
print(123)

StatementMeta(, f003f4e9-a9c3-4490-85e5-656b3499e50e, 8, Finished, Available, Finished)

123
