Nils Hornstein, 7369566

# TOAR REST-API Download Script
This Jupyter notebook contains download scripts to download data from the TOAR database. It is assumed that a logged-in user exists for TOAR, as this is required for the script to function properly. Such an user can be created via the TOAR dashboard at the following link: https://toar-data.fz-juelich.de/gui/v2/auth/logout=True.
<br>The notebook contains three different setups to download data from TOAR:
1. Via the TOAR REST API analysis service and data saved in JSON files
2. Via the TOAR REST API analysis service and data saved in csv files
3. Via the TOAR REST API time series endpoint and data saved in csv files

Each script creates its own folder structure to store the downloaded data and downloads the same measurement data. 

In [1487]:
from io import StringIO
import json
import pandas as pd
import requests
import time
from pathlib import Path
import zipfile
import io
import os
import calendar

# Configuration
Insert your access token that can be created in your user profile. This is the only modification you have to do in order to run the download scripts.
<br>***Note!** Your access token will be only valid for one hour after creation.*

In [1520]:
ACCESS_TOKEN = 'YOUR_ACCESS_TOKEN'

These arguments are given to each download script. As a result, each variant downloads the same data. This data consists of measurements of the variable 'particles up to 10 µm diameter' from a station in Cologne, with a specific flag and in the period from 1 December 2023 to 7 December 2023. 

In [1500]:
TOAR_SERVICE_URL="https://toar-data.fz-juelich.de/api/v2/"
headers = {'AccessToken': ACCESS_TOKEN}
flags = "AllOK"
start_date = "2023-12-01T00:00:00"
end_date = "2023-12-07T23:59:59"
date_range = f"{start_date},{end_date}"

# Data download via TOAR analysis service

In [1518]:
print(f"Send request to REST API ...")
start_time = time.time()

# Request the data of one time series on a specific variable, from a specific station, in a specific date range, and specific quality flags  
resp = requests.get( 
    f"{TOAR_SERVICE_URL}analysis/data/timeseries/"
    "?station_id=37,38,403,404"
    "&data_origin=Instrument"
    "&data_origin_type=Measurement"
    "&variable_id=13"
    f"&flags={flags}"
    f"&daterange={date_range}"
    "&limit=1", 
    headers=headers,
    timeout=(3.05, 20) 
)

result_url = resp.json().get('status', None)  # None, if task id does not exist
# print(result_url) # Control output of the task ID and URL

# Downlaod the file from our request as a ZIP archive
start_download_time = time.time()
resp = requests.get(result_url)
zip_content = BytesIO(resp.content)
end_download_time = time.time()
download_time = end_download_time - start_download_time
print(f"Download finished in {download_time:.2f} seconds.")

with zipfile.ZipFile(zip_content, "r") as zip_file:
    json_name = zip_file.namelist()[0]
    with zip_file.open(json_name) as file:
        content = file.read().decode("utf-8").strip()
        json_text = "\n".join(line for line in content.splitlines() if not line.startswith("#")) # delete first line of the JSON file which has no usage
        data = json.loads(json_text)
        
all_data = data.get("data", [])

# Get metadata to name folder dynamically
station_id = data.get("metadata", {}).get("station", {}).get("id", "unknown")
station_name = data.get("metadata", {}).get("station", {}).get("name", "unknown")
variable_name = data.get("metadata", {}).get("variable", {}).get("name", "unknown")
        
# Create dataframe 
df = pd.DataFrame(all_data)
if not df.empty and 'datetime' in df.columns:
    df['datetime'] = pd.to_datetime(df['datetime'], utc=True)
    df.set_index('datetime', inplace=True)

print(df.head())
print("Number of rows:", len(df))

# Save data as JSON files for each day
if not df.empty:
    for day, daily_data in df.groupby(pd.Grouper(freq='D')):
        if daily_data.empty:
            continue
            
        folder_path = os.path.join(
            "TOAR Data via Analysis Service in JSON",
            f"Station {station_id}, {station_name}",
            f"Variable {variable_name}",
            str(day.year),
            calendar.month_name[day.month]
        )
        os.makedirs(folder_path, exist_ok=True)

        filepath = os.path.join(folder_path, f"{day.date()}.json")
        daily_json = (
            daily_data.reset_index()
            .assign(datetime=lambda df: df["datetime"].dt.strftime("%Y-%m-%dT%H:%M:%SZ"))
            .to_dict(orient="records")
        )

        with open(filepath, "w", encoding="utf-8") as f:
            json.dump(daily_json, f, ensure_ascii=False, indent=2)

        print(f"{filepath} saved. Rows: {len(daily_data)}")
        
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Whole process finished in {elapsed_time:.2f} seconds.")

Send request to REST API ...
None


In [1479]:
print(f"Send request to REST API ...")
start_time = time.time()

# Request the data of one time series on a specific variable, from a specific station, in a specific date range, and specific quality flags  
resp = requests.get( 
    f"{TOAR_SERVICE_URL}analysis/data/timeseries/"
    "?station_id=37,38,403,404"
    "&data_origin=Instrument"
    "&data_origin_type=Measurement"
    "&variable_id=13"
    f"&flags={flags}"
    f"&daterange={date_range}"
    "&limit=1", 
    headers=headers,
    timeout=(3.05, 20) 
)

result_url = resp.json().get('status', None)  # None, if task id does not exist
#print(status_url) # Control output of the task ID and URL

# Downlaod the file from our request as a ZIP archive
start_download_time = time.time()
resp = requests.get(result_url)
zip_content = BytesIO(resp.content)
end_download_time = time.time()
download_time = end_download_time - start_download_time
print(f"Download finished in {download_time:.2f} seconds.")

with zipfile.ZipFile(zip_content, "r") as zip_file:
    json_name = zip_file.namelist()[0]
    with zip_file.open(json_name) as file:
        content = file.read().decode("utf-8").strip()
        json_text = "\n".join(line for line in content.splitlines() if not line.startswith("#")) # delete first line of the JSON file which has no usage
        data = json.loads(json_text)
        
all_data = data.get("data", [])

# Get metadata to name folder dynamically
station_id = data.get("metadata", {}).get("station", {}).get("id", "unknown")
station_name = data.get("metadata", {}).get("station", {}).get("name", "unknown")
variable_name = data.get("metadata", {}).get("variable", {}).get("name", "unknown")
        
# Create dataframe 
df = pd.DataFrame(all_data)
if not df.empty and 'datetime' in df.columns:
    df['datetime'] = pd.to_datetime(df['datetime'], utc=True)
    df.set_index('datetime', inplace=True)

print(df.head())
print("Number of rows:", len(df))

# Save data as CSV files for each day
if not df.empty:
    start_date = df.index.min().normalize()
    end_date = df.index.max().normalize()
    current_day = start_date

    while current_day <= end_date:
        next_day = current_day + timedelta(days=1)
        daily_data = df[(df.index >= current_day) & (df.index < next_day)]
        
        if not daily_data.empty:
            year = str(current_day.year)
            month_name = calendar.month_name[current_day.month]
            folder_path = os.path.join(
                "TOAR Data via Analysis Service in csv",
                f"Station_{station_id or 'unknown'}",
                f"Variable_{variable_name or 'unknown'}",
                year,
                month_name
            )
            os.makedirs(folder_path, exist_ok=True)
            
            filename = f"{current_day.date()}.csv"
            filepath = os.path.join(folder_path, filename)
            daily_data.to_csv(filepath, encoding="utf-8")
            print(f"{filepath} saved. Rows: {len(daily_data)}")
        
        current_day = next_day
        
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Whole process finished in {elapsed_time:.2f} seconds.")

Send request to REST API ...


MissingSchema: Invalid URL 'None': No scheme supplied. Perhaps you meant https://None?

# Data download via time series endpoint

## Prerequisites (find your time series)

In [1526]:
# Find possible time series we want to download according to specified criteria via search service
resp = requests.get( 
    f"{TOAR_SERVICE_URL}/search/"
    "?station_id=37,38,403,404"
    "&data_origin=Instrument"
    "&data_origin_type=Measurement"
    "&variable_id=13"
    "&limit=None", 
    headers=headers,
    timeout=(3.05, 20) 
)
time_series = resp.json()
time_series_ids = [ts.get("id") for ts in time_series]
print(time_series_ids)

[19956, 19958, 19986, 19987, 31457, 31544, 31830, 31866, 31869, 31887, 31901, 32153, 32160]


In [1528]:
# Find a time series we want to download via timeseries endpoint search that is the first one that fulfil further specifications
resp = requests.get( 
    f"{TOAR_SERVICE_URL}/timeseries/{ids[0]}"
    f"?flags={flags}"
    f"&daterange={date_range}"
    "&limit=1", 
    headers=headers,
    timeout=(3.05, 20) 
)
ts = resp.json()
ts_id = ts.get("id")
print(ts)
print(ts_id)

{'id': 19956, 'label': '', 'order': 1, 'sampling_frequency': 'hourly', 'aggregation': 'mean of two values', 'data_start_date': '2012-01-01T00:00:00+00:00', 'data_end_date': '2025-12-03T21:00:00+00:00', 'data_origin': 'instrument', 'data_origin_type': 'measurement', 'provider_version': 'N/A', 'sampling_height': 2.0, 'additional_metadata': {}, 'doi': '', 'coverage': -1.0, 'station': {'id': 37, 'codes': ['DENW053'], 'name': 'Köln-Chorweiler', 'coordinates': {'lat': 51.019345, 'lng': 6.884636, 'alt': 45.0}, 'coordinate_validation_status': 'not checked', 'country': 'Germany', 'state': 'Nordrhein-Westfalen', 'type': 'background', 'type_of_area': 'urban', 'timezone': 'Europe/Berlin', 'additional_metadata': {'rice_production': 0.0, 'station_alt_flag': '0', 'wheat_production': 0.0, 'google_resolution': '153', 'soybean_production': 0.0, 'station_google_alt': '46', 'station_reported_alt': '45', 'station_landcover_description': 'Urbanandbuilt-up:53.9%,Croplands:20.3%,Mixedforest:17.4%,Cropland/Nat

## Date arithmetic

In [1530]:
print(f"Start the download of time series {ts_id} ...")
start_time = time.time()

# Download the time series we have found before
result = requests.get(
    f"{TOAR_SERVICE_URL}/data/timeseries/{ts_id}?format=csv",
    headers=headers,
    timeout=(3.05, 10)
)

timeseries_meta = json.loads("\n".join([line[1:] for line in result.text.split('\n') if line.startswith('#')]))

df = pd.read_csv(
    StringIO(result.text),
    comment="#",
    parse_dates=["datetime"],
    index_col="datetime"
)

# Make sure the index is timezone-aware in UTC
if df.index.tz is None:
    df.index = df.index.tz_localize("UTC")
else:
    df.index = df.index.tz_convert("UTC")

# define range
if not isinstance(start_date, pd.Timestamp):
    start_date = pd.to_datetime(start_date, utc=True)
if not isinstance(end_date, pd.Timestamp):
    end_date = pd.to_datetime(end_date, utc=True)

# Get metadata to name folder dynamically
station_id = timeseries_meta.get("station", {}).get("id", "unknown")
station_name = timeseries_meta.get("station", {}).get("name", "unknown")
variable_name = timeseries_meta.get("variable", {}).get("name", "unknown")

# Create a folder to store the csv files
folder_path = os.path.join(
    "TOAR Data via time series endpoint in csv",
    f"Station {station_id}, {station_name}",
    f"Variable {variable_name}",
    str(day.year),
    calendar.month_name[day.month]
)
os.makedirs(folder_path, exist_ok=True)

#  Iterate over each day in time range and save each as a csv file
current_day = start_date

while current_day <= end_date:
    next_day = current_day + timedelta(days=1)

    daily_data = df[(df.index >= current_day) & (df.index < next_day)]

    if not daily_data.empty:
        filename = f"{current_day.date()}.csv"
        filepath = os.path.join(folder_path, filename)
        daily_data.to_csv(filepath, encoding="utf-8")
        print(f"{filename} saved. Rows: {len(daily_data)}")

    current_day = next_day

end_time = time.time()
print(f"Whole process finished in {end_time - start_time:.2f} seconds.")

Start the download of time series 19956 ...


JSONDecodeError: Expecting value: line 1 column 1 (char 0)