In [1]:
import pandas as pd
import requests
from tqdm.notebook import tqdm

In [2]:
PATH_TO_DATASETS = "../Datasets/"

# GREEN SPACES

This dataset describes an overview of public green, except for solitary trees. This data collection includes location, type of vegetation, and management information.

In [27]:
df = pd.read_csv(f'{PATH_TO_DATASETS}openbaar-groen0.csv', sep=';')

In [28]:
df[['LAT', 'LON']] = df['geo_point_2d'].str.split(',', expand=True)
df['LAT'] = pd.to_numeric(df['LAT'], errors='coerce')
df['LON'] = pd.to_numeric(df['LON'], errors='coerce')

In [None]:
#df.to_csv(f'{PATH_TO_DATASETS}green-spaces.csv', index=False)

In [32]:
df = pd.read_csv(f'{PATH_TO_DATASETS}green-spaces.csv')

In [33]:
df.drop(columns=['geo_point_2d', 'geo_shape', 'TECHN_KWAL', 'STREET'], inplace=True)

In [None]:
#df.to_csv(f'{PATH_TO_DATASETS}green-spaces.csv', index=False)

# AIR POLLUTION

In [36]:
df = pd.read_csv(f'{PATH_TO_DATASETS}real-time-fijnstof-monitoring.csv', sep=';')

In [39]:
cols_to_drop = [col for col in df.columns if col.startswith('content') or col in ['Data', 'lossetimestamps', 'lossetimestamps_entities', 'separatedentities']]
df_clean = df.drop(columns=cols_to_drop)

df_clean['Timestamp'] = pd.to_datetime(df_clean['Timestamp'], errors='coerce')

df_clean = df_clean.dropna(subset=['UFP', 'NO2'])

df_clean = df_clean.drop_duplicates()

df_clean = df_clean.reset_index(drop=True)

In [None]:
#df_clean.to_csv(f'{PATH_TO_DATASETS}airpollution.csv', index=False)

Unnamed: 0,Location,Timestamp,Latitude,Longitude,UFP,PM1,PM2.5,PM10,NO2,geopoint
0,521,2025-05-22 08:40:00+02:00,51.4379,5.3582,12059.0,8.33,14.71,32.09,16.0,"51.4379, 5.3582"
1,521,2025-05-22 08:50:00+02:00,51.4379,5.3582,11527.0,6.95,12.25,35.25,16.0,"51.4379, 5.3582"
2,521,2025-05-22 09:00:01+02:00,51.4379,5.3582,10414.0,6.49,11.57,19.8,13.0,"51.4379, 5.3582"
3,521,2025-05-22 09:10:00+02:00,51.4379,5.3582,9772.0,5.88,10.51,20.8,17.0,"51.4379, 5.3582"
4,525,2025-05-22 08:30:01+02:00,51.4546,5.3888,14094.0,7.83,14.39,24.58,23.0,"51.4546, 5.3888"
5,525,2025-05-22 08:40:00+02:00,51.4545,5.3887,16646.0,9.04,17.12,32.92,16.0,"51.4545, 5.3887"
6,525,2025-05-22 09:00:00+02:00,51.4545,5.3887,8498.0,6.69,12.76,30.24,11.0,"51.4545, 5.3887"
7,525,2025-05-22 09:10:00+02:00,51.4545,5.3887,9469.0,5.5,12.17,32.06,10.0,"51.4545, 5.3887"
8,525,2025-05-22 10:00:00+02:00,51.4545,5.3887,21292.0,5.19,7.9,17.55,24.0,"51.4545, 5.3887"
9,536,2025-05-22 09:10:00+02:00,51.4681,5.3901,7895.0,5.89,9.89,21.53,24.0,"51.4681, 5.3901"


In [None]:
BASE_URL = "https://api.dustmonitoring.nl/v1"
PROJECT_ID = 30

locations_resp = requests.get(f"{BASE_URL}/project({PROJECT_ID})/locations")
locations = locations_resp.json()['value']
location_ids = [loc['ID'] for loc in locations]

all_data = []

for loc_id in tqdm(location_ids, desc="Fetching data"):
    url = (
        f"https://api.dustmonitoring.nl/v1/project({PROJECT_ID})/location({loc_id})/stream"
        f"?$filter=timestamp gt 2025-05-23T00:00:00Z"
        f"&$select=timestamp,entity,value"
    )
    resp = requests.get(url)
    if resp.status_code != 200:
        continue
    data = resp.json().get('value', [])
    print(data)
    for entry in data:
        print(entry)
        all_data.append({
            'Location': loc_id,
            'Timestamp': entry['Timestamp'],
            'Name': entry['Name'],
            'Value': entry['Value']
        })

df = pd.DataFrame(all_data)
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df['Year'] = df['Timestamp'].dt.year

summary = df.groupby(['Location', 'Year', 'Name'])['Value'].agg(['mean', 'min', 'max', 'std']).reset_index()

summary.to_csv(f'{PATH_TO_DATASETS}airpollution_yearly_summary.csv', index=False)

Fetching data:   0%|          | 0/52 [00:00<?, ?it/s]

In [4]:
all_data

[]