Scraper: ids gevonden in url waarneming.nl, namen toegevoegd om later in dataframe te plaatsen om diersoorten makkelijker te vinden, geeft 1 grote csv. 

In [2]:
import requests
import threading
import os
import time
from datetime import datetime, timedelta
import pandas as pd

In [3]:
start_data = "1980-01-01"
end_date = "2025-04-03"
Location = "Rotterdam"
time_sleep = 1

In [4]:
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
batch_directory = f"batches_{timestamp}"

os.makedirs(batch_directory, exist_ok=True)

species_info = {
    # 185769: "rosse metselbij",
    # 6496: "zwanenbloem",
    # 7261: "zoete kers",
    # 410: "laatvlieger",
    # 712: "kleine vuurvlinder",
    # 6390: "steenbreekvaren",
    # 390: "egel",
    # 2135: "snoek",
    # 428: "vos",
    # 204: "zanglijster",
    6562: "Haagbeuk",
}

In [5]:

def daterange(start_date, end_date, step_years=1):
    """Yield start and end dates for each year step."""
    current = start_date
    while current < end_date:
        next_year = current.replace(year=current.year + step_years)
        yield (current, min(next_year - timedelta(days=1), end_date))
        current = next_year

def fetch_observations_for_species(species_id, results):
    print(f"\n Start ophalen van observaties voor soort {species_info[species_id]} (ID: {species_id})...\n")

    start_dt = datetime.strptime(start_data, "%Y-%m-%d")
    end_dt = datetime.strptime(end_date, "%Y-%m-%d")
    all_observations = []

    for batch_start, batch_end in daterange(start_dt, end_dt):
        retries = 5
        current_page = 1
        batch_observations = []
        date_after = batch_start.strftime("%Y-%m-%d")
        date_before = batch_end.strftime("%Y-%m-%d")
        success = False

        print(f"📅 [{species_info[species_id]}] Ophalen van {date_after} t/m {date_before}...")

        while True:
            api_url = (
                f"https://observation.org/api/v1/species/{species_id}/observations/"
                f"?limit=1000&country_id=166&date_after={date_after}&date_before={date_before}"
                f"&user=&location={Location}&page={current_page}"
            )

            response = requests.get(api_url)

            if response.status_code == 200:
                data = response.json()
                if 'detail' in data:
                    print(f"⚠️ Geen data in deze batch ({date_after} - {date_before})")
                    success = True
                    break

                observations = data.get("results", [])
                batch_observations.extend(observations)

                if len(observations) < 1000:
                    success = True
                    break

                current_page += 1
                time.sleep(time_sleep)

            elif response.status_code == 429 and retries > 0:
                print(f"⏳ Rate limit geraakt... wachten (poging {6 - retries}/5)")
                time.sleep(5)
                retries -= 1
            else:
                print(f"Fout ({response.status_code}) bij ophalen van batch {date_after} - {date_before}")
                break

        all_observations.extend(batch_observations)

        if success:
            print(f"Batch voltooid: {len(batch_observations)} observaties van {date_after} t/m {date_before}\n")
        else:
            print(f"Batch mislukt: {date_after} t/m {date_before}\n")

    with results['lock']:
        results['observations'][species_id] = all_observations

    print(f"Totaal opgehaald voor {species_info[species_id]}: {len(all_observations)} observaties.\n")


def fetch_all_observations():
    all_observations = {'observations': {}, 'lock': threading.Lock()}
    threads = []
    all_species_data = []

    for species_id in species_info.keys():
        thread = threading.Thread(target=fetch_observations_for_species, args=(species_id, all_observations))
        threads.append(thread)
        thread.start()

    for t in threads:
        t.join()

    for species_id, observations in all_observations['observations'].items():
        for obs in observations:
            obs['species_name'] = species_info[species_id]
        all_species_data.extend(observations)

    if all_species_data:
        df = pd.DataFrame(all_species_data)
        df.to_csv(os.path.join(batch_directory, "combined_observations.csv"), index=False, sep=';', encoding="utf-8")
        df.to_json(os.path.join(batch_directory, "combined_observations.json"), orient='records', lines=True, force_ascii=False)
        print(f"✔️ Gegevens opgeslagen in: {batch_directory}")
    else:
        print("⚠️ Geen observaties gevonden.")

fetch_all_observations()



 Start ophalen van observaties voor soort Haagbeuk (ID: 6562)...

📅 [Haagbeuk] Ophalen van 1980-01-01 t/m 1980-12-31...
Batch voltooid: 0 observaties van 1980-01-01 t/m 1980-12-31

📅 [Haagbeuk] Ophalen van 1981-01-01 t/m 1981-12-31...
Batch voltooid: 0 observaties van 1981-01-01 t/m 1981-12-31

📅 [Haagbeuk] Ophalen van 1982-01-01 t/m 1982-12-31...
Batch voltooid: 0 observaties van 1982-01-01 t/m 1982-12-31

📅 [Haagbeuk] Ophalen van 1983-01-01 t/m 1983-12-31...
Batch voltooid: 0 observaties van 1983-01-01 t/m 1983-12-31

📅 [Haagbeuk] Ophalen van 1984-01-01 t/m 1984-12-31...
Batch voltooid: 0 observaties van 1984-01-01 t/m 1984-12-31

📅 [Haagbeuk] Ophalen van 1985-01-01 t/m 1985-12-31...
Batch voltooid: 0 observaties van 1985-01-01 t/m 1985-12-31

📅 [Haagbeuk] Ophalen van 1986-01-01 t/m 1986-12-31...
Batch voltooid: 0 observaties van 1986-01-01 t/m 1986-12-31

📅 [Haagbeuk] Ophalen van 1987-01-01 t/m 1987-12-31...
Batch voltooid: 0 observaties van 1987-01-01 t/m 1987-12-31

📅 [Haagbeuk] 

In [None]:
# fetch_all_observations()

## Oudere versies met max 1000 waarnemingen per soort voor testing

In [None]:
start_data = "1986-01-01"
end_date = "2025-04-03"
Location = "Rotterdam"
time_sleep = 5

In [6]:
# Genereer een unieke directory naam met een timestamp 
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
batch_directory = f"batches_{timestamp}"

# Maak de directory aan
os.makedirs(batch_directory, exist_ok=True)

# Specifieke species_ids die je wilt ophalen
species_info = {
    # 185769: "rosse metselbij",
    # 6496: "zwanenbloem",
    # 7261: "zoete kers",
    # 410: "laatvlieger",
    # 712: "kleine vuurvlinder",
    # 6390: "steenbreekvaren",
    # 390: "egel",
    # 2135: "snoek",
    # 428: "vos",
    # 204: "zanglijster"
    6562: "Haagbeuk"
}

In [7]:
def fetch_observations_for_species(species_id, results):
    API_URL = f"https://observation.org/api/v1/species/{species_id}/observations/?limit=1000&country_id=166&date_after={start_data}&date_before={end_date}&user=&location={Location}"
    
    retries = 5  # Maximaal aantal herhalingen bij een 429-fout
    print(f"Start ophalen van observaties voor soort {species_info[species_id]} (ID: {species_id})...")

    while retries > 0:
        response = requests.get(API_URL)

        if response.status_code == 200:
            data = response.json()
            
            # Controleer of er observaties zijn
            if 'detail' in data and data['detail'] == "No Species matches the given query.":
                print(f"Geen observaties voor soort {species_info[species_id]} (ID: {species_id}).")
                return
            # Voeg de resultaten toe
            with results['lock']:
                results['observations'][species_id] = data.get("results", [])
            
            # Log the number of observations retrieved
            print(f"Opgehaald {len(results['observations'][species_id])} observaties voor soort {species_info[species_id]} (ID: {species_id}).")
            time.sleep(time_sleep)  # Rate-limit bescherming
            return
        
        elif response.status_code == 429:
            print(f"Rate limit bereikt voor soort {species_info[species_id]} (ID: {species_id}), opnieuw proberen in 5 seconden...")
            time.sleep(time_sleep)
            retries -= 1  # Aantal pogingen verlagen
        
        else:
            print(f"Fout bij ophalen voor soort {species_info[species_id]} (ID: {species_id}): {response.status_code}")
            return  # Stop als er een andere fout is

In [8]:
# Functie die de threading afhandelt
def fetch_all_observations():
    all_observations = {'observations': {}, 'lock': threading.Lock()}
    threads = []
    all_species_data = []  # Lijst om alle gegevens te combineren in één bestand

    print(f"Start met ophalen van observaties voor {len(species_info)} soorten...")

    # Loop door de opgegeven lijst van species_ids
    for species_id in species_info.keys():
        print(f"Start thread voor soort {species_info[species_id]} (ID: {species_id})...")
        thread = threading.Thread(target=fetch_observations_for_species, args=(species_id, all_observations))
        threads.append(thread)
        thread.start()

    # Wacht op de laatste threads
    print("Wachten op het afronden van alle threads...")
    for t in threads:
        t.join()

    print(f"Alle threads zijn afgerond. Gegevens verzamelen...")

    # Verzamel alle observaties in één lijst
    for species_id, observations in all_observations['observations'].items():
        if observations:
            # Voeg de naam van de soort toe aan elke observatie
            for observation in observations:
                observation['species_name'] = species_info[species_id]
            
            # Voeg deze observaties toe aan de gecombineerde lijst
            all_species_data.extend(observations)

    # Sla de gecombineerde data op als een DataFrame
    if all_species_data:
        df = pd.DataFrame(all_species_data)

        # Opslaan als CSV met semicolon als scheidingsteken
        combined_file_csv = os.path.join(batch_directory, "combined_observations.csv")
        df.to_csv(combined_file_csv, index=False, sep=';', encoding="utf-8")

        # Of opslaan als JSON
        combined_file_json = os.path.join(batch_directory, "combined_observations.json")
        df.to_json(combined_file_json, orient='records', lines=True, force_ascii=False)

        print(f"Alle observaties zijn succesvol opgeslagen in {batch_directory}.")

    else:
        print("Geen observaties gevonden voor de opgegeven soorten.")

In [6]:
# Start het ophalen van data
fetch_all_observations()

Start met ophalen van observaties voor 10 soorten...
Start thread voor soort rosse metselbij (ID: 185769)...
Start ophalen van observaties voor soort rosse metselbij (ID: 185769)...
Start thread voor soort zwanenbloem (ID: 6496)...
Start ophalen van observaties voor soort zwanenbloem (ID: 6496)...
Start thread voor soort zoete kers (ID: 7261)...
Start ophalen van observaties voor soort zoete kers (ID: 7261)...
Start thread voor soort laatvlieger (ID: 410)...
Start ophalen van observaties voor soort laatvlieger (ID: 410)...
Start thread voor soort kleine vuurvlinder (ID: 712)...
Start ophalen van observaties voor soort kleine vuurvlinder (ID: 712)...
Start thread voor soort steenbreekvaren (ID: 6390)...
Start ophalen van observaties voor soort steenbreekvaren (ID: 6390)...
Start thread voor soort egel (ID: 390)...
Start ophalen van observaties voor soort egel (ID: 390)...
Start thread voor soort snoek (ID: 2135)...
Start ophalen van observaties voor soort snoek (ID: 2135)...
Start threa

In [8]:
import os
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import ast

# Directory containing your CSV files
directory = 'batches_2025-04-09_15-38-22'

# Get all CSV files in the directory that end with 'with_geometry.csv'
csv_files = [f for f in os.listdir(directory) if f.endswith('with_geometry.csv')]

# Function to process CSV and return a GeoDataFrame
def process_csv(file_path):
    # Read the CSV into a pandas DataFrame
    df = pd.read_csv(file_path)

    # Parse the 'point' column and convert it into a geometry
    df['geometry'] = df['point'].apply(lambda x: Point(ast.literal_eval(x)['coordinates'][0], ast.literal_eval(x)['coordinates'][1]))

    # Convert the pandas DataFrame into a GeoDataFrame
    gdf = gpd.GeoDataFrame(df, geometry='geometry')

    # Set the CRS (coordinate reference system), assuming the coordinates are in WGS84 (EPSG:4326)
    gdf.set_crs('EPSG:4326', allow_override=True, inplace=True)

    return gdf

# Process each CSV file and save as .gpkg
for csv_file in csv_files:
    file_path = os.path.join(directory, csv_file)

    # Process the CSV file to get the GeoDataFrame (gdf)
    gdf = process_csv(file_path)

    # Replace '.csv' extension with '.gpkg' for output file
    output_gpkg = file_path.replace('.csv', '.gpkg')

    # Save the GeoDataFrame to a GeoPackage (GPKG) file
    gdf.to_file(output_gpkg, driver='GPKG')
    
    # Print the path of the saved file
    print(f"GeoDataFrame saved to {output_gpkg} as GeoPackage.")


In [9]:
df = pd.read_csv("batches_2025-04-09_15-38-22/combined_observations.csv", sep=';')

In [10]:
df

Unnamed: 0,id,species,date,time,number,sex,point,accuracy,notes,is_certain,...,user_detail,modified,species_group,validation_status,location,location_detail,photos,sounds,permalink,species_name
0,342881366,7261,2025-04-01,13:02,1,U,"{'type': 'Point', 'coordinates': [4.5329755, 5...",4.0,,True,...,"{'id': 862457, 'name': 'Ellen', 'avatar': None}",2025-04-01T13:36:16.453879,10,J,10315,"{'id': 10315, 'name': 'Rotterdam - Ommoord', '...",['https://observation.org/media/photo/11287473...,[],https://waarneming.nl/observation/342881366/,zoete kers
1,342880389,7261,2025-04-01,12:55,1,U,"{'type': 'Point', 'coordinates': [4.5325256, 5...",4.0,,True,...,"{'id': 862457, 'name': 'Ellen', 'avatar': None}",2025-04-01T13:00:03.449545,10,J,10315,"{'id': 10315, 'name': 'Rotterdam - Ommoord', '...",['https://observation.org/media/photo/11287431...,[],https://waarneming.nl/observation/342880389/,zoete kers
2,342392516,7261,2025-03-27,18:40,1,U,"{'type': 'Point', 'coordinates': [4.5298177, 5...",4.0,,True,...,"{'id': 797050, 'name': 'Theo', 'avatar': None}",2025-03-27T18:55:29.553576,10,J,6280,"{'id': 6280, 'name': 'Rotterdam - Kralingsche ...",['https://observation.org/media/photo/11261394...,[],https://waarneming.nl/observation/342392516/,zoete kers
3,331071639,7261,2024-10-16,14:42,1,U,"{'type': 'Point', 'coordinates': [4.552328, 51...",4.0,,True,...,"{'id': 89724, 'name': 'Rob v Dorland', 'avatar...",2024-10-16T18:16:11.174559,10,J,16690,"{'id': 16690, 'name': 'Rotterdam - Nessebos', ...",['https://observation.org/media/photo/10674696...,[],https://waarneming.nl/observation/331071639/,zoete kers
4,331034189,7261,2024-10-16,14:19,1,U,"{'type': 'Point', 'coordinates': [4.4771260209...",5.0,,False,...,"{'id': 1006175, 'name': 'Richard', 'avatar': '...",2024-10-16T14:19:53.551012,10,O,10202,"{'id': 10202, 'name': 'Rotterdam - centrum', '...",['https://observation.org/media/photo/10671467...,[],https://waarneming.nl/observation/331034189/,zoete kers
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4421,218973098,204,2021-07-02,19:47,1,U,"{'type': 'Point', 'coordinates': [4.4578924179...",3.0,,True,...,"{'id': 123974, 'name': 'Marco Tanis', 'avatar'...",2023-12-15T03:00:07.349689,1,A,43651,"{'id': 43651, 'name': 'Rotterdam - Wielewaal',...",[],[],https://waarneming.nl/observation/218973098/,zanglijster
4422,218931897,204,2021-07-03,12:03,1,U,"{'type': 'Point', 'coordinates': [4.5379918813...",10.0,,True,...,"{'id': 114006, 'name': 'Tiemen van Engelenhove...",2023-12-15T02:59:47.309527,1,A,44455,"{'id': 44455, 'name': 'Rotterdam - Het Lage La...",[],[],https://waarneming.nl/observation/218931897/,zanglijster
4423,218665875,204,2021-07-01,13:33,1,U,"{'type': 'Point', 'coordinates': [4.5472693443...",6.0,,True,...,"{'id': 11646, 'name': 'Anton van Meurs', 'avat...",2023-12-15T02:56:23.852477,1,A,21045,"{'id': 21045, 'name': 'Rotterdam - Prinsenpark...",[],[],https://waarneming.nl/observation/218665875/,zanglijster
4424,218623135,204,2021-06-30,16:07,1,U,"{'type': 'Point', 'coordinates': [4.541172, 51...",51.0,Papa-dag soort #82,True,...,"{'id': 196593, 'name': 'Vasco vd Berg', 'avata...",2023-12-15T02:56:08.953751,1,A,21044,"{'id': 21044, 'name': 'Rotterdam - Ommoordse V...",[],[],https://waarneming.nl/observation/218623135/,zanglijster
