In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


### GET full list of dataset IDs from OBIS API

In [2]:
# Define the URL
url = "https://api.obis.org/dataset/search?offset=0&limit=100000&q="

# Make the GET request
response = requests.get(url)

# Check the status of the response
if response.status_code == 200:
    data = response.json()
#    print(data)
else:
    print(f"Failed to retrieve data: {response.status_code}")


In [3]:
data['results'][1]['id']

'96041282-9386-4a8a-a564-fd753f1222b0'

In [4]:
datasetids = [record['id'] for record in data['results']]


In [5]:
len(datasetids)

5323

In [6]:
ids=datasetids
#ids

### Use datasetids list as reference to lookup more information

##### Step 1) Use Beautiful soup HTML scraping to extract the organizations listed under the "DATA PROVIDERS" sections AND any Organization Names listed as the "Contacts", e.g. for https://obis.org/dataset/e2c4a839-89b5-484f-a63c-df89c2f9f974 the Data Providers are: 'DNB' and 'GBIF Norway', Contact Orgs are: 'DNV','DNV','DNV','GBIF Norway','DNV'. The Contact Orgs are insurance for a situation where there are no Data Providers listed.

In [None]:
import time
import csv
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

In [None]:
# Function to extract data providers and contact organizations
def get_data_providers_and_contacts(dataset_id):
    url = f"https://obis.org/dataset/{dataset_id}"
    
    # Retry strategy
    retry_strategy = Retry(
        total=3,  # Number of retries
        backoff_factor=1,  # Wait 1, 2, 4, etc. seconds between retries
        status_forcelist=[429, 500, 502, 503, 504],  # Retry on these status codes
        allowed_methods=["HEAD", "GET", "OPTIONS"]  # Use allowed_methods instead of method_whitelist
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    http = requests.Session()
    http.mount("https://", adapter)
    http.mount("http://", adapter)
    
    try:
        response = http.get(url)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return [], [], "TIMEOUT"

    html_content = response.text
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Extract data providers
    data_providers_section = soup.find('h3', string='Data providers')
    data_providers = []
    if data_providers_section:
        table = data_providers_section.find_next('table')
        if table:
            data_providers = [row.find('td').text for row in table.find_all('tr')[1:]]  # Skip the header row
    
    # Extract contact organizations
    contact_orgs = []
    contacts_section = soup.find('td', string='Contacts')
    if contacts_section:
        subtable = contacts_section.find_next('table', class_='subtable table-contacts')
        if subtable:
            contact_orgs = [span.text for span in subtable.find_all('span', class_='smaller')]
    
    return data_providers, contact_orgs, None

##### Step 2) Use OBIS API again to the total number of occurences for that Dataset

In [None]:
# Function to get total occurrences
def get_total_occurrences(dataset_id):
    url = f"https://api.obis.org/occurrence?after=-1&size=0&datasetid={dataset_id}"
    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return "TIMEOUT"

    data = response.json()
    return data.get('total', 0)



##### Step 3) Run Code. FYI - takes some time (partly to avoid too many requests per second)

In [10]:
# Collect data with rate limiting and write to CSV in real-time
output_file = 'Dataset_and_Org_classifications/OBIS_DatasetID_2_orgnames.csv'
total_requests = len(ids)

NameError: name 'ids' is not defined

In [22]:
# Write the header to the CSV file
with open(output_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["dataset_id", "Data providers", "Contact Organizations", "Total Occurrences"])

for i, dataset_id in enumerate(ids):
    providers, contact_orgs, timeout_flag = get_data_providers_and_contacts(dataset_id)
    total_occurrences = get_total_occurrences(dataset_id) if not timeout_flag else "TIMEOUT"
    
    # Write the data to the CSV file
    with open(output_file, mode='a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([
            dataset_id,
            providers if not timeout_flag else "TIMEOUT",
            contact_orgs if not timeout_flag else "TIMEOUT",
            total_occurrences
        ])
    
    # Print progress message
    print(f"Processing dataset {i + 1}/{total_requests} ({(i + 1) / total_requests * 100:.2f}%)", end='\r')
    
    # Rate limiting
    time.sleep(1)  # Adjust the sleep time as needed

print("\nData collection complete.")


Processing dataset 5323/5323 (100.00%)
Data collection complete.


In [11]:
df_OBISorgInfo = pd.read_csv(output_file)

In [12]:
df_OBISorgInfo

Unnamed: 0,dataset_id,Data providers,Contact Organizations,Total Occurrences
0,d6ae6af8-8eac-4fbd-9ce5-b49b1137351c,['University of Queensland'],"['University of Queensland', 'University of Qu...",3665
1,96041282-9386-4a8a-a564-fd753f1222b0,['CSIRO National Collections and Marine Infras...,"['CSIRO Environment', 'CSIRO Environment', 'CS...",19763
2,61dc18bb-e2ec-43a1-949c-120d58235b61,"['The Australian National University', 'CSIRO ...","['Australian National University', 'Australian...",1960
3,0c7b3f51-33c8-494c-ab8a-c5bd28f40f87,['Delaware Museum of Nature and Science – Moll...,"['Invert-E-Base Portal', 'Delaware Museum of N...",34280
4,8ea620e9-37d5-4ff6-9a02-0bc94356c2d5,"['Invert-E-Base Portal', 'Malacology Collectio...","['Invert-E-Base Portal', 'Malacology Collectio...",164632
...,...,...,...,...
5318,75e2a2b6-f58f-4c43-aefd-878cee7865b1,"['Australian Antarctic Division', 'Intergovern...",['Intergovernmental Oceanographic Commission o...,556
5319,cf494b72-7b4d-4ca6-bb90-6a96554dcd57,[],[],0
5320,424b6232-cb9a-489a-8894-30fd7b951d49,[],[],0
5321,ddf1a66a-6913-4653-9f3a-44072705ad64,[],[],0


In [14]:
len(df_OBISorgInfo)

5323

Expect 5323 rows output