# 0.1.2: Explore and preprocess GBIF citizen science data

## Imports and config

In [1]:
import json
from pathlib import Path
from pprint import pprint

import dask.dataframe as dd
import h3
import numpy as np
import pandas as pd

from src.conf.conf import get_config
from src.conf.environment import log

%load_ext autoreload
%autoreload 2

# Display all columns when printing a pandas DataFrame
pd.set_option("display.max_columns", None)

cfg = get_config()

Define some key paths

In [2]:
gbif_raw_dir = Path(cfg.gbif.raw.dir)
gbif_prep_dir = Path(cfg.interim.gbif.dir)

## Explore the raw data structure

First let's just browse the directory contents.

In [4]:
print("Directory contents of the raw GBIF data:")
for file in gbif_raw_dir.iterdir():
    print(file)

Directory contents of the raw GBIF data:
data/raw/all_tracheophyta_non-cult_2024-04-10/all_tracheophyta_non-cult_2024-04-10.json
data/raw/all_tracheophyta_non-cult_2024-04-10/all_tracheophyta_non-cult_2024-04-10.parquet


Let's parse the JSON file.

In [5]:
# parse the json file
json_fn = "all_tracheophyta_non-cult_2024-04-10.json"
with open(gbif_raw_dir / json_fn) as f:
    data = json.load(f)

pprint(data)

{'created': '2024-04-10T16:03:38.992+00:00',
 'doi': '10.15468/dl.b9ptxy',
 'downloadLink': 'https://api.gbif.org/v1/occurrence/download/request/0136703-240321170329656.zip',
 'eraseAfter': '2024-10-10T16:03:38.940+00:00',
 'key': '0136703-240321170329656',
 'license': 'http://creativecommons.org/licenses/by-nc/4.0/legalcode',
 'modified': '2024-04-10T17:02:55.607+00:00',
 'numberDatasets': 12645,
 'request': {'format': 'SIMPLE_PARQUET',
             'predicate': {'predicates': [{'key': 'TAXON_KEY',
                                           'matchCase': False,
                                           'type': 'equals',
                                           'value': '7707728'},
                                          {'predicate': {'key': 'DEGREE_OF_ESTABLISHMENT',
                                                         'matchCase': False,
                                                         'type': 'in',
                                                         'values': [

We can see from the included JSON file all the metadata surrounding the data request, including the exact query, as well as other information like request date, number of datasets, and number of records.

In [6]:
print(f"Number of datasets: {data['numberDatasets']:,}")
print(f"Total records: {data['totalRecords']:,}")

Number of datasets: 12,645
Total records: 386,455,349


### Load the dataframe with Dask

Before trying to load the parquet file directly into memory, let's print the size of the data in GB (it is provided in bytes).

In [7]:
print(f"Size of the data: {data['size'] / 1e9:.2f} GB")

Size of the data: 24.33 GB


Depending on your system, that may be a bit too big to load all at once. Especially because the data is saved in parquet format, which supports compression, so the real size may be significantly larger than that. Luckily, we can use `dask` to load large dataframes due to its ability to only load into memory exactly what we need for our given computations.

In [10]:
ddf = dd.read_parquet(gbif_raw_dir / "all_tracheophyta_non-cult_2024-04-10.parquet/*")
ddf.head()

Unnamed: 0,gbifid,datasetkey,occurrenceid,kingdom,phylum,class,order,family,genus,species,infraspecificepithet,taxonrank,scientificname,verbatimscientificname,verbatimscientificnameauthorship,countrycode,locality,stateprovince,occurrencestatus,individualcount,publishingorgkey,decimallatitude,decimallongitude,coordinateuncertaintyinmeters,coordinateprecision,elevation,elevationaccuracy,depth,depthaccuracy,eventdate,day,month,year,taxonkey,specieskey,basisofrecord,institutioncode,collectioncode,catalognumber,recordnumber,identifiedby,dateidentified,license,rightsholder,recordedby,typestatus,establishmentmeans,lastinterpreted,mediatype,issue
0,4155839127,8a863029-f435-446a-821e-275f4f641165,https://observation.org/observation/243417806,Plantae,Tracheophyta,Magnoliopsida,Asterales,Asteraceae,Silybum,Silybum marianum,,SPECIES,Silybum marianum (L.) Gaertn.,Silybum marianum,,NL,Cadier en Keer,Limburg,PRESENT,1.0,c8d737e0-2ff8-42e8-b8fc-6b805d26fc5f,50.8,5.75,5000.0,,,,,,2022-05-29,29.0,5.0,2022.0,3145214,3145214.0,HUMAN_OBSERVATION,,Observations,OBS.243417806,,[],NaT,CC_BY_NC_4_0,Stichting Observation International,[User 418761],[],,2024-01-25 12:03:49.367,[],[OCCURRENCE_STATUS_INFERRED_FROM_INDIVIDUAL_CO...
1,3823408376,8a863029-f435-446a-821e-275f4f641165,https://observation.org/observation/243524281,Plantae,Tracheophyta,Liliopsida,Asparagales,Iridaceae,Iris,Iris pseudacorus,,SPECIES,Iris pseudacorus L.,Iris pseudacorus,,NL,Rotterdam - Prinsenpark,Zuid-Holland,PRESENT,1.0,c8d737e0-2ff8-42e8-b8fc-6b805d26fc5f,51.9,4.5,5000.0,,,,,,2022-05-30,30.0,5.0,2022.0,5298231,5298231.0,HUMAN_OBSERVATION,,Observations,OBS.243524281,,[],NaT,CC_BY_NC_4_0,Stichting Observation International,[User 746719],[],,2024-01-25 12:03:26.829,[StillImage],[OCCURRENCE_STATUS_INFERRED_FROM_INDIVIDUAL_CO...
2,4056148605,8a863029-f435-446a-821e-275f4f641165,https://observation.org/observation/243600288,Plantae,Tracheophyta,Magnoliopsida,Fabales,Fabaceae,Vicia,,,GENUS,Vicia L.,Vicia spec.,,NL,Amstelveen - Keizer Karelpark,Noord-Holland,PRESENT,1.0,c8d737e0-2ff8-42e8-b8fc-6b805d26fc5f,52.25,4.85,5000.0,,,,,,2022-05-31,31.0,5.0,2022.0,2974751,,HUMAN_OBSERVATION,,Observations,OBS.243600288,,[],NaT,CC_BY_NC_4_0,Stichting Observation International,[User 753446],[],,2024-01-25 11:57:47.679,[StillImage],[OCCURRENCE_STATUS_INFERRED_FROM_INDIVIDUAL_CO...
3,3904214750,8a863029-f435-446a-821e-275f4f641165,https://observation.org/observation/243760126,Plantae,Tracheophyta,Liliopsida,Asparagales,Orchidaceae,Neottia,Neottia ovata,,SPECIES,Neottia ovata (L.) Bluff & Fingerh.,Neottia ovata,,NL,Bunnik - Oud Amelisweerd,Utrecht,PRESENT,1.0,c8d737e0-2ff8-42e8-b8fc-6b805d26fc5f,52.05,5.15,5000.0,,,,,,2022-06-01,1.0,6.0,2022.0,2816250,2816250.0,HUMAN_OBSERVATION,,Observations,OBS.243760126,,[],NaT,CC_BY_NC_4_0,Stichting Observation International,[User 153271],[],,2024-01-25 12:04:37.647,[StillImage],[OCCURRENCE_STATUS_INFERRED_FROM_INDIVIDUAL_CO...
4,3905757702,8a863029-f435-446a-821e-275f4f641165,https://observation.org/observation/243801621,Plantae,Tracheophyta,Magnoliopsida,Dipsacales,Caprifoliaceae,Lonicera,Lonicera periclymenum,,SPECIES,Lonicera periclymenum L.,Lonicera periclymenum,,NL,Haskerhornerpolder,Friesland,PRESENT,1.0,c8d737e0-2ff8-42e8-b8fc-6b805d26fc5f,52.9,5.8,5000.0,,,,,,2022-06-01,1.0,6.0,2022.0,5334277,5334277.0,HUMAN_OBSERVATION,,Observations,OBS.243801621,,[],NaT,CC_BY_NC_4_0,Stichting Observation International,[User 93322],[],,2024-01-25 11:57:59.972,[],[OCCURRENCE_STATUS_INFERRED_FROM_INDIVIDUAL_CO...


## TODO: Analyze data
- global distribution
- Num. with images
- coordinate uncertainty
- dataset information
- issues
- collection date

### Occurrence status

The data gives us occurence status (whether or not the species is present or absent in the observation). This could potentially be pretty valuable, as absence data information can be just as useful as presence information.

In [12]:
ddf.occurrencestatus.value_counts().compute()

occurrencestatus
ABSENT       3389108
PRESENT    383066241
Name: count, dtype: int64[pyarrow]

## Select relevant data

While there is a ton of interesting information in the GBIF data, we're primarily interested in present species, their official names, their locations, and their corresponding PFTs. Let's reload the data with only that information to reduce the memory footprint before we merge with our PFT information in TRY.

In [12]:
columns = [
    "species",
    "taxonrank",
    "decimallatitude",
    "decimallongitude",
]
ddf = dd.read_parquet(
    gbif_raw_dir / "all_tracheophyta_non-cult_2024-04-10.parquet/*",
    columns=columns,
    npartitions=60,
)

ddf = ddf[ddf["taxonrank"] == "SPECIES"].drop(columns=["taxonrank"])

## Load TRY PFTs

In [13]:
pfts = dd.read_csv(Path(cfg.trydb.raw.pfts), encoding="latin-1").drop(
    columns=["AccSpeciesID"]
)

In [11]:
pfts.head()

Unnamed: 0,AccSpeciesName,pft
0,Calamagrostis lapponica,Grass
1,Carex capitata,Grass
2,Carex rostrata,Grass
3,Carex saxatilis,Grass
4,Carex vaginata,Grass


Before matching, make sure we're dealing only with common, 2-word species names.

In [5]:
pfts.AccSpeciesName.str.split().str.len().value_counts().compute()

AccSpeciesName
1       3548
2     109849
3        719
4       6157
5         64
6         54
7         21
8         10
9          3
10         2
14         1
Name: count, dtype: int64

PFTs and GBIF species columns (not shown here due to extensive computation time) both have a number of records with species names longer than two words. We can truncate each record to only the first two words in hopes that this makes matching simpler.

In [5]:
ddf["speciesname"] = ddf["species"].str.extract("([A-Za-z]+ [A-Za-z]+)", expand=False)
pfts["speciesname"] = pfts["AccSpeciesName"].str.extract(
    "([A-Za-z]+ [A-Za-z]+)", expand=False
)

To improve merge efficiency, we'll drop missing values and set speciesname as the index.

In [6]:
ddf = (
    ddf.dropna(subset="speciesname").drop(columns=["species"]).set_index("speciesname")
)
pfts = (
    pfts.dropna(subset="speciesname")
    .drop(columns=["AccSpeciesName"])
    .set_index("speciesname")
)

Now we can merge the PFT information into the GBIF data and then reset the index.

In [7]:
# Merge pfts into ddf on speciesname index
ddf = dd.merge(ddf, pfts, left_index=True, right_index=True).reset_index()

And save the results directly to disk to avoid memory issues.

In [8]:
ddf.to_parquet(gbif_prep_dir / "gbif_pfts.parquet", write_index=False)

(Took about 33 mins)

## Inspect matched GBIF / PFT data

In [8]:
gbif_pfts = dd.read_parquet(gbif_prep_dir / "gbif_pfts.parquet")
gbif_pfts.head()

Unnamed: 0,speciesname,decimallatitude,decimallongitude,pft
0,Aa argyrolepis,-0.666,-78.5,Grass
1,Aa argyrolepis,-2.6,-78.666,Grass
2,Aa argyrolepis,-0.18389,-78.72889,Grass
3,Aa argyrolepis,-0.18389,-78.72889,Grass
4,Aa calceata,-14.82278,-68.19111,Shrub


In [17]:
unique_species_gbif_pfts = gbif_pfts.speciesname.nunique().compute()
unique_species_gbif = ddf.species.str.extract(
    "([A-Za-z]+ [A-Za-z]+)", expand=False
).nunique().compute()
unique_species_pfts = pfts.AccSpeciesName.str.extract(
    "([A-Za-z]+ [A-Za-z]+)", expand=False
).nunique().compute()

print(
    f"Species matched from GBIF: {unique_species_gbif_pfts / unique_species_gbif:.2%}"
)
print(
    f"Species matched from PFTS: {unique_species_gbif_pfts / unique_species_pfts:.2%}"
)

Species matched from GBIF: 28.17%
Species matched from PFTS: 80.00%


In [18]:
print(f"Total number of species: {unique_species_gbif_pfts:,}")

Total number of species: 88,516


## Subsample GBIF data with a regular hexagonal grid

One issue with the GBIF data is that species occurrences are obtained opportunistically and [tend to be clustered toward the global north](https://www.gbif.org/occurrence/map?occurrence_status=present). To reduce this sampling bias such that our GBIF data is more spatially balanced, we can subsampled the data by first binning it into equally spaced hexagons and then selecting a maximum number of observations from each hexagon.

In [9]:
def lat_lon_to_hex(lat: float, lon: float, resolution: int | float = 0.5):
    return h3.geo_to_h3(lat, lon, resolution)


def apply_hex_to_partition(df):
    return df.assign(
        hex=lat_lon_to_hex_vectorized(
            df.decimallatitude, df.decimallongitude, resolution=4
        )
    )

def sample_partition(df, n_samples=10000):
    return df.groupby(level=0).apply(
        lambda group: group if len(group) <= n_samples else group.sample(n_samples)
    )

# subsample the data first for testing
gbif_pfts = gbif_pfts.sample(frac=0.1)

# Vectorize the function
lat_lon_to_hex_vectorized = np.vectorize(lat_lon_to_hex)

meta = gbif_pfts._meta.assign(hex=pd.Series(dtype="object"))

# Apply the function to each partition
gbif_pfts = gbif_pfts.map_partitions(apply_hex_to_partition, meta=meta).persist()


gbif_pfts = gbif_pfts.set_index("hex", npartitions=180)

meta = gbif_pfts._meta_nonempty.sample(0)

subsampled_gbif_pfts = gbif_pfts.map_partitions(
    sample_partition, n_samples=1000, meta=meta
).compute()

As we can see, with a grid size of 4 degrees and a sample size of 1,000 occurrences per grid cell, we've reduced the number of occurrences to ~19%.  If everything went well, we should have the same number of hex cells in the original gbif_pfts dataframe as in the subsampled dataframe. We should also see a maximum of 1,000 occurrences in the hex bins.

In [15]:
subsampled_gbif_pfts.reset_index(drop=True).shape[0] / gbif_pfts.shape[0].compute()

0.18786533327592

In [19]:
gbif_pfts.reset_index().hex.nunique().compute() 

59641

In [30]:
subsampled_gbif_pfts.reset_index(level=0).reset_index(drop=True).hex.nunique()

59641

In [39]:
print(
    "Average number of occurrences per hex bin: "
    f"{subsampled_gbif_pfts.reset_index(level=0).groupby(level=0).size().mean():.2f}"
)

print(
    "Maximum number of occurrences per hex bin: "
    f"{subsampled_gbif_pfts.reset_index(level=0).groupby(level=0).size().max():,}"
)

Average number of occurrences per hex bin: 139.94
Maximum number of occurrences per hex bin: 1,000
