In [1]:
import pandas as pd
import numpy as np
from dwca.read import DwCAReader

In [2]:
dwca_path = '../data/dwca-inboveg-niche-vlaanderen-events-v1.7.zip'

In [3]:
with DwCAReader(dwca_path) as dwca:
    events = pd.read_csv(
        dwca.absolute_temporary_path('event.txt'),
        delimiter='\t',
        dtype=object
    )

In [4]:
def print_unique_values(df):
    for column in df.columns:
        unique_values = df[column].astype(str).unique().tolist()
        if len(unique_values) <= 20:
            unique_values_for_print = ', '.join(unique_values)
        else:
            unique_values_for_print = 'more than 20 values, check seperately'
        print(column + ':\n' + unique_values_for_print +'\n')

## Events

In [5]:
# Number of records
len(events)

569

In [6]:
# Columns
print('\n'.join(events.columns))

id
type
language
license
rightsHolder
accessRights
datasetID
datasetName
ownerInstitutionCode
eventID
samplingProtocol
sampleSizeValue
sampleSizeUnit
eventDate
verbatimEventDate
locationID
continent
countryCode
stateProvince
county
municipality
verbatimLocality
locationAccordingTo
verbatimLatitude
verbatimLongitude
verbatimCoordinateSystem
verbatimSRS
decimalLatitude
decimalLongitude
geodeticDatum
coordinateUncertaintyInMeters


In [7]:
# Unique values per column
print_unique_values(events)

id:
more than 20 values, check seperately

type:
event

language:
en

license:
http://creativecommons.org/publicdomain/zero/1.0/

rightsHolder:
INBO

accessRights:
http://www.inbo.be/en/norms-for-data-use

datasetID:
http://dataset.inbo.be/Niche-Vlaanderen

datasetName:
InboVeg-niche-Vlaanderen

ownerInstitutionCode:
INBO

eventID:
more than 20 values, check seperately

samplingProtocol:
vegetationPlot- LONDO

sampleSizeValue:
9, 100, 4, 25

sampleSizeUnit:
m²

eventDate:
more than 20 values, check seperately

verbatimEventDate:
more than 20 values, check seperately

locationID:
more than 20 values, check seperately

continent:
Europe

countryCode:
BE

stateProvince:
Vlaams Gewest, nan, XY Onbekend

county:
Antwerpen, West-Vlaanderen, Limburg, Vlaams-Brabant, Oost-Vlaanderen, nan, XY Onbekend

municipality:
more than 20 values, check seperately

verbatimLocality:
more than 20 values, check seperately

locationAccordingTo:
MILKLIM-Gebieden

verbatimLatitude:
more than 20 values, check s

## Experiment

In [8]:
values = events['stateProvince'].astype(str).value_counts(sort=True, dropna=False)

In [9]:
values.index

Index(['Vlaams Gewest', 'nan', 'XY Onbekend'], dtype='object')

In [11]:
events['decimalLongitude'].astype(float).min()

2.8536800000000002