In [1]:
import pandas as pd
import json
import requests
import random
import textblob
from textblob.classifiers import NaiveBayesClassifier

# Are there any keywords associated with a dataset type?

With the help of the [GBIF API](https://www.gbif.org/developer/summary) and textblob.

See the documentation I used to write this little script:
* https://www.analyticsvidhya.com/blog/2018/02/the-different-methods-deal-text-data-predictive-python/
* https://stevenloria.com/simple-text-classification/

## I - get the dataset's features to analyse from GBIF

In [2]:
res = pd.DataFrame(columns=["Title", "Description", "type"])

# We are excluding datasets from organizations that generated the metadata automatically
# PLAZI, GEO-TER and PANGEA
exclude = ['7ce8aef0-9e92-11dc-8738-b8a03c50a862',
           'd5778510-eb28-11da-8629-b8a03c50a862',
           'ef69a030-3940-11dd-b168-b8a03c50a862']
offset = 0
step = 900
end_of_records = False
while not end_of_records:
    param = {
        "offset": offset,
        "limit": step
    }
    # Query API
    response = requests.get("http://api.gbif.org/v1/dataset", param)
    response = response.json()
    offset += step
    end_of_records = response["endOfRecords"]
    for dataset in response["results"]:
        # exclude dataset from PLAZI, GEO-TER and PANGEA
        if dataset["publishingOrganizationKey"] not in exclude:
            # Get title and description of the dataset
            res.at[dataset["key"], "Title"] = dataset["title"]
            if "description" in dataset:
                res.at[dataset["key"], "Description"] = dataset["description"]
            else:
                res.at[dataset["key"], "Description"] = ""
            res.at[dataset["key"], "type"] = dataset["type"]

## II - Set the parameters for the cleaning and analysis

In [3]:
feature = "Description" # we want to find words in the description
ferquency_threshold = 50 # how many most frequent word do we remove
rare_threshold_word_number = 15 # how many time a word should appear to be kept
size_training_set = 5000 # training set size

## III - Clean up

In [4]:
# Set everything to lower case
res[feature] = res[feature].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [5]:
# Remove punctuation
res[feature] = res[feature].str.replace('[^\w\s]', '')

# Remove taboo words (which could give bout the dataset type)
taboo_words = ["checklist", "occurrence", "event"]
for taboo in taboo_words:
    res[feature] = res[feature].str.replace(taboo, '')

In [6]:
# Find the most frequent words
freq = pd.Series(' '.join(res[feature]).split()).value_counts()[:ferquency_threshold]
# print(freq)
freq = list(freq.index)
# Remove the most frequent words
for taboo in freq:
    res[feature] = res[feature].replace(taboo, '')

In [7]:
# Find the rare words
freq = pd.Series(' '.join(res[feature]).split()).value_counts()
freq = freq[freq < rare_threshold_word_number]
freq = list(freq.index)
# Remove rare words
for taboo in freq:
    res[feature] = res[feature].replace(taboo, '')

In [8]:
# Correct spelling - not done here because it takes too much time
# res['Title'] = res['Title'].apply(lambda x: str(textblob.TextBlob(x).correct()))

In [9]:
# Lemmatization (remove some letters in words to make it more "universal")
res[feature] = res[feature].apply(lambda x: " ".join([textblob.Word(word).lemmatize() for word in x.split()]))

In [10]:
# Reformat dataset
dataset = list(res[[feature, 'type']].itertuples(index=False))
# Shuffle data
random.shuffle(dataset)

## IV - Train and test Classifier - Naive Bayesian

In [11]:
# Train classifier
cl = NaiveBayesClassifier(dataset[0:size_training_set])

In [12]:
# Test classifier
cl.accuracy(dataset[size_training_set:])

0.8605159597726278

In [13]:
# Put back the performance in the context 
# In our case the performance is barely any better than predicting the most abundant class:
# See numbers below, 83% of the datasets are occurrence datasets
res["type"].value_counts()

OCCURRENCE        7985
CHECKLIST         1159
METADATA           220
SAMPLING_EVENT     210
Name: type, dtype: int64

Over all **this model is not good**. I would conclude that there isnt any good way to infer the dataset type from the metadata.

In addition to this, no particular relevant word is clearly associated to a data type (see below).

## V - What word is associated with a dataset type

In [14]:
# Find the most informative words
cl.show_informative_features(10)

Most Informative Features
contains(httpswwwndffnlenglish) = True           METADA : OCCURR =   1477.9 : 1.0
      contains(positive) = True           METADA : OCCURR =    901.7 : 1.0
    contains(evaluating) = True           SAMPLI : OCCURR =    418.4 : 1.0
           contains(fim) = True           SAMPLI : OCCURR =    395.8 : 1.0
   contains(utilization) = True           SAMPLI : OCCURR =    264.6 : 1.0
       contains(without) = True           METADA : OCCURR =    170.9 : 1.0
      contains(overseas) = True           CHECKL : OCCURR =    170.4 : 1.0
           contains(cbd) = True           CHECKL : OCCURR =    161.0 : 1.0
       contains(flagged) = True           CHECKL : OCCURR =    147.0 : 1.0
     contains(kilometre) = True           METADA : OCCURR =    136.6 : 1.0


In [16]:
# Example to check out datasets containing one of the informative words: positive
res[res.Description.str.contains("positive")] # In most cases here, they are artifacts

Unnamed: 0,Title,Description,type
bbcdfbfc-d1d5-4ba1-a56f-bc4c96150a2f,Leopard and Weddell seal program 1999/2002,p20002001 season 31 quad based survey were con...,OCCURRENCE
f83746ee-4cf2-4e60-a720-dd508b559794,NDFF Waternet,pwaternet is the only water company in the net...,METADATA
09683b92-d4e8-4773-9959-e07ff47167f5,NDFF Unie van Bosgroepen,pde unie van bosgroepen is de overkoepelende o...,METADATA
99f3136c-d582-45a7-8b93-67c5ec52daab,NDFF City of Breda,pbreda is a city and municipality in the south...,METADATA
405f09d2-17a5-4630-b08a-052ae49510a4,NDFF City of Deventer,pder is a historic and pleasant city and munic...,METADATA
19e38826-ec3d-481f-bd19-57f5e7e8c17c,NDFF Delfland Water Authority,pdelfland is one of our country twentytwo wate...,METADATA
ffae2921-22c7-41e4-a0d2-41a33f0aeb2a,NDFF Rijnland Water Authority,prijnland is not the only district water board...,METADATA
cd2dd66d-d1c4-4512-a6d1-4048601572c8,NDFF Water Authority Schieland Krimpenerwaard,schieland en de krimpenerwaard is a regional w...,METADATA
d4974c0d-0e3b-49e4-ab7b-46c697a634f2,NDFF Water Authority Hollands Noorderkwartier,water authority holland noorderkwartier is a d...,METADATA
a4c3bfc2-9443-4cf5-ba56-fae8eb1aca6e,NDFF Monitoring program vascular plants in EU ...,in this protocol the following 5 vascular plan...,METADATA
