In [1]:
import pandas as pd
import json
import requests
import random
import textblob
from textblob.classifiers import NaiveBayesClassifier

# Can we predict if a dataset is part of a citizen science project?

With the help of the [GBIF API](https://www.gbif.org/developer/summary) and textblob.

See the documentation I used to write this  script:
* https://www.analyticsvidhya.com/blog/2018/02/the-different-methods-deal-text-data-predictive-python/
* https://stevenloria.com/simple-text-classification/

## What do I label as citizen science in the training set?

### What I label as "citizen science":
* Metadata explicitly includes the words "citizen" or "citizen science" (or for the French version "science" or "enquête" participative)
* The metadata mentions that the dataset is partly or entirely made by volunteers
* Bioblitz datasets

### What I don't label as "citizen science":
* What seems like compulsory student work
* Personal collections or notebook (unless the description includes clue from above)

## I - get the dataset's features to analyse from GBIF

In [2]:
def extract_set_of_words(dataset):
    '''
    Puts together titles, descriptions, methods and keywords
    '''
    set_of_words = dataset["title"] + " "
    if "description" in dataset:
        set_of_words += dataset["description"]+ " "
    # Get keywords
    if "keywordCollections" in dataset:
        for kwcollection in dataset["keywordCollections"]:
            if "keywords" in kwcollection:
                for kw in kwcollection["keywords"]:
                    set_of_words += str(kw)+ " "
    # Get Methods
    if "samplingDescription" in dataset:
        for key in dataset["samplingDescription"]:
            if key != "methodSteps":
                set_of_words += dataset["samplingDescription"][key]+ " "
            else:
                for methodStep in dataset["samplingDescription"][key]:
                    set_of_words += str(methodStep)+ " "
    return set_of_words

In [3]:
res = pd.DataFrame(columns=["Description", "Language"])

# We are excluding datasets from organizations that generated the metadata automatically
# PLAZI and PANGEA
exclude = ['7ce8aef0-9e92-11dc-8738-b8a03c50a862',
           'd5778510-eb28-11da-8629-b8a03c50a862']
offset = 0
step = 900
end_of_records = False
while not end_of_records:
    param = {
        "offset": offset,
        "limit": step
    }
    # Query API
    response = requests.get("http://api.gbif.org/v1/dataset", param)
    response = response.json()
    offset += step
    end_of_records = response["endOfRecords"]
    for dataset in response["results"]:
        # exclude dataset from PLAZI and PANGEA
        if dataset["publishingOrganizationKey"] not in exclude and dataset["type"] != "CHECKLIST":
            res.at[dataset["key"], "Language"] = dataset["language"]
            # Get title, description, keywords, methods of the dataset
            res.at[dataset["key"], "Description"] = extract_set_of_words(dataset)

## Save data to avoid rerunning everything, everytime I debug my model

In [4]:
# res.to_csv("../raw_descriptions.tsv", index=True, sep="\t")

res = pd.read_table("../raw_descriptions.tsv")
res = res.set_index("UUID")

## Load the training  + testing sets: partially manually annotated data

In [5]:
training_set = pd.read_table("../some_manually_annotated_datasets.tsv")
training_set = training_set.set_index("UUID")

## II - Set the parameters for data cleaning and analysis

In [6]:
feature = "Description" # we want to find words in the description
frequency_threshold = 15 # how many most frequent word do we remove
rare_threshold_word_number = 2 # how many time a word should appear to be kept
parameter_selection_training_set = 800 # Size of training set + selection of parameters
crossValidation = 4
triming_threshold_for_model = 15 # number of words we want to keep from model

## III - Clean up

In [7]:
# Remove underscore
res[feature] = res[feature].str.replace("_", " ")
res[feature] = res[feature].str.replace("-", " ")

# Set everything to lower case
res[feature] = res[feature].apply(lambda x: " ".join(x.lower() for x in x.split()))

Unfortunately, the translation stopped working after an hour of using it. Apparently iy is a common issue with libraries using the google API.

In [8]:
# # Translate sentence if needed
# for dataset in res.index.tolist():
#     if res.loc[dataset, "Language"] != "eng":
#         print(dataset)
#         res.at[dataset, feature] = str(textblob.TextBlob(res.loc[dataset, feature]).translate(to='en'))

In [9]:
# Remove punctuation
res[feature] = res[feature].str.replace('[^\w\s]', '')
res[feature] = res[feature].str.replace('[\d]', '')

In [10]:
# Find the most frequent words by language
for language in list(set(res.Language.tolist())):
    freq = pd.Series(' '.join(res[res.Language == language][feature]).split()).value_counts()[:frequency_threshold]
    freq = list(freq.index)
    # Remove the most frequent words
    res[feature] = res[feature].apply(lambda x: " ".join([c for c in x.split() if c not in freq]))

In [11]:
# Replace some key words
words_to_replace = pd.read_table("../wd_replace.txt")
words_to_replace = words_to_replace.set_index("word")
for word in words_to_replace.index.tolist():
    res[feature] = res[feature].str.replace(word, words_to_replace.loc[word, "replacement"])

In [12]:
# # Find the rare words # OPTIONAL because it takes too much time
# freq = pd.Series(' '.join(res[feature]).split()).value_counts()
# freq = freq[freq < rare_threshold_word_number]
# freq = list(freq.index)
# # Remove rare words
# res[feature] = res[feature].apply(lambda x: " ".join([c for c in x.split() if c not in freq]))

In [None]:
# Correct spelling - not done here because it takes too much time
# res[feature] = res[feature].apply(lambda x: str(textblob.TextBlob(x).correct()))

In [None]:
# Lemmatization (remove some letters in words to make it more "universal")
res[feature] = res[feature].apply(lambda x: " ".join([textblob.Word(word).lemmatize() for word in x.split()]))

# Remove as many NAs as possible
res[feature] = res[feature].str.replace(" na ", " ")

In [None]:
# Training set
data_training_set = pd.concat([res, training_set], join="inner", axis=1)

# Reformat dataset
index_list = data_training_set.index.tolist()
random.shuffle(index_list)
index_test_set = index_list[parameter_selection_training_set:len(index_list)]
index_training_set = index_list[0:parameter_selection_training_set]

data_training_set = data_training_set.loc[index_training_set]
data_training_set = list(data_training_set[[feature, 'CS']].itertuples(index=False))

## IV - Train and test Classifier - Naive Bayesian

In [None]:
# Train classifier
word_for_model = []

for fold in range(0, crossValidation):
    cl = NaiveBayesClassifier(data_training_set[fold:(fold+1)*int(len(data_training_set)/crossValidation)])
    informative_feature = cl.informative_features(triming_threshold_for_model)
    for word in informative_feature:
        word_for_model.append(word[0].replace("contains(","").replace(")",""))

In [None]:
# Put back the performance in the context 
training_set["CS"].value_counts()

In [None]:
# Languages
res["Language"].value_counts()

## V - What word is associated with citizen science

In [None]:
# Find the most informative words
keywords = pd.Series(word_for_model).value_counts()
keywords

## VI - Train and test model on reduced set of words

In [None]:
reduced_set_of_words = keywords[keywords > 1].index.tolist()
reduced_set_of_words += ["herbarium", "museum", "inaturalist"]
reduced_set_of_words = set(reduced_set_of_words)

In [None]:
res[feature] = res[feature].apply(lambda x: " ".join([c for c in x.split() if c in reduced_set_of_words]))
res[feature] = res[feature].str.replace(" na ", " ")

In [None]:
# Reduced training set
reduced_set = pd.concat([res, training_set], join="inner", axis=1)

# Reformat dataset
reduced_training_set = reduced_set.loc[index_training_set]
reduced_training_set = list(reduced_training_set[[feature, 'CS']].itertuples(index=False))

test_set = reduced_set.loc[index_test_set]
test_set = list(test_set[[feature, 'CS']].itertuples(index=False))

# train dataset
# cl = NaiveBayesClassifier(reduced_training_set)
cl = textblob.classifiers.DecisionTreeClassifier(reduced_training_set)

In [None]:
print(cl.pseudocode())

In [None]:
truePositive = 0
trueNegative = 0
falsePositive = 0
falseNegative = 0
for test in test_set:
    if test[1] == "T":
        if cl.classify(test[0]) == "T":
            truePositive += 1
        else:
            falsePositive += 1
    else:
        if cl.classify(test[0]) == "T":
            falseNegative += 1
        else:
            trueNegative += 1

In [None]:
print("Performance on testing set\n")
print("True positive:\t", truePositive*100/len(test_set))
print("True negative:\t", trueNegative*100/len(test_set))
print("False positive:\t", falsePositive*100/len(test_set))
print("False negative:\t", falseNegative*100/len(test_set))

In [None]:
print("Accuracy")
cl.accuracy(test_set)

In [None]:
# Put back the performance in the context 
reduced_set.loc[index_test_set]["CS"].value_counts()

## Predict the rest of the datasets

In [None]:
to_check = pd.DataFrame(["UUID", "CS"])
for test in res.index.tolist():
    if test not in training_set.index.tolist():
        to_check.at[test, "CS"] = cl.classify(res.at[test, feature])

In [None]:
title_and_description = pd.read_table("../raw_occurrence_dataset_descriptions_and_titles.tsv")
title_and_description = title_and_description.set_index("UUID")
to_check_w_title = pd.concat([title_and_description, to_check], join = "inner", axis=1)
to_check_w_title.sort_values(["CS"], ascending=False).to_csv("../test_model_subsample.tsv", sep = "\t")