In [11]:
import pandas as pd
import json
import requests
import time
import sys
# import psycopg2
import re

In [12]:
grscicoll_api = 'http://api.gbif.org/v1/grscicoll/'
occurrence_api = 'https://api.gbif.org/v1/occurrence/search'
organization_api = 'https://api.gbif.org/v1/organization'
dataset_prefix = "https://www.gbif.org/dataset/"
# GitHub
gh_username = ""
github_header = {'Accept': "application/vnd.github.v3+json"}
gh_token = ""
owner = "gbif"
repo = "collection-mobilization"
gh_api = "https://api.github.com/repos/"+owner+"/"+repo+"/issues"
link_readme = "https://github.com/"+owner+"/"+repo+"#readme"

# Scope
region = "ASIA"

In [13]:
# test_json = {
#     "title": "title"
# }

# post = requests.post(gh_api,
#                      data=json.dumps(test_json),
#                      auth=(gh_username, gh_token),
#                      headers=github_header)
# print(gh_api)
# post

In [14]:
def create_body_issue(inst, country, steps_to_check,
                      gbif_prefix='https://www.gbif.org/',
                      grscicoll_prefix='https://www.gbif.org/grscicoll/'):
    """
    Use institution information and find related GBIF records then format all of it in a GH issue
    """
    markdown_body = "## "+inst["name"]+"\n"
    markdown_body += "**GRSciColl URL**: "+grscicoll_prefix+"institution/"+inst["key"]+"\n"
    if inst["masterSource"] != "GRSCICOLL":
        markdown_body += "**Synchronized with** `"+inst["masterSource"]+"`\n"
    if "code" in inst:
        markdown_body += "**Code**: `"+inst["code"]+"`\n"
    if "homepage" in inst:
        markdown_body += "**Homepage**: "+ inst["homepage"]+"\n"
    
    markdown_body += "**Fuzzy name search in GBIF publisher**:\n"
    param_org = {
        "q": inst["name"],
        "limit": 30,
        "country": country
    }
    inst_name_search = requests.get(organization_api, param_org)
    if inst_name_search.ok:
        organization_match = inst_name_search.json()
        organization_names = []
        organization_UUIDs = []
        if organization_match["results"] == []:
            markdown_body += "`None`\n"
        else:
            for org in organization_match["results"]:
                markdown_body += "* ["+org["title"]+"]("+gbif_prefix+"publisher/"+org["key"]+")\n"
    
    markdown_body += "\n\n"
    
    linked_occurrences = 0
    param = {
        "institution_key": inst["key"],
        "limit": 0,
        "facet": "publishingOrg"
    }
    inst_occ = requests.get(occurrence_api, param)
    if inst_occ.ok:
        inst_occ_dict = inst_occ.json()
        linked_occurrences = inst_occ_dict["count"]
        markdown_body += "**Number of linked occurrences**: `"+str(linked_occurrences)+"` ("+gbif_prefix+"occurrence/search?advanced=1&institution_key="+inst["key"]+")\n"
        if linked_occurrences != 0:
            markdown_body += "**Publishing organizations for linked occurrences**:\n"
            for publisher in inst_occ_dict["facets"][0]["counts"]:
                markdown_body += "* "+gbif_prefix+"publisher/"+publisher["name"]+"\n"
    
    markdown_body += steps_to_check
    return markdown_body

In [15]:
def post_issue(data, gh_api, gh_username, gh_token,github_header):
    post = requests.post(gh_api,
                         data=json.dumps(data),
                         auth=(gh_username, gh_token),
                         headers=github_header)
    if not post.ok:
        print("couldn't create issue for ", data)
        print(post)

In [16]:
def create_issue_for_institution_based_on_json(inst, country, gh_api, gh_username, gh_token, github_header, steps_to_check):
    """
    Create a GitHub issue for an institution entry:
    * inst: is an institution JSON as returned by the GRSciColl API.
    * country: country of the institution
    """
    issue = {}
    issue["title"] = inst["name"]
    issue["body"] = create_body_issue(inst, country, steps_to_follow)
    issue["labels"] = [country]
    post_issue(issue, gh_api, gh_username, gh_token,github_header)

In [17]:
def create_issue_for_institution_per_country(country, already_issued, grscicoll_api, gh_api, gh_username, gh_token, github_header, steps_to_check, step=500):
    """
    For a given country, create one GitHub issue per GRSciColl institution entry. If an issue already exists, don't create a new issue
    """
    query = {
        "country": country,
        "limit": step,
        "offset" : 0
    }
    endOfRecords = False
    while not endOfRecords:
        institutions = requests.get(grscicoll_api + 'institution/', query)

        if institutions.ok:
            institution_page = institutions.json()
            endOfRecords = institution_page["endOfRecords"]
            query["offset"] += step

            for inst in institution_page["results"]:
                if inst["key"] not in already_issued:
                    create_issue_for_institution_based_on_json(inst, country, gh_api, gh_username, gh_token, github_header, steps_to_check)
                    time.sleep(3)
        else:
            print(institutions)
            endOfRecords = True

In [28]:
def create_issue_for_institution(key, country, gh_api, gh_username, gh_token, github_header, steps_to_check):
    """
    Create a GitHub issue for an institution entry:
    * key: is an institution key in GRSciColl
    * country: country of the institution
    """
    institution = requests.get(grscicoll_api + 'institution/'+ key)
    if institution.ok:
        create_issue_for_institution_based_on_json(institution.json(), country, gh_api, gh_username, gh_token, github_header, steps_to_check)

In [19]:
steps_to_follow = """\n\n---
1. **Find out if the information is complete and up to date**:

- [ ]  Check the homepage if available or google the institution name in the country. Check if all collections for that institution are represented in GRSciColl.
    - Check whether it is an independent entry. If duplicated merge with the selected and entry and close this issue.
- [ ]  Add missing collections/information to the institution on GRSciColl directly in the registry. If some collections are digitized, put the information in the GRSciColl `Notes` field.

2. **Check if the data is also in GBIF**:

- [ ]  If there are GBIF occurrence records linked, check from which dataset/publisher they come. Is the institution a registered publisher? Or do the records come from a third party publisher? Are all the collections in GRSciColl also in GBIF? (add comments to the issue)
- [ ]  If no record is linked to GRSciColl, look for the institution name on the GBIF list of publishers. Is there any corresponding publisher? Have they published any data? (add comments to the issue)
- [ ]  If data has been published on GBIF but isn’t linked to GRSciColl, notify Marie (tag ManonGros), she can link the data.
- [ ]  Translate outcome of your checks into labels. See guidelines here: """ + link_readme

In [20]:
## If you want to query all the countries for a region
# countries = []
# all_countries = requests.get("https://api.gbif.org/v1/enumeration/country")
# if all_countries.ok:
#     all_countries = all_countries.json()
#     for country in all_countries:
#         if country["gbifRegion"] == region:
#             countries.append(country["iso2"])

In [21]:
countries = [
    "VE"
]

In [23]:
## IF you want to have a list of all the issues already created
# page = 1
# end = False
# already_issued = []

# while not end:
#     issues = requests.get(gh_api, {"page":page}, auth=(gh_username, gh_token)).json()
#     page += 1
#     if len(issues) < 30:
#         end = True
        
#     for issue in issues:
#         search_res = re.search('https://www.gbif.org/grscicoll/institution/.+\n', issue["body"])
#         if search_res is not None:
#             already_issued.append(search_res.group(0).replace("\n","").replace("\r", "").replace("https://www.gbif.org/grscicoll/institution/",""))
#         else:
#             print(issue["body"])

In [24]:
for country in countries:
    print(country)
    create_issue_for_institution_per_country(country, already_issued, grscicoll_api, gh_api, gh_username, gh_token, github_header, steps_to_follow)

VE


In [29]:
## For just one institution
create_issue_for_institution("a50c1ed5-76c3-478c-be0b-e781e7cb04eb", "LT", gh_api, gh_username, gh_token, github_header, steps_to_follow)