# Comparing the metadata record of repositories to the corresponding record in DataCite

## Import

In [1]:
import json
import os
import time
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import seaborn as sns

## Figshare

### Get record from Figshare

In [2]:
# token
figshare_access_token = os.getenv("FIGSHARE_ACCESS_TOKEN")

In [8]:
# Inspired from the example available here https://help.figshare.com/article/how-to-use-the-figshare-api#search-ids
# get info of all the posters (unlike Zenodo, this doesn't return all the metadata for each poster
# so we do that only to get all the ids of the posters)
BASE_URL = "https://api.figshare.com/v2"
results = []

date_after = "2024-12-28"
date_before = "2024-12-31"

search_logic = (
    ":item_type:dataset AND "
    + ":posted_after:"
    + date_after
    + " AND :posted_before:"
    + date_before
)
query = '{"search_for": "' + search_logic + '"}'
y = json.loads(query)

for j in range(1, 11):
    r = json.loads(
        requests.post(
            BASE_URL + "/articles/search?page_size=1000&page={}".format(j), params=y
        ).content
    )
    if r:
        results.extend(r)
    else:
        break

In [9]:
# use figshare ids to get full metadata of each poster
dict_results = {}
count = 0
for result in results:
    if isinstance(result, dict):
        figshare_id = result["id"]
        api_call_headers = {"Authorization": "token " + str(figshare_access_token)}
        r = requests.get(
            BASE_URL + "/articles/" + str(figshare_id), headers=api_call_headers
        )
        metadata = json.loads(r.text)
        dict_results[count] = metadata
        count += 1

with open("outputs/figshare.json", "w", encoding="utf-8") as f:
    json.dump(dict_results, f, ensure_ascii=False, indent=4)

### Create UI stats json

In [48]:
# load data
with open("outputs/figshare-nov.json", "r", encoding="utf-8") as file:
    results = json.load(file)

In [49]:
# Get stats
datasetCount = 0
alternateIdentifierCount = 0
identfierCount = 0
languageCount = 0
geoLocationsCount = 0
rightsCount = 0
titlesCount = 0
publicationYearCount = 0
resourceTypeCount = 0
resourceTypeGeneralCount = 0
dateCount = 0

relatedItemsCount = 0
formatsCount = 0
subjectsCount = 0
versionCount = 0

relatedIdentifiersCount = 0
relatedIdentifiersInstanceCount = 0
relationTypeRelatedIdentifierCount = 0
relationTypeRelatedIdentifierInstanceCount = 0
relatedIdentifierTypeRelatedIdentifierCount = 0
relatedIdentifierTypeRelatedIdentifierInstanceCount = 0
resourceTypeGeneralRelatedIdentifierCount = 0
resourceTypeGeneralRelatedIdentifierInstanceCount = 0

contributorsCount = 0

creatorsCount = 0
creatorsInstanceCount = 0
affilationCreatorsCount = 0
affilationCreatorsInstanceCount = 0
nameTypeCreatorsCount = 0
nameTypeCreatorsInstanceCount = 0
affiliationIdentifierCreatorsCount = 0
affiliationIdentifierCreatorsInstanceCount = 0
affilationIdentifierSchemeCreatorsCount = 0
affilationIdentifierSchemeCreatorsInstanceCount = 0
nameIdentifierSchemeCreatorsCount = 0
nameIdentifierSchemeCreatorsInstanceCount = 0
nameIdentifierCreatorsCount = 0
nameIdentifierCreatorsInstanceCount = 0

fundingReferencesCount = 0
fundingReferencesInstanceCount = 0
awardURIFundingReferencesCount = 0
awardURIFundingReferencesInstanceCount = 0
awardTitleFundingReferencesCount = 0
awardTitleFundingReferencesInstanceCount = 0
awardNumberFundingReferencesCount = 0
awardNumberFundingReferencesInstanceCount = 0
funderIdentifierTypeFundingReferencesCount = 0
funderIdentifierTypeFundingReferencesInstanceCount = 0
funderIdentifierFundingReferencesCount = 0
funderIdentifierFundingReferencesInstanceCount = 0
funderNameFundingReferencesCount = 0
funderNameFundingReferencesInstanceCount = 0


sizesCount = 0
descriptionCount = 0
publisherCount = 0

for metadata in results.values():

    # Dataset count
    datasetCount += 1

    # Alternate identifier
    # -- Not in Figshare schema

    # IdentifierCount
    if "doi" in metadata.keys():
        identfierCount += 1

    # Language
    # -- Not in Figshare schema

    # Geo location
    # -- Not in Figshare schema

    # License
    if "license" in metadata.keys():
        rightsCount += 1

    # Title
    if "title" in metadata.keys():
        titlesCount += 1

    # Publication year
    # -- Not in Figshare schema can be infered from created_date

    # Resource type
    if "defined_type_name" in metadata.keys():
        resourceTypeCount += 1
        resourceTypeGeneralCount += 1

    # Dates
    if "published_date" in metadata.keys():
        dateCount += 1

    # Related items
    # -- Not in Figshare schema

    # Formats
    # -- Not in Figshare schema

    # Subjects
    if "tags" in metadata.keys():
        subjectsCount += 1

    # Version
    if "version" in metadata.keys():
        versionCount += 1

    # Related identifiers
    hasrelationTypeRelatedIdentifier = False
    hasrelatedIdentifierTypeRelatedIdentifier = False
    hasresourceTypeGeneralRelatedIdentifier = False
    if "related_materials" in metadata.keys():
        relatedIdentifiersCount += 1
        for related_material in metadata["related_materials"]:
            relatedIdentifiersInstanceCount += 1
            # Relation type
            if "relation" in related_material.keys():
                hasrelationTypeRelatedIdentifier = True
                relationTypeRelatedIdentifierInstanceCount += 1

            # Related identifier type
            if "identifier_type" in related_material.keys():
                hasrelatedIdentifierTypeRelatedIdentifier = True
                relatedIdentifierTypeRelatedIdentifierInstanceCount += 1

            # Resource type general
            if "resource_type" in related_material.keys():
                hasresourceTypeGeneralRelatedIdentifier = True
                resourceTypeGeneralRelatedIdentifierInstanceCount += 1

        if hasrelationTypeRelatedIdentifier:
            relationTypeRelatedIdentifierCount += 1
        if hasrelatedIdentifierTypeRelatedIdentifier:
            relatedIdentifierTypeRelatedIdentifierCount += 1
        if hasresourceTypeGeneralRelatedIdentifier:
            resourceTypeGeneralRelatedIdentifierInstanceCount += 1

    # Contributors
    # -- Not in Figshare schema

    # Creators
    hasaffilationCreators = False
    hasnameIdentifierCreators = False

    if "authors" in metadata.keys():
        creatorsCount += 1
        for author in metadata["authors"]:
            creatorsInstanceCount += 1

            # affilation
            if "affiliation" in author.keys():
                hasaffilationCreators = True
                affilationCreatorsInstanceCount += 1

            # name type
            # -- Not in schema

            # affiliation identifier
            # -- Not in schema

            # affiliation identifier scheme
            # -- Not in schema

            # identifier scheme
            # -- Not in schema

            # name identifier
            if "orcid_id" in author.keys():
                hasnameIdentifierCreators = True
                nameIdentifierCreatorsInstanceCount += 1

    if hasaffilationCreators:
        affilationCreatorsCount += 1
    if hasnameIdentifierCreators:
        nameIdentifierCreatorsCount += 1

    hasawardURIfundingReferences = False
    hasawardTitleFundingReferences = False
    hasawardNumberFundingReferences = False
    hasfunderNameFundingReferences = False

    if "funding_list" in metadata.keys():
        fundingReferencesCount += 1
        for funding in metadata["funding_list"]:
            fundingReferencesInstanceCount += 1

            # award URI
            if "url" in funding.keys():
                hasawardURIFundingReferences = True
                awardURIFundingReferencesInstanceCount += 1

            # award title
            if "title" in funding.keys():
                hasawardTitleFundingReferences = True
                awardTitleFundingReferencesInstanceCount += 1

            # award number
            if "grant_code" in funding.keys():
                hasawardNumberFundingReferences = True
                awardNumberFundingReferencesInstanceCount += 1

            # funding identifier type
            # -- Not in schema

            # funding identifier
            # -- Not in schema

            # funder name
            if "funder_name" in funding.keys():
                hasfunderNameFundingReferences = True
                funderNameFundingReferencesInstanceCount += 1

        if hasawardURIfundingReferences:
            awardURIfundingReferencesCount += 1
        if hasawardTitleFundingReferences:
            awardTitleFundingReferencesCount += 1
        if hasawardNumberFundingReferences:
            awardNumberFundingReferencesCount += 1
        if hasfunderNameFundingReferences:
            funderNameFundingReferencesCount += 1

    if "size" in metadata.keys():
        sizesCount += 1

    if "description" in metadata.keys():
        descriptionCount += 1

    # publisher
    # -- Not in schema but we know it's figshare

In [50]:
# Create UI stats json
dict_stats = {}
dict_stats["stats"] = {}
dict_stats["stats"]["byResourceType"] = {}
dict_stats["stats"]["byResourceType"]["Dataset"] = {}
dataset = dict_stats["stats"]["byResourceType"]["Dataset"]

dataset["alternateIdentifier"] = {
    "count": alternateIdentifierCount,
    "instance": alternateIdentifierCount,
    "missing": datasetCount - alternateIdentifierCount,
    "field_status": "Not in schema",
    "completeness": alternateIdentifierCount / datasetCount,
}


dataset["identifier"] = {
    "count": identfierCount,
    "instance": identfierCount,
    "missing": datasetCount - identfierCount,
    "field_status": "Mandatory",
    "completeness": identfierCount / datasetCount,
}

dataset["identifier"] = {
    "count": identfierCount,
    "instance": identfierCount,
    "missing": datasetCount - identfierCount,
    "field_status": "Mandatory",
    "completeness": identfierCount / datasetCount,
}

dataset["language"] = {
    "count": languageCount,
    "instance": languageCount,
    "missing": datasetCount - languageCount,
    "field_status": "Not in schema",
    "completeness": languageCount / datasetCount,
}

dataset["geoLocations"] = {
    "count": geoLocationsCount,
    "instance": geoLocationsCount,
    "missing": datasetCount - geoLocationsCount,
    "field_status": "Not in schema",
    "completeness": geoLocationsCount / datasetCount,
}

dataset["rights"] = {
    "count": rightsCount,
    "instance": rightsCount,
    "missing": datasetCount - rightsCount,
    "field_status": "Mandatory",
    "completeness": rightsCount / datasetCount,
}

dataset["title"] = {
    "count": titlesCount,
    "instance": titlesCount,
    "missing": datasetCount - titlesCount,
    "field_status": "Mandatory",
    "completeness": titlesCount / datasetCount,
}

dataset["publicationYear"] = {
    "count": publicationYearCount,
    "instance": publicationYearCount,
    "missing": datasetCount - publicationYearCount,
    "field_status": "Not in schema",
    "completeness": publicationYearCount / datasetCount,
}

dataset["resourceType"] = {
    "count": resourceTypeCount,
    "instance": resourceTypeCount,
    "missing": datasetCount - resourceTypeCount,
    "field_status": "Mandatory",
    "completeness": resourceTypeCount / datasetCount,
}

dataset["date"] = {
    "count": dateCount,
    "instance": dateCount,
    "missing": datasetCount - dateCount,
    "field_status": "Not in schema",
    "completeness": dateCount / datasetCount,
}

dataset["relatedItems"] = {
    "count": relatedItemsCount,
    "instance": relatedItemsCount,
    "missing": datasetCount - relatedItemsCount,
    "field_status": "Not in schema",
    "completeness": relatedItemsCount / datasetCount,
}

dataset["formats"] = {
    "count": formatsCount,
    "instance": formatsCount,
    "missing": datasetCount - formatsCount,
    "field_status": "Not in schema",
    "completeness": formatsCount / datasetCount,
}

dataset["subjects"] = {
    "count": subjectsCount,
    "instance": subjectsCount,
    "missing": datasetCount - subjectsCount,
    "field_status": "Mandatory",
    "completeness": subjectsCount / datasetCount,
}

dataset["version"] = {
    "count": versionCount,
    "instance": versionCount,
    "missing": datasetCount - versionCount,
    "field_status": "Mandatory",
    "completeness": versionCount / datasetCount,
}

dataset["relatedItems"] = {
    "count": relatedItemsCount,
    "instance": relatedItemsCount,
    "missing": datasetCount - relatedItemsCount,
    "field_status": "Not in schema",
    "completeness": relatedItemsCount / datasetCount,
}

dataset["relatedIdentifiers"] = {
    "count": relatedIdentifiersCount,
    "instance": relatedIdentifiersInstanceCount,
    "missing": datasetCount - relatedIdentifiersCount,
    "field_status": "Optional",
    "completeness": relatedIdentifiersCount / datasetCount,
    "subfields": {
        "relationType": {
            "count": relationTypeRelatedIdentifierCount,
            "instance": relationTypeRelatedIdentifierInstanceCount,
            "missing": datasetCount - relationTypeRelatedIdentifierCount,
            "field_status": "Optional",
            "completeness": relationTypeRelatedIdentifierCount / datasetCount,
        },
        "relatedIdentifierType": {
            "count": relatedIdentifierTypeRelatedIdentifierCount,
            "instance": relatedIdentifierTypeRelatedIdentifierInstanceCount,
            "missing": datasetCount - relatedIdentifierTypeRelatedIdentifierCount,
            "field_status": "Optional",
            "completeness": relatedIdentifierTypeRelatedIdentifierCount / datasetCount,
        },
        "resourceTypeGeneral": {
            "count": resourceTypeGeneralRelatedIdentifierCount,
            "instance": resourceTypeGeneralRelatedIdentifierInstanceCount,
            "missing": datasetCount - resourceTypeGeneralRelatedIdentifierCount,
            "field_status": "Optional",
            "completeness": resourceTypeGeneralRelatedIdentifierCount / datasetCount,
        }
    },
}

dataset["contributors"] = {
    "count": contributorsCount,
    "instance": contributorsCount,
    "missing": datasetCount - contributorsCount,
    "field_status": "Not in schema",
    "completeness": contributorsCount / datasetCount,
}

dataset["creators"] = {
    "count": creatorsCount,
    "instance": creatorsInstanceCount,
    "missing": datasetCount - creatorsCount,
    "field_status": "Mandatory",
    "completeness": creatorsCount / datasetCount,
    "subfields": {
        "affiliation": {
            "count": affilationCreatorsCount,
            "instance": affilationCreatorsInstanceCount,
            "missing": datasetCount - affilationCreatorsCount,
            "field_status": "Optional",
            "completeness": affilationCreatorsCount / datasetCount,
        },
        "nameType": {
            "count": nameTypeCreatorsCount,
            "instance": nameTypeCreatorsInstanceCount,
            "missing": datasetCount - nameTypeCreatorsCount,
            "field_status": "Not in schema",
            "completeness": nameTypeCreatorsCount / datasetCount,
        },
        "affiliationIdentifier": {
            "count": affiliationIdentifierCreatorsCount,
            "instance": affiliationIdentifierCreatorsInstanceCount,
            "missing": datasetCount - affiliationIdentifierCreatorsCount,
            "field_status": "Not in schema",
            "completeness": affiliationIdentifierCreatorsCount / datasetCount,
        },
        "affiliationIdentifierScheme": {
            "count": affilationIdentifierSchemeCreatorsCount,
            "instance": affilationIdentifierSchemeCreatorsInstanceCount,
            "missing": datasetCount - affilationIdentifierSchemeCreatorsCount,
            "field_status": "Not in schema",
            "completeness": affilationIdentifierSchemeCreatorsCount / datasetCount,
        },
        "nameIdentifierScheme": {
            "count": nameIdentifierSchemeCreatorsCount,
            "instance": nameIdentifierSchemeCreatorsInstanceCount,
            "missing": datasetCount - nameIdentifierSchemeCreatorsCount,
            "field_status": "Not in schema",
            "completeness": nameIdentifierSchemeCreatorsCount / datasetCount,
        },
        "nameIdentifier": {
            "count": nameIdentifierCreatorsCount,
            "instance": nameIdentifierCreatorsInstanceCount,
            "missing": datasetCount - nameIdentifierCreatorsCount,
            "field_status": "Optional",
            "completeness": nameIdentifierCreatorsCount / datasetCount,
        }
    },
}

dataset["fundingReferences"] = {
    "count": fundingReferencesCount,
    "instance": fundingReferencesInstanceCount,
    "missing": datasetCount - fundingReferencesCount,
    "field_status": "Mandatory",
    "completeness": fundingReferencesCount / datasetCount,
    "subfields": {
        "awardURI": {
            "count": awardURIFundingReferencesCount,
            "instance": awardURIFundingReferencesInstanceCount,
            "missing": datasetCount - awardURIFundingReferencesCount,
            "field_status": "Optional",
            "completeness": awardURIFundingReferencesCount / datasetCount,
        },
        "awardTitle": {
            "count": awardTitleFundingReferencesCount,
            "instance": awardTitleFundingReferencesInstanceCount,
            "missing": datasetCount - awardTitleFundingReferencesCount,
            "field_status": "Mandatory",
            "completeness": awardTitleFundingReferencesCount / datasetCount,
        },
        "awardNumber": {
            "count": awardNumberFundingReferencesCount,
            "instance": awardNumberFundingReferencesInstanceCount,
            "missing": datasetCount - awardNumberFundingReferencesCount,
            "field_status": "Optional",
            "completeness": awardNumberFundingReferencesCount / datasetCount,
        },
        "funderIdentifierType": {
            "count": funderIdentifierTypeFundingReferencesCount,
            "instance": funderIdentifierTypeFundingReferencesInstanceCount,
            "missing": datasetCount - funderIdentifierTypeFundingReferencesCount,
            "field_status": "Not in schema",
            "completeness": funderIdentifierTypeFundingReferencesCount / datasetCount,
        },
        "funderIdentifier": {
            "count": funderIdentifierFundingReferencesCount,
            "instance": funderIdentifierFundingReferencesInstanceCount,
            "missing": datasetCount - funderIdentifierFundingReferencesCount,
            "field_status": "Not in schema",
            "completeness": funderIdentifierFundingReferencesCount / datasetCount,
        },
        "funderNameFunding": {
            "count": funderNameFundingReferencesCount,
            "instance": funderNameFundingReferencesInstanceCount,
            "missing": datasetCount - funderNameFundingReferencesCount,
            "field_status": "Optional",
            "completeness": funderNameFundingReferencesCount / datasetCount,
        }
    },
}


dataset["sizes"] = {
    "count": sizesCount,
    "instance": sizesCount,
    "missing": datasetCount - sizesCount,
    "field_status": "Not in schema",
    "completeness": sizesCount / datasetCount,
}


dataset["description"] = {
    "count": descriptionCount,
    "instance": descriptionCount,
    "missing": datasetCount - descriptionCount,
    "field_status": "Not in schema",
    "completeness": descriptionCount / datasetCount,
}

dataset["publisher"] = {
    "count": publisherCount,
    "instance": publisherCount,
    "missing": datasetCount - publisherCount,
    "field_status": "Not in schema",
    "completeness": publisherCount / datasetCount,
}

In [52]:
dict_stats["stats"] = {}
dict_stats["stats"]["fields"] = dataset

In [53]:
with open("outputs/figshare-ui-nov.json", "w", encoding="utf-8") as f:
    json.dump(dict_stats, f, ensure_ascii=False, indent=4)