find . -type f -name "DataCite_Public_Data_File_2024.tar" -print0 | \
xargs -0 -I{} tar -O -xf {} --wildcards 'dois/*/*.jsonl.gz' | \
gunzip -c | \
jq -c '.attributes | {doi, subjects, geoLocations, dates}' | \
gzip > output.jsonl.gz

In [47]:
from collections import Counter
import gzip
import json

scheme_counts = Counter()
uniq_scheme_counts = Counter()
geo_count = 0
date_coverage = 0

with gzip.open('output.jsonl.gz', 'rt', encoding='utf-8') as f:
    for line_num, line in enumerate(f, 1):
        try:
            obj = json.loads(line)
        except json.JSONDecodeError as e:
            print(f"Line {line_num}: JSON decode error: {e}")
            continue

        if 'geoLocations' in obj:
            geo_count += 1

        for date in obj.get('dates', []):
            if isinstance(date, dict) and date.get('dateType') == 'Coverage':
                date_coverage += 1

        scheme_uris_seen = set()
        for subject in obj.get('subjects', []):
            if isinstance(subject, dict):
                scheme_uri = subject.get('schemeUri')
                if scheme_uri:
                    scheme_uris_seen.add(scheme_uri)
                    scheme_counts[scheme_uri] += 1
        for uri in scheme_uris_seen:
            uniq_scheme_counts[uri] += 1

print(dict(scheme_counts))
print(dict(uniq_scheme_counts))
print(f"Dates of the type Coverage: {date_coverage}")
print(f"Records with a geoLocations: {geo_count}")


{'http://www.abs.gov.au/ausstats/abs@.nsf/0/6BB427AB9696C225CA2574180004463E': 2146880, 'http://dewey.info/': 95211, 'https://thesaurus.babylon-software.com/': 5, 'http://www.isotc211.org/2005/resources/Codelist/gmxCodelists.xml': 286, 'http://www.inchi-trust.org/': 14756, 'https://www.abs.gov.au/Ausstats/abs@.nsf/Latestproducts/4AE1B46AE2048A28CA25741800044242?': 8, 'http://www.oecd.org/science/inno/38235147.pdf': 13547186, 'http://gcmdservices.gsfc.nasa.gov/kms/concepts/concept_scheme/sciencekeywords/?format=xml': 1381, 'https://dewey.info/': 22123, 'http://id.loc.gov/authorities/subjects': 20252764, 'http://d-nb.info/standards/elementset/gnd': 18, 'http://d-nb.info/gnd/': 12500, 'http://id.loc.gov/authorities/subjects.html': 1160836, 'http://aims.fao.org/aos/agrovoc/': 395, 'http://www.fao.org/docrep/003/u1808f/u1808F00.htm': 82, 'http://www.fao.org/docrep/003/u1808e/u1808e00.htm': 82, 'http://schema.org/keywords': 138, 'http://uknowledge.uky.edu/assets/taxonomy.pdf': 16, 'http://pu