find . -type f -name "DataCite_Public_Data_File_2024.tar" -print0 | \
xargs -0 -I{} tar -O -xf {} --wildcards 'dois/updated_2025-01/*.jsonl.gz' | \
gunzip -c | \
jq -c '.attributes | {doi, subjects, geoLocations, dates}' | \
gzip > output25.jsonl.gz

zcat output25.jsonl.gz | tail -n 100000 | gzip > last_lines.jsonl.gz

In [None]:
from collections import Counter
import gzip
import json

scheme_counts = Counter()
uniq_scheme_counts = Counter()
geo_count = 0
date_coverage = 0

with gzip.open('last_lines.jsonl.gz', 'rt', encoding='utf-8') as f:
    for line_num, line in enumerate(f, 1):
        try:
            obj = json.loads(line)
        except json.JSONDecodeError as e:
            print(f"Line {line_num}: JSON decode error: {e}")
            continue

        if 'geoLocations' in obj:
            geo_count += 1

        for date in obj.get('dates', []):
            if isinstance(date, dict) and date.get('dateType') == 'Coverage':
                date_coverage += 1

        scheme_uris_seen = set()
        for subject in obj.get('subjects', []):
            if isinstance(subject, dict):
                scheme_uri = subject.get('schemeUri')
                if scheme_uri:
                    scheme_uris_seen.add(scheme_uri)
                    scheme_counts[scheme_uri] += 1
        for uri in scheme_uris_seen:
            uniq_scheme_counts[uri] += 1

print(dict(scheme_counts))
print(dict(uniq_scheme_counts))
print(f"Dates of the type Coverage: {date_coverage}")
print(f"Records with a geoLocations: {geo_count}")


{'https://kaken.nii.ac.jp/': 163906, 'http://id.loc.gov/authorities/subjects': 163906, 'http://www.abs.gov.au/ausstats/abs@.nsf/0/6BB427AB9696C225CA2574180004463E': 376, 'http://dewey.info/': 436, 'http://www.oecd.org/science/inno/38235147.pdf': 1904, 'https://dewey.info/': 3, 'http://www.oecd.org/science/inno': 1, 'http://astrothesaurus.org/': 9, 'https://minorplanetcenter.net/iau/info/PackedDes.html': 9, 'https://github.com/PLOS/plos-thesaurus': 6, 'https://d-nb.info/gnd/': 121, 'https://prsinfo.clinicaltrials.gov/definitions.html#PrimaryCondition': 3881, 'https://www.microbiologyresearch.org/content/journal/ijsem/10.1099/ijsem.0.005585': 1040, 'http://id.loc.gov/authorities/subjects/sh2011004061': 3, 'http://id.loc.gov/authorities/subjects/sh85043149': 1, 'http://id.loc.gov/authorities/subjects/sh85146874': 3, 'http://id.loc.gov/authorities/subjects/sh85002717': 2, 'http://id.loc.gov/authorities/subjects/sh85105993': 1, 'https://calenda.org/subjects': 74, 'http://zbw.eu/stw/descript