## Functionality for getting a data dump, processing and saving it

In [27]:
from sickle import Sickle
import xml.etree.ElementTree as ET
from habanero import Crossref

In [28]:
OAI_ZB_PREVIEW_NS = 'https://zbmath.org/OAI/2.0/oai_zb_preview/'
OAI_NS = 'http://www.openarchives.org/OAI/2.0/' # the OAI namespace
TAGS = ['author', 'author_ids', 'document_title', 'source', 'classifications', 'language', 'links', 'keywords', 'doi', 'publication_year', 'serial']
ZBMATH_NS = 'https://zbmath.org/zbmath/elements/1.0/'
CONFLICT_TXT = 'zbMATH Open Web Interface contents unavailable due to conflicting licenses.'

In [29]:
def dump_records(out_dir, from_date=None, until_date=None):
    # date has to have format like 2012-12-12
    sickle = Sickle('https://oai.zbmath.org/v1')
    if from_date and until_date:
        records = sickle.ListRecords(**{'metadataPrefix': 'oai_dc','from': from_date, 'until': until_date})
    elif from_date:
        records = sickle.ListRecords(**{'metadataPrefix': 'oai_dc','from': from_date})
    elif until_date:
        records = sickle.ListRecords(**{'metadataPrefix': 'oai_dc','until': until_date})
    else:
        records = sickle.ListRecords(metadataPrefix='oai_dc')
    with open(out_dir + 'data_dump.txt', 'w+') as f:
        for rec in records:
            f.write(rec.raw + '\n')

In [30]:
def read_xml(xml_file):
    with open(xml_file) as f:
        xml_object = ET.fromstringlist(["<root>", f.read(), "</root>"])
    records = tree.getroot().findall(get_tag('record'))
    return(records)

In [31]:
def get_tag(tag_name, namespace=OAI_NS):
    """
    Returns a fully qualified tag name.
    @param namespace URL of a namespace|None (OAI_NS is default)
    """
    return '{{{}}}{}'.format(namespace, tag_name)

In [95]:
def parse_record(xml_record, verbose=False):
    """
    Parse bibliographic record details from XML Element.
    @returns dict
    """
    is_conflict = False
    new_entry = {}
    # zbMath identifier
    zb_id = xml_record.find(get_tag('header')).find(get_tag('identifier')).text 
    new_entry['id'] = zb_id
    # read tags
    zb_preview = xml_record.find(get_tag('metadata')).find(get_tag('zbmath', OAI_ZB_PREVIEW_NS))
    if zb_preview:
        for tag in TAGS:
            value = zb_preview.find(get_tag(tag, ZBMATH_NS))
            if value is not None:
                if len(value):
                    # element has children
                    texts = []
                    for child in value:
                        texts.append(child.text)
                    text = ';'.join(texts) # multiple values are rendered as a semicolon-separated string
                else:
                    # element content is a simple text
                    text = zb_preview.find(get_tag(tag, ZBMATH_NS)).text
                
                if text.startswith(CONFLICT_TXT):
                    # License conflict
                    if verbose:
                        print('Licensing conflict for id "{}" tag "{}"'.format(zb_id, tag))
                 #       print(text)
                    is_conflict = True
                    text = None
            
                new_entry[tag] = text
        if is_conflict:
            print("in conflict")
            if "doi" in new_entry:
                if new_entry["doi"]:
                    print(new_entry)
                    get_info_from_doi(new_entry)
                else:
                    return None
            else: 
                return None
        return new_entry

In [96]:
def get_info_from_doi(entry_dict):
    internal_tags = ["author_id", "source", "classifications", "language", "links", "keywords"]
    cr = Crossref(mailto="pusch@zib.de")
    doi = entry_dict['doi']
    try:
        work_info = cr.works(ids = doi)
        print("in get doi info try")
        print(work)
    except:
        print("work not found, agency:")
        agency = cr.registration_agency(doi)
        print(agency)
        print(a)
    for key, val in dict.items():
        if val is None and val not in internal_tags:
            parse_doi_info(val, work_info['message'])
        

In [None]:
def parse_doi_info(val, work_info):
    if val == 'author':
        #looks like: 'author': [{'given': 'Max', 'family': 'Mustermann', 'sequence': 'first', 'affiliation': []}]
        all_authors = ""
        author_list = []
        if 'author' not in work_info:
            return None
        for author_dict in work_info['author']:
            first_name = author_dict['given']
            family_name = author_dict['family']
            #if we don't know the family name, it is too hard to identify the author
            if not family_name:
                pass
            else:
                #first name not necessarily needed
                if not first_name:
                    author_list.append(family_name)
                else:
                    author_list.append(family_name + ", " + first_name)
        if not author_list:
            return None
        else: return ";".join(author_list)
    elif val == 'document_title':
        if 'title' not in work_info:
            return None
        return work_info['title']
    elif val == 'publication_year':
        # 'published': {'date-parts': [[2008]]}}
        if 'published' not in work_info:
            return None
        return work_info['published']['date_parts'][0][0]
    elif val == 'serial':
        
        
        

In [97]:
def do_all(xml_file, out_file):
    with open(xml_file) as infile:
        with open(out_file, 'a') as outfile:
            outfile.write((",").join(TAGS))
            record_string = ""
            for line in infile:
                record_string = record_string + line
                if line.endswith('</record>\n'):
                    element = ET.fromstring(record_string)
                    record = parse_record(element)
                    record_string = ""
                    if record:
                        outfile.write(''.join(str(x) for x in record.values()))
                    
                
    # read records from file one by one DONE
    # for each record:
        # parse and extract information DONE
        # if something is missing, query by doi
        # write to file one by one in csv format

In [98]:
do_all('../../test.txt', 'test_output.txt')

in conflict
in conflict
in conflict
in conflict
in conflict
in conflict
in conflict
in conflict
in conflict
in conflict
{'id': 'oai:zbmath.org:5346296', 'author': None, 'author_ids': 'applebaum.david-b', 'document_title': None, 'source': None, 'classifications': '60-01;94-01', 'language': None, 'keywords': 'introduction to probability theory;information theory;entropy;Markov chains', 'doi': '10.1017/CBO9780511755262', 'publication_year': '2008', 'serial': None}
in get doi info try
{'status': 'ok', 'message-type': 'work', 'message-version': '1.0.0', 'message': {'indexed': {'date-parts': [[2022, 4, 5]], 'date-time': '2022-04-05T11:29:22Z', 'timestamp': 1649158162947}, 'publisher-location': 'Cambridge', 'edition-number': '2', 'reference-count': 0, 'publisher': 'Cambridge University Press', 'isbn-type': [{'value': '9780511755262', 'type': 'print'}], 'content-domain': {'domain': [], 'crossmark-restriction': False}, 'short-container-title': [], 'published-print': {'date-parts': [[2008]]}, 'D

NameError: name 'a' is not defined

In [85]:
records =read_xml('test.xml')

MemoryError: 

---------------------------------------------------------------------------------------------------------------------------------------------------

In [48]:
tree = read_xml('reduced.xml')

In [49]:
root = tree.getroot()

In [50]:
print(root.tag)
print(root.attrib)

{http://www.openarchives.org/OAI/2.0/}record
{}


In [51]:
for child in root:
    print(child.tag, child.attrib)

{http://www.openarchives.org/OAI/2.0/}header {}
{http://www.openarchives.org/OAI/2.0/}metadata {}


In [81]:
parse_record(root, verbose=True)

{'id': 'oai:zbmath.org:5346175',
 'author': 'Moskalenko, V. A.; Entel, P.; Digor, D. F.; Dohotaru, L. A.; Citro, R.',
 'document_title': 'A diagram approach to the strong coupling in the single-impurity Anderson model',
 'source': 'Theor. Math. Phys. 155, No. 3, 914-935 (2008); translation from Teor. Mat. Fiz. 155, No. 3, 474-497 (2008).',
 'classifications': '82B44',
 'language': 'English',
 'keywords': "strong correlation;spectral function;Dyson equation;Green's function;Anderson impurity model",
 'doi': '10.1007/s11232-008-0077-9',
 'publication_year': '2008',
 'serial': "Theoretical and Mathematical Physics;Springer US, New York, NY; Pleiades Publishing, New York, NY; MAIK ``Nauka/Interperiodica'', Moscow"}

In [99]:
cr = Crossref(mailto="pusch@zib.de")
work_info = cr.works(ids = '10.1007/s11232-008-0077-9')

In [100]:
work_info

{'status': 'ok',
 'message-type': 'work',
 'message-version': '1.0.0',
 'message': {'indexed': {'date-parts': [[2022, 4, 2]],
   'date-time': '2022-04-02T12:26:12Z',
   'timestamp': 1648902372026},
  'reference-count': 47,
  'publisher': 'Springer Science and Business Media LLC',
  'issue': '3',
  'license': [{'start': {'date-parts': [[2008, 6, 1]],
     'date-time': '2008-06-01T00:00:00Z',
     'timestamp': 1212278400000},
    'content-version': 'tdm',
    'delay-in-days': 0,
    'URL': 'http://www.springer.com/tdm'}],
  'content-domain': {'domain': [], 'crossmark-restriction': False},
  'short-container-title': ['Theor Math Phys'],
  'published-print': {'date-parts': [[2008, 6]]},
  'DOI': '10.1007/s11232-008-0077-9',
  'type': 'journal-article',
  'created': {'date-parts': [[2008, 6, 25]],
   'date-time': '2008-06-25T11:38:51Z',
   'timestamp': 1214393931000},
  'page': '914-935',
  'source': 'Crossref',
  'is-referenced-by-count': 9,
  'title': ['A diagram approach to the strong co