## Functionality for getting a data dump, processing and saving it

In [12]:
from sickle import Sickle
import xml.etree.ElementTree as ET

In [80]:
OAI_ZB_PREVIEW_NS = 'https://zbmath.org/OAI/2.0/oai_zb_preview/'
OAI_NS = 'http://www.openarchives.org/OAI/2.0/' # the OAI namespace
TAGS = ['author', 'author_ids', 'document_title', 'source', 'classifications', 'language', 'links', 'keywords', 'doi', 'publication_year', 'serial']
ZBMATH_NS = 'https://zbmath.org/zbmath/elements/1.0/'
CONFLICT_TXT = 'zbMATH Open Web Interface contents unavailable due to conflicting licenses.'

In [None]:
def dump_records(out_dir, from_date=None, until_date=None):
    # date has to have format like 2012-12-12
    sickle = Sickle('https://oai.zbmath.org/v1')
    if from_date and until_date:
        records = sickle.ListRecords(**{'metadataPrefix': 'oai_dc','from': from_date, 'until': until_date})
    elif from_date:
        records = sickle.ListRecords(**{'metadataPrefix': 'oai_dc','from': from_date})
    elif until_date:
        records = sickle.ListRecords(**{'metadataPrefix': 'oai_dc','until': until_date})
    else:
        records = sickle.ListRecords(metadataPrefix='oai_dc')
    with open(out_dir + 'data_dump.txt', 'w+') as f:
    for rec in records:
        f.write(rec.raw + '\n')

In [84]:
def read_xml(xml_file):
    with open(xml_file) as f:
        xml_object = ET.fromstringlist(["<root>", f.read(), "</root>"])
    records = tree.getroot().findall(get_tag('record'))
    return(records)

In [68]:
def get_tag(tag_name, namespace=OAI_NS):
    """
    Returns a fully qualified tag name.
    @param namespace URL of a namespace|None (OAI_NS is default)
    """
    return '{{{}}}{}'.format(namespace, tag_name)

In [74]:
def parse_record(xml_record, verbose=False):
    """
    Parse bibliographic record details from XML Element.
    @returns dict
    """
    new_entry = {}
    # zbMath identifier
    zb_id = xml_record.find(get_tag('header')).find(get_tag('identifier')).text 
    new_entry['id'] = zb_id
    # read tags
    zb_preview = xml_record.find(get_tag('metadata')).find(get_tag('zbmath', OAI_ZB_PREVIEW_NS))
    if zb_preview:
        for tag in TAGS:
            value = zb_preview.find(get_tag(tag, ZBMATH_NS))
            if value is not None:
                if len(value):
                    # element has children
                    texts = []
                    for child in value:
                        texts.append(child.text)
                    text = ';'.join(texts) # multiple values are rendered as a semicolon-separated string
                else:
                    # element content is a simple text
                    text = zb_preview.find(get_tag(tag, ZBMATH_NS)).text
                
                if text == CONFLICT_TXT:
                    # License conflict
                    if verbose:
                        print('Licensing conflict for id "{}" tag "{}"'.format(zb_id, tag))
                 #       print(text)
                    return None
            
                new_entry[tag] = text
        return new_entry

In [90]:
def do_all(xml_file):
    with open(xml_file) as infile:
        record_string = ""
        for line in infile:
            record_string = record_string + line
            print(line)
            print(record_file)
            if line.endswith('</record>'):
                print(record_string)
                print(a)
                
    # read records from file one by one
    # for each record:
        # parse and extract information
        # if something is missing, query by doi
        # write to file one by one in csv format

In [None]:
do_all('test.txt')

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [85]:
records =read_xml('test.xml')

MemoryError: 

---------------------------------------------------------------------------------------------------------------------------------------------------

In [48]:
tree = read_xml('reduced.xml')

In [49]:
root = tree.getroot()

In [50]:
print(root.tag)
print(root.attrib)

{http://www.openarchives.org/OAI/2.0/}record
{}


In [51]:
for child in root:
    print(child.tag, child.attrib)

{http://www.openarchives.org/OAI/2.0/}header {}
{http://www.openarchives.org/OAI/2.0/}metadata {}


In [81]:
parse_record(root, verbose=True)

{'id': 'oai:zbmath.org:5346175',
 'author': 'Moskalenko, V. A.; Entel, P.; Digor, D. F.; Dohotaru, L. A.; Citro, R.',
 'document_title': 'A diagram approach to the strong coupling in the single-impurity Anderson model',
 'source': 'Theor. Math. Phys. 155, No. 3, 914-935 (2008); translation from Teor. Mat. Fiz. 155, No. 3, 474-497 (2008).',
 'classifications': '82B44',
 'language': 'English',
 'keywords': "strong correlation;spectral function;Dyson equation;Green's function;Anderson impurity model",
 'doi': '10.1007/s11232-008-0077-9',
 'publication_year': '2008',
 'serial': "Theoretical and Mathematical Physics;Springer US, New York, NY; Pleiades Publishing, New York, NY; MAIK ``Nauka/Interperiodica'', Moscow"}

In [69]:
get_tag('header')

'{http://www.openarchives.org/OAI/2.0/}header'

In [70]:
a = root.find('header')
b = root.find(get_tag('header'))
c= root.find('{http://www.openarchives.org/OAI/2.0/}header')

In [71]:
print(a)

None


In [72]:
print(b)

<Element '{http://www.openarchives.org/OAI/2.0/}header' at 0x7f9c8b50d6d8>


In [73]:
print(c)

<Element '{http://www.openarchives.org/OAI/2.0/}header' at 0x7f9c8b50d6d8>


In [57]:
print(ET.tostring(root))

b'<ns0:record xmlns:ns0="http://www.openarchives.org/OAI/2.0/" xmlns:ns1="https://zbmath.org/OAI/2.0/oai_zb_preview/" xmlns:ns2="https://zbmath.org/zbmath/elements/1.0/"><ns0:header><ns0:identifier>oai:zbmath.org:5346175</ns0:identifier><ns0:datestamp>2008-09-23T14:10:58Z</ns0:datestamp><ns0:setSpec>82</ns0:setSpec></ns0:header><ns0:metadata><ns1:zbmath><ns2:document_id>5346175</ns2:document_id><ns2:time>2008-09-23T14:10:58Z</ns2:time><ns2:document_type>j</ns2:document_type><ns2:publication_year>2008</ns2:publication_year><ns2:author>Moskalenko, V. A.; Entel, P.; Digor, D. F.; Dohotaru, L. A.; Citro, R.</ns2:author><ns2:document_title>A diagram approach to the strong coupling in the single-impurity Anderson model</ns2:document_title><ns2:source>Theor. Math. Phys. 155, No. 3, 914-935 (2008); translation from Teor. Mat. Fiz. 155, No. 3, 474-497 (2008).</ns2:source><ns2:classifications><ns2:classification>82B44</ns2:classification></ns2:classifications><ns2:keywords><ns2:keyword>strong co