# Filter papers by software

In [39]:
# load utilities shared between notebooks
%run definitions_and_functions.ipynb

There are millions of references to papers in the zbMath database. We just need (for now) those related to the list of mathematical software (data/smMATH-initial.csv) that has been imported into the MaRDI-Portal.

In [2]:
import pandas as pd

# load the list of swMath software
software_df = pd.read_csv('data/swMATH-initial.csv')
softwares = software_df['Len'].tolist()

## Get a list of records related to a single software
Use the helper/filter endpoint to get a list of papers related to a particular software. In this case 'Gfan'.

**Doesn't work for some software(e.g. FORTRAN) mailed OAI suport about this**

In [3]:
import requests

software = 'Gfan'
REQUEST_URL="{}&filter=software:{}".format(FILTER, software)

# get data from API
headers = {'accept': 'text/xml'} # this has no effect
all_records_xml = requests.get(REQUEST_URL, headers)
if all_records_xml.status_code == 200:
    # save raw data in local file
    with open('data/software_records_{}.xml'.format(software), 'w') as f:
        f.write(all_records_xml.text)    
else: 
    print(all_records_xml.reason)

Define a function to parse all records in the data set, put them into a pandas data frame

In [50]:
import xml.etree.ElementTree as ET
import pandas as pd

def parse_record_list(records, verbose=False):
    """
    Parses a list of 'record' XML elements.
    @return pandas DataFrame or None (if no suitably licensed data was found)
    """
    # loop through all entries
    all_details = []
    for record in records:
        details = parse_record(record)
        if details:
            all_details.append(details)

    # convert to data frame
    records_df = pd.DataFrame(all_details)
    if 'id' not in records_df.columns:
        if verbose: print("Problem reading zbMath id's. No data?")
        return None
    else:
        records_df.set_index('id', inplace=True)

    if verbose:
        print('Imported {} entries (discarded {} for licensing conflicts)'.format(len(records_df), len(records) - len(records_df) ))

    return records_df

Load the data from previously saved XML dump. Parse the tree, get a list of records

In [47]:
tree = ET.parse('data/software_records_Gfan.xml')
list_ids = tree.getroot().find(ns('ListRecords')) # when parsing XML from file, call getroot()
records = list_ids.findall(ns('record'))
records_df = parse_record_list(records, True).head()

Imported 56 entries (discarded 44 for licensing conflicts)


Save the data for later usage

In [8]:
records_df.to_csv('data/software_records_Gfan.csv')

## Get records for all software
Go through the list of all software, get the records from API, put them in a data frame.

If there's an API error, log and continue.

In [54]:
import requests
from datetime import datetime

IMPORT_LOG = "data/api.log"
all_details = pd.DataFrame() # the final data frame
counter = 0
for software in softwares:
    headers = {'accept': 'text/xml'} # this has no effect
    REQUEST_URL="{}&filter=software:{}".format(FILTER, software)
    
    try:
        # get data from API
        all_records_xml = requests.get(REQUEST_URL, headers)
        if all_records_xml.status_code == 500: raise ZbMathOpenAPIException(all_records_xml.reason)
        
        #parse the tree, get a list of records
        tree = ET.fromstring(all_records_xml.text)
        list_ids = tree.find(ns('ListRecords')) # when parsing XML from string, don't call getroot()
        records = list_ids.findall(ns('record'))
        # parse record details into a data frame
        records_df = parse_record_list(records)
        
        # add name of software and append new records to final data frame
        if records_df is not None:
            records_df['software'] = software
            all_details = all_details.append(records_df)
        
        # print feedback
        counter += 1
        if counter % 2 == 0:
            print("processed {}/{} entries".format(counter, len(softwares)))
        
    except ZbMathOpenAPIException as e:
        # if the API throws a server-side error, log and continue
        with open(IMPORT_LOG, "a") as logfile:
            logfile.write("{} [ERROR] While reading {} from zbMath Open API help/filter endpoint: {}\n".format(datetime.now(), software, e))

print("Done. Check {} for errors".format(IMPORT_LOG))

processed 2/39597 entries
processed 4/39597 entries
processed 6/39597 entries
processed 8/39597 entries
Done. Check data/api.log for errors


In [27]:
all_details = all_details.append(records_df)