In [1]:
import requests
from lxml import etree, objectify
import urllib3
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
import re

In [2]:
def clean_prefixes(xml_root):
    '''
    This function takes root as argument and clean it from any prefix, before returning it.
    '''
    for e in xml_root.getiterator():
        if hasattr(e.tag, 'find'):
            i = e.tag.find('}')
            if i >= 0:
                e.tag = e.tag[i+1:]
    return xml_root

def get_url_to_parse(base_url, start=1, total_results=5743, results_per_page=200):
    '''
    From a base url, query the UN digital library, and get urls for all result pages.
    Base url can contain all parameters of the query, but rg and jrec. results per page is set to 200
    because it is the maximum number of records a regular user can download.
    '''
    url = base_url + '&rg=' + str(results_per_page) + '&jrec=' + str(start) + '&of=xm'
    response = urllib3.PoolManager().request('GET', url) 
    total_results = int(re.search(r":\s(\d+)\s-->",str(response.data)).group(1))
    print(total_results)
    url_list = []
    while start <= total_results:
        url = base_url + '&rg=' + str(results_per_page) + '&jrec=' + str(start) + '&of=xm'
        start += results_per_page
        url_list.append(url)
    return url_list

def url_to_xml(url):
    '''
    Given a list of urls leading to xml documents, will query the url, and return a list of related XML trees.
    '''
    print(url)
    with urllib3.PoolManager(num_pools=40) as http:
        try:
            r = http.request('GET', url, preload_content=False)
        except:
            logger.exception('Error')
    try:
        tree = objectify.parse(r)
    except:
        logger.exception('Error')
    return tree

def merge_xml(trees,root,child):
    '''
    Given a list of xml trees will merge them into a single XML document.
    '''
    collection = etree.Element(root)
    for tree in trees:
        root = clean_prefixes(tree.getroot())
        childNodeList = root.findall(child)
        for node in childNodeList: 
            collection.append(node)
    document = etree.ElementTree(collection)
    return document

def save_marcxml(url,save_path):
    pages_list = get_url_to_parse(url)
    xml_trees = [url_to_xml(page) for page in pages_list]
    merged_xml = merge_xml(xml_trees,'collection','record')
    print(len(merged_xml.findall('record')))
    merged_xml.write(path)

def get_metadata(r):
    """
    Takes a MARC XML record. For each metadata, the function will call annother get_element, which will return
    'None', a string (one value found), or a list of string (multiple values found)
    """
    # Each record will be a dictionary and include
    metadata =  {
        # Standard MARC fields
        "record_id": get_md_value(r,"controlfield[@tag='001']"),
        # The title field concatenate 3 subfields (a, b, c)
        "title": get_md_value(
            r,"datafield[@tag='245']/subfield[@code='a']") + " " + get_md_value(
            r,"datafield[@tag='245']/subfield[@code='b']") + " " + get_md_value(
            r,"datafield[@tag='245']/subfield[@code='c']"),
        "subject-topics": get_md_value(r,"datafield[@tag='650']/subfield[@code='a']"),
        "subject-geo": get_md_value(r,"datafield[@tag='651']/subfield[@code='a']"),
        "publication_date": get_md_value(r,"datafield[@tag='269']/subfield[@code='a']"),
        #"subject-corporates": get_md_value(r,"datafield[@tag='610']/subfield[@code='a']"),
        #"subject-titles": get_md_value(r,"datafield[@tag='630']/subfield[@code='a']"),
        # Local MARC fields in used for UN documents
        "symbol": get_md_value(r,"datafield[@tag='191']/subfield[@code='a']"),
        "body": get_md_value(r,"datafield[@tag='191']/subfield[@code='b']"),
        #"session": get_md_value(r,"datafield[@tag='191']/subfield[@code='c']"),
    }
    # Call the function get_files_info. This function return annother dictionary{description: url}
    files_info = get_files_md(r)
    # Merge the initial metadata dictionary with the dictionary containing the files information
    metadata = {**metadata,**files_info}
    return metadata

def get_files_md(r):
    '''
    This function get the URL from the xml records as well as the description. It forms a dictionary,
    where the description is the key and the url the value.
    '''
    files = {}
    MARC_856 = r.findall("datafield[@tag='856'][@ind1='4']")
    if len(MARC_856) > 0:
        for item in MARC_856:
            description = "url-" + get_md_value(item, "subfield[@code='y']")
            # Deals with some variation of encoding in the name of languages (Specific to UN)
            description = description.replace('ñ','ñ').replace('ç','ç')
            url = get_md_value(item, "subfield[@code='u']")
            files[description] = url
    return files

def get_md_value(marc_xml_record,query):
    """
    Takes 2 arguments: a MARC XML record and a query (XPath) that identifies an XML element. 
    Queries the record to identify the targeted element. If no element is found return 'None', if one element is found returns a string, if more than one
    elements are found returns a list of strings.
    """
    # Parse the record to get all matchin gelements
    xml_element = marc_xml_record.findall(query)
    # Process xml_element according to its length to return either a string, or a list of strings.
    if len(xml_element)>1:
        values = []
        i = 0
        for item in xml_element:
            element = values.append(xml_element[i].text)
            i +=1
        return values # multiple values, returns a list of strings
    elif len(xml_element) == 1:
        return xml_element[0].text # one value, retunrns a string
    else:
        return "" # no value, returns an empty string.

def convert_marxml_to_csv(input_path,output_path):
    # Set a variable with parameters on how the parser should behave.
    parser = etree.XMLParser(encoding='utf-8')
    # Import xml file as an xml etree
    tree = etree.parse(input_path,parser)
    # Remove prefixes in XML elements
    root = clean_prefixes(tree.getroot())
    # Get all <records> element in XML MARC
    xml_records = root.findall("record")
    dictionary_records = [get_metadata(r) for r in xml_records]
    md_dataset = (pd.DataFrame(dictionary_records))
    md_dataset.to_csv(output_path)

def get_csv(year):
    df = (pd.read_csv('data/acquisition/metadata/' + year +'.csv')
          .set_index('record_id'))
    return df

In [3]:
url = 'http://digitallibrary.un.org/search?ln=en&p=year%3A2000-%3E2017+AND+(collection%3A%22General+Assembly%22+OR+collection%3A%22Security+Council%22+OR+collection%3A%22Economic+and+social+council%22)+AND+collection%3A%22Documents+and+Publications%22'
path= 'data/acquisition/metadata/doc_2000_2017.'

In [4]:
save_marcxml(url,path)

151804
