In [1]:
RESEARCH_OUTPUTS_NS = 'v1.publication-import.base-uk.pure.atira.dk'
COMMONS_NS = 'v3.commons.pure.atira.dk'
RO_NS = "{%s}" % RESEARCH_OUTPUTS_NS
C_NS = "{%s}" % COMMONS_NS
NSMAP = {"ro": RESEARCH_OUTPUTS_NS, "comm": COMMONS_NS}

DEFAULT_PEER_REVIEW_VALUE = 'false'
FOR_APPROVAL_WORKFLOW_STATUS = 'forApproval'
DEFAULT_WORKFLOW_STATUS = 'approved'
DEFAULT_PUBLICATION_STATUS = 'published'
DEFAULT_MANAGING_ORG_UNIT = '1'
DEFAULT_DOCUMENT_VERSION = 'publishersversion'
DEFAULT_DOCUMENT_ACCESS = 'unknown'
DEFAULT_VISIBILITY = 'Public'
DEFAULT_JOURNAL_WORKFLOW_STATUS = 'approved'
DEFAULT_PUBLISHER_WORKFLOW_STATUS = 'approved'
JACS_LOGICAL_NAME = 'JACSSubjectClassifications'
ISSN_REGEX = '(\d{4}-?\d{3}(?:\d|[xX]))(?: \(([pP]rint|[oO]nline)\))?'

SUBTYPE_MAP = {'thesis': 'other',
'article': 'article',
'exhibition': 'exhibition' ,
'patent': 'patent',
'other': 'other' ,
'artefact': 'artefact' , 
'conference_item': 'paper' ,
'dataset': 'other',
'composition': 'composition',
'book': 'book', 
'book_section': 'chapter',
'image': 'digitalorvisualproducts',
'video': 'digitalorvisualproducts',
'performance': 'performance',
'monograph':'other',
'audio':'digitalorvisualproducts'}

THESIS_SUBTYPE_MAP = {'mphil': 'master', 'phd': 'doc'}

PUB_STATUS_MAP = {'submitted': 'submitted', 'accepted': 'inpress', 'inpress': 'inpress', 'published_online': 'epub', 
                  'published': 'published', 'pub': 'published', 'completed': 'published', 'in_prep': 'inprep', 'unpub': 'unpublished'}
             
EVENT_MAP =  {'conference': 'conference', 'other': 'other' , 'workshop': 'workshop'}

MEDIA_MAP =  {'online': 'online', 'cd': 'cd' , 'dvd': 'dvd', 'film': 'film' ,'exhibition':'other','performance':'other', 'other':'other'}

VERSION_MAP  = {'other': 'other', 'draft': 'preprint','supplemental': 'other', 'published': 'publishersversion',
                'accepted': 'authorsversion', 'submitted': 'preprint' }

#LICENCE_MAP  = {'cc_by_nc_nd_4': 'cc_by_nc_nd_4', 'cc_by_4': 'cc_by_4','arr': 'arr', 'cc_by_nc_nd': 'cc_by_nc_nd','cc_by_nd': 'cc_by_nd',
#                'cc_by_nd_4': 'cc_by_nd_4' }

VISIBILITY_MAP = {'show': 'Public'}

PERMISSION_MAP = {'public': 'open', 'staffonly': 'embargoed'}

LANG_MAP = {'en': 'en_GB', 'fr': 'fr_FR', 
            'de': 'de_DE', 'ru': 'ru_RU',
            'zh': 'zh_CN', 
            'ja': 'ja_JP', 'it': 'it_IT',
            'nl': 'nl_NL', 'pt': 'pt_PT',
            'pl': 'pl_PL', 'cs': 'cs_CZ',
            'ko': 'ko_KR', 'id': 'id_ID',
            'vi': 'vi_VN', 'es': 'es_ES',
            'el': 'el_GR', 'fa': 'fa_IR',
            'fi': 'fi_FI', 'aus': 'en_GB',
            'bi': 'bi_VU', 'sk': 'sk_SK',
            'is': 'is_IS', 'sv': 'sv_SE',
            'other': 'und'}

MONTH_MAP = {'jan': '01', 'feb': '02', 'mar': '03', 'apr': '04', 'may': '05', 'jun': '06', 'jul': '07', 'aug': '08', 'sep': '09', 'oct': '10', 'nov': '11', 'dec': '12'}

DEFAULT_LANGUAGE = 'en_GB'

XML_HEADER = '<?xml version="1.0" encoding="UTF-8"?>\n' +\
             '<ro:publications xmlns:ro="v1.publication-import.base-uk.pure.atira.dk" xmlns:comm="v3.commons.pure.atira.dk">'
XML_FOOTER = '</ro:publications>'

DIVISIONS = {"rc1":"(The) Helen Hamlyn Centre for Design"
,"rc2":"Intelligent Mobility Design Centre"
,"rc3":"Material Science Research Centre"
,"s1":"School of Architecture"
,"ri":"Research & Knowledge Exchange Office"
,"s7":"School of Arts and Humanities"
,"s2":"School of Communication"
,"rc":"Research Centres"
,"s3":"School of Design"
,"rc4":"Computer Science"
,"other_sch":"Research & Knowledge Exchange Office"}

PRETTY_PRINT = True

In [2]:
EP_NSMAP = {'ep': 'http://eprints.org/ep2/data/2.0'}

In [3]:
import logging
from lxml import etree as et
import urllib
def escape_entities(text):
    escaped_text = text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')\
    .replace('"', '&quot;').replace('\'', '&apos;')
    return escaped_text

def fit_string(source, length):
    if len(source) > length:
        logging.warning('String value {} longer than max size: {}. Shortening it'.format(source, length))
        return source[:length - 3] + '...'
    else:
        return source

def marshal_xml(tree):
    xml_out = et.tostring(tree, pretty_print=PRETTY_PRINT, encoding="UTF-8").decode("UTF-8")
    return xml_out.replace('xmlns:ro="{}" '.format(RESEARCH_OUTPUTS_NS), '').replace('xmlns:comm="{}" '.format(COMMONS_NS), '')

In [4]:
def map(id, content_type, xml_in):
    tree = et.fromstring(xml_in)
    logging.info('Mapping publication {} with type {}'.format(id, content_type))
    logging.info(f"Mapping publication {id} with type {content_type}")
    mapping_function = MAPPING_FUNCTIONS.get(content_type)
    if not mapping_function:
        raise Exception('No mapping function available for content type {}'.format(content_type))
    result = mapping_function(id, content_type, tree)
    return result

In [5]:
def match_person(p):
    result = None
    logging.debug(p)
    # sql_id_match = "select id, org_id from v_persons_matching where identifiers like '%{}%' or lower(email) = '{}'"
    sql_name_match = 'select id, org_id from v_persons_matching where lower(family_name) = lower(%s) and lower(given_name) = lower(%s)'
    cur1 = conn.cursor()
    if p.get('id') is not None:
        logging.info('Attempting match by id with {}'.format(p['id']))
        if '\'' in p['id']:
            p['id'] = p['id'].replace('\'', '\'\'')
            
        sql_id_match = f"""select id, org_id from v_persons_matching where identifiers like '%{p['id']}%' or lower(email) = '{p['id']}' """
        logging.info(sql_id_match)
        cur1.execute(sql_id_match)
        res = cur1.fetchall()
        if len(res) == 0:
            logging.warning('Author id {} not matched. Attempting match by name'.format(p['id']))
            cur1.execute(sql_name_match, (p['family'], p['given']))
            res = cur1.fetchall()
    else:
        logging.info('Attempting match by name with {} {}'.format(p['family'], p['given']))
        cur1.execute(sql_name_match, (p['family'], p['given']))
        res = cur1.fetchall()
    if len(res) == 0:
        logging.warning('Unable to match author {} {}. It will be treated as external'.format(p['given'], p['family']))
    elif len(res) > 1:
        logging.error('Multiple matches for author {} {}.'.format(p['given'], p['family']))
    else:
        result = res[0]
    return result
    cur.close()

In [7]:
external_author = []
def set_persons(pub, meta):
    owner = None
    logging.debug('{}\tSetting persons'.format(id))
    # Get source data
    source_authors = [{'family': c.find('ep:name/ep:family', EP_NSMAP).text, 
                       'given': c.find('ep:name/ep:given', EP_NSMAP).text if c.find('ep:name/ep:given', EP_NSMAP) is not None else '', 
                       'id': c.find('ep:id', EP_NSMAP).text if c.find('ep:id', EP_NSMAP) is not None else None } 
                      for c in meta.findall('ep:creators/ep:item', EP_NSMAP)]
    source_contributors = [{'family': c.find('ep:name/ep:family', EP_NSMAP).text if c.find('ep:name/ep:family', EP_NSMAP) is not None else '', 
                       'given': c.find('ep:name/ep:given', EP_NSMAP).text if c.find('ep:name/ep:given', EP_NSMAP) is not None else '', 
                       'type_con': c.find('ep:type', EP_NSMAP).text[-3:].lower() if c.find('ep:type', EP_NSMAP) is not None else 'oth',
                        'id':c.find('ep:id',EP_NSMAP).text if c.find('ep:id', EP_NSMAP) is not None else None} 
                      for c in meta.findall('ep:contributors/ep:item', EP_NSMAP)]
    source_editors = [{'family': c.find('ep:name/ep:family', EP_NSMAP).text, 
                       'given': c.find('ep:name/ep:given', EP_NSMAP).text if c.find('ep:name/ep:given', EP_NSMAP) is not None else '', 
                       'id': c.find('ep:id', EP_NSMAP).text if c.find('ep:id', EP_NSMAP) is not None else None } 
                      for c in meta.findall('ep:editors/ep:item', EP_NSMAP)]
    # Persons
    persons = et.SubElement(pub, RO_NS+'persons', nsmap=NSMAP)
    for p in source_authors:
        author = et.SubElement(persons, RO_NS+'author', nsmap=NSMAP)
        # TODO: change role
        et.SubElement(author, RO_NS+'role', nsmap=NSMAP).text = 'author'
        person = et.SubElement(author, RO_NS+'person', nsmap=NSMAP)
        et.SubElement(person, RO_NS+'firstName', nsmap=NSMAP).text = p['given']
        et.SubElement(person, RO_NS+'lastName', nsmap=NSMAP).text = p['family']
        
        # If internal, add the ID
        internal_person = match_person(p)
        if internal_person is not None:
            logging.info('Internal person matched: {}'.format(internal_person[0]))
            person.set('id', internal_person[0])
            person.set('origin', 'internal')
        else:
            person.set('origin', 'external')
            external_person = [(p['given']+ ' '+p['family']).title(),p['id'],DIVISIONS.get(meta.find('ep:divisions/ep:item',EP_NSMAP))]
            if external_person not in external_author:
                external_author.append(external_person)
        # Person organization
        # if author is internal, get affiliation from Pure
        if person.get('origin') == 'internal' and internal_person[1] is not None:
            orgs = et.SubElement(author, RO_NS+'organisations', nsmap=NSMAP)
            org = et.SubElement(orgs, RO_NS+'organisation', nsmap=NSMAP)
            org.set('id', internal_person[1])
            # If this is the first internal organization encountered, set it as the owner
            if not owner:
                owner = internal_person[1]
    for p in source_contributors:
        author = et.SubElement(persons, RO_NS+'author', nsmap=NSMAP)
        # TODO: change role
        
        et.SubElement(author, RO_NS+'role', nsmap=NSMAP).text = p['type_con']
        person = et.SubElement(author, RO_NS+'person', nsmap=NSMAP)
        et.SubElement(person, RO_NS+'firstName', nsmap=NSMAP).text = p['given']
        et.SubElement(person, RO_NS+'lastName', nsmap=NSMAP).text = p['family']
        internal_person = match_person(p)
        if internal_person is not None:
            logging.info('Internal person matched: {}'.format(internal_person[0]))
            person.set('id', internal_person[0])
            person.set('origin', 'internal')
        else:
            person.set('origin', 'external')
            external_person = [(p['given']+ ' '+p['family']).title(),p['id'],DIVISIONS.get(meta.find('ep:divisions/ep:item',EP_NSMAP))]
            if external_person not in external_author:
                external_author.append(external_person)
        if person.get('origin') == 'internal' and internal_person[1] is not None:
            orgs = et.SubElement(author, RO_NS+'organisations', nsmap=NSMAP)
            org = et.SubElement(orgs, RO_NS+'organisation', nsmap=NSMAP)
            org.set('id', internal_person[1])
    for p in source_editors:
        author = et.SubElement(persons, RO_NS+'author', nsmap=NSMAP)
        # TODO: change role
        et.SubElement(author, RO_NS+'role', nsmap=NSMAP).text = 'editor'
        person = et.SubElement(author, RO_NS+'person', nsmap=NSMAP)
        et.SubElement(person, RO_NS+'firstName', nsmap=NSMAP).text = p['given']
        et.SubElement(person, RO_NS+'lastName', nsmap=NSMAP).text = p['family']
        internal_person = match_person(p)
        if internal_person is not None:
            logging.info('Internal person matched: {}'.format(internal_person[0]))
            person.set('id', internal_person[0])
            person.set('origin', 'internal')
        else:
            person.set('origin', 'external')
            external_person = [(p['given']+ ' '+p['family']).title(),p['id'],DIVISIONS.get(meta.find('ep:divisions/ep:item',EP_NSMAP))]
            if external_person not in external_author:
                external_author.append(external_person)
        if person.get('origin') == 'internal' and internal_person[1] is not None:
            orgs = et.SubElement(author, RO_NS+'organisations', nsmap=NSMAP)
            org = et.SubElement(orgs, RO_NS+'organisation', nsmap=NSMAP)
            org.set('id', internal_person[1])
            # If this is the first internal organization encountered, set it as the owner
            if not owner:
                owner = internal_person[1]
    return owner

In [1]:
import re
def set_basic_metadata(pub, id, type, meta):
    logging.debug('{} - Setting basic metadata'.format(id))
    ## ATTRIBUTES
    # ID
    pub.set('id', id)
    # Subtype
    subtype = SUBTYPE_MAP.get(type)
    if not subtype:
        raise Exception('Unknown Pure subtype for {}'.format(type))
    pub.set('subType', subtype)
    
    # Peer reviewed
    pr_value =  meta.find('ep:refereed', EP_NSMAP).text.lower() if meta.find('ep:refereed', EP_NSMAP) is not None else DEFAULT_PEER_REVIEW_VALUE
    et.SubElement(pub, RO_NS+'peerReviewed', nsmap=NSMAP).text = pr_value
    
    # International peer reviewed
    # N/A
    
    # Accepted duplicate
    # N/A
    
    # Publication category
    #et.SubElement(pub, RO_NS+'publicationCategory', nsmap=NSMAP).text = CATEGORY_MAP.get(metadata.get('local.output.category')[0]['value'], 'unknown')
    #print(publicationCategory.text)
    
    # Publication Statuses
    pub_statuses = et.SubElement(pub, RO_NS+'publicationStatuses', nsmap=NSMAP)
    statuses = meta.findall('ep:dates/ep:item', EP_NSMAP)
    status_map = dict()
    for st in statuses:
        date_value = st.find('ep:date', EP_NSMAP).text
        status_value_node = st.find('ep:date_type', EP_NSMAP)
        if status_value_node is None:
            status_value_node = meta.find('ep:ispublished', EP_NSMAP)
        status_value = DEFAULT_PUBLICATION_STATUS if status_value_node is None else status_value_node.text
        # Status type
        status = PUB_STATUS_MAP.get(status_value)
        logging.debug('Status: {}'.format(status))
        if not status:
            raise Exception('Unknown publication status {}'.format(status_value))
        if status_map.get(status) is None:
            status_map[status] = date_value
        # If the status is already included, update the date the the greater value
        else:
            if date_value > status_map[status]:
                status_map[status] = date_value
    for s in status_map.keys():
        pub_status = et.SubElement(pub_statuses, RO_NS+'publicationStatus', nsmap=NSMAP)
        et.SubElement(pub_status, RO_NS+'statusType', nsmap=NSMAP).text = s
        # Status date
        date = et.SubElement(pub_status, RO_NS+'date', nsmap=NSMAP)
        date_value = status_map[s].split('-')
        et.SubElement(date, C_NS+'year', nsmap=NSMAP).text = date_value[0]
        if len(date_value) > 1:
            et.SubElement(date, C_NS+'month', nsmap=NSMAP).text = date_value[1]
        if len(date_value) == 3:
            et.SubElement(date, C_NS+'day', nsmap=NSMAP).text = date_value[2]
    if len(pub_statuses.getchildren()) == 0:
        raise Exception('No publication status available')
    
    # Workflow
    archived = meta.find('ep:eprint_status', EP_NSMAP).text
    if archived == 'archive':
        et.SubElement(pub, RO_NS+'workflow', nsmap=NSMAP).text = DEFAULT_WORKFLOW_STATUS
    else:
        et.SubElement(pub, RO_NS+'workflow', nsmap=NSMAP).text = FOR_APPROVAL_WORKFLOW_STATUS
    
    # Classified descriptions
    # N/A
    
    # Language: select the most frequent language of the documents associated with this record
    langs = meta.findall('ep:documents/ep:document/ep:language', EP_NSMAP)
    languages = [l.text for l in langs]
    lang_freq = {languages.count(l): l for l in languages}
    if len(languages) > 0:
        selected_lang = lang_freq[max(lang_freq.keys())]
        logging.info("Language: {}".format(selected_lang))
        lang_code = LANG_MAP.get(selected_lang)
        if lang_code is None:
            raise Exception('Unknown language: {}'.format(selected_lang))
    else:
        lang_code = DEFAULT_LANGUAGE
    et.SubElement(pub, RO_NS+'language', nsmap=NSMAP).text = lang_code
    
    # Title
    logging.debug('{}\tSetting title'.format(id))
    title = et.SubElement(pub, RO_NS+'title', nsmap=NSMAP)
    title_text = et.SubElement(title, C_NS+'text', nsmap=NSMAP)
    title_value = meta.find('ep:title', EP_NSMAP).text
    # Using publication language as the title language to avoid errors
    title_text.set('lang', lang_code.split('_')[0])
    # Deals with language 'unknown' that lacks country code
    if len(lang_code.split('_')) == 2:
        title_text.set('country', lang_code.split('_')[1])
    title_text.text = et.CDATA(title_value)
    
    # SubTitle
    # N/A
    
    # Abstract
    abstract_in = meta.find('ep:abstract', EP_NSMAP)
    if abstract_in is not None:
        logging.debug('{}\tSetting abstract'.format(id))
        abstract = et.SubElement(pub, RO_NS+'abstract', nsmap=NSMAP)
        abstract_text = et.SubElement(abstract, C_NS+'text', nsmap=NSMAP)
        # Using publication language as the abstract language
        abstract_text.set('lang', lang_code.split('_')[0])
        # Deals with language 'unknown' that lacks country code
        if len(lang_code.split('_')) == 2:
            abstract_text.set('country', lang_code.split('_')[1])
        abstract_text.text = et.CDATA(abstract_in.text)
    
    # Persons: returns the first internal org_unit id encountered, so it can be set as the owner
    owner = set_persons(pub, meta)
    
    # Organisations
    # N/A
    
    # Owner: this is the first internal org unit found in the publication (see set_persons())
    # If owner is None, replace it with the default managing org unit
    if not owner:
        owner = DEFAULT_MANAGING_ORG_UNIT
    logging.info('Setting owner: {}'.format(owner))
    et.SubElement(pub, RO_NS+'owner', nsmap=NSMAP).set('id', owner)
    
    # Keywords
    keywords = et.SubElement(pub, RO_NS+'keywords', nsmap=NSMAP)
    # Free text keywords
    keys = meta.find('ep:keywords', EP_NSMAP)
    if keys is not None:
        keys_value = re.split(r"[;,]\s*", keys.text)
        logging.debug('{}\tSetting free text keywords'.format(id))
        keys_cont = et.SubElement(keywords, C_NS+'logicalGroup', nsmap=NSMAP)
        keys_cont.set('logicalName', 'keywordContainers')
        sks = et.SubElement(keys_cont, C_NS+'structuredKeywords', nsmap=NSMAP)
        sk = et.SubElement(sks, C_NS+'structuredKeyword', nsmap=NSMAP)
        fks = et.SubElement(sk, C_NS+'freeKeywords', nsmap=NSMAP)
        for k in keys_value:
            fk = et.SubElement(fks, C_NS+'freeKeyword', nsmap=NSMAP)
            text = et.SubElement(fk, C_NS+'text', nsmap=NSMAP)
            language = DEFAULT_LANGUAGE
            lang_codes = language.split('_')
            text.set('lang', lang_codes[0])
            text.set('country', lang_codes[1])
            text.text = k
    # JACS Subject classifications
   # jacs_keys = meta.findall('ep:subjects/ep:item', EP_NSMAP)
    #if jacs_keys is not None and len(jacs_keys) > 0:
     #   jacs_values = [j.text for j in jacs_keys]
      #  lg = et.SubElement(keywords, C_NS+'logicalGroup', nsmap=NSMAP)
       # lg.set('logicalName', JACS_LOGICAL_NAME)
        #sks = et.SubElement(lg, C_NS+'structuredKeywords', nsmap=NSMAP)
        #for jv in jacs_values:
         #   sk = et.SubElement(sks, C_NS+'structuredKeyword', nsmap=NSMAP)
          #  sk.set('classification', '{}/{}'.format(jv[:1], jv))
    
    if len(keywords.getchildren()) == 0:
        pub.remove(keywords)
    type_licence = set() 
    #URL
    url = meta.find('ep:official_url', EP_NSMAP)
    if url is not None:
        url1 = et.SubElement(pub, RO_NS+'urls', nsmap=NSMAP)
        url2 = et.SubElement(url1, RO_NS+'url', nsmap=NSMAP)
        et.SubElement(url2, RO_NS+'url', nsmap=NSMAP).text = url.text
    # Electronic versions
    documents = meta.findall('ep:documents/ep:document', EP_NSMAP)
    if documents is not None and len(documents) > 0:
        ev = et.SubElement(pub, RO_NS+'electronicVersions', nsmap=NSMAP)
        logging.debug('Setting Electronic Versions')
        for document_tag in documents:   
            # Skip all files having type 'other'
            if document_tag.find('./ep:format', EP_NSMAP).text == 'other':
                continue
            extracted_data = {}
            evfile = et.SubElement(ev, RO_NS+'electronicVersionFile', nsmap=NSMAP)
            for item in document_tag.iter():
                tag = item.tag.split("}")[1]
                extracted_data[tag] = item.text
                
                if tag == 'file':
                    extracted_data['file_id'] = item.attrib['id']
                    
            if 'content' in extracted_data.keys():
                version = et.SubElement(evfile, RO_NS+'version', nsmap=NSMAP)
                version.text = VERSION_MAP.get(extracted_data['content'], 'other')
                
            if 'license' in extracted_data.keys():                
                licence = et.SubElement(evfile, RO_NS+'licence', nsmap=NSMAP)
                licence.text = extracted_data['license']
            
            if 'security' in extracted_data.keys():  
                publicAccess = et.SubElement(evfile, RO_NS+'publicAccess', nsmap=NSMAP)
                publicAccess.text = PERMISSION_MAP.get(extracted_data['security'], 'unknown')
            
            if 'date_embargo' in extracted_data.keys():  
                date = et.SubElement(evfile, RO_NS+'embargoEndDate', nsmap=NSMAP)
                date.text = extracted_data['date_embargo']     
                #print(date.text, date_embargo)
            
            if 'doc_title' in extracted_data.keys():    
                title = et.SubElement(evfile, RO_NS+'title', nsmap=NSMAP)
                title.text = extracted_data['doc_title']
                
            file_id = extracted_data.get('file_id', 'file1')
            file = et.SubElement(evfile, RO_NS+'file', nsmap=NSMAP, attrib={'id': file_id})

            if 'filename' in extracted_data.keys():
                filename = et.SubElement(file, RO_NS+'filename', nsmap=NSMAP)
                filename.text = extracted_data['filename']

            if 'url' in extracted_data.keys():
                filelocation = et.SubElement(file, RO_NS+'fileLocation', nsmap=NSMAP)
                filelocation.text = extracted_data['url']
            
            if 'mime_type' in extracted_data.keys():
                mimetype = et.SubElement(file, RO_NS+'mimetype', nsmap=NSMAP)
                mimetype.text = extracted_data['mime_type']
            
            if 'filesize' in extracted_data.keys():
                filesize = et.SubElement(file, RO_NS+'filesize', nsmap=NSMAP)
                filesize.text = extracted_data['filesize']
            
            if 'mtime' in extracted_data.keys():
                depositdate = et.SubElement(file, RO_NS+'depositDate', nsmap=NSMAP)
                depositdate.text = '-'.join(extracted_data['mtime'].split()[0].split('-')[::-1])
            
            source = et.SubElement(file, RO_NS+'source', nsmap=NSMAP)
            source.text = 'EPrints'
                
            externalRepositoryState = et.SubElement(file, RO_NS+'externalRepositoryState', nsmap=NSMAP)
            externalRepositoryState.text = 'STORED'
    
     # Remove node if no electronic versions
        if len(ev.getchildren()) == 0:
            pub.remove(ev)        
    # Additional Files
    # N/A
    
    # Storage
    exs = et.SubElement(pub, RO_NS+'existingStores', nsmap=NSMAP)
    ex =  et.SubElement(exs, RO_NS+'existingStore', nsmap=NSMAP)
    et.SubElement(ex, RO_NS+'storeName', nsmap=NSMAP).text = 'BCU ePrints'
    eid = meta.find('ep:eprintid', EP_NSMAP)
    if eid is not None:
        et.SubElement(ex, RO_NS+'storeContentId', nsmap=NSMAP).text = eid.text
    
    # Transfer to repository: set to FALSE (these are already in the repository
    et.SubElement(pub, RO_NS+'transferToRepository', nsmap=NSMAP).text = 'false'
    
    # Bibliographical Notes
    if meta.find('ep:funders', EP_NSMAP) is not None:
        bib_notes = et.SubElement(pub, RO_NS+'bibliographicalNotes', nsmap=NSMAP)
        text = '<br/>'.join([f.text for f in meta.findall('ep:funders/ep:item', EP_NSMAP)])
        logging.info('Setting bibliographical note')
        bib_note = et.SubElement(bib_notes, RO_NS+'bibliographicalNote', nsmap=NSMAP)
        note_text = et.SubElement(bib_note, C_NS+'text', nsmap=NSMAP)
        language = DEFAULT_LANGUAGE
        lang_codes = language.split('_')
        note_text.set('lang', lang_codes[0])
        note_text.set('country', lang_codes[1])
        note_text.text = text
    
    # Visibility
    visibility = meta.find('ep:metadata_visibility', EP_NSMAP)
    value = DEFAULT_VISIBILITY
    if visibility is not None:
        value = VISIBILITY_MAP[visibility.text]
        et.SubElement(pub, RO_NS+'visibility', nsmap=NSMAP).text = value
    
    # External IDs
    # TODO

In [9]:
import re
def map_article(id, type, meta):
    logging.info('Mapping publication {} to journal article'.format(id))
    pub = et.Element(RO_NS+'contributionToJournal', nsmap=NSMAP)
    # Basic metadata
    set_basic_metadata(pub, id, type, meta)
    
    # Pages
    page_range = meta.find('ep:pagerange', EP_NSMAP)
    if page_range is not None:
        et.SubElement(pub, RO_NS+'pages', nsmap=NSMAP).text = page_range.text
    
    # Page number
    page_num = meta.find('ep:pages', EP_NSMAP)
    if page_num is not None:
        et.SubElement(pub, RO_NS+'numberOfPages', nsmap=NSMAP).text = page_num.text
    
    # Article Number
    # N/A
    
    # Journal number
    issue = meta.find('ep:number', EP_NSMAP)
    if issue is not None:
        et.SubElement(pub, RO_NS+'journalNumber', nsmap=NSMAP).text = fit_string(issue.text, 64)
    
    # Journal volume
    volume = meta.find('ep:volume', EP_NSMAP)
    if volume is not None:
        et.SubElement(pub, RO_NS+'journalVolume', nsmap=NSMAP).text = volume.text
    
    # Journal
    journal = et.SubElement(pub, RO_NS+'journal', nsmap=NSMAP)
    journaltit = meta.find('ep:publication', EP_NSMAP)
    # Title
    if journaltit is not None:
        et.SubElement(journal, RO_NS+'title', nsmap=NSMAP).text = journaltit.text
    # ISSN. By default ISSN is considered print if not specified
    issn = meta.find('ep:issn', EP_NSMAP)
    if issn is not None:
        pissns = et.SubElement(journal, RO_NS+'printIssns', nsmap=NSMAP)
        eissns = et.SubElement(journal, RO_NS+'electronicIssns', nsmap=NSMAP)
        regex = re.compile(ISSN_REGEX)
        results = regex.findall(issn.text)
        if len(results) == 0:
            logging.warning('Unknown ISSN format: {}'.format(issn.text))
        logging.debug('Parsed issns: {}'.format(results))
        for r in results:
            if r[1] == 'Print' or r[1] == '':
                et.SubElement(pissns, RO_NS+'issn', nsmap=NSMAP).text = r[0]
            else:
                et.SubElement(eissns, RO_NS+'issn', nsmap=NSMAP).text = r[0]
        if len(pissns.getchildren()) == 0:
            journal.remove(pissns)
        if len(eissns.getchildren()) == 0:
            journal.remove(eissns)
    # Publisher
    publisher = meta.find('ep:publisher', EP_NSMAP)
    if publisher is not None:
        publishers = et.SubElement(journal, RO_NS+'publisher', nsmap=NSMAP)
        #name
        et.SubElement(publishers, RO_NS+'name', nsmap=NSMAP).text = publisher.text
        et.SubElement(publishers, RO_NS+'workflow', nsmap=NSMAP).text = DEFAULT_PUBLISHER_WORKFLOW_STATUS
    #workflow
    et.SubElement(journal, RO_NS+'workflow', nsmap=NSMAP).text = DEFAULT_JOURNAL_WORKFLOW_STATUS
    # Event
    # N/A
    
    # Case notes
    # N/A
    
    return marshal_xml(pub)

In [10]:
def map_chapter(id, type, meta):
    logging.info('Mapping publication {} to book chapter'.format(id))
    pub = et.Element(RO_NS+'chapterInBook', nsmap=NSMAP)
    set_basic_metadata(pub, id, type, meta)
    # pages
    page_range = meta.find('ep:pagerange', EP_NSMAP)
    if page_range is not None:
        et.SubElement(pub, RO_NS+'pages', nsmap=NSMAP).text = page_range.text
    # numberOfPages
    page_num = meta.find('ep:pages', EP_NSMAP)
    if page_num is not None:
        et.SubElement(pub, RO_NS+'numberOfPages', nsmap=NSMAP).text = page_num.text
    # chapter
    # N/A
    # articleNumber 
    an = meta.find('ep:number', EP_NSMAP)
    if an is not None:
        et.SubElement(pub, RO_NS+'articleNumber', nsmap=NSMAP).text = an.text
    # edition
    # N/A
    # placeOfPublication
    place_of_pub = meta.find('ep:place_of_pub', EP_NSMAP)
    if place_of_pub is not None:
        et.SubElement(pub, RO_NS+'placeOfPublication', nsmap=NSMAP).text = place_of_pub.text
    # volume
    volume = meta.find('ep:volume', EP_NSMAP)
    if volume is not None:
        et.SubElement(pub, RO_NS+'volume', nsmap=NSMAP).text = volume.text    
     # printIsbns
    keys = meta.find('ep:isbn', EP_NSMAP)
    if keys is not None:
        if len(keys.text) >= 20:
            logging.warning('Suspect ISBN value for publication {}: {}'.format(id, keys.text))
        keys_value = keys.text.split('; ')
        keys_cont = et.SubElement(pub, RO_NS+'printIsbns', nsmap=NSMAP)
        for k in keys_value:
            et.SubElement(keys_cont, RO_NS+'isbn', nsmap=NSMAP).text = k
    # electronicIsbns
    # N/A
    # hostPublicationTitle - To be reviewed with customer
    hostpubtitle = meta.find('ep:title', EP_NSMAP)
    if hostpubtitle is not None:
        et.SubElement(pub, RO_NS+'hostPublicationTitle', nsmap=NSMAP).text = hostpubtitle.text
    # hostPublicationsubTitle
    # N/A
    # publisher
    publisher = meta.find('ep:publisher', EP_NSMAP)
    if publisher is not None:
        publishers = et.SubElement(pub, RO_NS+'publisher', nsmap=NSMAP)
        # name
        et.SubElement(publishers, RO_NS+'name', nsmap=NSMAP).text = publisher.text
        et.SubElement(publishers, RO_NS+'workflow', nsmap=NSMAP).text = DEFAULT_PUBLISHER_WORKFLOW_STATUS
    # editors
    editorsin = meta.find('ep:editors', EP_NSMAP)
    if editorsin is not None:
        names = editorsin.findall('ep:item/ep:name', EP_NSMAP)
        editors = et.SubElement(pub, RO_NS+'editors', nsmap=NSMAP)
        for n in names:
            editor = et.SubElement(editors, RO_NS+'editor', nsmap=NSMAP)
            et.SubElement(editor, C_NS+'firstname', nsmap=NSMAP).text = n.find('ep:given', EP_NSMAP).text
            et.SubElement(editor, C_NS+'lastname', nsmap=NSMAP).text = n.find('ep:family', EP_NSMAP).text
    # series
    seriesin = meta.find('ep:series', EP_NSMAP)
    if seriesin is not None:
        series = et.SubElement(pub, RO_NS+'series', nsmap=NSMAP)
        serie = et.SubElement(series, RO_NS+'serie', nsmap=NSMAP)
        et.SubElement(serie, RO_NS+'name', nsmap=NSMAP).text = seriesin.text
    # event
     # N/A
    # caseNotes
    # N/A
    return marshal_xml(pub)

In [11]:
def map_book(id, type, meta):
    logging.info('Mapping publication {} to book'.format(id))
    pub = et.Element(RO_NS+'book', nsmap=NSMAP)
    set_basic_metadata(pub, id, type, meta)
    # numberOfPages
    page_num = meta.find('ep:pages', EP_NSMAP)
    if page_num is not None:
        et.SubElement(pub, RO_NS+'numberOfPages', nsmap=NSMAP).text = page_num.text
    # placeOfPublication
    placeOfPub = meta.find('ep:place_of_pub', EP_NSMAP)
    if placeOfPub is not None:
        et.SubElement(pub, RO_NS+'placeOfPublication', nsmap=NSMAP).text = placeOfPub.text
    # edition
    # N/A
    # volume
    volume = meta.find('ep:volume', EP_NSMAP)
    if volume is not None:
        et.SubElement(pub, RO_NS+'volume', nsmap=NSMAP).text = volume.text   
     # printIsbns
    keys = meta.find('ep:isbn', EP_NSMAP)
    if keys is not None:
        if len(keys.text) >= 20:
            logging.warning('Suspect ISBN value for publication {}: {}'.format(id, keys.text))
        keys_value = keys.text.split('; ')
        keys_cont = et.SubElement(pub, RO_NS+'printIsbns', nsmap=NSMAP)
        for k in keys_value:
            et.SubElement(keys_cont, RO_NS+'isbn', nsmap=NSMAP).text = k
    # electronicIsbns
    # N/A
    # commissioningBodyExternalOrganisation
    # N/A
    # series
    seriesin = meta.find('ep:series', EP_NSMAP)
    if seriesin is not None:
        series = et.SubElement(pub, RO_NS+'series', nsmap=NSMAP)
        serie = et.SubElement(series, RO_NS+'serie', nsmap=NSMAP)
        et.SubElement(serie, RO_NS+'name', nsmap=NSMAP).text = seriesin.text
    # publisher
    publisher = meta.find('ep:publisher', EP_NSMAP)
    if publisher is not None:
        publishers = et.SubElement(pub, RO_NS+'publisher', nsmap=NSMAP)
        # name
        et.SubElement(publishers, RO_NS+'name', nsmap=NSMAP).text = publisher.text
        et.SubElement(publishers, RO_NS+'workflow', nsmap=NSMAP).text = DEFAULT_PUBLISHER_WORKFLOW_STATUS
    # event
    # N/A
    return marshal_xml(pub)

In [12]:
def map_other(id, type, meta):
    logging.info('Mapping publication {} to other contribution'.format(id))
    pub = et.Element(RO_NS+'other', nsmap=NSMAP)
    set_basic_metadata(pub, id, type, meta)
    #outputMedia
    output_media = meta.find('ep:output_media', EP_NSMAP)
    if output_media is not None:
        value = MEDIA_MAP.get(output_media.text.lower(),'other')
        et.SubElement(pub, RO_NS+'outputMedia', nsmap=NSMAP).text = value
    # numberOfPages
    # N/A
    # placeOfPublication
    place_of_pub = meta.find('ep:place_of_pub', EP_NSMAP)
    if place_of_pub is not None:
        et.SubElement(pub, RO_NS+'placeOfPublication', nsmap=NSMAP).text = place_of_pub.text
    # edition
    # N/A
    # volume
    # N/A
    # printIsbns
    keys = meta.find('ep:isbn', EP_NSMAP)
    if keys is not None:
        if len(keys.text) >= 20:
            logging.warning('Suspect ISBN value for publication {}: {}'.format(id, keys.text))
        keys_value = keys.text.split('; ')
        keys_cont = et.SubElement(pub, RO_NS+'printIsbns', nsmap=NSMAP)
        for k in keys_value:
            et.SubElement(keys_cont, RO_NS+'isbn', nsmap=NSMAP).text = k
    # electronicIsbns
    # N/A
    # series
    # N/A
    # publisher
    publisher = meta.find('ep:publisher', EP_NSMAP)
    if publisher is not None:
        publishers = et.SubElement(pub, RO_NS+'publisher', nsmap=NSMAP)
        # name
        et.SubElement(publishers, RO_NS+'name', nsmap=NSMAP).text = publisher.text
        et.SubElement(publishers, RO_NS+'workflow', nsmap=NSMAP).text = DEFAULT_PUBLISHER_WORKFLOW_STATUS
    # caseNotes
    # N/A
    return marshal_xml(pub)

In [13]:
# Eventdate transformations
from datetime import date
def double(num):
    if len(num) == 1:
        return '0' + num
    else:
        return num

def quad(year):
    if len(year)!=4:
        if int(year) > date.today().year%100:
            return '19' + year
        else:
            return '20' + year
    else:
        return year

def get_start_end(eventdate):
    # Default value for start_date and end_date
    start_date=end_date=date.today().strftime("%d-%m-%Y")
    
    # Replace any separator or space with hyphen
    eventdate = re.sub(r' to ','',eventdate)
    eventdate = re.sub(r'\W+','-',eventdate)
    eventdate = re.sub(r"(?<=\d)(st|nd|rd|th)\b","", eventdate)

    # Possible regex patterns
    patterns = [
    r'^(\d{1,2})-(\d{1,2})-(\d{2,4})-(\d{1,2})-(\d{1,2})-(\d{2,4})', # Two dates num, different months, two years
    r'^(\d{1,2})-(\d{1,2})-(\d{1,2})-(\d{1,2})-(\d{2,4})', # Two dates num, different months, one year
    r'^(\d{1,2})-(\d{1,2})-(\d{1,2})-(\d{2,4})', # Two Dates Num, same month
    r'^(\d{1,2})-(\d{1,2})-(\d{2,4})', # Single Date Num
    r'^([a-zA-Z]+)-(\d{1,2})-(\d{2,4})', # Single Date Reverse
    r'^(\d{1,2})-(\d{1,2})-([a-zA-Z.]+)-(\d{2,4})', # Two Dates, same month
    r'^(\d{1,2})-([a-zA-Z.]+)-(\d{1,2})-([a-zA-Z.]+)-(\d{2,4})', # Two Dates, different months
    r'^(\d{1,2})-([a-zA-Z.]+)-(\d{2,4})-(\d{1,2})-([a-zA-Z.]+)-(\d{2,4})', # Two dates, two months, two years
    r'^([a-zA-Z.]+)-(\d{1,2})-(\d{1,2})-(\d{2,4})', # Two Dates, same month, reverse
    r'^([a-zA-Z.]+)-(\d{1,2})-([a-zA-Z]+)-(\d{1,2})-(\d{2,4})', # Two Dates, different months, reverse
    r'^(\d{1,2})-([a-zA-Z]+)-(\d{2,4})', # Single Date
    ]
    
    # Standardization of dates
    for pattern in patterns:
        match = re.search(pattern, eventdate)
        if match:
            match = match.groups()
            if all(element.isnumeric() for element in match):
                if len(match) == 6:
                    start_date = double(match[0]) + '-' + double(match[1]) + '-' + quad(match[2])
                    end_date = double(match[3]) + '-' + double(match[4]) + '-' + quad(match[5])
                    return start_date, end_date
                elif len(match) == 5:
                    start_date = double(match[0]) + '-' + double(match[1]) + '-' + quad(match[4])
                    end_date = double(match[2]) + '-' + double(match[3]) + '-' + quad(match[4])
                    return start_date, end_date
                elif len(match) == 4:
                    start_date = double(match[0]) + '-' + double(match[2]) + '-' + quad(match[3])
                    end_date = double(match[1]) + '-' + double(match[2]) + '-' + quad(match[3])
                    return start_date, end_date
                elif len(match) == 3:
                    start_date=end_date = double(match[0]) + '-' + double(match[1]) + '-' + quad(match[2])
                    return start_date, end_date
            if len(match) == 6:
                if match[1].isalpha() and match[4].isalpha()  and match[1].lower()[:3] in MONTH_MAP.keys() and match[4].lower()[:3] in MONTH_MAP.keys():
                    start_date = double(match[0]) + '-' + MONTH_MAP[match[1].lower()[:3]] + '-' + quad(match[2])
                    end_date = double(match[3]) + '-' + MONTH_MAP[match[4].lower()[:3]] + '-' + quad(match[5])
                    return start_date, end_date
            elif len(match) == 3:
                if match[0].isalpha() and match[0].lower()[:3] in MONTH_MAP.keys():
                    start_date = end_date = double(match[1]) + '-' + MONTH_MAP[match[0].lower()[:3]] + '-' + quad(match[2])
                    return start_date, end_date
                else:
                    start_date = end_date = double(match[0]) + '-' + MONTH_MAP[match[1].lower()[:3]] + '-' + quad(match[2])
                    return start_date, end_date
            elif len(match) == 4:
                if match[2].isalpha() and match[2].lower()[:3] in MONTH_MAP.keys():
                    start_date = double(match[0]) + '-' + MONTH_MAP[match[2].lower()[:3]] + '-' + quad(match[3])
                    end_date = double(match[1]) + '-' + MONTH_MAP[match[2].lower()[:3]] + '-' + quad(match[3])
                    return start_date, end_date
                elif match[0].isalpha() and match[0].lower()[:3] in MONTH_MAP.keys():
                    start_date = double(match[1]) + '-' + MONTH_MAP[match[0].lower()[:3]] + '-' + quad(match[3])
                    end_date = double(match[2]) + '-' + MONTH_MAP[match[0].lower()[:3]] + '-' + quad(match[3])
                    return start_date, end_date
            elif len(match) == 5:
                if match[1].isalpha() and match[3].isalpha()  and match[1].lower()[:3] in MONTH_MAP.keys() and match[3].lower()[:3] in MONTH_MAP.keys():
                    start_date = double(match[0]) + '-' + MONTH_MAP[match[1].lower()[:3]] + '-' + quad(match[4])
                    end_date = double(match[2]) + '-' + MONTH_MAP[match[3].lower()[:3]] + '-' + quad(match[4])
                    return start_date, end_date
                else:
                    start_date = double(match[1]) + '-' + MONTH_MAP[match[0].lower()[:3]] + '-' + quad(match[4])
                    end_date = double(match[3]) + '-' + MONTH_MAP[match[2].lower()[:3]] + '-' + quad(match[4])
                    return start_date, end_date

                
    return start_date, end_date
 
  

In [14]:
def map_conference(id, type, meta):
    logging.info('Mapping publication {} to conference'.format(id))
    pub = et.Element(RO_NS+'contributionToConference', nsmap=NSMAP)
    set_basic_metadata(pub, id, type, meta)
    # pages
    # N/A
    # numberOfPages
    # N/A
    # Event
    event = et.SubElement(pub, RO_NS+'event', nsmap=NSMAP)
    eventType = meta.find('ep:event_type', EP_NSMAP)
    if eventType is not None:
        et.SubElement(event, RO_NS+'type', nsmap=NSMAP).text = eventType.text
    eventTitle = meta.find('ep:event_title', EP_NSMAP)
    if eventTitle is not None:
        title = et.SubElement(event, RO_NS+'title', nsmap=NSMAP)
        et.SubElement(title, C_NS+'text', nsmap=NSMAP).text = eventTitle.text
    location = meta.find('ep:event_location', EP_NSMAP)
    if location is not None:
        et.SubElement(event, RO_NS+'location', nsmap=NSMAP).text = location.text
    eventdate = meta.find('ep:event_dates', EP_NSMAP)
    if eventdate is not None:
        start_date, end_date = get_start_end(eventdate.text)
        #print(id,eventdate.text, start_date, end_date)
        et.SubElement(event, RO_NS+'startDate', nsmap=NSMAP).text = start_date
        et.SubElement(event, RO_NS+'endDate', nsmap=NSMAP).text = end_date
    return marshal_xml(pub)

In [15]:
def map_non_textual(id, type, meta):
    logging.info('Mapping publication {} to non textual contribution'.format(id))
    pub = et.Element(RO_NS+'nonTextual', nsmap=NSMAP)
    set_basic_metadata(pub, id, type, meta)
    #outputMedia
    output_media = meta.find('ep:output_media', EP_NSMAP)
    if output_media is not None:
        value = MEDIA_MAP.get(output_media.text.lower(),'other')
        et.SubElement(pub, RO_NS+'outputMedia', nsmap=NSMAP).text = value
    # Event
    event = et.SubElement(pub, RO_NS+'event', nsmap=NSMAP)
    eventType = meta.find('ep:event_type', EP_NSMAP)
    if eventType is not None:
        et.SubElement(event, RO_NS+'type', nsmap=NSMAP).text = eventType.text
    eventTitle = meta.find('ep:event_title', EP_NSMAP)
    if eventTitle is not None:
        title = et.SubElement(event, RO_NS+'title', nsmap=NSMAP)
        et.SubElement(title, C_NS+'text', nsmap=NSMAP).text = eventTitle.text
    location = meta.find('ep:event_location', EP_NSMAP)
    if location is not None:
        et.SubElement(event, RO_NS+'location', nsmap=NSMAP).text = location.text
    eventdate = meta.find('ep:event_dates', EP_NSMAP)
    #print(eventdate.text)
    if eventdate is not None:
        start_date, end_date = get_start_end(eventdate.text)
        #print(id,eventdate.text, start_date, end_date)
        et.SubElement(event, RO_NS+'startDate', nsmap=NSMAP).text = start_date
        et.SubElement(event, RO_NS+'endDate', nsmap=NSMAP).text = end_date
    return marshal_xml(pub)

In [16]:
def map_working_paper(id, type, meta):
    logging.info('Mapping publication {} to working paper'.format(id))
    pub = et.Element(RO_NS+'workingPaper', nsmap=NSMAP)
    set_basic_metadata(pub, id, type, meta)
    # pages
    pages = meta.find('ep:pagerange', EP_NSMAP)
    if pages is not None:
        et.SubElement(pub, RO_NS+'pages').text = pages.text
    # numberOfPages
    noOfpages = meta.find('ep:pages', EP_NSMAP)
    if noOfpages is not None:
        et.SubElement(pub, RO_NS+'numberOfPages').text = noOfpages.text
    # placeOfPublication
    place_of_pub = meta.find('ep:place_of_pub', EP_NSMAP)
    if place_of_pub is not None:
        et.SubElement(pub, RO_NS+'placeOfPublication', nsmap=NSMAP).text = place_of_pub.text
    # volume
    volume = meta.find('ep:volume', EP_NSMAP)
    if volume is not None:
        et.SubElement(pub, RO_NS+'volume').text = volume.text
    # volume
    edition = meta.find('ep:number', EP_NSMAP)
    if edition is not None:
        et.SubElement(pub, RO_NS+'edition').text = edition.text
    # publisher
    publisher = meta.find('ep:publisher', EP_NSMAP)
    if publisher is not None:
        publishers = et.SubElement(pub, RO_NS+'publisher', nsmap=NSMAP)
        # name
        et.SubElement(publishers, RO_NS+'name', nsmap=NSMAP).text = publisher.text
        et.SubElement(publishers, RO_NS+'workflow', nsmap=NSMAP).text = DEFAULT_PUBLISHER_WORKFLOW_STATUS
     # printIsbns
    keys = meta.find('ep:isbn', EP_NSMAP)
    if keys is not None:
        if len(keys.text) >= 20:
            logging.warning('Suspect ISBN value for publication {}: {}'.format(id, keys.text))
        keys_value = keys.text.split('; ')
        keys_cont = et.SubElement(pub, RO_NS+'printIsbns', nsmap=NSMAP)
        for k in keys_value:
            et.SubElement(keys_cont, RO_NS+'isbn', nsmap=NSMAP).text = k
    # series
    seriesin = meta.find('ep:series', EP_NSMAP)
    if seriesin is not None:
        series = et.SubElement(pub, RO_NS+'series', nsmap=NSMAP)
        serie = et.SubElement(series, RO_NS+'serie', nsmap=NSMAP)
        et.SubElement(serie, RO_NS+'name', nsmap=NSMAP).text = seriesin.text
    return marshal_xml(pub)

In [17]:
def map_patent(id, type, meta):
    logging.info('Mapping publication {} to patent'.format(id))
    pub = et.Element(RO_NS+'patent', nsmap=NSMAP)
    set_basic_metadata(pub, id, type, meta)
    # ipc
    # N/A
    # patentNumber
    # N/A
    # date
    date_in = meta.find('ep:date', EP_NSMAP)
    if date_in is not None:
        et.SubElement(pub, RO_NS+'date').text = date_in.text
    # priorityDate
    # N/A
    # priorityNumber
    # N/A
    # country
    # N/A
    # publisher
    # N/A
    return marshal_xml(pub)

In [None]:
def map_thesis(id, type, meta):
    logging.info('Mapping publication {} to thesis'.format(id))
    pub = et.Element(RO_NS+'thesis', nsmap=NSMAP)
    set_basic_metadata(pub, id, type, meta)
    # TODO
    #Qualification
    qual = meta.find('ep:thesis_name', EP_NSMAP)
    if qual is not None:
        et.SubElement(pub, RO_NS+'qualification', nsmap=NSMAP).text = qual.text
    thesis_subtype = THESIS_SUBTYPE_MAP.get(qual.text)
    if thesis_subtype is None:
        raise Exception('Unknown thesis type: {}'.format(qual.text))
        pub.set('subType', thesis_subtype)
    else:
        raise Exception('Missing qualification information')
    #Number of pages
    pages_in = meta.find('ep:pages', EP_NSMAP)
    if pages_in is not None:
        et.SubElement(pub, RO_NS+'numberOfPages').text = pages_in.text
    return marshal_xml(pub)

In [18]:
MAPPING_FUNCTIONS = {'conference_item': map_conference,
'patent': map_patent,
'article': map_article,
'book': map_book,
'book_section': map_chapter, 
'exhibition': map_non_textual,
'performance': map_non_textual,
'composition': map_non_textual,
'artefact': map_non_textual,
'audio': map_non_textual,
'video': map_non_textual,
'image': map_non_textual,
'monograph': map_other,
'dataset': map_other,
'thesis': map_thesis,
'other': map_other}