# ISO20022 : Extractor notebook
Copyright : IBM, 2018<br>
Author : **Karl Hegarty**, IBM Ireland<br>

# Industry Models Additional Utilities and Samples
```
Important Note: the items in this project are being shared on an "as-is" basis. Users may copy and modify Source Components and Sample Materials for internal use only provided however that Licensee may not alter or delete any copyright information or notices contained in the Source Components or Sample Materials. IBM provides the Source Components and Sample Materials without obligation of support and "AS IS", WITH NO WARRANTY OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING THE WARRANTY OF TITLE, NON-INFRINGEMENT OR NON-INTERFERENCE AND THE IMPLIED WARRANTIES AND CONDITIONS OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.

There is no direct support offered for these components.
IBM, Industry Models Development Team.
```

## Download eRepository & Initalisation

The e-Repository is provided in EMF format together with the ISO 20022 ecore implementation metamodel. 

The EMF repository is available for download __[here](https://www.iso20022.org/e_dictionary.page)__<br>
See "Download the e-Repository" link <br>

## Main execution section

In [1]:
import urllib.request
import glob, os
import zipfile
import re
import json
import csv
import datetime
import logging
import time

# The following are (2) options for how message elements are generated; 'byId' and 'byName'
# Use to create unique messageElement / type terms where the xmiid is appended to the name to keep unique
# or full resuse of terms - where name duplicates are added as isATypeOfs to origional terms
# possible to implement a mix or intermediate where individual items are kept unique (not currently implemented)

messageElementsUniqueness='byId'
#messageElementsUniqueness='byName'       # WIP

## over-ride main csv and log run
csv_write_file="output/ISO20022-v2013.csv"
logfile='output/ISO20022.log'

# delete log file and restart/run kernel to create a fresh log file
logger=logging.basicConfig(filename=logfile, filemode='w', format='%(asctime)s %(levelname)-8s %(message)s', level=logging.DEBUG)
logging.debug('\tISO20022 Extractor Started :')
igc_sc_root='iso20022_2018demo'

if not os.path.exists('iso20022/'):
    os.makedirs('iso20022/')
if not os.path.exists('output/'):
    os.makedirs('output/')

    
# Use the following to limit the scope of message sets
#limit_2_messageSets={}                   # leave empty for all message sets (latest versions)
#limit_2_messageSets={'_wRoFw02rEeG_I4xRYCA_7g': 'Change or Verify Account Identification - ISO - Latest version'}       ## comment this line 'out' to run all messageSets

# BPS
# limit_2_messageSets={
#        '_wRoFw02rEeG_I4xRYCA_7g': 'Change or Verify Account Identification - ISO - Latest version', 
#        '_wRoFxE2rEeG_I4xRYCA_7g': 'Bank Account Management - ISO - Latest version ', 
#        '_wRx2xE2rEeG_I4xRYCA_7g': 'Bank-to-Customer Cash Management - ISO - Latest version', 
#        '_moz48_2VEeGjgKzdN0DbWA': 'Exceptions and Investigations - ISO - Latest version', 
#        '_ivfzI_2XEeG0oal_knwJ7A': 'Payments Initiation - ISO - Latest version', 
#        '_AnUXc_2YEeG0oal_knwJ7A': 'Payments Mandates - ISO - Latest version', 
#        '_N1zK8_2YEeG0oal_knwJ7A': 'Notification to Receive  - ISO - Latest version', 
#        '_tk1vk_2YEeG0oal_knwJ7A': 'Payments Clearing and Settlement - ISO - Latest version', 
#        '_hijK0WiLEeOuqdLlpUIWBw': 'Investment Funds - ISO - Latest version',
#        '_xiys0WfyEeOLcMuJoUvTAg': 'Settlement And Reconciliation - ISO - Latest version'
#    }

# # Demo
limit_2_messageSets={
        '_wRoFxE2rEeG_I4xRYCA_7g': 'Bank Account Management - ISO - Latest version ',
        '_ivfzI_2XEeG0oal_knwJ7A': 'Payments Initiation - ISO - Latest version', 
        '_AnUXc_2YEeG0oal_knwJ7A': 'Payments Mandates - ISO - Latest version', 
        '_tk1vk_2YEeG0oal_knwJ7A': 'Payments Clearing and Settlement - ISO - Latest version', 
        '_wiatARJsEeSstbhSoCHcWw': 'Post-Trade Foreign Exchange - ISO - Latest version', 
        '_vZbScWwCEeSvG_1tXIrsqQ': 'Central CounterParty (CCP) Securities Clearing - ISO - Latest version', 
        '_MvlNQM7hEeSdANavIlVXcg': 'Cross-Border Transactions Currency Control Reporting - ISO - Latest version', 
        '_eEhc0MEOEea7jLfvGi1PDw': 'Money Market Statistical Reporting - ISO - Latest version'
    }

isofilename='iso20022/20180314_ISO20022_2013_eRepository.zip'
if len(os.listdir('iso20022/')) == 0:
    url = "https://www.iso20022.org/sites/default/files/documents/eRepositories/Metamodel/20180314_ISO20022_2013_eRepository.zip"
    urllib.request.urlretrieve(url, 'iso20022/20180314_ISO20022_2013_eRepository.zip')
    with zipfile.ZipFile(isofilename,"r") as zip_ref:
        zip_ref.extractall('iso20022/')

# Verify eRepository exists
stat=os.stat(isofilename)
(mode, ino, dev, nlink, uid, gid, size, atime, mtime, ctime) = os.stat(isofilename)

print("Checking file exists: " + isofilename + "\tlast modified: %s" % time.ctime(mtime))
print("The number of messageSets in scope is: " + str(len(limit_2_messageSets)))

Checking file exists: iso20022/20180314_ISO20022_2013_eRepository.zip	last modified: Fri Nov  2 16:30:59 2018
The number of messageSets in scope is: 8


In [2]:
from lxml import etree

isofile = "iso20022/20180314_ISO20022_2013_eRepository.iso20022"
nsxmi = {'xmi': 'http://www.omg.org/XMI'}
nsxsi = {'xsi': 'http://www.w3.org/2001/XMLSchema-instance'}
nsre = {"re": "http://exslt.org/regular-expressions"}

iso=etree.parse(isofile)
isoroot=iso.getroot()

# Regulation bug(s): trailing space
bug_list={"Bank Account Management - ISO - Latest version ": "Bank Account Management - ISO - Latest version"}

nspcs = {"re": "http://exslt.org/regular-expressions", "xsi": 'http://www.w3.org/2001/XMLSchema-instance', 'xsi': 'http://www.w3.org/2001/XMLSchema-instance'}
latest_version_messageSets=isoroot.xpath("//businessProcessCatalogue/topLevelCatalogueEntry[@xsi:type='iso20022:MessageSet'][re:match(@name, 'Latest version$') or re:match(@name, 'Latest version $')]", namespaces=nspcs)
messageElements=isoroot.xpath("//dataDictionary/topLevelDictionaryEntry/messageElement", namespaces=nspcs)
print ("There are " + str(len(latest_version_messageSets)) + " - 'latest' messageSets in the eRepository")

There are 36 - 'latest' messageSets in the eRepository


In [3]:
# Build up a dictionary from the XML datadictionary toplevelDictionaryEntry
# In the case of 'messageElements' create additional dictionaries - including a 'parents only' dictionary

tlds=isoroot.xpath("//dataDictionary/topLevelDictionaryEntry", namespaces=nspcs)
logdata={}
typeindex={}
log_index=0
iso={}
iso['tld']={}
iso['messageElements']={}
iso['messageElementsParents']={}
for items in tlds:
    item_id=items.get('{http://www.omg.org/XMI}id')
    item_type=items.get('{http://www.w3.org/2001/XMLSchema-instance}type')
    iso['tld'][item_id]={}
    iso['tld'][item_id].setdefault('name', items.get('name'))
    iso['tld'][item_id].setdefault('type', item_type)    
    iso['tld'][item_id].setdefault('definition', items.get('definition'))
    for c_items in items.getchildren():
        if c_items.tag == "messageElement":
            mEl_id=c_items.get('{http://www.omg.org/XMI}id')
            if 'messsageElements' not in iso['tld'][item_id]:
                iso['tld'][item_id]['messageElements']={}
            iso['tld'][item_id]['messageElements'][mEl_id]={}
            if mEl_id not in iso['messageElements']:
                iso['messageElements'][mEl_id]={}
            iso['messageElements'][mEl_id].setdefault('name', c_items.get('name'))
            iso['messageElements'][mEl_id].setdefault('definition', c_items.get('definition'))
            if c_items.get('complexType'):
                    iso['messageElements'][mEl_id].setdefault('mEl_type', 'complexType')
                    iso['messageElements'][mEl_id].setdefault('mEl_type_value', c_items.get('complexType'))
                    typeindex[c_items.get('complexType')]={}
                    typeindex[c_items.get('complexType')].setdefault('complexType', mEl_id)
            elif c_items.get('simpleType'):
                iso['messageElements'][mEl_id].setdefault('mEl_type', 'simpleType')
                iso['messageElements'][mEl_id].setdefault('mEl_type_value', c_items.get('simpleType'))
                typeindex[c_items.get('simpleType')]={}
                typeindex[c_items.get('simpleType')].setdefault('simpleType', mEl_id)                
            elif c_items.get('type'):
                iso['messageElements'][mEl_id].setdefault('mEl_type', 'type')
                iso['messageElements'][mEl_id].setdefault('mEl_type_value', c_items.get('type'))
                typeindex[c_items.get('type')]={}
                typeindex[c_items.get('type')].setdefault('type', mEl_id)
            if item_id not in iso['messageElementsParents']:
                iso['messageElementsParents'][item_id]={}
            iso['messageElementsParents'][item_id][mEl_id]={}        
print ("top Level Dictionary items loaded to dictionary iso[\'tld\'][{id}]\t\t\t" + format(len(iso['tld']),',d'))
print ("top Level Dictionary items loaded to dictionary iso[\'messageElements\'][{id}]\t\t" + format(len(iso['messageElements']),',d'))

top Level Dictionary items loaded to dictionary iso['tld'][{id}]			17,470
top Level Dictionary items loaded to dictionary iso['messageElements'][{id}]		80,218


In [4]:
## from businessProcessCatalogue/topLevelCatalogueEntry build dictionaries for messageBuyildingBlocks and messageDefinitions

MessageDefinitions=isoroot.xpath("//businessProcessCatalogue/topLevelCatalogueEntry/messageDefinition", namespaces=nsxmi)
iso['messageDefinitions']={}
iso['messageBuildingBlocks']={}
for mds in MessageDefinitions:
    mds_id=mds.get('{http://www.omg.org/XMI}id')
    iso['messageDefinitions'][mds_id]={}
    iso['messageDefinitions'][mds_id].setdefault('name', mds.get('name'))
    iso['messageDefinitions'][mds_id].setdefault('definition', mds.get('definition'))    
    for subelement in mds.getchildren():
        if subelement.tag == 'messageBuildingBlock':
            mbbs=subelement
            if 'messageBuildingBlocks' not in iso['messageDefinitions'][mds_id]:
                iso['messageDefinitions'][mds_id]['messageBuildingBlocks']={}
            mbbs_id=mbbs.get('{http://www.omg.org/XMI}id')
            iso['messageDefinitions'][mds_id]['messageBuildingBlocks'][mbbs_id]={}
            if mbbs_id not in iso['messageBuildingBlocks']:
                iso['messageBuildingBlocks'][mbbs_id]={}
            iso['messageBuildingBlocks'][mbbs_id].setdefault('name', mbbs.get('name'))
            iso['messageBuildingBlocks'][mbbs_id].setdefault('definition', mbbs.get('definition'))
            typetext=''
            if mbbs.get('complexType'):
                iso['messageBuildingBlocks'][mbbs_id].setdefault('mbb_type', 'complexType')
                iso['messageBuildingBlocks'][mbbs_id].setdefault('mbb_type_value', mbbs.get('complexType'))  
            elif mbbs.get('simpleType'):
                iso['messageBuildingBlocks'][mbbs_id].setdefault('mbb_type', 'simpleType')        
                iso['messageBuildingBlocks'][mbbs_id].setdefault('mbb_type_value', mbbs.get('simpleType'))          
            elif mbbs.get('type'):
                iso['messageBuildingBlocks'][mbbs_id].setdefault('mbb_type', 'type')
                iso['messageBuildingBlocks'][mbbs_id].setdefault('mbb_type_value', mbbs.get('type'))          
            else: 
                print ("a messageBuildingBlock had no type")            
            
print ("Message definitions loaded to dictionary iso[\'messageDefinitions\'][{id}]\t" + format(len(iso['messageDefinitions']),',d'))

Message definitions loaded to dictionary iso['messageDefinitions'][{id}]	1,519


In [5]:
## create a dictionary for 'latest' messageSets
iso['messageSets']={}
for lms in latest_version_messageSets:
    lms_id=lms.get('{http://www.omg.org/XMI}id')    
    if len(limit_2_messageSets.keys()) == 0 or lms_id in limit_2_messageSets:
        iso['messageSets'][lms_id]={}
        mSetName = lms.get('name')
        if mSetName in bug_list:                                    # check for bugs in  MessageSet names
            mSetName=bug_list[mSetName]
        iso['messageSets'][lms_id].setdefault('name', mSetName)
        iso['messageSets'][lms_id].setdefault('definition', lms.get('definition')) 
        if lms.get('messageDefinition'):
            messageDefinition_text=lms.get('messageDefinition')
            iso['messageSets'][lms_id]['messageDefinitions']={}
            for mds in messageDefinition_text.split(' '):
                iso['messageSets'][lms_id]['messageDefinitions'][mds]={}
log_index+=1
logdata.setdefault(log_index, "Message Sets loaded to dictionary iso[\'messageSets\']\t\t" + format(len(iso['messageSets']),',d'))

print ("Message Sets loaded to dictionary iso[\'messageSets\']\t\t" + format(len(iso['messageSets']),',d'))


Message Sets loaded to dictionary iso['messageSets']		8


In [6]:
# categories and terms are created as rows in a csv file
# the file is ordered with categories & root elements to appear before other rows reference them 
# a separate notebook loads the csv representation into IGC

## create root category and term
csvdata={}
cathash={}
termhash={}

#### create parent category & term
csv_index = 1
csvrow = {
        'index': csv_index,
        'element_type': 'category',
        'element_name': igc_sc_root,
        'parent_category': '',
        'short_description': "ISO 20022 Financial Services - Universal financial industry message scheme is the ISO Standard for Financial Services Messaging."
        }
csvdata.setdefault(csv_index, csvrow)
iso_category_index=csv_index
logging.debug('\tadd CSV row :\t' + str(csv_index) + "\t" + csvrow['element_type'] + "\t" + csvrow['element_name'])

csv_index += 1
csvrow = {
        'index': csv_index,
        'element_type': 'term',
        'element_name': igc_sc_root, # e.g. iso20022_2018mmdd_
        'parent_category': iso_category_index,
        'short_description': "ISO 20022 Financial Services - Universal financial industry message scheme is the ISO Standard for Financial Services Messaging.",
        'label': 'supportive content'
        }
csvdata.setdefault(csv_index, csvrow)
iso_term_index=csv_index
logging.debug('\tadd CSV row :\t' + str(csv_index) + "\t" + csvrow['element_type'] + "\t" + csvrow['element_name'])
print ("Creating root cagtegory and term - " + igc_sc_root)

Creating root cagtegory and term - iso20022_2018demo


In [7]:
# process the messageSets dictionary to create csv rows of categories and terms

for mSet in iso['messageSets']:                     ## for messageSets in latest_version_messageSets
    ### messageSet categories    
    csv_index += 1
    if len(iso['messageSets'][mSet]['definition']) > 254:
        short_desc = iso['messageSets'][mSet]['definition'][:255]
        long_desc = iso['messageSets'][mSet]['definition']
    else: 
        short_desc = iso['messageSets'][mSet]['definition']
        long_desc = ''
    csvrow = {
            'index': csv_index,
            'element_type': 'category',
            'element_name': iso['messageSets'][mSet]['name'],
            'parent_category': iso_category_index,
            'short_description': short_desc,
            'long_description': long_desc
            }
    csvdata.setdefault(csv_index, csvrow)
    iso['messageSets'][mSet].setdefault('cat_index', csv_index) 
    cathash.setdefault(mSet, 'messageSet')
    logging.debug('\tadd CSV row : \tmessageSets\t' + str(csv_index) + "\t" + csvrow['element_type'] + "\t" + csvrow['element_name'])
    
    ### messageSet terms
    csv_index += 1
    csvrow = {
            'index': csv_index,
            'element_type': 'term',
            'element_name': iso['messageSets'][mSet]['name'],
            'parent_category': iso['messageSets'][mSet]['cat_index'],
            'short_description': short_desc,
            'long_description': long_desc,
            'label': 'supportive content',
            'custom_external_reference': 'messageSet: ' + mSet,
            'isATypeOf': iso_term_index        
            }
    csvdata.setdefault(csv_index, csvrow)
    iso['messageSets'][mSet].setdefault('term_index', csv_index)
    iso['messageSets'][mSet].setdefault('isATypeOf', iso_term_index)
    termhash.setdefault(mSet, 'messageSet')
    logging.debug('\tadd CSV row : \tmessageSets\t' + str(csv_index) + "\t" + csvrow['element_type'] + "\t" + csvrow['element_name'])
    
print ('fin - messageSets added - csv count: ' + format(csv_index, ',d'))

fin - messageSets added - csv count: 18


In [8]:
# process the message Definitions dictionary (for the in scope messageSets) to create csv rows of categories and terms
for mSet in iso['messageSets']:                     ## for messageSets in latest_version_messageSets
    for mdefs in iso['messageSets'][mSet]['messageDefinitions']:       ### messageDefinitions
        if mdefs not in cathash: 
            ## message Definition categories
            csv_index += 1
            if len(iso['messageDefinitions'][mdefs]['definition']) > 254:
                short_desc = iso['messageDefinitions'][mdefs]['definition'][:255]
                long_desc = iso['messageDefinitions'][mdefs]['definition']
            else: 
                short_desc = iso['messageDefinitions'][mdefs]['definition']
                long_desc = ''
            csvrow = {
                    'index': csv_index,
                    'element_type': 'category',
                    'element_name': iso['messageDefinitions'][mdefs]['name'],
                    'parent_category': iso['messageSets'][mSet]['cat_index'],
                    'short_description': short_desc,
                    'long_description': long_desc
                    }
            csvdata.setdefault(csv_index, csvrow)
            iso['messageDefinitions'][mdefs].setdefault('cat_index', csv_index)
            cathash.setdefault(mdefs, 'messageDefinition')
            logging.debug('\tadd CSV row : \tmessageDefinitions\t' + str(csv_index) + "\t" + csvrow['element_type'] + "\t" + csvrow['element_name'])
        else:
            print ("messageDefinition duplicate - skipping")

        if mdefs not in termhash:
            ## message Definition terms
            csv_index += 1
            isATypeOf=iso['messageSets'][mSet]['term_index']
            csvrow = {
                    'index': csv_index,
                    'element_type': 'term',
                    'element_name': iso['messageDefinitions'][mdefs]['name'],
                    'parent_category': iso['messageDefinitions'][mdefs]['cat_index'],
                    'short_description': short_desc,
                    'long_description': long_desc,
                    'label': 'supportive content',
                    'custom_external_reference': 'messageDefinition: ' + mdefs,
                    'isATypeOf': isATypeOf
                    }
            csvdata.setdefault(csv_index, csvrow)
            iso['messageDefinitions'][mdefs].setdefault('term_index', csv_index)
            iso['messageDefinitions'][mdefs].setdefault('isATypeOf', isATypeOf)            
            termhash.setdefault(mdefs, 'messageDefinition')
            logging.debug('\tadd CSV row : \tmessageDefinitions\t' + str(csv_index) + "\t" + csvrow['element_type'] + "\t" + csvrow['element_name'])
        else:
            print ("Duplicate messageDefinition : " + str(iso['messageDefinitions'][mdefs]['name']))
            current_isATypeOf = iso['messageDefinitions'][mdefs]['isATypeOf']
            new_isATypeOf = str(current_isATypeOf) + ', ' + iso['messageSets'][mSet]['term_index']
            iso['messageDefinitions'][mdefs]['isATypeOf'] = new_isATypeOf
            logging.debug('\tadd IsATypeOf \tmessageDefinitions\t' + str(mdefs) + "\tfrom : " + current_isATypeOf + "\tto : " + new_isATypeOf)

print ('fin - messageDefinition pairs added - csv count: ' + format(csv_index, ',d'))

fin - messageDefinition pairs added - csv count: 172


In [9]:
# process the messageBuilding Blocks dictionary (for the in scope messageSets / messageDefinitions ) to create csv rows of categories and terms
# use a key of messageBuildingBlock name and type to determine reuse i.e. not creating a duplicate term
# rather append onto the 'IsATypeOf' relationship of the existing / origional term 

namemgr={}
iso['mbbs_orig']={}
for mSet in iso['messageSets']:                     ## for messageSets in latest_version_messageSets
    for mdefs in iso['messageSets'][mSet]['messageDefinitions']:       ### messageDefinitions
        for mbbs in iso['messageDefinitions'][mdefs]['messageBuildingBlocks']:
            ## key for identifying unique message blocks is message block name + type
            mbb_name = iso['messageBuildingBlocks'][mbbs]['name']
            mbb_type = iso['messageBuildingBlocks'][mbbs]['mbb_type_value']
#            if messageElementsUniqueness == 'byId':
#                mbb_key = mbb_name + '_' + mbb_type
#            elif messageElementsUniqueness == 'byId':
#                mbb_key = mbb_name
            mbb_key = mbb_name + '_' + mbb_type
            if mbb_key not in termhash: 
                csv_index += 1
                if len(iso['messageBuildingBlocks'][mbbs]['definition']) > 254:
                    short_desc = iso['messageBuildingBlocks'][mbbs]['definition'][:255]
                    long_desc = iso['messageBuildingBlocks'][mbbs]['definition']
                else: 
                    short_desc = iso['messageBuildingBlocks'][mbbs]['definition']
                    long_desc = ''
                isATypeOf=iso['messageDefinitions'][mdefs]['term_index']
                pcat=iso['messageDefinitions'][mdefs]['cat_index']
                name=iso['messageBuildingBlocks'][mbbs]['name']
                csvrow = {
                        'index': csv_index,
                        'element_type': 'term',
                        'element_name': name,
                        'parent_category': pcat,
                        'short_description': short_desc,
                        'long_description': long_desc,
                        'label': 'supportive content',                  
                        'custom_external_reference': 'messageBuildingBlock: ' + mbbs,
                        'isATypeOf': isATypeOf
                        }
                csvdata.setdefault(csv_index, csvrow)
                logging.debug('\tadd CSV row : \tmessageBuildingBlocks\t' + str(mbb_key) + "\t" + str(csv_index) + "\t" + csvrow['element_type'] + "\t" + csvrow['element_name'])
                iso['messageBuildingBlocks'][mbbs].setdefault('term_index', csv_index)
                iso['messageBuildingBlocks'][mbbs].setdefault('isATypeOf', isATypeOf)
                iso['messageBuildingBlocks'][mbbs].setdefault('pcat', pcat)
                if messageElementsUniqueness == 'byId':
                    if name not in namemgr:
                        namemgr[name]={}
                        namemgr[name].setdefault(mbbs, name)
                iso['mbbs_orig'][mbbs]={}
                iso['mbbs_orig'][mbbs].setdefault('term_index', csv_index)
                iso['mbbs_orig'][mbbs].setdefault('isATypeOf', isATypeOf) 
                iso['mbbs_orig'][mbbs].setdefault('pcat', pcat)                 
                termhash.setdefault(mbb_key, mbbs)   # store the origional id for when the term was created                
            else:
                # Duplicate messageBuildingBlock
                orig_mbbs = termhash[mbb_key]
                iso['messageBuildingBlocks'][mbbs].setdefault('duplicate', orig_mbbs)
                current_isATypeOf = iso['messageBuildingBlocks'][orig_mbbs]['isATypeOf']
                new_isATypeOf = str(current_isATypeOf) + ', ' + str(iso['messageDefinitions'][mdefs]['term_index'])
                iso['messageBuildingBlocks'][orig_mbbs]['isATypeOf'] = new_isATypeOf
                c_index=iso['messageBuildingBlocks'][orig_mbbs]['term_index']
                csvdata[c_index]['isATypeOf']=new_isATypeOf
                logging.debug('\tadd IsATypeOf \tmessageBuildingBlocks\t' + str(mbbs) + "\tfrom : " + str(current_isATypeOf) + "\tto : " + str(new_isATypeOf))
print ('fin - messageBuildingBlock terms added - csv count: ' + format(csv_index, ',d'))


fin - messageBuildingBlock terms added - csv count: 328


## Option #1 - process messageElements 'byId'
### High volume of terms, no reuse

In [10]:
# byId : Part 1 of 3 
# process messageElements 'byId' : High(est) volume of terms, no reuse
# the recursive function mElTree continues through the complex/type/simple messageElement 'types' to fill out the full message

if messageElementsUniqueness == 'byId':
    iso['mEl']={}
    ic=csv_index

    ## process element by id 
    def processelement(xmiid, parentid, type_text, element_type):
        global ic
        if xmiid not in iso['mEl']:
            iso['mEl'][xmiid]={}
            iso['mEl'][xmiid]['isATypeOf']={}
            iso['mEl'][xmiid]['isATypeOf'].setdefault(parentid, 1)
            iso['mEl'][xmiid].setdefault('pcat', iso['mEl'][parentid]['pcat'])
            ic+=1
            iso['mEl'][xmiid].setdefault('index', ic)
            iso['mEl'][xmiid].setdefault('type_text', type_text)
            logging.debug('\tmessageElements\t' + str(ic) + "\tcreate " + element_type + " :\t" + xmiid + "\tparent: " + parentid)
        else:
            if parentid not in iso['mEl'][xmiid]['isATypeOf']:
                iso['mEl'][xmiid]['isATypeOf'].setdefault(parentid, 1)
                logging.debug('\tmessageElements\t' + str(ic) + "\tnew isatypeof " + element_type + " :\t" + xmiid + "\tparent " + parentid)
            else:
                iso['mEl'][xmiid]['isATypeOf'][parentid] += 1
                logging.debug('\tmessageElements\t' + str(ic) + "\tupdate isatypeof " + element_type + " :\t" + xmiid + "pcount=" + parentid + str(iso['mEl'][xmiid]['isATypeOf'][parentid]))

    def mElTree(parentid, xmiid, type_text):
        global ic
        if xmiid in iso['tld']:
            if xmiid not in iso['messageElementsParents']:
                processelement(xmiid, parentid, type_text, 'eleaf')
            else:
                processelement(xmiid, parentid, type_text, 'nonleaf')
                for cEls in iso['messageElementsParents'][xmiid]:
                    if cEls in iso['messageElements']:
                        cEl_type_text=iso['messageElements'][cEls]['mEl_type']
                        cEl_type_value=iso['messageElements'][cEls]['mEl_type_value']
                        processelement(cEls, xmiid, cEl_type_text, 'gchild')                    
                        mElTree(cEls, cEl_type_value, cEl_type_text)
                    else:
                        print ("not a message element\t: " + cEls)
        else:
            print ("unknown item encountered : \t:\t" + mElType)

    for mbbs in iso['mbbs_orig']:                     # only create branches for mbbs that are origional terms
        mbb_type=iso['messageBuildingBlocks'][mbbs]['mbb_type']
        type_val=iso['messageBuildingBlocks'][mbbs]['mbb_type_value']
        logging.debug('\tmessageElements\t' + str(ic) + "\tcreate for parent:\t" + mbbs)
        iso['mEl'][mbbs]={}
        iso['mEl'][mbbs]['isATypeOf']={}
        iso['mEl'][mbbs].setdefault('pcat', iso['messageBuildingBlocks'][mbbs]['pcat'])
        iso['mEl'][mbbs]['isATypeOf'].setdefault(iso['messageBuildingBlocks'][mbbs]['isATypeOf'])
        iso['mEl'][mbbs].setdefault('index', iso['messageBuildingBlocks'][mbbs]['term_index'])
        mElTree(mbbs, type_val, mbb_type)
    print ('fin - messageElement part 1 - byId')

fin - messageElement part 1 - byId


In [11]:
# byId : Part 2 of 3 : dataClasses / codes
# process messageElements 'byId' : High(est) volume of terms, no reuse
# this section process codesets by adding to the messageElement dictionary a json array of codeSets for 
# in scope messElements

csets=isoroot.xpath("//dataDictionary/topLevelDictionaryEntry[@xsi:type='iso20022:CodeSet']/code", namespaces=nspcs)
codecounter=0
iso['codeset']={}

tracelookup={}
for cs in csets:
    pid=cs.getparent().get('{http://www.omg.org/XMI}id')
    if cs.getparent().get('trace'):
        tracelookup.setdefault(pid, cs.getparent().get('trace'))    
    if pid not in iso['codeset']:
        iso['codeset'][pid]={}            
    if cs.get('codeName'):
        code = cs.get('codeName')
    else:
        code=''
    if cs.get('name'):
        name = cs.get('name')
    else:
        name=''            
    if cs.get('definition'):
        definition = cs.get('definition')
    else:
        definition=''
    index=pid + '_' + name
    if index not in iso['codeset'][pid]:
        iso['codeset'][pid][index]={}
    iso['codeset'][pid][index].setdefault('code', code)
    iso['codeset'][pid][index].setdefault('name', name)
    iso['codeset'][pid][index].setdefault('definition', definition)

if messageElementsUniqueness == 'byId':
    for mEl in iso['mEl']:
        codejson=[]
        testjson=[]        
        if mEl in tracelookup:            
            xmiid = tracelookup[mEl]
            for csets in iso['codeset'][mEl]:
                convert_cset=csets.replace(mEl, xmiid)
                testjson.append(iso['codeset'][xmiid][convert_cset])
                testjsonlength=len(str(json.dumps(testjson, indent=4, separators=(',', ': '))))
                if testjsonlength <= 7800:
                    codejson.append(iso['codeset'][xmiid][convert_cset])
                    codecounter+=1
            iso['mEl'][mEl].setdefault('codejson', json.dumps(codejson, indent=4, separators=(',', ': ')))
        elif mEl in iso['codeset']:
            for csets in iso['codeset'][mEl]:
                testjson.append(iso['codeset'][mEl][csets])
                testjsonlength=len(str(json.dumps(testjson, indent=4, separators=(',', ': '))))
                if testjsonlength <= 7800:
                    codejson.append(iso['codeset'][mEl][csets])
                    codecounter+=1
            iso['mEl'][mEl].setdefault('codejson', json.dumps(codejson, indent=4, separators=(',', ': ')))

print ('fin - codecounter = ' + str(codecounter))

fin - codecounter = 462


In [12]:
# byId : Part 3 of 3 
# process messageElements 'byId' : High(est) volume of terms, no reuse
# build the messageElement csv rows (terms)
# there is a namemgr that appends a numberic for names Agent, Agent (1), Agent (2) to preserve uniqueness


if messageElementsUniqueness == 'byId':
    for item in iso['mEl']:
        if item not in iso['mbbs_orig']: # ignore message building blocks - can be removed 
            if item in iso['messageElements']:
                name=iso['messageElements'][item]['name']
                definition=iso['messageElements'][item]['definition']
            elif item in iso['tld']:
                name=iso['tld'][item]['name']
                definition=iso['tld'][item]['definition']
            else:
                print ("item unknown" + item)

            if len(definition) > 254:
                short_desc = definition[:255]
                long_desc = definition
            else: 
                short_desc = definition
                long_desc = ''
            isatypeof=''
            pcat=iso['mEl'][item]['pcat']
            if 'codejson' in iso['mEl'][item]:
                codetext=iso['mEl'][item]['codejson']
                label_text = 'supportive content, enumeration'
            else:
                codetext=''
                label_text = 'supportive content'
            for iato in iso['mEl'][item]['isATypeOf']:
                if len(isatypeof) == 0:
                    isatypeof = str(iso['mEl'][iato]['index'])
                else:
                    isatypeof = isatypeof + ', ' + str(iso['mEl'][iato]['index'])
            example = 'messageElement - ' + str(iso['mEl'][item]['type_text'])
            if messageElementsUniqueness == 'byId':
                if name not in namemgr:
                    namemgr[name]={}
                    namemgr[name].setdefault(item, name)
                    name_text = name
                else:
                    namecounter=len(namemgr[name])
                    name_text= name + ' (' + str(namecounter) + ')'
                    namemgr[name].setdefault(item, name_text)
                csv_index+=1
                csvrow = {
                        'index': csv_index,
                        'element_type': 'term',
                        'element_name': name_text,
                        'parent_category': pcat,
                        'short_description': short_desc,
                        'long_description': long_desc,
                        'label': label_text,
                        'custom_external_reference': 'messageElement: ' + item,
                        'custom_Enumeration_Values': codetext,
                        'isATypeOf': isatypeof
                        }
                csvdata.setdefault(csv_index, csvrow)
                logging.debug('\tadd CSV row : \tmessageElement\t' + str(item) + "\t" + str(csv_index) + "\t" + csvrow['element_type'] + "\t" + csvrow['element_name'])            
    print ('fin - messageElement part 2 - byId - csv count: ' + format(csv_index, ',d'))   
else:
    print ('skipping : messageElement - byId')

fin - messageElement part 2 - byId - csv count: 3,707


## Option #2 - process messageElements 'byName' 
### lower volume of terms, high reuse

In [13]:
# byName : Part 1 of 2 
# process messageElements 'byName' lower volumne of terms, high reuse
# 
# This option is not yet implemented
#
if messageElementsUniqueness == 'byName':
    iso['mEl']={}
    ic=csv_index

    ## process element by id 
    def processelement(xmiid, parentid, type_text, element_type):
        global ic, pcat
        pcat=''
        if xmiid in iso['messageElements']:
            mEl_name=iso['messageElements'][xmiid]['name']
        elif xmiid in iso['tld']:
            mEl_name=iso['tld'][xmiid]['name']
        else:
            print ("unknown name\t" + xmiid)
        if parentid in iso['messageElements']:
            parent_name=iso['messageElements'][parentid]['name']
        elif parentid in iso['messageBuildingBlocks']:
            parent_name=iso['messageBuildingBlocks'][parentid]['name']            
        elif parentid in iso['tld']:
            parent_name=iso['tld'][parentid]['name']
        else:
            print ("unknown parent name\t" + parentid)            
        if mEl_name not in iso['mEl']:
            iso['mEl'][mEl_name]={}
            iso['mEl'][mEl_name].setdefault('orig_id', xmiid)
            iso['mEl'][mEl_name]['isATypeOf']={}
            iso['mEl'][mEl_name]['isATypeOf'].setdefault(parentid, 1)
            iso['mEl'][mEl_name].setdefault('pcat', pcat)            
            ic+=1
            iso['mEl'][mEl_name].setdefault('index', ic)
            iso['mEl'][mEl_name].setdefault('type_text', type_text)
            logging.debug('\tmessageElements\t' + str(ic) + "\tcreate " + element_type + " :\t" + mEl_name + "\tparent: " + parentid)
        else:
            if parentid not in iso['mEl'][mEl_name]['isATypeOf']:
                iso['mEl'][mEl_name]['isATypeOf'].setdefault(parentid, 1)
                logging.debug('\tmessageElements\t' + str(ic) + "\tnew isatypeof " + element_type + " :\t" + mEl_name + "\tparent " + parentid)
            else:
                iso['mEl'][mEl_name]['isATypeOf'][parentid] += 1
                logging.debug('\tmessageElements\t' + str(ic) + "\tupdate isatypeof " + element_type + " :\t" + mEl_name + "pcount=" + parentid + str(iso['mEl'][mEl_name]['isATypeOf'][parentid]))

    def mElTree(parentid, xmiid, type_text):
        global ic, pcat
        if xmiid in iso['tld']:
            if xmiid not in iso['messageElementsParents']:
                processelement(xmiid, parentid, type_text, 'eleaf')
            else:
                processelement(xmiid, parentid, type_text, 'nonleaf')
                for cEls in iso['messageElementsParents'][xmiid]:
                    if cEls in iso['messageElements']:
                        cEl_type_text=iso['messageElements'][cEls]['mEl_type']
                        cEl_type_value=iso['messageElements'][cEls]['mEl_type_value']
                        processelement(cEls, xmiid, cEl_type_text, 'gchild')                    
                        mElTree(cEls, cEl_type_value, cEl_type_text)
                    else:
                        print ("not a message element\t: " + cEls)
        else:
            print ("unknown item encountered : \t:\t" + mElType)

    for mbbs in iso['mbbs_orig']:                     # only create branches for mbbs that are origional terms
        mbb_type=iso['messageBuildingBlocks'][mbbs]['mbb_type']
        mbb_name=iso['messageBuildingBlocks'][mbbs]['name']
        type_val=iso['messageBuildingBlocks'][mbbs]['mbb_type_value']
        logging.debug('\tmessageElements\t' + str(ic) + "\tcreate for parent:\t" + mbbs)
        iso['mEl'][mbb_name]={}
        iso['mEl'][mbb_name]['isATypeOf']={}
        iso['mEl'][mbb_name].setdefault('orig_id', mbbs)
        pcat=iso['messageBuildingBlocks'][mbbs]['pcat']
        iso['mEl'][mbb_name].setdefault('pcat', pcat)
        iso['mEl'][mbb_name]['isATypeOf'].setdefault(iso['messageBuildingBlocks'][mbbs]['isATypeOf'])
        iso['mEl'][mbb_name].setdefault('index', iso['messageBuildingBlocks'][mbbs]['term_index'])
        mElTree(mbb_name, type_val, mbb_type)
    print ('fin - messageElement part 1 - byName')

In [14]:
# byName : Part 2 of 2 
# process messageElements 'byName' lower volumne of terms, high reuse
# 
# This option is not yet implemented
#

if messageElementsUniqueness == 'byName':
    for itemname in iso['mEl']:
        orig_id = iso['mEl'][itemname]['orig_id']
        if orig_id in iso['mbbs_orig']:
            pcat=iso['mbbs_orig'][itemname]['pcat']
            print ("mseeage block " + itemname)
        else:
            item = iso['mEl'][itemname]['orig_id']
            if item in iso['messageElements']:
                name=iso['messageElements'][item]['name']
                definition=iso['messageElements'][item]['definition']
            elif item in iso['tld']:
                name=iso['tld'][item]['name']
                definition=iso['tld'][item]['definition']
            else:
                print ("item unknown" + item)

            if len(definition) > 254:
                short_desc = definition[:255]
                long_desc = definition
            else: 
                short_desc = definition
                long_desc = ''
            isatypeof=''
#            pcat=iso['mEl'][item]['pcat']
            for iato in iso['mEl'][itemname]['isATypeOf']:
                if iato in iso['tld']:
                    iato_name = iso['tld'][iato]['name']
                elif iato in iso['messageElements']:
                    iato_name = iso['messageElements'][iato]['name']
                elif iato in iso['messageBuildingBlocks']:
                    iato_name = iso['messageBuildingBlocks'][iato]['name']                    
                else:
                    print ("no name for " + iato)
                iato_val = str(iso['mEl'][iato_name]['index'])
                if len(isatypeof) == 0:
                    isatypeof = iato_val
                else:
                    isatypeof = isatypeof + ', ' + iato_val
            example = 'messageElement - ' + str(iso['mEl'][itemname]['type_text'])
            name_text = name + " (" + item + ")"
            csv_index+=1
            csvrow = {
                    'index': csv_index,
                    'element_type': 'term',
                    'element_name': name_text,
                    'parent_category': pcat,
                    'short_description': short_desc,
                    'long_description': long_desc,
                    'label': 'supportive content',
                    'external_reference': item,
                    'isATypeOf': isatypeof
                    }
            csvdata.setdefault(csv_index, csvrow)
            logging.debug('\tadd CSV row : \tmessageElement\t' + str(item) + "\t" + str(csv_index) + "\t" + csvrow['element_type'] + "\t" + csvrow['element_name'])
    print ('fin - messageElement part 2 - byName - csv count: ' + format(csv_index, ',d'))
else:
    print ('skipping : messageElement - byName')

skipping : messageElement - byName


## Creating csv of categories and terms

In [15]:
# create the csvfile of category rows and terms
with open(csv_write_file, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['index', 'element_type', 'element_name', 'parent_category', 'short_description', 'long_description', 'label', 'custom_Enumeration_Values', 'custom_external_reference', 'isATypeOf']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for icsv in range(1, len(csvdata) + 1):
        writer.writerow(csvdata[icsv])
print ("Created " + csv_write_file + " - " + format(len(csvdata), ',d') + " row(s) for import.")

Created output/ISO20022-v2013.csv - 3,707 row(s) for import.


In [16]:
logging.debug('\tISO Extractor Ended')

# End of Notebook execution.

## debug and other useful info

In [17]:
#debug=True
debug=False
if debug:
    print ("debug mode : On")
    ## list all message sets not in previous or latest
    all_messageSets=isoroot.xpath("//businessProcessCatalogue/topLevelCatalogueEntry[@xsi:type='iso20022:MessageSet']", namespaces=nspcs)
    previous_version_messageSets=isoroot.xpath("//businessProcessCatalogue/topLevelCatalogueEntry[@xsi:type='iso20022:MessageSet'][re:match(@name, 'Previous version$')]", namespaces=nspcs)
    for i in all_messageSets:
        if i not in latest_version_messageSets and i not in previous_version_messageSets:
            print (i.get('name'))
else:
    print ("debug mode : Off")

debug mode : Off


In [18]:
if debug:
    with open('output/cset_analysis.csv', 'w', newline='', encoding='utf-8') as csetfile:
        fieldnames = ['type', 'id', 'name', 'derivation', 'trace', 'code_id', 'code_name', 'code_definition', 'code_codeName']
        writer = csv.DictWriter(csetfile, fieldnames=fieldnames)
        writer.writeheader()
        for icsv in range(1, len(csetlist) + 1):
            writer.writerow(csetlist[icsv])
    print ("Created output/cset_analysis.csv - " + format(len(csetlist), ',d') + " row(s) for import.")

In [19]:
if debug:
    isocodeset=isoroot.xpath("//dataDictionary/topLevelDictionaryEntry[@xsi:type='iso20022:CodeSet']", namespaces=nspcs)

    csetlist={}
    cset_index=0
    for ics in isocodeset:
        codesetid=ics.get('{http://www.omg.org/XMI}id')
        codesetname=ics.get('name')
        codesetderivation=ics.get('derivation')
        codesettrace=ics.get('trace')
        c_sets=ics.getchildren()
        for cs in c_sets:
            if cs.tag == 'code':
                if cs.get('trace'):
                    code_id = cs.get('code')
                    code_name = cs.get('name')
                    code_definition = cs.get('definition')
                    code_codeName = cs.get('codeName')
                    csetrow={
                        'type': 'iso20022:CodeSet',
                        'id': codesetid,
                        'name': codesetname,
                        'derivation': codesetderivation,
                        'trace': codesettrace,
                        'code_id': code_id,
                        'code_name': code_name,
                        'code_definition': code_definition,
                        'code_codeName': code_codeName
                    }
                    cset_index+=1
                    csetlist.setdefault(cset_index, csetrow)
    print ('fin - len cset ' + str(cset_index))

In [20]:
if debug:
    # generate a messageSet id, name pair for a list of messageSets
    # BPS listing
    #messages_in_scope={'Change or Verify Account Identification - ISO - Latest version', 'Bank Account Management - ISO - Latest version ', 'Bank-to-Customer Cash Management - ISO - Latest version', 'Exceptions and Investigations - ISO - Latest version', 'Notification to Receive  - ISO - Latest version', 'Payments Initiation - ISO - Latest version', 'Payments Mandates - ISO - Latest version', 'Payments Clearing and Settlement - ISO - Latest version', 'Investment Funds - ISO - Latest version', 'Settlement And Reconciliation - ISO - Latest version'}

    # Demo listing
    messages_in_scope={'Bank Account Management - ISO - Latest version ', 'Payments Initiation - ISO - Latest version', 'Payments Mandates - ISO - Latest version', 'Payments Clearing and Settlement - ISO - Latest version', 'Post-Trade Foreign Exchange - ISO - Latest version', 'Central CounterParty (CCP) Securities Clearing - ISO - Latest version', 'Cross-Border Transactions Currency Control Reporting - ISO - Latest version', 'Money Market Statistical Reporting - ISO - Latest version'}

    #'Bank Account Management - ISO - Latest version'
    #"Bank Account Management - ISO - Latest version "
    bug_list={"Bank Account Management - ISO - Latest version ": "Bank Account Management - ISO - Latest version"}

    for i in latest_version_messageSets:
        name = i.get('name')
        ms_id = i.get('{http://www.omg.org/XMI}id')
        if name in messages_in_scope:
            print ("\'" + str(ms_id) + "\': \'" + name + "\', " )


In [21]:
if debug:
    ## Summary of types for ;
    ##           //dataDictionary/topLevelDictionaryEntry
    ##           //businessProcessCatalogue/topLevelCatalogueEntry

    isodataDictionary=isoroot.xpath("//dataDictionary", namespaces=nspcs)
    isodataDictionarytopLevelDictionaryEntry=isoroot.xpath("//dataDictionary/topLevelDictionaryEntry", namespaces=nspcs)

    print ("Total for //dataDictionary/topLevelDictionaryEntry\t" + str(len(isodataDictionarytopLevelDictionaryEntry)))
    isotypes={}

    for tlde in isodataDictionarytopLevelDictionaryEntry:
        isotype=tlde.get('{http://www.w3.org/2001/XMLSchema-instance}type')
        if isotype in isotypes:
            isotypes[isotype] += 1
        else:
            isotypes.setdefault(isotype, 1)
    print (json.dumps(isotypes, sort_keys=True, indent=4, separators=(',', ': ')))

    isodataDictionary=isoroot.xpath("//businessProcessCatalogue", namespaces=nspcs)
    isobusinessProcessCataloguetopLevelDictionaryEntry=isoroot.xpath("//businessProcessCatalogue/topLevelCatalogueEntry", namespaces=nspcs)

    print ("\nTotal for //businessProcessCatalogue/topLevelCatalogueEntry\t" + str(len(isobusinessProcessCataloguetopLevelDictionaryEntry)))
    bpcisotypes={}

    for tlde in isobusinessProcessCataloguetopLevelDictionaryEntry:
        bpcisotype=tlde.get('{http://www.w3.org/2001/XMLSchema-instance}type')
        if bpcisotype in bpcisotypes:
            bpcisotypes[bpcisotype] += 1
        else:
            bpcisotypes.setdefault(bpcisotype, 1)
    print (json.dumps(bpcisotypes, sort_keys=True, indent=4, separators=(',', ': ')))

## The following hides / unhides code for this notebook

In [22]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
The raw code for this IPython notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')