diff --git a/.travis.yml b/.travis.yml index 2aed753a..12c0e98b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,4 +3,3 @@ before_install: - sudo apt-get install -qq libxml2-utils script: - wget "https://raw.githubusercontent.com/IATI/IATI-Codelists/version-2.02/codelist.xsd"; wget "https://raw.githubusercontent.com/IATI/IATI-Codelists/version-2.02/xml.xsd"; xmllint --schema codelist.xsd --noout xml/* - diff --git a/convert.py b/convert.py index bcf4e8d0..a7a44688 100644 --- a/convert.py +++ b/convert.py @@ -1,31 +1,262 @@ +from collections import OrderedDict +from datetime import date +from os import system +from os.path import join +import re + +from lxml import etree as ET +import unicodecsv as csv + + """ Converts codelist files from external sources into the format used by IATI. -Currently only supports the IANA Media Types code list (FileFormat). +Note not all external codelists are converted automatically yet. """ +etparser = ET.XMLParser(encoding='utf-8', remove_blank_text=True) +today = str(date.today()) -from lxml import etree as ET -template = ET.parse('templates/FileFormat.xml', ET.XMLParser(remove_blank_text=True)) -codelist_items = template.find('codelist-items') +# Adapted from code at http://effbot.org/zone/element-lib.htm +def indent(elem, level=0, shift=2): + i = '\n' + level * ' ' * shift + if len(elem): + if not elem.text or not elem.text.strip(): + elem.text = i + ' ' * shift + if not elem.tail or not elem.tail.strip(): + # hack to remove trailing newline + if level: + elem.tail = i + for elem in elem: + indent(elem, level + 1, shift) + if not elem.tail or not elem.tail.strip(): + elem.tail = i + else: + if level and (not elem.tail or not elem.tail.strip()): + elem.tail = i + + +def str_update(current, proposed): + if current and re.split(r'[\s|\xa0]+', current) == re.split(r'[\s|\xa0]+', proposed): + return current + elif proposed: + return proposed.replace('\r', '').replace('. ', '. ').strip() + return None + + +def create_codelist_item(keys): + tmpl_path = join('templates', 'generic-codelist-item.xml') + xml = ET.parse(tmpl_path, etparser).getroot() + if 'name_en' not in keys: + xml.remove(xml.find('name')) + if 'description_en' not in keys: + xml.remove(xml.find('description')) + if 'category' not in keys: + xml.remove(xml.find('category')) + return xml + + +def update_codelist_item(codelist_item, code_dict): + # update code + if not codelist_item.find('code').text: + codelist_item.find('code').text = code_dict['code'] + if 'category' in code_dict: + # update category + codelist_item.find('category').text = code_dict['category'] + + if 'name_en' in code_dict: + for name_el in codelist_item.findall('name/narrative'): + if name_el.get('{http://www.w3.org/XML/1998/namespace}lang') == 'fr': + name_el.text = str_update(name_el.text, code_dict.get('name_fr')) + else: + name_el.text = str_update(name_el.text, code_dict.get('name_en')) + + if 'description_en' in code_dict: + for description_el in codelist_item.findall('description/narrative'): + if description_el.get('{http://www.w3.org/XML/1998/namespace}lang') == 'fr': + if code_dict['description_fr']: + description_el.text = str_update(description_el.text, code_dict['description_fr']) + else: + codelist_item.find('description').remove(description_el) + else: + description_el.text = str_update(description_el.text, code_dict['description_en']) + + return codelist_item + + +def source_to_xml(tmpl_name, source_name, lookup, source_data=None): + old_xml = ET.parse(join('xml', '{}.xml'.format(tmpl_name)), etparser) + + tmpl_path = join('templates', '{}.xml'.format(tmpl_name)) + xml = ET.parse(tmpl_path, etparser) + codelist_items = xml.find('codelist-items') + + if not source_data: + source_path = join('source', '{}.csv'.format(source_name)) + with open(source_path) as f: + reader = csv.DictReader(f) + source_data = [{ + k: x.get(v) for k, v in lookup.items() + } for x in reader if x[lookup['code']]] + + source_data_dict = OrderedDict([(source_data_row['code'].upper(), source_data_row) for source_data_row in source_data]) + + old_codelist_els = old_xml.xpath('//codelist-item') + while old_codelist_els: + old_codelist_el = old_codelist_els.pop(0) + old_codelist_code = old_codelist_el.find('code').text.upper() + # peek at the first code + if source_data_dict: + new_code_dict = list(source_data_dict.values())[0] + if new_code_dict['code'].upper() != old_codelist_code and not old_xml.xpath('//codelist-item/code[text()="{}"]/..'.format(new_code_dict['code'])): + # add a new code, with activation date of today + new_codelist_item = create_codelist_item(new_code_dict.keys()) + new_codelist_item = update_codelist_item(new_codelist_item, new_code_dict) + new_codelist_item.attrib['status'] = 'active' + new_codelist_item.attrib['activation-date'] = today + codelist_items.append(new_codelist_item) + source_data_dict.popitem(last=False) + # push the last popped item onto the front of the queue + old_codelist_els.insert(0, old_codelist_el) + continue + + if old_codelist_code in source_data_dict: + # it's in the current codes, so update it + new_code_dict = source_data_dict[old_codelist_code] + updated_codelist_item = update_codelist_item(old_codelist_el, new_code_dict) + codelist_items.append(updated_codelist_item) + del source_data_dict[old_codelist_code] + elif old_codelist_el.attrib.get('status') == 'withdrawn': + # it's an old withdrawn code, so just copy it + codelist_items.append(old_codelist_el) + elif codelist_items.xpath('//codelist-item/code[text()="{}"]/..'.format(old_codelist_el.find('code').text)): + # some codelist items are hard-coded, and should just + # be left as is + pass + else: + # it's a newly withdrawn code, so mark the withdrawal date + # as today, and copy it + old_codelist_el.attrib['status'] = 'withdrawn' + old_codelist_el.attrib['withdrawal-date'] = today + codelist_items.append(old_codelist_el) + + output_path = join('xml', '{}.xml'.format(tmpl_name)) + for el in xml.iter('*'): + if el.text is not None: + if not el.text.strip(): + # force tag self-escaping + el.text = None + indent(xml.getroot(), 0, 4) + xml.write(output_path, encoding='utf-8', pretty_print=True) + + +# throw away any local changes +system('git checkout -- xml/') + +fileformat_lookup = { + 'code': 'Media Type', + 'category': 'Type', +} +source_to_xml('FileFormat', 'media-types', fileformat_lookup) + +currency_lookup = { + 'code': 'AlphabeticCode', + 'name_en': 'Currency', + 'withdrawal_date': 'WithdrawalDate', +} +# TODO: source data includes withdrawn codes! +# This needs to be factored in when parsing. +source_to_xml('Currency', 'currencies', currency_lookup) + +country_lookup = { + 'code': 'code', + 'name_en': 'name_en', +} +source_path = join('source', 'countries.csv') +with open(source_path) as f: + reader = csv.DictReader(f) + countries = [{ + 'code': x['ISO3166-1-Alpha-2'], + 'name_en': x['CLDR display name'], + } for x in reader if x['ISO3166-1-Alpha-2']] +source_to_xml('Country', 'countries', country_lookup, source_data=countries) + +language_lookup = { + 'code': 'alpha2', + 'name_en': 'English', +} +source_to_xml('Language', 'languages', language_lookup) + +lookup_no_category = { + 'code': 'code', + 'name_en': 'name_en', + 'name_fr': 'name_fr', + 'description_en': 'description_en', + 'description_fr': 'description_fr', +} +source_to_xml('AidType-category', 'aid_type_categories', lookup_no_category) +source_to_xml('FlowType', 'flow_types', lookup_no_category) +source_to_xml('SectorCategory', 'sector_categories', lookup_no_category) -media_types = ET.parse('source/media-types.xml') -for registry in media_types.findall('{http://www.iana.org/assignments}registry'): - registry_id = registry.attrib['id'] - for record in registry.findall('{http://www.iana.org/assignments}record'): - codelist_item = ET.Element('codelist-item') +lookup_no_desc = { + 'code': 'code', + 'name_en': 'name_en', + 'name_fr': 'name_fr', +} +source_to_xml('CollaborationType', 'collaboration_types', lookup_no_desc) +source_to_xml('CRSChannelCode', 'channel_codes', lookup_no_desc) - code = ET.Element('code') - code.text = registry_id + '/' + record.find('{http://www.iana.org/assignments}name').text - codelist_item.append(code) +source_path = join('source', 'finance_type_categories.csv') +with open(source_path) as f: + reader = csv.DictReader(f) + finance_type_categories = [] + for finance_type_category in reader: + if finance_type_category['code'] == '0': + continue + finance_type_categories.append(finance_type_category) +source_to_xml('FinanceType-category', None, lookup_no_desc, source_data=finance_type_categories) - category = ET.Element('category') - category.text = registry_id - codelist_item.append(category) +lookup = { + 'code': 'code', + 'category': 'category', + 'name_en': 'name_en', + 'name_fr': 'name_fr', + 'description_en': 'description_en', + 'description_fr': 'description_fr', +} +source_to_xml('AidType', 'aid_types', lookup) - codelist_items.append(codelist_item) +source_path = join('source', 'finance_types.csv') +with open(source_path) as f: + reader = csv.DictReader(f) + finance_types = [] + for finance_type in reader: + if finance_type['category'] == '0': + continue + if finance_type['name_en'] == '': + finance_type['name_en'] = finance_type['description_en'] + if finance_type['name_fr'] == '': + finance_type['name_fr'] = finance_type['description_fr'] + finance_types.append(finance_type) +source_to_xml('FinanceType', None, lookup, source_data=finance_types) -template.write('xml/FileFormat.xml', pretty_print=True) +source_path = join('source', 'sectors.csv') +with open(source_path) as f: + reader = csv.DictReader(f) + sectors = [] + for sector in reader: + if sector['voluntary_code'] != '': + sector['code'] = sector['voluntary_code'] + for txt in ['name_en', 'name_fr', 'description_en', 'description_fr']: + sector[txt] = re.sub(r' +', ' ', sector[txt]) + sectors.append(sector) +sectors = sorted(sectors, key=lambda x: x['code']) +source_to_xml('Sector', None, lookup, source_data=sectors) +source_path = join('source', 'recipients.csv') +with open(source_path) as f: + reader = csv.DictReader(f) + regions = [x for x in reader if x['income_group'] == 'Unallocated by income'] +source_to_xml('Region', None, lookup, source_data=regions) diff --git a/get.sh b/get.sh index 627026d2..e93a72a8 100755 --- a/get.sh +++ b/get.sh @@ -1 +1,25 @@ -wget "https://www.iana.org/assignments/media-types/media-types.xml" -O source/media-types.xml +mkdir source + +# IANA +wget "https://raw.githubusercontent.com/datasets/media-types/master/media-types.csv" -O source/media-types.csv + +# ISO 4217 +wget "https://raw.githubusercontent.com/datasets/currency-codes/master/data/codes-all.csv" -O source/currencies.csv + +# ISO 3166 +wget "https://raw.githubusercontent.com/datasets/country-codes/master/data/country-codes.csv" -O source/countries.csv + +# ISO 639-1 +wget "https://raw.githubusercontent.com/datasets/language-codes/master/data/language-codes-full.csv" -O source/languages.csv + +# DAC CRS +wget "https://raw.githubusercontent.com/datasets/dac-crs-codes/master/data/aid_types.csv" -O source/aid_types.csv +wget "https://raw.githubusercontent.com/datasets/dac-crs-codes/master/data/aid_type_categories.csv" -O source/aid_type_categories.csv +wget "https://raw.githubusercontent.com/datasets/dac-crs-codes/master/data/collaboration_types.csv" -O source/collaboration_types.csv +wget "https://raw.githubusercontent.com/datasets/dac-crs-codes/master/data/channel_codes.csv" -O source/channel_codes.csv +wget "https://raw.githubusercontent.com/datasets/dac-crs-codes/master/data/finance_types.csv" -O source/finance_types.csv +wget "https://raw.githubusercontent.com/datasets/dac-crs-codes/master/data/finance_type_categories.csv" -O source/finance_type_categories.csv +wget "https://raw.githubusercontent.com/datasets/dac-crs-codes/master/data/flow_types.csv" -O source/flow_types.csv +wget "https://raw.githubusercontent.com/datasets/dac-crs-codes/master/data/recipients.csv" -O source/recipients.csv +wget "https://raw.githubusercontent.com/datasets/dac-crs-codes/master/data/sectors.csv" -O source/sectors.csv +wget "https://raw.githubusercontent.com/datasets/dac-crs-codes/master/data/sector_categories.csv" -O source/sector_categories.csv diff --git a/requirements.txt b/requirements.txt index 7b403f33..10e6d209 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ lxml==3.4.1 +unicodecsv==0.14.1 diff --git a/templates/AidType-category.xml b/templates/AidType-category.xml new file mode 100644 index 00000000..6e41b198 --- /dev/null +++ b/templates/AidType-category.xml @@ -0,0 +1,12 @@ + + + + Aid Type (category) + + + This codelists exists to group the Aid Type codelist into categories. It is not used as a codelist in its own right. + + http://www.oecd.org/dac/stats/dacandcrscodelists.htm + + + diff --git a/templates/AidType.xml b/templates/AidType.xml new file mode 100644 index 00000000..e38a301e --- /dev/null +++ b/templates/AidType.xml @@ -0,0 +1,9 @@ + + + + Aid Type + + http://www.oecd.org/dac/stats/dacandcrscodelists.htm + + + diff --git a/templates/CRSChannelCode.xml b/templates/CRSChannelCode.xml new file mode 100644 index 00000000..097e1ef7 --- /dev/null +++ b/templates/CRSChannelCode.xml @@ -0,0 +1,9 @@ + + + + CRS Channel Code + + http://www.oecd.org/dac/stats/dacandcrscodelists.htm + + + diff --git a/templates/CollaborationType.xml b/templates/CollaborationType.xml new file mode 100644 index 00000000..1c7b9f7b --- /dev/null +++ b/templates/CollaborationType.xml @@ -0,0 +1,9 @@ + + + + Collaboration Type + + http://www.oecd.org/dac/stats/dacandcrscodelists.htm + + + diff --git a/templates/Country.xml b/templates/Country.xml new file mode 100644 index 00000000..936fbb5c --- /dev/null +++ b/templates/Country.xml @@ -0,0 +1,25 @@ + + + + Country + + + + The Country codelist is generated from the ISO 3166-1 part of the + ISO 3166 standard. The standard makes allowance, alongside the + officially assigned codes, for code elements to be expanded by + using either reserved codes or user-assigned codes. IATI currently + defines additional codes in the XA -XZ range. + + + http://www.iso.org/iso/home/standards/country_codes.htm + + + + XK + + Kosovo + + + + diff --git a/templates/Currency.xml b/templates/Currency.xml new file mode 100644 index 00000000..ebf525b9 --- /dev/null +++ b/templates/Currency.xml @@ -0,0 +1,12 @@ + + + + Currency + + + ISO 4217 Currency used for all transactions and budgets + + http://www.iso.org/iso/home/standards/currency_codes.htm + + + diff --git a/templates/FileFormat.xml b/templates/FileFormat.xml index 8e5a0475..3fd97227 100644 --- a/templates/FileFormat.xml +++ b/templates/FileFormat.xml @@ -1,8 +1,12 @@ - - File Format - File format of published documents. - http://www.iana.org/assignments/media-types - - + + + File Format + + + File format of published documents. + + http://www.iana.org/assignments/media-types + + diff --git a/templates/FinanceType-category.xml b/templates/FinanceType-category.xml new file mode 100644 index 00000000..d20b1381 --- /dev/null +++ b/templates/FinanceType-category.xml @@ -0,0 +1,12 @@ + + + + Finance Type (category) + + + This codelists exists to group the Finance Type codelist into categories. It is not used as a codelist in its own right. + + http://www.oecd.org/dac/stats/dacandcrscodelists.htm + + + diff --git a/templates/FinanceType.xml b/templates/FinanceType.xml new file mode 100644 index 00000000..ac53792a --- /dev/null +++ b/templates/FinanceType.xml @@ -0,0 +1,12 @@ + + + + Finance Type + + + DAC/CRS transaction classification used to distinguish financial instruments, e.g. grants or loans. + + http://www.oecd.org/dac/stats/dacandcrscodelists.htm + + + diff --git a/templates/FlowType.xml b/templates/FlowType.xml new file mode 100644 index 00000000..e5fbae4f --- /dev/null +++ b/templates/FlowType.xml @@ -0,0 +1,12 @@ + + + + Flow Type + + + DAC/CRS distinction between ODA (official development assistance) and other types of resource flow. + + http://www.oecd.org/dac/stats/dacandcrscodelists.htm + + + diff --git a/templates/Language.xml b/templates/Language.xml new file mode 100644 index 00000000..0649cfdb --- /dev/null +++ b/templates/Language.xml @@ -0,0 +1,9 @@ + + + + Language + + http://www.iso.org/iso/home/standards/language_codes.htm + + + diff --git a/templates/Region.xml b/templates/Region.xml new file mode 100644 index 00000000..bc1b1be4 --- /dev/null +++ b/templates/Region.xml @@ -0,0 +1,9 @@ + + + + Region + + http://www.oecd.org/dac/stats/dacandcrscodelists.htm + + + diff --git a/templates/Sector.xml b/templates/Sector.xml new file mode 100644 index 00000000..43d98788 --- /dev/null +++ b/templates/Sector.xml @@ -0,0 +1,9 @@ + + + + DAC 5 Digit Sector + + http://www.oecd.org/dac/stats/dacandcrscodelists.htm + + + diff --git a/templates/SectorCategory.xml b/templates/SectorCategory.xml new file mode 100644 index 00000000..fdaaab5e --- /dev/null +++ b/templates/SectorCategory.xml @@ -0,0 +1,9 @@ + + + + DAC 3 Digit Sector + + http://www.oecd.org/dac/stats/dacandcrscodelists.htm + + + diff --git a/templates/generic-codelist-item.xml b/templates/generic-codelist-item.xml new file mode 100644 index 00000000..9315797b --- /dev/null +++ b/templates/generic-codelist-item.xml @@ -0,0 +1,12 @@ + + + + + + + + + + + +