diff --git a/.travis.yml b/.travis.yml
index 2aed753a..12c0e98b 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -3,4 +3,3 @@ before_install:
- sudo apt-get install -qq libxml2-utils
script:
- wget "https://raw.githubusercontent.com/IATI/IATI-Codelists/version-2.02/codelist.xsd"; wget "https://raw.githubusercontent.com/IATI/IATI-Codelists/version-2.02/xml.xsd"; xmllint --schema codelist.xsd --noout xml/*
-
diff --git a/convert.py b/convert.py
index bcf4e8d0..a7a44688 100644
--- a/convert.py
+++ b/convert.py
@@ -1,31 +1,262 @@
+from collections import OrderedDict
+from datetime import date
+from os import system
+from os.path import join
+import re
+
+from lxml import etree as ET
+import unicodecsv as csv
+
+
"""
Converts codelist files from external sources into the format used by IATI.
-Currently only supports the IANA Media Types code list (FileFormat).
+Note not all external codelists are converted automatically yet.
"""
+etparser = ET.XMLParser(encoding='utf-8', remove_blank_text=True)
+today = str(date.today())
-from lxml import etree as ET
-template = ET.parse('templates/FileFormat.xml', ET.XMLParser(remove_blank_text=True))
-codelist_items = template.find('codelist-items')
+# Adapted from code at http://effbot.org/zone/element-lib.htm
+def indent(elem, level=0, shift=2):
+ i = '\n' + level * ' ' * shift
+ if len(elem):
+ if not elem.text or not elem.text.strip():
+ elem.text = i + ' ' * shift
+ if not elem.tail or not elem.tail.strip():
+ # hack to remove trailing newline
+ if level:
+ elem.tail = i
+ for elem in elem:
+ indent(elem, level + 1, shift)
+ if not elem.tail or not elem.tail.strip():
+ elem.tail = i
+ else:
+ if level and (not elem.tail or not elem.tail.strip()):
+ elem.tail = i
+
+
+def str_update(current, proposed):
+ if current and re.split(r'[\s|\xa0]+', current) == re.split(r'[\s|\xa0]+', proposed):
+ return current
+ elif proposed:
+ return proposed.replace('\r', '').replace('. ', '. ').strip()
+ return None
+
+
+def create_codelist_item(keys):
+ tmpl_path = join('templates', 'generic-codelist-item.xml')
+ xml = ET.parse(tmpl_path, etparser).getroot()
+ if 'name_en' not in keys:
+ xml.remove(xml.find('name'))
+ if 'description_en' not in keys:
+ xml.remove(xml.find('description'))
+ if 'category' not in keys:
+ xml.remove(xml.find('category'))
+ return xml
+
+
+def update_codelist_item(codelist_item, code_dict):
+ # update code
+ if not codelist_item.find('code').text:
+ codelist_item.find('code').text = code_dict['code']
+ if 'category' in code_dict:
+ # update category
+ codelist_item.find('category').text = code_dict['category']
+
+ if 'name_en' in code_dict:
+ for name_el in codelist_item.findall('name/narrative'):
+ if name_el.get('{http://www.w3.org/XML/1998/namespace}lang') == 'fr':
+ name_el.text = str_update(name_el.text, code_dict.get('name_fr'))
+ else:
+ name_el.text = str_update(name_el.text, code_dict.get('name_en'))
+
+ if 'description_en' in code_dict:
+ for description_el in codelist_item.findall('description/narrative'):
+ if description_el.get('{http://www.w3.org/XML/1998/namespace}lang') == 'fr':
+ if code_dict['description_fr']:
+ description_el.text = str_update(description_el.text, code_dict['description_fr'])
+ else:
+ codelist_item.find('description').remove(description_el)
+ else:
+ description_el.text = str_update(description_el.text, code_dict['description_en'])
+
+ return codelist_item
+
+
+def source_to_xml(tmpl_name, source_name, lookup, source_data=None):
+ old_xml = ET.parse(join('xml', '{}.xml'.format(tmpl_name)), etparser)
+
+ tmpl_path = join('templates', '{}.xml'.format(tmpl_name))
+ xml = ET.parse(tmpl_path, etparser)
+ codelist_items = xml.find('codelist-items')
+
+ if not source_data:
+ source_path = join('source', '{}.csv'.format(source_name))
+ with open(source_path) as f:
+ reader = csv.DictReader(f)
+ source_data = [{
+ k: x.get(v) for k, v in lookup.items()
+ } for x in reader if x[lookup['code']]]
+
+ source_data_dict = OrderedDict([(source_data_row['code'].upper(), source_data_row) for source_data_row in source_data])
+
+ old_codelist_els = old_xml.xpath('//codelist-item')
+ while old_codelist_els:
+ old_codelist_el = old_codelist_els.pop(0)
+ old_codelist_code = old_codelist_el.find('code').text.upper()
+ # peek at the first code
+ if source_data_dict:
+ new_code_dict = list(source_data_dict.values())[0]
+ if new_code_dict['code'].upper() != old_codelist_code and not old_xml.xpath('//codelist-item/code[text()="{}"]/..'.format(new_code_dict['code'])):
+ # add a new code, with activation date of today
+ new_codelist_item = create_codelist_item(new_code_dict.keys())
+ new_codelist_item = update_codelist_item(new_codelist_item, new_code_dict)
+ new_codelist_item.attrib['status'] = 'active'
+ new_codelist_item.attrib['activation-date'] = today
+ codelist_items.append(new_codelist_item)
+ source_data_dict.popitem(last=False)
+ # push the last popped item onto the front of the queue
+ old_codelist_els.insert(0, old_codelist_el)
+ continue
+
+ if old_codelist_code in source_data_dict:
+ # it's in the current codes, so update it
+ new_code_dict = source_data_dict[old_codelist_code]
+ updated_codelist_item = update_codelist_item(old_codelist_el, new_code_dict)
+ codelist_items.append(updated_codelist_item)
+ del source_data_dict[old_codelist_code]
+ elif old_codelist_el.attrib.get('status') == 'withdrawn':
+ # it's an old withdrawn code, so just copy it
+ codelist_items.append(old_codelist_el)
+ elif codelist_items.xpath('//codelist-item/code[text()="{}"]/..'.format(old_codelist_el.find('code').text)):
+ # some codelist items are hard-coded, and should just
+ # be left as is
+ pass
+ else:
+ # it's a newly withdrawn code, so mark the withdrawal date
+ # as today, and copy it
+ old_codelist_el.attrib['status'] = 'withdrawn'
+ old_codelist_el.attrib['withdrawal-date'] = today
+ codelist_items.append(old_codelist_el)
+
+ output_path = join('xml', '{}.xml'.format(tmpl_name))
+ for el in xml.iter('*'):
+ if el.text is not None:
+ if not el.text.strip():
+ # force tag self-escaping
+ el.text = None
+ indent(xml.getroot(), 0, 4)
+ xml.write(output_path, encoding='utf-8', pretty_print=True)
+
+
+# throw away any local changes
+system('git checkout -- xml/')
+
+fileformat_lookup = {
+ 'code': 'Media Type',
+ 'category': 'Type',
+}
+source_to_xml('FileFormat', 'media-types', fileformat_lookup)
+
+currency_lookup = {
+ 'code': 'AlphabeticCode',
+ 'name_en': 'Currency',
+ 'withdrawal_date': 'WithdrawalDate',
+}
+# TODO: source data includes withdrawn codes!
+# This needs to be factored in when parsing.
+source_to_xml('Currency', 'currencies', currency_lookup)
+
+country_lookup = {
+ 'code': 'code',
+ 'name_en': 'name_en',
+}
+source_path = join('source', 'countries.csv')
+with open(source_path) as f:
+ reader = csv.DictReader(f)
+ countries = [{
+ 'code': x['ISO3166-1-Alpha-2'],
+ 'name_en': x['CLDR display name'],
+ } for x in reader if x['ISO3166-1-Alpha-2']]
+source_to_xml('Country', 'countries', country_lookup, source_data=countries)
+
+language_lookup = {
+ 'code': 'alpha2',
+ 'name_en': 'English',
+}
+source_to_xml('Language', 'languages', language_lookup)
+
+lookup_no_category = {
+ 'code': 'code',
+ 'name_en': 'name_en',
+ 'name_fr': 'name_fr',
+ 'description_en': 'description_en',
+ 'description_fr': 'description_fr',
+}
+source_to_xml('AidType-category', 'aid_type_categories', lookup_no_category)
+source_to_xml('FlowType', 'flow_types', lookup_no_category)
+source_to_xml('SectorCategory', 'sector_categories', lookup_no_category)
-media_types = ET.parse('source/media-types.xml')
-for registry in media_types.findall('{http://www.iana.org/assignments}registry'):
- registry_id = registry.attrib['id']
- for record in registry.findall('{http://www.iana.org/assignments}record'):
- codelist_item = ET.Element('codelist-item')
+lookup_no_desc = {
+ 'code': 'code',
+ 'name_en': 'name_en',
+ 'name_fr': 'name_fr',
+}
+source_to_xml('CollaborationType', 'collaboration_types', lookup_no_desc)
+source_to_xml('CRSChannelCode', 'channel_codes', lookup_no_desc)
- code = ET.Element('code')
- code.text = registry_id + '/' + record.find('{http://www.iana.org/assignments}name').text
- codelist_item.append(code)
+source_path = join('source', 'finance_type_categories.csv')
+with open(source_path) as f:
+ reader = csv.DictReader(f)
+ finance_type_categories = []
+ for finance_type_category in reader:
+ if finance_type_category['code'] == '0':
+ continue
+ finance_type_categories.append(finance_type_category)
+source_to_xml('FinanceType-category', None, lookup_no_desc, source_data=finance_type_categories)
- category = ET.Element('category')
- category.text = registry_id
- codelist_item.append(category)
+lookup = {
+ 'code': 'code',
+ 'category': 'category',
+ 'name_en': 'name_en',
+ 'name_fr': 'name_fr',
+ 'description_en': 'description_en',
+ 'description_fr': 'description_fr',
+}
+source_to_xml('AidType', 'aid_types', lookup)
- codelist_items.append(codelist_item)
+source_path = join('source', 'finance_types.csv')
+with open(source_path) as f:
+ reader = csv.DictReader(f)
+ finance_types = []
+ for finance_type in reader:
+ if finance_type['category'] == '0':
+ continue
+ if finance_type['name_en'] == '':
+ finance_type['name_en'] = finance_type['description_en']
+ if finance_type['name_fr'] == '':
+ finance_type['name_fr'] = finance_type['description_fr']
+ finance_types.append(finance_type)
+source_to_xml('FinanceType', None, lookup, source_data=finance_types)
-template.write('xml/FileFormat.xml', pretty_print=True)
+source_path = join('source', 'sectors.csv')
+with open(source_path) as f:
+ reader = csv.DictReader(f)
+ sectors = []
+ for sector in reader:
+ if sector['voluntary_code'] != '':
+ sector['code'] = sector['voluntary_code']
+ for txt in ['name_en', 'name_fr', 'description_en', 'description_fr']:
+ sector[txt] = re.sub(r' +', ' ', sector[txt])
+ sectors.append(sector)
+sectors = sorted(sectors, key=lambda x: x['code'])
+source_to_xml('Sector', None, lookup, source_data=sectors)
+source_path = join('source', 'recipients.csv')
+with open(source_path) as f:
+ reader = csv.DictReader(f)
+ regions = [x for x in reader if x['income_group'] == 'Unallocated by income']
+source_to_xml('Region', None, lookup, source_data=regions)
diff --git a/get.sh b/get.sh
index 627026d2..e93a72a8 100755
--- a/get.sh
+++ b/get.sh
@@ -1 +1,25 @@
-wget "https://www.iana.org/assignments/media-types/media-types.xml" -O source/media-types.xml
+mkdir source
+
+# IANA
+wget "https://raw.githubusercontent.com/datasets/media-types/master/media-types.csv" -O source/media-types.csv
+
+# ISO 4217
+wget "https://raw.githubusercontent.com/datasets/currency-codes/master/data/codes-all.csv" -O source/currencies.csv
+
+# ISO 3166
+wget "https://raw.githubusercontent.com/datasets/country-codes/master/data/country-codes.csv" -O source/countries.csv
+
+# ISO 639-1
+wget "https://raw.githubusercontent.com/datasets/language-codes/master/data/language-codes-full.csv" -O source/languages.csv
+
+# DAC CRS
+wget "https://raw.githubusercontent.com/datasets/dac-crs-codes/master/data/aid_types.csv" -O source/aid_types.csv
+wget "https://raw.githubusercontent.com/datasets/dac-crs-codes/master/data/aid_type_categories.csv" -O source/aid_type_categories.csv
+wget "https://raw.githubusercontent.com/datasets/dac-crs-codes/master/data/collaboration_types.csv" -O source/collaboration_types.csv
+wget "https://raw.githubusercontent.com/datasets/dac-crs-codes/master/data/channel_codes.csv" -O source/channel_codes.csv
+wget "https://raw.githubusercontent.com/datasets/dac-crs-codes/master/data/finance_types.csv" -O source/finance_types.csv
+wget "https://raw.githubusercontent.com/datasets/dac-crs-codes/master/data/finance_type_categories.csv" -O source/finance_type_categories.csv
+wget "https://raw.githubusercontent.com/datasets/dac-crs-codes/master/data/flow_types.csv" -O source/flow_types.csv
+wget "https://raw.githubusercontent.com/datasets/dac-crs-codes/master/data/recipients.csv" -O source/recipients.csv
+wget "https://raw.githubusercontent.com/datasets/dac-crs-codes/master/data/sectors.csv" -O source/sectors.csv
+wget "https://raw.githubusercontent.com/datasets/dac-crs-codes/master/data/sector_categories.csv" -O source/sector_categories.csv
diff --git a/requirements.txt b/requirements.txt
index 7b403f33..10e6d209 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1,2 @@
lxml==3.4.1
+unicodecsv==0.14.1
diff --git a/templates/AidType-category.xml b/templates/AidType-category.xml
new file mode 100644
index 00000000..6e41b198
--- /dev/null
+++ b/templates/AidType-category.xml
@@ -0,0 +1,12 @@
+
+
+
+ Aid Type (category)
+
+
+ This codelists exists to group the Aid Type codelist into categories. It is not used as a codelist in its own right.
+
+ http://www.oecd.org/dac/stats/dacandcrscodelists.htm
+
+
+
diff --git a/templates/AidType.xml b/templates/AidType.xml
new file mode 100644
index 00000000..e38a301e
--- /dev/null
+++ b/templates/AidType.xml
@@ -0,0 +1,9 @@
+
+
+
+ Aid Type
+
+ http://www.oecd.org/dac/stats/dacandcrscodelists.htm
+
+
+
diff --git a/templates/CRSChannelCode.xml b/templates/CRSChannelCode.xml
new file mode 100644
index 00000000..097e1ef7
--- /dev/null
+++ b/templates/CRSChannelCode.xml
@@ -0,0 +1,9 @@
+
+
+
+ CRS Channel Code
+
+ http://www.oecd.org/dac/stats/dacandcrscodelists.htm
+
+
+
diff --git a/templates/CollaborationType.xml b/templates/CollaborationType.xml
new file mode 100644
index 00000000..1c7b9f7b
--- /dev/null
+++ b/templates/CollaborationType.xml
@@ -0,0 +1,9 @@
+
+
+
+ Collaboration Type
+
+ http://www.oecd.org/dac/stats/dacandcrscodelists.htm
+
+
+
diff --git a/templates/Country.xml b/templates/Country.xml
new file mode 100644
index 00000000..936fbb5c
--- /dev/null
+++ b/templates/Country.xml
@@ -0,0 +1,25 @@
+
+
+
+ Country
+
+
+
+ The Country codelist is generated from the ISO 3166-1 part of the
+ ISO 3166 standard. The standard makes allowance, alongside the
+ officially assigned codes, for code elements to be expanded by
+ using either reserved codes or user-assigned codes. IATI currently
+ defines additional codes in the XA -XZ range.
+
+
+ http://www.iso.org/iso/home/standards/country_codes.htm
+
+
+
+ XK
+
+ Kosovo
+
+
+
+
diff --git a/templates/Currency.xml b/templates/Currency.xml
new file mode 100644
index 00000000..ebf525b9
--- /dev/null
+++ b/templates/Currency.xml
@@ -0,0 +1,12 @@
+
+
+
+ Currency
+
+
+ ISO 4217 Currency used for all transactions and budgets
+
+ http://www.iso.org/iso/home/standards/currency_codes.htm
+
+
+
diff --git a/templates/FileFormat.xml b/templates/FileFormat.xml
index 8e5a0475..3fd97227 100644
--- a/templates/FileFormat.xml
+++ b/templates/FileFormat.xml
@@ -1,8 +1,12 @@
-
- File Format
- File format of published documents.
- http://www.iana.org/assignments/media-types
-
-
+
+
+ File Format
+
+
+ File format of published documents.
+
+ http://www.iana.org/assignments/media-types
+
+
diff --git a/templates/FinanceType-category.xml b/templates/FinanceType-category.xml
new file mode 100644
index 00000000..d20b1381
--- /dev/null
+++ b/templates/FinanceType-category.xml
@@ -0,0 +1,12 @@
+
+
+
+ Finance Type (category)
+
+
+ This codelists exists to group the Finance Type codelist into categories. It is not used as a codelist in its own right.
+
+ http://www.oecd.org/dac/stats/dacandcrscodelists.htm
+
+
+
diff --git a/templates/FinanceType.xml b/templates/FinanceType.xml
new file mode 100644
index 00000000..ac53792a
--- /dev/null
+++ b/templates/FinanceType.xml
@@ -0,0 +1,12 @@
+
+
+
+ Finance Type
+
+
+ DAC/CRS transaction classification used to distinguish financial instruments, e.g. grants or loans.
+
+ http://www.oecd.org/dac/stats/dacandcrscodelists.htm
+
+
+
diff --git a/templates/FlowType.xml b/templates/FlowType.xml
new file mode 100644
index 00000000..e5fbae4f
--- /dev/null
+++ b/templates/FlowType.xml
@@ -0,0 +1,12 @@
+
+
+
+ Flow Type
+
+
+ DAC/CRS distinction between ODA (official development assistance) and other types of resource flow.
+
+ http://www.oecd.org/dac/stats/dacandcrscodelists.htm
+
+
+
diff --git a/templates/Language.xml b/templates/Language.xml
new file mode 100644
index 00000000..0649cfdb
--- /dev/null
+++ b/templates/Language.xml
@@ -0,0 +1,9 @@
+
+
+
+ Language
+
+ http://www.iso.org/iso/home/standards/language_codes.htm
+
+
+
diff --git a/templates/Region.xml b/templates/Region.xml
new file mode 100644
index 00000000..bc1b1be4
--- /dev/null
+++ b/templates/Region.xml
@@ -0,0 +1,9 @@
+
+
+
+ Region
+
+ http://www.oecd.org/dac/stats/dacandcrscodelists.htm
+
+
+
diff --git a/templates/Sector.xml b/templates/Sector.xml
new file mode 100644
index 00000000..43d98788
--- /dev/null
+++ b/templates/Sector.xml
@@ -0,0 +1,9 @@
+
+
+
+ DAC 5 Digit Sector
+
+ http://www.oecd.org/dac/stats/dacandcrscodelists.htm
+
+
+
diff --git a/templates/SectorCategory.xml b/templates/SectorCategory.xml
new file mode 100644
index 00000000..fdaaab5e
--- /dev/null
+++ b/templates/SectorCategory.xml
@@ -0,0 +1,9 @@
+
+
+
+ DAC 3 Digit Sector
+
+ http://www.oecd.org/dac/stats/dacandcrscodelists.htm
+
+
+
diff --git a/templates/generic-codelist-item.xml b/templates/generic-codelist-item.xml
new file mode 100644
index 00000000..9315797b
--- /dev/null
+++ b/templates/generic-codelist-item.xml
@@ -0,0 +1,12 @@
+
+
+
+
+
+
+
+
+
+
+
+