## Import libraries

In [1]:
import lxml.etree as ET
import csv
import pandas as pd
import numpy as np

## Import files

In [2]:
xml_file = ET.parse("UG_DATA.xml")
xml = xml_file.getroot()

In [3]:
mapping=pd.read_csv("codelists.csv")
mapping

Unnamed: 0,Codelist,Code,Name
0,FREQ,A,Annual
1,FREQ,S,"Half-yearly, semester"
2,FREQ,Q,Quarterly
3,FREQ,M,Monthly
4,FREQ,W,Weekly
5,FREQ,D,Daily
6,FREQ,H,Hourly
7,FREQ,B,Daily – businessweek
8,FREQ,N,Minutely
9,OBS_STATUS,A,Normal value


## Convert XML to CSV
Create an empty dataframe:

In [4]:
xml_data=pd.DataFrame()

Populate csv_data with values from XML:

In [5]:
count=0
for series in xml.iter('Series'):
    disaggregations=str(series.attrib).strip("{}").split(", ")
    for obs in series.findall('Obs'):
        attributes=str(obs.attrib).strip("{}").split(", ")
        col=[]
        value=[]
        for pair1 in disaggregations:
            col.append(pair1.split(": ")[0].strip("'"))
            value.append(pair1.split(": ")[1].strip("'"))
        for pair2 in attributes:
            col.append(pair2.split(": ")[0].strip("'"))
            value.append(pair2.split(": ")[1].strip("'"))
        row=pd.DataFrame([value], columns=col)
        if count==0:
            xml_data=pd.DataFrame(columns=col)
        count+=1
        xml_data=xml_data.append(row, sort=True).reset_index(drop=True)
xml_data.head()

Unnamed: 0,AGE,COMMENT_OBS,COMPOSITE_BREAKDOWN,CUST_BREAKDOWN,DISABILITY_STATUS,EDUCATION_LEV,FREQ,INCOME_WEALTH_QUANTILE,NATURE,OBS_STATUS,...,REF_AREA,REPORTING_TYPE,SERIES,SEX,SOURCE_DETAIL,TIME_DETAIL,TIME_PERIOD,UNIT_MEASURE,UNIT_MULT,URBANISATION
0,_T,,_T,_T,_T,_T,A,_T,C,A,...,UG,N,SI_POV_NAHC,_T,UNHS,2017,2017,PERCENT,0,_T
1,Y0T4,,_T,_T,_T,_T,A,_T,C,A,...,UG,N,SH_STA_STNT,F,UDHS,2016,2016,PERCENT,0,_T
2,Y0T4,,_T,_T,_T,_T,A,_T,C,A,...,UG,N,SH_STA_STNT,F,UDHS,2011,2011,PERCENT,0,_T
3,Y0T4,,_T,_T,_T,_T,A,_T,C,A,...,UG,N,SH_STA_STNT,M,UDHS,2016,2016,PERCENT,0,_T
4,Y0T4,,_T,_T,_T,_T,A,_T,C,A,...,UG,N,SH_STA_STNT,M,UDHS,2011,2011,PERCENT,0,_T


Drop columns that just contain "_T" as this means that there is only one breakdown type so column is not required:

In [6]:
for column in xml_data:
    if len(set(list(xml_data[column])))==1 and set(list(xml_data[column]))=={"_T"}:
        xml_data.drop(columns=column, inplace=True)

# rename OBS_VALUE column
xml_data.rename(columns={"OBS_VALUE":"_REPVAR_"}, inplace=True)
        
xml_data.head()

Unnamed: 0,AGE,COMMENT_OBS,DISABILITY_STATUS,FREQ,NATURE,OBS_STATUS,_REPVAR_,REF_AREA,REPORTING_TYPE,SERIES,SEX,SOURCE_DETAIL,TIME_DETAIL,TIME_PERIOD,UNIT_MEASURE,UNIT_MULT
0,_T,,_T,A,C,A,21.4,UG,N,SI_POV_NAHC,_T,UNHS,2017,2017,PERCENT,0
1,Y0T4,,_T,A,C,A,26.9,UG,N,SH_STA_STNT,F,UDHS,2016,2016,PERCENT,0
2,Y0T4,,_T,A,C,A,37.0,UG,N,SH_STA_STNT,F,UDHS,2011,2011,PERCENT,0
3,Y0T4,,_T,A,C,A,30.9,UG,N,SH_STA_STNT,M,UDHS,2016,2016,PERCENT,0
4,Y0T4,,_T,A,C,A,29.9,UG,N,SH_STA_STNT,M,UDHS,2011,2011,PERCENT,0


## Convert csv_data to values found in mapping
Create a new empty dataframe (same length as csv_data):

In [7]:
csv_data=pd.DataFrame(index=xml_data.index)

For each row in csv_data, convert DSD code id to DSD code name using mapping:

In [8]:
for i in xml_data.index:
    for col in xml_data.drop(columns=["TIME_PERIOD", "_REPVAR_"]).columns:
        try:
            csv_data.at[i, col]=mapping['Name'].loc[mapping['Code']==xml_data.at[i, col]].loc[mapping['Codelist']==col].item()
        except ValueError:
            pass
    for col in xml_data[["_REPVAR_", "TIME_PERIOD"]].columns:
        csv_data.at[i, col]=xml_data.at[i,col]

Below you can the difference between the csv_data and indicator dataframes:

In [9]:
xml_data.tail()

Unnamed: 0,AGE,COMMENT_OBS,DISABILITY_STATUS,FREQ,NATURE,OBS_STATUS,_REPVAR_,REF_AREA,REPORTING_TYPE,SERIES,SEX,SOURCE_DETAIL,TIME_DETAIL,TIME_PERIOD,UNIT_MEASURE,UNIT_MULT
96,Y0T4,,_T,A,C,A,32.2,UG,N,SG_REG_BRTH,M,UDHS,2017,2017,PERCENT,0
97,Y0T4,,_T,A,C,A,32.2,UG,N,SG_REG_BRTH,_T,UDHS,2017,2017,PERCENT,0
98,_T,,_T,A,C,A,35.0,UG,N,VC_VOV_DCMN,_T,NGPSS,2017,2017,PERCENT,0
99,_T,,_T,A,C,A,32.0,UG,N,VC_VOV_DCMN,F,NGPSS,2017,2017,PERCENT,0
100,_T,,_T,A,C,A,39.0,UG,N,VC_VOV_DCMN,M,NGPSS,2017,2017,PERCENT,0


In [10]:
csv_data.tail()

Unnamed: 0,AGE,FREQ,NATURE,OBS_STATUS,REPORTING_TYPE,SEX,UNIT_MEASURE,UNIT_MULT,_REPVAR_,TIME_PERIOD
96,under 5 years old,Annual,Country Data,Normal value,National,Male,Percent,Units,32.2,2017
97,under 5 years old,Annual,Country Data,Normal value,National,Both sexes or no breakdown by sex,Percent,Units,32.2,2017
98,All age ranges or no breakdown by age,Annual,Country Data,Normal value,National,Both sexes or no breakdown by sex,Percent,Units,35.0,2017
99,All age ranges or no breakdown by age,Annual,Country Data,Normal value,National,Female,Percent,Units,32.0,2017
100,All age ranges or no breakdown by age,Annual,Country Data,Normal value,National,Male,Percent,Units,39.0,2017


## Write out indicator to CSV
To view all of the csv data, view the outputted CSV file:

In [12]:
csv_data.to_csv("uganda_xml2csv.csv", encoding="utf-8")