## Import libraries

In [1]:
import lxml.etree as ET
import csv
import pandas as pd
import numpy as np

## Import files

In [2]:
xml_file = ET.parse("indicator_1-2-1.xml")
xml = xml_file.getroot()

In [3]:
mapping=pd.read_csv("121_newmapping.csv")
mapping

Unnamed: 0,DSD_Dim,DSD_Dim_Code,EXT_Dim,EXT_Dim_Code,DSD_Attri,DSD_Attri_Code,DSD_Dim_Type
0,_REPVAR_,SDG,Value,NONE,,,STUDYVAR
1,FREQ,A,,,,4.0,DIM
2,REPORTING_TYPE,N,,,,4.0,DIM
3,SERIES,SI_POV_NAHC,,,,4.0,DIM
4,REF_AREA,826,,,,4.0,DIM
5,SEX,F,Sex,Female,,3.0,DIM
6,SEX,M,Sex,Male,,3.0,DIM
7,SEX,_T,Sex,_,,3.0,DIM
8,AGE,_T,Age,_,,3.0,DIM
9,AGE,Y0T16,Age,15 and under,,3.0,DIM


## Create some objects
Create an empty dataframe for data to be transferred from XML file to:

In [4]:
csv_data=pd.DataFrame()

## Populate csv_data dataframe

In [5]:
count=0
for series in xml.iter('Series'):
    disaggregations=str(series.attrib).strip("{}").split(", ")
    for obs in series.findall('Obs'):
        attributes=str(obs.attrib).strip("{}").split(", ")
        col=[]
        value=[]
        for pair1 in disaggregations:
            col.append(pair1.split(": ")[0].strip("'"))
            value.append(pair1.split(": ")[1].strip("'"))
        for pair2 in attributes:
            col.append(pair2.split(": ")[0].strip("'"))
            value.append(pair2.split(": ")[1].strip("'"))
        row=pd.DataFrame([value], columns=col)
        if count==0:
            csv_data=pd.DataFrame(columns=col)
        count+=1
        csv_data=csv_data.append(row).reset_index(drop=True)
csv_data.head()

Unnamed: 0,FREQ,REPORTING_TYPE,SERIES,REF_AREA,SEX,AGE,URBANISATION,INCOME_WEALTH_QUANTILE,EDUCATION_LEV,OCCUPATION,CUST_BREAKDOWN,COMPOSITE_BREAKDOWN,DISABILITY_STATUS,TIME_PERIOD,OBS_VALUE,OBS_STATUS,UNIT_MULT,UNIT_MEASURE
0,A,N,SI_POV_NAHC,826,F,Y0T16,_T,_T,_T,_T,_T,_T,_T,2005,31.4,A,0,PERCENT
1,A,N,SI_POV_NAHC,826,F,Y0T16,_T,_T,_T,_T,_T,_T,_T,2006,30.5,A,0,PERCENT
2,A,N,SI_POV_NAHC,826,F,Y0T16,_T,_T,_T,_T,_T,_T,_T,2007,28.6,A,0,PERCENT
3,A,N,SI_POV_NAHC,826,F,Y0T16,_T,_T,_T,_T,_T,_T,_T,2008,29.7,A,0,PERCENT
4,A,N,SI_POV_NAHC,826,F,Y0T16,_T,_T,_T,_T,_T,_T,_T,2009,28.9,A,0,PERCENT


Drop columns that just contain "_T" as this means that there is only one breakdown type so column is not required:

In [6]:
for column in csv_data:
    if len(set(list(csv_data[column])))==1 and set(list(csv_data[column]))=={"_T"}:
        csv_data.drop(columns=column, inplace=True)

# rename OBS_VALUE column
csv_data.rename(columns={"OBS_VALUE":"_REPVAR_"}, inplace=True)
        
csv_data.head()

Unnamed: 0,FREQ,REPORTING_TYPE,SERIES,REF_AREA,SEX,AGE,TIME_PERIOD,_REPVAR_,OBS_STATUS,UNIT_MULT,UNIT_MEASURE
0,A,N,SI_POV_NAHC,826,F,Y0T16,2005,31.4,A,0,PERCENT
1,A,N,SI_POV_NAHC,826,F,Y0T16,2006,30.5,A,0,PERCENT
2,A,N,SI_POV_NAHC,826,F,Y0T16,2007,28.6,A,0,PERCENT
3,A,N,SI_POV_NAHC,826,F,Y0T16,2008,29.7,A,0,PERCENT
4,A,N,SI_POV_NAHC,826,F,Y0T16,2009,28.9,A,0,PERCENT


## Convert csv_data to values found in mapping
Create a new empty dataframe (same length as csv_data):

In [7]:
indicator=pd.DataFrame(index=csv_data.index)

For each column in csv_data create a column in indicator (df) which has the corresponding column name from the mapping:

In [8]:
for col in csv_data:
    try:
        newcol=mapping["EXT_Dim"].loc[mapping["DSD_Dim"]==col].iloc[0]
        indicator[newcol]=csv_data[col]
        csv_data.rename(columns={col:newcol}, inplace=True)
    except IndexError:
        pass
csv_data.head()

Unnamed: 0,nan,nan.1,nan.2,nan.3,Sex,Age,Year,Value,Obs.status,Unit.mult,Unit.measure
0,A,N,SI_POV_NAHC,826,F,Y0T16,2005,31.4,A,0,PERCENT
1,A,N,SI_POV_NAHC,826,F,Y0T16,2006,30.5,A,0,PERCENT
2,A,N,SI_POV_NAHC,826,F,Y0T16,2007,28.6,A,0,PERCENT
3,A,N,SI_POV_NAHC,826,F,Y0T16,2008,29.7,A,0,PERCENT
4,A,N,SI_POV_NAHC,826,F,Y0T16,2009,28.9,A,0,PERCENT


Remove any 'nan' columns (as they're not needed for NRP):

In [9]:
indicator.drop(columns=[np.nan], inplace=True)
indicator.head()

Unnamed: 0,Sex,Age,Year,Value,Obs.status,Unit.mult,Unit.measure
0,F,Y0T16,2005,31.4,A,0,PERCENT
1,F,Y0T16,2006,30.5,A,0,PERCENT
2,F,Y0T16,2007,28.6,A,0,PERCENT
3,F,Y0T16,2008,29.7,A,0,PERCENT
4,F,Y0T16,2009,28.9,A,0,PERCENT


For each row in indicator_df, convert DSD value to NRP value using mapping:

In [10]:
for i in indicator.index:
    for col in indicator.drop(columns=["Year", "Value"]).columns:
        try:
            indicator.at[i, col]=mapping['EXT_Dim_Code'].loc[mapping['DSD_Dim_Code']==indicator.at[i, col]].loc[mapping['EXT_Dim']==col].item()
        except ValueError:
            pass
indicator.tail()

Unnamed: 0,Sex,Age,Year,Value,Obs.status,Unit.mult,Unit.measure
268,_,_,2013,24.8,Normal value,Units,Percentage (%)
269,_,_,2014,24.1,Normal value,Units,Percentage (%)
270,_,_,2015,23.5,Normal value,Units,Percentage (%)
271,_,_,2016,22.2,Normal value,Units,Percentage (%)
272,_,_,2017,17.0,Normal value,Units,Percentage (%)


In [11]:
for colname in indicator.columns:
    newcolname=colname.replace(".", " ")
    indicator.rename(columns={colname:newcolname}, inplace=True)

Change any "_" values to empty strings:

In [12]:
indicator.replace("_", "", inplace=True)
indicator.tail()

Unnamed: 0,Sex,Age,Year,Value,Obs status,Unit mult,Unit measure
268,,,2013,24.8,Normal value,Units,Percentage (%)
269,,,2014,24.1,Normal value,Units,Percentage (%)
270,,,2015,23.5,Normal value,Units,Percentage (%)
271,,,2016,22.2,Normal value,Units,Percentage (%)
272,,,2017,17.0,Normal value,Units,Percentage (%)


Rearrange columns as the NRP requires the first column to be Year and the last column to be Value:

In [14]:
indicator=indicator[["Year"]+[c for c in indicator if c not in ["Year","Value"]] + ["Value"]]
indicator.head()

Unnamed: 0,Year,Sex,Age,Obs status,Unit mult,Unit measure,Value
0,2005,Female,15 and under,Normal value,Units,Percentage (%),31.4
1,2006,Female,15 and under,Normal value,Units,Percentage (%),30.5
2,2007,Female,15 and under,Normal value,Units,Percentage (%),28.6
3,2008,Female,15 and under,Normal value,Units,Percentage (%),29.7
4,2009,Female,15 and under,Normal value,Units,Percentage (%),28.9


## Write to CSV
Encode in utf-8 and write with no index column:

In [15]:
indicator.to_csv("1.2.1_xml2csv.csv", encoding="utf-8", index=False)