# Analysis of children's health data from Kolding

In [1]:
import gzip
import matplotlib
import pandas
import seaborn
import xml.etree.ElementTree as ET

In [2]:
%matplotlib inline

## Load data into a pandas DataFrame

In [3]:
boern_xml = 'Boern.xml.gz'
with gzip.open(boern_xml) as fin:
    tree = ET.parse(fin)
    root = tree.getroot()
    child_data = []
    for child in root:
        #print(child.tag, child.attrib)
        child_attrib_list = [0] * 7
        for i, child_attrib in enumerate(child):
            #print(child_attrib.tag, child_attrib.text)
            child_attrib_list[i] = child_attrib.text
        child_data.append(child_attrib_list)

col_names = ['Child', 'Year', 'Gender', 'District', 'Weight', 'Height', 'Head Circumference']
child_df = pandas.DataFrame(child_data, columns=col_names)

In [4]:
numeric_cols = ['Child', 'Year', 'Weight', 'Height', 'Head Circumference']
child_df[[*numeric_cols]] = child_df[[*numeric_cols]].apply(pandas.to_numeric)

In [None]:
print('Read {:d} entries for {:d} children.'.format(len(child_df), len(child_df['Child'].unique())))

Read 116242 entries for 26125 children.


## Calculate children's age at each measurment

Note that first measurment for each child is taken at birth (age 0).

In [None]:
child_grouped = child_df.groupby('Child')

child_age_dfs = []
for child_id, child_data in child_grouped:
    #print(child_id)
    min_year = child_data['Year'].min()
    child_data = pandas.concat([child_data, child_data['Year'] - min_year], axis=1)
    child_data.columns = col_names + ['Age']
    #print(child_data)
    child_age_dfs.append(child_data)

In [None]:
child_data.groupby('Age')

In [None]:
child_age_df = pandas.concat(child_age_dfs)

In [None]:
age_vec = child_age_df['Age']
age_vec_gt_1 = age_vec[age_vec > 1]
seaborn.distplot(age_vec_gt_1, kde=False)