In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mpld3
import seaborn as sn
from lxml import etree
sn.set_context('notebook')

# Visualise PERSiST output

The GUI for PERSiST built using the INCA Core is not yet working properly. However, the INCA Core command line version produces results in XML format. This notebook reads the XML output and creates some basic plots.

The XML files are unnecessarily large and cumbersome to work with - there must be a better way of storing the output?

The simulated values are stored between `DataContainer` tags. The code below first identifies all these tags, then walks up the XML tree to identify the "parents", which specify what the values actually represent. For a relatively basic PERSiST setup, this output file seems to contain more than 2600 daily resolution annual time series (!). Surely this nisn't necessary?

In [2]:
# Parse output
in_xml = r'C:\Data\James_Work\JHI\Modelling_Workshop_April_2017\PERSiST results.xml'
tree = etree.parse(in_xml)

# Container for data
data = {}

# Walk XML file
for elem in tree.iter(tag='DataContainer'):
    # Get attributes
    ancest = ['%s (%s)' % (elem.attrib['name'], elem.attrib['units'])]
    
    # Walk up tree to find ancestors
    for par in elem.iterancestors():
        ancest.append(par.attrib['name'])
        
    # List of ancestors from top down
    ancest = tuple(ancest[::-1])
       
    # Get values
    vals = []
    for val in elem.iter():
        # Assume either number or NaN
        try:
            vals.append(float(val.text))
        except ValueError:
            vals.append(np.nan)
            
    data[ancest] = vals

# Build DF
df = pd.DataFrame(data)

# Remove strange '\n' from first row
df = df[df.index!=0]

df.head()

Unnamed: 0_level_0,PERSiST results,PERSiST results,PERSiST results,PERSiST results,PERSiST results,PERSiST results,PERSiST results,PERSiST results,PERSiST results,PERSiST results,PERSiST results,PERSiST results,PERSiST results,PERSiST results,PERSiST results,PERSiST results,PERSiST results,PERSiST results,PERSiST results,PERSiST results,PERSiST results
Unnamed: 0_level_1,Lake,Lake,Lake,Lake,Lake,Lake,Lake,Lake,Lake,Lake,...,Stream_2,Stream_2,Stream_2,Stream_2,Stream_2,Stream_2,Stream_2,Stream_2,Stream_2,Stream_2
Unnamed: 0_level_2,Reaches,Reaches,Reaches,Reaches,Reaches,Reaches,Reaches,Reaches,Reaches,Reaches,...,Reaches,Reaches,Reaches,Reaches,Reaches,Reaches,Reaches,Reaches,Reaches,Reaches
Unnamed: 0_level_3,LAE02,LAE02,LAE02,LAE02,LAE02,LAE02,LAE02,LAE02,LAE02,LAE02,...,LakeSouth,LakeSouth,LakeSouth,LakeSouth,LakeSouth,LakeSouth,LakeSouth,LakeSouth,LakeSouth,LakeSouth
Unnamed: 0_level_4,Landscape units,Landscape units,Landscape units,Landscape units,Landscape units,Landscape units,Landscape units,Landscape units,Landscape units,Landscape units,...,Landscape units,Landscape units,Landscape units,Reaches equations,Reaches equations,Reaches equations,Reaches equations,Reaches equations,Reaches equations,Reaches equations
Unnamed: 0_level_5,Forest,Forest,Forest,Forest,Forest,Forest,Forest,Forest,Forest,Forest,...,Open Water,Open Water,Open Water,Reach depth (m),Reach flow (m3/s),Reach flow input (m3/s),Reach time constant (days),Reach velocity (m/s),Reach volume (m3),Total diffuse flow output (m3/s)
Unnamed: 0_level_6,Landscape units equations,Landscape units equations,Landscape units equations,Landscape units equations,Landscape units equations,Landscape units equations,Soils,Soils,Soils,Soils,...,Soils,Soils,Soils,NaN,NaN,NaN,NaN,NaN,NaN,NaN
Unnamed: 0_level_7,Diffuse flow output (m3/s),Rainfall (mm/day),Snow depth (mm/day),Snow fall (mm/day),Snow melt (mm/day),Total runoff to reach (mm/day),Direct_Runoff,Direct_Runoff,Direct_Runoff,Direct_Runoff,...,Soil_Water,Soil_Water,Soil_Water,NaN,NaN,NaN,NaN,NaN,NaN,NaN
Unnamed: 0_level_8,NaN,NaN,NaN,NaN,NaN,NaN,Soils equations,Soils equations,Soils equations,Soils equations,...,Soils equations,Soils equations,Soils equations,NaN,NaN,NaN,NaN,NaN,NaN,NaN
Unnamed: 0_level_9,NaN,NaN,NaN,NaN,NaN,NaN,Drought runoff (mm/day),Evapotranspiration (mm/day),Evapotranspiration X3 (mm/day),Evapotranspiration X4 (mm/day),...,Water depth 2 (mm/day),Water depth 3 (mm/day),Water depth 4 (mm/day),NaN,NaN,NaN,NaN,NaN,NaN,NaN
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


This file has 2670 columns. What are all these series? Pandas has conveniently created a heirarchical index on the columns, which makes it fairly easy to access the series of interest (once the Core XML tag conventions are understood). However, I still think this is fiddly and unnecessarily complicated.

In [3]:
# Plot results for reach LAE02
df2 = df['PERSiST results']['Stream_1']['Reaches']['LAE02']['Reaches equations']
df2.columns = df2.columns.get_level_values(0)
f = df2.plot(subplots=True, figsize=(12, 10))

# Get rid of "None" in mpld3
for ax in f.flatten():
    ax.legend().set_title('')
    
plt.tight_layout()
mpld3.display()