# Provenance document for vulnerability model NRML file

This noteook generates a provenance document describing the creation of an NRML-format file containing the full suite of wind vulnerability models used in the National Wind Risk Asessment (2023/24). 

Dates, paths and filenames are set manually in the relevant cells. 

In [1]:
import os
from prov.model import ProvDocument
from prov.dot import prov_to_dot
from datetime import datetime
from time import ctime, localtime, strftime

provdoc = ProvDocument()
provdoc.set_default_namespace("")
provdoc.add_namespace("nwra", "http://www.ga.gova.au/hazards")
provdoc.add_namespace("prov", "http://www.w3.org/ns/prov#")
provdoc.add_namespace("xsd", "http://www.w3.org/2001/XMLSchema#")
provdoc.add_namespace("foaf", "http://xmlns.com/foaf/0.1/")
provdoc.add_namespace("void", "http://vocab.deri.ie/void#")
provdoc.add_namespace("dcterms", "http://purl.org/dc/terms/")
provdoc.add_namespace("git", "http://example.org/gitontology#")

<Namespace: git {http://example.org/gitontology#}>

In [2]:
DATEFMT = "%Y-%m-%dT%H:%M:%S"

def flModDate(filename, dateformat="%Y-%m-%d %H:%M:%S"):
    """
    Return the last modified date of the input file

    :param str filename: file name (full path).
    :param str dateformat: Format string for the date (default
                           '%Y-%m-%d %H:%M:%S')
    :returns: File modification date/time as a string
    :rtype: str

    Example: modDate = flModDate('C:/foo/bar.csv' ,
                                 dateformat='%Y-%m-%dT%H:%M:%S')
    """
    try:
        si = os.stat(filename)
    except (IOError, WindowsError):
        print(f"Input file is not a valid file: {filename}")
    moddate = localtime(si.st_mtime)
    if dateformat:
        return strftime(dateformat, moddate)
    else:
        return datetime.datetime(*moddate[:6])

### Entities

In [3]:
e1file = "X:/georisk/HaRIA_B_Wind/projects/acs/2. DATA/1. Work Unit Assessment/NWRA/vulnerability/ACS Phase 0 wind vulnerability curves.xlsx"
e1 = provdoc.entity("nwra:sourceVulnerabilityFile",
                    {
                        "dcterms:title": "ACS Phase 0 wind vulnerability curves.xlsx",
                        "dcterms:description": "This spreadsheet documents the adjustment of GA's heuristic National Wind Risk Assessment vulnerability curves originally produced in 2009 - 2010 for CaRSA and subsequently ACS Phase 0.",
                        "dcterms:created": flModDate(e1file, DATEFMT),
                        "prov:location": e1file,
                        "dcterms:format": "Microsoft Excel spreadsheet",
                        "dcterms:creator": "Martin Wehner"
                    })

e2file = "X:/georisk/HaRIA_B_Wind/projects/acs/2. DATA/1. Work Unit Assessment/NWRA/vulnerability/NWRA domestic wind 2023 vulnerability set.csv"
e2 = provdoc.entity("nwra:vulnerabilityModelSet",
                    {
                        "dcterms:title": "NWRA domestic wind vulnerability model set",
                        "dcterms:description": "NWRA vulnerability model set that describes the full set of wind vulnerability models",
                        "dcterms:created": flModDate(e2file, DATEFMT),
                        "prov:location": e2file,
                        "dcterms:format": "Microsoft Excel Comma Separated Values"
                    })

e3file = "X:/georisk/HaRIA_B_Wind/projects/acs/2. DATA/1. Work Unit Assessment/NWRA/vulnerability/domestic_wind_2023.xml"
e3 = provdoc.entity("nwra:vulnerabilityNRMLFile",
                    {
                        "dcterms:title": "NWRA domestic wind vulnerability model set NRML",
                        "dcterms:description": "NRML format of the domestic wind vulnerability functions to be used in the NWRA",
                        "dcterms:created": flModDate(e3file, DATEFMT),
                        "prov:location": e3file,
                        "dcterms:format": "Natural hazard and Risk Markup Language (NRML)"
                    })

### Agents

In [4]:
provdoc.agent(":GeoscienceAustralia",
              {
                  "foaf:name": "Geoscience Australia",
                  "dcterms:type": "foaf:Organization",
                  "foaf:mbox": "hazards@ga.gov.au"
              }
             )

provdoc.agent(":NHIStaff",
              {
                  "foaf:name": "Craig Arthur",
                  "dcterms:type": "prov:Person",
                  "foaf:mbox": "craig.arthur@ga.gov.au",
              })

provdoc.agent(":VRMStaff",
              {
                  "foaf:name": "Martin Wehner",
                  "dcterms:type": "prov:Person",
                  "foaf:mbox": "martin.wehner@ga.gov.au",
              })

# Need to get the git repo data for git revision, url, etc.:
provdoc.agent(":HazImp",
              {
                  "foaf:name": "HazImp Impact Assessment",
                  "dcterms:type": "prov:softwareAgent",
                  "prov:location": "https://github.com/GeoscienceAustralia/hazimp",
                  "git:commitHash": "2e6cd7166bfc491489805ebe21323643116a58d6",
                  "git:commitDate": "2023-07-23T11:04:00"
              })



<ProvAgent: HazImp>

### Activities

In [5]:
a1 = provdoc.activity("nwra:extractModelSet",
                      startTime=datetime(2023, 10, 12),
                      endTime=datetime(2023, 10, 12),
                      other_attributes={
                          "dcterms:title": "Extract Model Set",
                          "dcterms:description": "Extract the model set from the source vulnerability file to a simplified spreadsheet"
                      })

a2 = provdoc.activity("nwra:translateModelSet",
                      startTime=datetime(2023, 10, 12, 15, 40),
                      endTime=datetime(2023, 10, 12, 15, 41),
                      other_attributes={
                          "dcterms:title": "Translate model set",
                          "dcterms:description": "Translate vulnerability model set from a csv format to NRML format"
                      })

### Associations

In [6]:
provdoc.wasDerivedFrom(e2, e1, a1)
provdoc.wasDerivedFrom(e3, e2, a2)
provdoc.wasGeneratedBy(e2, a1, time=datetime(2023, 10, 12, 15, 10))
provdoc.wasGeneratedBy(e3, a2, time=datetime(2023, 10, 12, 15, 41))

provdoc.used(a1, e1, datetime(2023, 10, 12))
provdoc.used(a2, e2, datetime(2023, 10, 12, 15, 40))


provdoc.wasAssociatedWith(a1, ":NHIStaff")
provdoc.wasAssociatedWith(a2, ":HazImp")
provdoc.wasAttributedTo(e1, ":VRMStaff")
provdoc.wasAttributedTo(e2, ":NHIStaff")

provdoc.actedOnBehalfOf(":NHIStaff", ":GeoscienceAustralia")
provdoc.actedOnBehalfOf("VRMStaff", ":GeoscienceAustralia")

<ProvDelegation: (VRMStaff, GeoscienceAustralia)>

### Write the provenance

Write the provenance information to XML and generate a directed graph of the associations

In [7]:
dot = prov_to_dot(provdoc, direction="TB", use_labels=True)
dot.write_png('windvulnerabilitymodelprovenance.png')
provdoc.serialize('windvulnerabilitymodelprovenance.xml', format='xml')

### Validate the PROV-XML

Run the resulting XML file through the schema validation to ensure the XML is valid PROV information. If there is an exception raised at this point, then there is an error somewhere back in the definition of the elements. Check the specific line indicated in the XML file for errors.

If no errors are reported, then the XML file is a valid PROV-XML document.

In [8]:
from lxml import etree
from pathlib import Path

PROVXML_SCHEMA="C:/WorkSpace/prov/prov.xsd"
schema = etree.XMLSchema(etree.parse(PROVXML_SCHEMA))
schema.assert_(etree.parse('windvulnerabilitymodelprovenance.xml')) 