# Sample Skyspark VAV Validation

# 1) Setup

## Imports

In [None]:
# ----------------------------------------
# Imports
# ----------------------------------------
import os
import json
import re

from rdflib import Namespace, SH, RDF, BNode, Graph
from pyshacl import validate
from dotenv import load_dotenv
load_dotenv()

from tasty import constants as tc
from tasty import graphs as tg

## Inputs
Define the key variables and input information here

***Items to Change***
- `SHAPE`: this is the name of the SHACL equipment shape against which you would like to validate your sample equipment in the instance data
- `SAMPLE`: this is the name of the sample equipment in your instance data
- `input_namespace_uri`: this is the namespace uri used for your sample equipment in the instance data
- `raw_data_graph_filename`: this is the filename/filepath to save the raw instance data (in turtle format) retrieved from the Skyspark API call
- `data_graph_filename`: this is the filename/filepath to save the cleaned/processed instance data for the data graph to be used for validation
- `shapes_graph_filename`: this it the filename/filepath of the SHACL shapes data for the shape graph 
***Remaining Items*** </br>
These items should be okay as is, but can be changed if need be. If you are printing out results, <u>*make sure that the output directory exists in your local file structure*</u>.
- `output_directory`: this is the directory where output files will be printed to below
- `tasty_main_directory`: this is the absolute path of the main tasty directory. It should just be the parent directory of the current working directory.

In [None]:
# ----------------------------------------
# User Defined Variables
# ----------------------------------------

SHAPE = 'NREL-VAV-SD-Cooling-Only-Shape'
SAMPLE = '214466de-7abb28a7'
input_namespace_uri = 'urn:/_#'

raw_data_graph_filename = 'examples/output/sample_skyspark_vav_raw.ttl'
data_graph_filename = 'examples/output/sample_skyspark_vav_clean.ttl'
shapes_graph_filename = 'tasty/generated_shapes/haystack_all.ttl'

output_directory = os.path.join(os.path.abspath(''), 'example_data/output')
tasty_main_directory = os.path.join(os.path.abspath(''), '../')
# print(tasty_main_directory)

## API Request From Skyspark 
NOTE - Must be connected to NREL network to access the api endpoint

In [None]:
import requests

axon_query_string = '(point and equipRef->navName=="UFVAV-3") or (equip and navName=="UFVAV-3")'

# load skyspark URL from .env file
skyspark_api_url = os.environ.get('API_ENDPOINT')

response = requests.get(
    skyspark_api_url,
    params={'filter':axon_query_string},
    headers={
        'Accept': 'text/turtle'
    }   
)

print(response.status_code, end = " - ")
if response.status_code == 200:
    print("Sucess")
elif response.status_code == 404:
    print("Not Found")

# print(response.headers['Content-Type'])
# response.encoding = 'utf-8'
raw_skyspark_data = response.text
print(raw_skyspark_data)

In [None]:
# Save response to file
f = os.path.join(tasty_main_directory, raw_data_graph_filename)
with open(f, 'w') as clean_file:
    clean_file.write(raw_skyspark_data)
print(f"raw instance data saved to '{raw_data_graph_filename}' ")

### Pre-Process Raw Input File
The raw data file generate by Skyspark cannot be parsed by rdflib as is, and also needs to be updated in a couple of other ways:
1. The date-time fields generated by Skyspark cause errors in rdflibs parse function. These fields are removed
2. The Skyspark namespace-prefix is and underscore "\_" (this may be worth changing in the future), but more importantly this prefix is not associated with a URI namespace. The file will be parsed, however there is no way to reference the associated equipment as the namespace is not defined. The following prefix/namespace paring is added `@prefix _: <urn:/_#>`.
3. Finally the project haystack namespaces that are listed in the raw file are version '3.9.9', however the shapes graph that is currently generated by tasty uses '3.9.10'. So the namespaces for the project haystack prefixes are updated to version 10.

There are likely more elegant ways to handle these changes. For #3, we can likely generated a haystack version '3.9.9' graph to avoid this requirement. For #2 there is likely a way to bind this prefix to the graph prior to parsing the instance data file.

In [None]:
# ----------------------------------------
# Pre Process raw skyspark .ttl file 
# ----------------------------------------

# read in the file
f1 = os.path.join(tasty_main_directory, raw_data_graph_filename)
with open(f1, 'r') as raw_file:
    filedata = raw_file.read()

# REMOVE DATE-TIME FIELDS
#-------------------------
# remove date-time fields in the middle of the definition
filedata = re.sub('\n.*\^{2}xsd:dateTime.*;', '', filedata)
# remove date-time fields at the end of the definition
filedata = re.sub(';\n.*\^{2}xsd:dateTime.*.', '.', filedata)

# add urn namespace to graph
filedata = re.sub('@prefix', '@prefix _: <urn:/_#> .\n@prefix', filedata, count = 1)

# change the project haystack namespaces to v10
filedata = re.sub('/3.9.9', '/3.9.10', filedata)

# print(filedata)
f2 = os.path.join(tasty_main_directory, data_graph_filename)
with open(f2, 'w') as clean_file:
    clean_file.write(filedata)
print(f"cleaned instance data saved to '{data_graph_filename}' ")

# 2) Main Code

## Definitions
This defines additional variables and helper functions to be used below

In [None]:
# ----------------------------------------
# Variables and Constants
# ----------------------------------------

NAMESPACE = Namespace(input_namespace_uri)
shape_name = tc.PH_SHAPES_NREL[SHAPE]
target_node = NAMESPACE[SAMPLE]

PHCUSTOM = Namespace("https://project-haystack.org/def/custom#")
POINT = Namespace("https://skyfoundry.com/def/point/3.0.27#")

# known_tags = ["zone", "air", "temp", "sensor", "sp", "cmd", "discharge", "damper", "humidity", "co2", "occupied",
#                            "occupancyIndicator", "cooling", "heating", "effective", "occ", "unocc", "standby", "operating", "mode", "request",
#                            "leaving", "entering", "flow", "min", "max", "pressure", ] 

# invalid_tags = [tc.PHIOT_3_9_10["point"], tc.PHIOT_3_9_10["his"], POINT["hisCollectCov"], tc.PHIOT_3_9_10["cur"]]

# ----------------------------------------
# Helper Function Definitions
# ----------------------------------------

def get_data_graph():
    n = tg.get_versioned_graph(tc.HAYSTACK, tc.V3_9_10)
    f = os.path.join(tasty_main_directory, data_graph_filename)
    n.parse(f, format='turtle')
    return n


def get_shapes_graph():
    g = tg.get_versioned_graph(tc.HAYSTACK, tc.V3_9_10)
    f = os.path.join(tasty_main_directory, shapes_graph_filename)
    g.parse(f, format='turtle')
    return g


def print_graph_to_file(g, filename):
    output_filename = os.path.join(output_directory, filename + ".ttl")
    g.serialize(output_filename, format='turtle')


def print_graph(g):
    print(g.serialize(format='turtle').decode('utf-8'))    
    

## Generate and Process Graphs

### Create Data, Shapes, and Ontology Graphs 
Create the data and shapes graph using the helper functions defined above. The data and shapes graphs are generated using rdflib's `parse` function to import the graphs defined in `data_graph_filename` and the `shapes_graph_filename` respectively. The ontology graph is generated by the `load_ontology` method from tasty's `graphs` module (imported as `tg`).

In [None]:
# ----------------------------------------
# Generate Graphs
# ----------------------------------------

# Data Graph
data_graph = get_data_graph()
print("...loaded data graph")

# Shapes Graph
shapes_graph = get_shapes_graph()
print("...loaded shapes graph")

# Ontology Graph
ont_graph = tg.load_ontology(tc.HAYSTACK, tc.V3_9_10)
print("...loaded ontology graph")

### Get List of Valid Tags With Namespaces
Create a list of valid tags with their prefixes, so we can remove all extraneous tags from the data graph (see below) </br>
The methodology employed here is as follows:
1. loop through each of the `.json` files in the `source-shapes/haystack` directory 
2. iterate over all shapes in each file
3. extract any `tags` or `custom-tags` associated with each shape
4. if the tag is not yet in the tags list, then it is added
5. once all tags are added, generate a new list of namespaced tags (i.e. complete URIs for tags); iterate over the original list and for each tag use the `get_namespaced_term` method from tasty's `graphs` module (imported as `tg`).

In [None]:
# ----------------------------------------
# Create a List of Valid Tags
# ----------------------------------------
source_shapes_dir = os.path.join(tasty_main_directory, 'tasty/source_shapes')
schema = tc.HAYSTACK

source_shapes_schema_dir = os.path.join(source_shapes_dir, schema.lower())
files = [os.path.join(source_shapes_schema_dir, f) for f in 
         os.listdir(source_shapes_schema_dir) if f.endswith('.json')]

valid_tags = []
valid_tags_ns = []

# go through schema files and extract valid tags
for file in files:
    # open file and read in json to python dict
    with open(file, 'r') as f:
        filedata = json.loads(f.read())
        # for each shape 
        for shape in filedata['shapes']:
            # add tags if not already added to the tags list
            if 'tags' in shape:
                for tag in shape['tags']:
                    if tag not in valid_tags:
                        valid_tags.append(tag)
            # add custom tags if not already added to the tags list
            if 'tags-custom' in shape:
                for tag in shape['tags-custom']:
                    if tag not in valid_tags:
                            valid_tags.append(tag)
        
print("...generated tag list")
print("...adding namespaces")
# sort tags list
valid_tags = sorted(valid_tags)

# add namespaces to all valid tags
false_count = 0
for tag in valid_tags:
    tag_ns = tg.get_namespaced_term(ont_graph, tag)
    # take care of custom tags
    if tag_ns is False:
        tag_ns = PHCUSTOM[tag]
    valid_tags_ns.append(tag_ns)
    if(tag_ns == False):
        false_count +=1
    
    print(f"Tag: {tag:<20} URI: {tag_ns}")

print(false_count)


### Post Process of Data Graph

#### (Ignore) Bind Default Namespace to Graph
**Ignore this section; this issue was taken care of above during the raw-input pre-processing**

The default namespace generated by skyspark appears to be an underscore "\_". There is no prefix namespace associated with this on the output .ttl graph from skyspark, so there is no official URI. For the purpose of this excercise, we will use "urn:\/\_#", but we may wish to revisit this. The nasmespace is defined above under the "Inputs" section. 

In [None]:
# data_graph.bind("_", NAMESPACE)

# # fix project-haystack namespaces
# data_graph.bind("ph", tc.PH_3_9_10, replace = True)
# data_graph.bind("phScience", tc.PHSCIENCE_3_9_10, replace = True)
# data_graph.bind("phIoT", tc.PHIOT_3_9_10, replace = True)

#### a) Keep Only Valid Tags in Data Graph

In [None]:
# keep only valid tags
for s1, p1, o1 in data_graph.triples((None, tc.PHIOT_3_9_10["equipRef"], target_node)):
    print(f"...processing node: \t{s1}")
    for s, p, o in data_graph.triples((s1, tc.PH_3_9_10["hasTag"], None)):
        if o not in valid_tags_ns:
            data_graph.remove((s, p, o))

#### b) Add First Class Point Type

Had to modify this section - the turtle format that comes back from a get request is slightly different than the one tha comes back from the Skyspark UI </br>
(as both of these are subclasses of "point" there should be a way to access it, but unclear what the RDF triple search critera should look like) </br>
**Alternatively, we could look for everything with a point tag - but we have to do this before above step where we remove the 'point' tag** </br>
NOTE: if 'his', 'cur', 'writable', or 'weather' tags are not removed, first class point may not be correct, as it assumes mutual exclusivity  

In [None]:
from tasty.skyspark import point_mapper as pm

# load the point tree
file = os.path.join(tasty_main_directory, 'tasty/schemas/haystack/defs_3_9_10.ttl')
pt = pm.PointTree(file, 'point')
root = pt.get_root()


def clean_points(s, p, o ):
    print(f"Point: \t{s}")
    print(f"Tags: ", end = "")
    
    # get the tags for this point
    tags = []
    for s1,p1,o1 in data_graph.triples((s, tc.PH_3_9_10["hasTag"], None)):
        tag = o1[o1.find('#')+1:]
        print(f"\t{tag}")
        tags.append(tag)
    
    # now determine first class point type
    fc_point = pt.determine_first_class_point_type(root, tags)
    print(f"\t...First Class Entity Type: {fc_point.type}\n")
    
    # add first class point type as class to the point
    data_graph.add((s, RDF.type,tc.PHIOT_3_9_10[fc_point.type]))
    # remove the tags associated with first class point
    for tag in fc_point.tags:
        # using all three namespaces because i do not know which is correct
        # TODO: develop method for determining proper namespace
        data_graph.remove((s, tc.PH_3_9_10["hasTag"], tc.PHIOT_3_9_10[tag]))
        data_graph.remove((s, tc.PH_3_9_10["hasTag"], tc.PHSCIENCE_3_9_10[tag]))
        data_graph.remove((s, tc.PH_3_9_10["hasTag"], tc.PH_3_9_10[tag]))    




# Get all points with an equipRef
for s, p, o in data_graph.triples((None, tc.PHIOT_3_9_10["equipRef"], None)):
    clean_points(s,p,o)   

In [None]:
print_graph(data_graph)

### Add Sample Equipment as Target Node

First we add a triple to the shapes graph:
- The **subject** is the SHACL equipment shape
- The **predicate** is `sh:targetNode`
- The **object** is the sample equipment

This indicates that the sample shape should conform to the overall SHACL equipment shape 

In [None]:
# add Instance Equipment as target node to SHACL Equipment Shape
shapes_graph.add((shape_name, SH.targetNode, target_node))
print(f"\tadded '{target_node}' as target node to {shape_name}")

Next we iterate over all *nodes* of the SHACL equipment shape using rdflidb's `triples()` function which supports basic triple pattern matching ([see documentation here](https://rdflib.readthedocs.io/en/stable/intro_to_graphs.html)). For each triple with a subject of the SHACL equipment shape and predicate of `sh:node`, we take the object (i.e. all of the functional group shapes which constitute the equipment shape) and add the sample equipment as a target node to these shapes. This is done so that the validation results will identify specific points that fail to validate, rather than simply functional group shapes.</br>
So for each *node* (functional group shape) add a triple to the shapes graph:
- The **subject** is the *node* (functional group shape)
- The **predicate** is `sh:targetNode`
- The **object** is the sample equipment

Ultimately, this means we are indicating that the sample equipment should conform to each of these functional group shapes independently. Note that this is acceptable currently because there is no `maxCount` on the functional group shape's `equipRef` path. 

In [None]:
# add Instance Equipment as target node to SHACL Functional Groups Shapes
for s1, p1, o1 in shapes_graph.triples((shape_name, SH.node, None)):
    shapes_graph.add((o1, SH.targetNode, target_node))
    print(f"\tadded '{target_node}' as target node to {o1}")

# 3) Validation

## PySHACL Validation

In [None]:
# ----------------------------------------
# Run pySCHACL Validation
# ----------------------------------------
result = validate(data_graph, shacl_graph=shapes_graph, ont_graph=ont_graph)
conforms, results_graph, results = result

print(f"Conforms: {conforms}")

In [None]:
print_graph(results_graph)

## Determine Missing Points
This implements a rudimentary logic for finding the missing points (simple shapes) from the pySHACL results graph. The process is as follows:
1. Find each "validation result" which represents one SHACL constraint that was not met. This is done by iterating through all the triples in the graph and finding the triple with a `rdf:type` of `sh:ValidationResult`. The subject of this match will be the URI of the "validation result" node.
2. For each of these "validation results" look at the `sh:sourceShape`
3. If it is a BNode (as opposed to a URI) then we assume this refers to one of the constraints on the functional group SHACL shape (and therefore one of the "simple shapes") and it will have a `sh:qualifiedValueShape` which should be a URI of one of the simple shapes.
4. Add this shape to the list of missing points

*Note: this logic likely needs to be refined*

In [None]:
missing_points = []
optional_points = []

# Find the Validation Results
for subject, predicate, object in results_graph.triples((None, RDF.type, SH.ValidationResult)):
#     print(f"Subject:{subject}\tPredicate:{predicate}\tObject:{object}")
    severity = results_graph.value(subject = subject, predicate= SH.resultSeverity)
    
    # check if Validation result points to a BNode
    for node in results_graph.objects(subject=subject, predicate=SH.sourceShape):
#         print(f"\tNode:{node}\t\tIs BNode:{isinstance(node, BNode)}")

        if isinstance(node, BNode):
            point = results_graph.value(subject=node, predicate=SH.qualifiedValueShape)
            
            if(severity == SH.Violation):
                missing_points.append(point)
            elif(severity == SH.Warning):
                optional_points.append(point)
if len(missing_points) <= 0:
    print("No Points Missing")
else:
    print(f"{len(missing_points)} Missing Points:")
    for point in missing_points:
#         for subject, predicate, object in shapes_graph.triples((point, SH.class, None)):
#             print(f"Subject:{subject}\tPredicate:{predicate}\tObject:{object}")
        print(f"\t{point}")
    
if len(optional_points) <= 0:
    print("No Optional Points Missing")
else:
    print(f"{len(optional_points)} Missing Optional Points:")
    for point in optional_points:
        print(f"\t{point}")
            

## Print pySHACL Graphs and Results to File (Optional) 

In [None]:
# ----------------------------------------
# Print Output Files
# ----------------------------------------
# Print Results to File
fn = os.path.join(output_directory, "results.txt")
f = open(fn, "w")
f.write(results)
f.close()
print("...printed results")

# Print Graphs to File(s)
print_graph_to_file(data_graph, "data_graph")
print("...printed data graph")
print_graph_to_file(shapes_graph, "shapes_graph")
print("...printed shapes graph")
print_graph_to_file(results_graph, "results_graph")
print("...printed results graph")

## 3b) Brick Validation (Optional)
Brickscehma uses pyshacl for validation, so it gives us the same result. In this case, we just passed in the shapes graph directly, so this is not actually testing conformance against an actual brick model or using the brick schema in any significant way.

In [None]:
# ----------------------------------------
# Run BrickSchema Validation
# ----------------------------------------

from brickschema import Graph

# Set Up Graphs
dg = Graph()
df = os.path.join(tasty_main_directory, data_graph_filename)
dg.load_file(df)

sg = Graph()
sf = os.path.join(tasty_main_directory, shapes_graph_filename)
sg.load_file(sf)

valid, _, report = dg.validate(shape_graphs=[sg])
print(f"Brick Validation - Conforms: {valid}")
if not valid:
    print(report)