# Identifying the level of each housing characteristic

This notebook visualizes the ResStock project as a hierarchical graph with the nodes being the housing characteristics the dependencies as directed edges.  The housing characteristics at the top of the graph are high level characteristics that have little dependencies, where housing characteristics at the bottom of the graph have many dependencies or depend on characteristics that have many dependencies.

## Python Version

In [None]:
import sys
print(sys.version)

## Import Modules

In [None]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from glob import glob
from os.path import isfile, join
import networkx as nx
from networkx.drawing.nx_agraph import graphviz_layout

import eulpuq.bayesian_update.children as ch

## Functions

In [None]:
def load_adj_graph(path_adj):
    """
    This function loads the adjacency matrix in as a data frame and then converts
    it to a graph.
    """
    adj_df = pd.read_csv(path_adj,index_col=0)

    # Create Graph
    G = nx.from_pandas_adjacency(adj_df,nx.DiGraph())

    # Reset index for convienience
    adj_df.reset_index(inplace=True,drop=True)
    
    return adj_df, G

def get_level_df(adj_df,nlevels,n_nodes):
    """
    This function returns the level of each housing characteristic(0 is no dependencies)
    """
    # Initialize node level values
    level = np.zeros_like(adj_df.iloc[:,0])

    # Iterate over nodes up to the longest path length
    for j in range(int(nlevels)):
        # For each housing characteristic node
        for i in range(n_nodes):
            # Get the index of the dependencies
            column_name = adj_df.columns[i]
            dependencies = np.where(adj_df[column_name] == 1)[0]

            #If there are dependencies
            if len(dependencies) > 0:
                # Get the level of the dependencies
                dependency_levels = level[dependencies]

                # Identify the maximum level of the dependencies
                max_level = np.max(dependency_levels)

                # This node's level must be one greater than the max level
                level[i] = max_level + 1

    # Create Data Frame
    hc_level_df = pd.DataFrame()
    hc_level_df['housing_characteristic'] = adj_df.columns
    hc_level_df['level'] = level
    
    return hc_level_df, level

def isolate_levels(G,level,node_names):
    """
    This function gets edges that do not jump different levels.
    """
    jump_edges = list()
    edges_new = list()
    for edge in G.edges:
        idx_in = np.where(edge[0] == node_names)[0][0]
        idx_out = np.where(edge[1] == node_names)[0][0]

        jump = int(np.fabs(level[idx_in] - level[idx_out]))
        if jump > 1:
            jump_edges.append(edge)
        else:
            edges_new.append(edge)

    return edges_new


## Calculate adjacency_matrix

In [None]:
override_existing_adj_df = True # <-----

# Get all project directories
# project_dirs = glob(join('..','..','..','project_*'))
project_dirs = ["/Users/lliu2/Documents/GitHub/EULP-uncertainty-quantification/eulpuq"]

for project_dir in project_dirs:
    outdir = join(project_dir, 'util', 'dependency_wheel')
    os.makedirs(outdir, exist_ok=True)
    
    path_HCs = join(project_dir, "housing_characteristics")
    level_dct, adj_df = ch.level_calc(path_HCs)
    
    if override_existing_adj_df:
        outfile = join(outdir, "adjacency_matrix.csv")
        adj_df.to_csv(outfile, index=True)
        
        pums_vars = [
            "Building America Climate Zone",
            "AHS Region",
            "ASHRAE IECC Climate Zone 2004",
            "CEC Climate Zone",
            "Census Region",
            "Census Region RECS",
            "Census Division",
            "State",
            "PUMA",
            "County",
            
            "Geometry Building Type ACS",
            "Geometry Building Type RECS",
            "Vacancy Status",
            "Tenure",
            "Vintage",
            "Vintage ACS",
            "Heating Fuel",
            "Federal Poverty Levels",
            "Income RECS2015",
            "Occupants",
            "Income",
        ]
        
        pums_vars_intersect = list(set(pums_vars).intersection(adj_df.columns))
        
        adj_df2 = adj_df.loc[pums_vars_intersect, pums_vars_intersect]

        outfile2 = join(outdir, 'adjacency_matrix_pums.csv')
        adj_df2.to_csv(outfile2, index=True)
        print(f"Saved adj_df for {project_dir} to:\n  {outfile}")
        print(f"Saved adj_df for {project_dir} to:\n  {outfile2}")

## Load data and plot

In [None]:
# for colors: https://graphviz.org/doc/info/colors.html
color_cycle = ["gainsboro", "darkslategray1", "yellow", "lightpink", "yellowgreen", "tan1", "plum",
               "antiquewhite1", "lightskyblue2", "gold", "thistle1", "palegreen1", "darksalmon", "mediumpurple1"]

print('Creating a dependency graphs for project:')
print('----------------------------------------')
for project_dir in project_dirs:

    # Create path for adjacency matrix
    path_adj = join(project_dir,'util/dependency_wheel/adjacency_matrix.csv')

    # Load adjacency matric and construct graph
    try:
        # Load adjacency matrix and graph
        adj_df, G = load_adj_graph(path_adj)

        # Get information about the graph
        print(project_dir)
        n_nodes = len(G.nodes)
        n_edges = len(G.edges)
        node_names = adj_df.columns.values
        longest_path_len = nx.dag_longest_path_length(G) # max level != num of levels
        edges_org = G.edges
        print(nx.info(G))
        print('Longest path length:\t %d' % longest_path_len)
        
        # Get levels
        nlevels = int(longest_path_len+1)
        hc_level_df, level = get_level_df(adj_df, nlevels, n_nodes)
        print('Maximum level:\t %d' % hc_level_df['level'].max() )
        assert nlevels == hc_level_df["level"].nunique()
        
        # Save the levels data frame
        ## Make directory if it doesn't exist
        outdir = join(project_dir,'util','dependency_graphs')
        os.makedirs(outdir, exist_ok=True)

        ## Save data frame
        hc_level_df.to_csv(join(outdir,'hc_levels.csv'),index=False)
        
        edges_new = isolate_levels(G,level,node_names)
        
        # Create Plots
        G_new = nx.DiGraph()
        G_new.add_nodes_from(node_names)
        G_new.add_edges_from(edges_new)

        scale_x = 1.0
        scale_y = 1.0
        pos = {}
        for i in range(int(longest_path_len)+1):
            idx = np.where(level == i)[0][::-1]
            n_level = len(idx)

            j = 1
            for idx_i in idx:
                y = n_level*scale_y*(float(j)/float(n_level)) - n_level/2.0
                x = i*scale_x
                pos[node_names[idx_i]] = (x,y)
                j += 1
        
        # make color dict
        level_dct = hc_level_df.set_index("housing_characteristic")["level"]
        colors = np.tile(color_cycle, nlevels//len(color_cycle)+1)[:nlevels]
        level_color_dct = dict(zip(range(nlevels), colors))
        
        ### [1] dependency graph full
        out_file = join(outdir,'Dependency_Graph_Full.pdf')
        A = nx.nx_agraph.to_agraph(G)
        A.node_attr['style'] = 'filled'
        A.node_attr['shape'] = 'box'
        A.node_attr['fontsize'] = 8
        A.node_attr['height'] = 0.5
        A.node_attr['margin'] = 0.1
        A.edge_attr['color'] = 'darkgray'
        
        # color labels by level
        for node in A.nodes():
            n = A.get_node(node)
            level = level_dct[node]
            n.attr['fillcolor'] = level_color_dct[level]
            
        A.draw(out_file, prog="dot")
        
        ### [2] dependency graph levels
        out_file = join(outdir,'Dependency_Graph_Levels.pdf')
        A = nx.nx_agraph.to_agraph(G_new)
        A.node_attr['style'] = 'filled'
        A.node_attr['shape'] = 'box'
        A.node_attr['fontsize'] = 8
        A.node_attr['height'] = 0.5
        A.node_attr['margin'] = 0.1
        A.edge_attr['color'] = 'darkgray'
        
        # color labels by level
        for node in A.nodes():
            n = A.get_node(node)
            level = level_dct[node]
            n.attr['fillcolor'] = level_color_dct[level]
            
        A.draw(out_file, prog="dot")
        
        print(f"  >> Dependency graphs created, check graphs at {outdir}\n")
    except IOError:
        pass
        

### dep graph for PUMS vars only

In [None]:
# for colors: https://graphviz.org/doc/info/colors.html

### PUMS only
color_cycle = ["gainsboro", "darkslategray1", "yellow", "lightpink", "yellowgreen", "tan1", "plum",
               "antiquewhite1", "lightskyblue2", "gold", "thistle1", "palegreen1", "darksalmon", "mediumpurple1"]

print('Creating a dependency graphs for project:')
print('----------------------------------------')
for project_dir in project_dirs:

    # Create path for adjacency matrix
    path_adj = join(project_dir,'util/dependency_wheel/adjacency_matrix_pums.csv')
    
    # Load adjacency matric and construct graph
    try:
        # Load adjacency matrix and graph
        adj_df, G = load_adj_graph(path_adj)

        # Get information about the graph
        print(project_dir)
        n_nodes = len(G.nodes)
        n_edges = len(G.edges)
        node_names = adj_df.columns.values
        longest_path_len = nx.dag_longest_path_length(G) # max level != num of levels
        edges_org = G.edges
        print(nx.info(G))
        print('Longest path length:\t %d' % longest_path_len)
        
        # Get levels
        nlevels = int(longest_path_len+1)
        hc_level_df, level = get_level_df(adj_df, nlevels, n_nodes)
        print('Maximum level:\t %d' % hc_level_df['level'].max() )
        assert nlevels == hc_level_df["level"].nunique()
        
        # Save the levels data frame
        ## Make directory if it doesn't exist
        outdir = join(project_dir,'util','dependency_graphs')
        os.makedirs(outdir, exist_ok=True)

        ## Save data frame
        hc_level_df.to_csv(join(outdir,'hc_levels.csv'),index=False)
        
        edges_new = isolate_levels(G,level,node_names)
        
        # Create Plots
        G_new = nx.DiGraph()
        G_new.add_nodes_from(node_names)
        G_new.add_edges_from(edges_new)

        scale_x = 1.0
        scale_y = 1.0
        pos = {}
        for i in range(int(longest_path_len)+1):
            idx = np.where(level == i)[0][::-1]
            n_level = len(idx)

            j = 1
            for idx_i in idx:
                y = n_level*scale_y*(float(j)/float(n_level)) - n_level/2.0
                x = i*scale_x
                pos[node_names[idx_i]] = (x,y)
                j += 1
        
        # make color dict
        level_dct = hc_level_df.set_index("housing_characteristic")["level"]
        colors = np.tile(color_cycle, nlevels//len(color_cycle)+1)[:nlevels]
        level_color_dct = dict(zip(range(nlevels), colors))
        
        ### [1] dependency graph full
        out_file = join(outdir,'Dependency_Graph_Full_PUMS.pdf')
        A = nx.nx_agraph.to_agraph(G)
        A.node_attr['style'] = 'filled'
        A.node_attr['shape'] = 'box'
        A.node_attr['fontsize'] = 8
        A.node_attr['height'] = 0.5
        A.node_attr['margin'] = 0.1
        A.edge_attr['color'] = 'darkgray'
        
        # color labels by level
        for node in A.nodes():
            n = A.get_node(node)
            level = level_dct[node]
            n.attr['fillcolor'] = level_color_dct[level]
            
        A.draw(out_file, prog="dot")
        
        ### [2] dependency graph levels
        out_file = join(outdir,'Dependency_Graph_Levels_PUMS.pdf')
        A = nx.nx_agraph.to_agraph(G_new)
        A.node_attr['style'] = 'filled'
        A.node_attr['shape'] = 'box'
        A.node_attr['fontsize'] = 8
        A.node_attr['height'] = 0.5
        A.node_attr['margin'] = 0.1
        A.edge_attr['color'] = 'darkgray'
        
        # color labels by level
        for node in A.nodes():
            n = A.get_node(node)
            level = level_dct[node]
            n.attr['fillcolor'] = level_color_dct[level]
            
        A.draw(out_file, prog="dot")
        
        print(f"  >> Dependency graphs created, check graphs at {outdir}\n")
    except IOError:
        pass
        