In [47]:
import csv

from enbios2.const import BASE_DATA_PATH

base_path = BASE_DATA_PATH / "temp/miquel_upscaling"

# this is the sheet: "dendrogram-technology dict"
base_denodo_gram_file = base_path / "dendrogram_generation_base.csv"
# this is the PNIEC dendrogram sheet
dendogram_file = base_path / "dendrogram_generation_dendo.csv"
base_denodo_gram_file.exists(), dendogram_file.exists()

(True, True)

In [48]:
dendo_base_reader = csv.DictReader(base_denodo_gram_file.open(encoding="utf-8"))
print(dendo_base_reader.fieldnames)
all_base_rows = list(dendo_base_reader)

dendo_reader = csv.DictReader(dendogram_file.open(encoding="utf-8"))
print(dendo_reader.fieldnames)
dendo_rows = list(dendo_reader)

['Dendrogram name', 'tech name']
['Child', 'Parent', 'Dendrogram level']


In [49]:
# ok. lets match tech-name : Dendrogram name to "Child" (in dendo_rows)
base_dendo_names = set(row['Dendrogram name'] for row in all_base_rows)
base_dendo_names_in_dendo = set(
    row['Child'] for row in dendo_rows if row["Child"])  # the if will filter out the empty ones

In [50]:
# ok, that good. all names "Dendrogram name" appear somewhere as Child (of something)
base_dendo_names - base_dendo_names_in_dendo

set()

In [51]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [52]:
from enbios2.experiment.sum_hierarchy import HierarchyNode

# ok lets build the hierarchy, or tree. dendogram is actually the last name I would give it :). Because that is the name of the plot
# we dont care about the "Dendrogram level" column, but just look at the Child - Parent relationship

base_tree: HierarchyNode = None
all_nodes: list[HierarchyNode] = []

for row in dendo_rows:
    if not row["Child"]:
        continue
    if not row["Parent"]:
        base_tree = HierarchyNode(row["Child"])
        all_nodes.append(base_tree)
    else:
        node = HierarchyNode(row["Child"].strip())
        parent_ = list(filter(lambda node: node.name == row["Parent"], all_nodes))
        if not parent_:
            print(row, "does not connect")
            print(all_nodes)
            continue
        parent = parent_[0]
        parent.add_child(node)
        all_nodes.append(node)

In [53]:
# ok, so far so good.
base_tree.assert_all_names_unique()
base_tree.as_dict()
base_tree.to_csv(base_path / "base_tree.csv", include_attrs=[], merge_first_sub_row=True)

ok this was just a test if the basic tree can be build. now lets build a mega tree.
the root is called "impacts", which has 18 children, one for each indicator and each indicator has 2 children: onsite and offsite.
each of these 2 subtrees has some subtree of the kind we build before

In [54]:
# however, the generator table include technology names, which are actually not in the tree, but the mapping that "dendrogram_generation_base.csv" is giving.
# so lets find all those "Dendrogram name" nodes and change their names to what the "tech name" column has...
for row in all_base_rows:
    #print(row)
    node = base_tree.find_child_by_name(row['Dendrogram name'])
    assert node
    node.name = row["tech name"]

In [55]:
generation_file = base_path / "PNIEC_generation.csv"
assert generation_file.exists()
generation_rows = list(csv.DictReader(generation_file.open(encoding="utf-8")))

# first lets check if "Technologies" match the "tech name"
base_tech_names = set(r["tech name"] for r in all_base_rows)
technologies = set(r["Technologies"] for r in generation_rows)

# this is what we read from Technologies and matches tech name
real_tech_names = set()

# lets build a map from Technologies -> tech name, we need that for mapping later
technology2tree_tech_map: dict[str,str] = {}
# "el" seems to be a good split token in the Technologies column
for technology in technologies:
    assert "el" in technology
    parts = technology.split("_")
    el_index = parts.index("el")
    # the bits before el are the real tech name
    real_tech_name = "_".join(parts[:el_index])
    #print(real_tech_name)
    real_tech_names.add(real_tech_name)
    if real_tech_name in base_tech_names:
        technology2tree_tech_map[technology] = real_tech_name
    else: # we are happy that this is actually never printed
        print(real_tech_name, "not in base tech names")


In [56]:
# ok lets build the 3 top layers with one iteration over the generation list collecting all the indicators. we actually just need to run over the very first tech, all the indicators will be there
# surely this could also be done with one call of pandas.
first_tech: str = None
indicators: list[str] = []

for row in generation_rows:
    if not first_tech:
        first_tech = row["Technologies"]
    if row["Technologies"] != first_tech:
        break
    indicators.append(row["Indicators"])
indicators

['terrestrial acidification potential (TAP)',
 'global warming potential (GWP1000)',
 'freshwater ecotoxicity potential (FETP)',
 'marine ecotoxicity potential (METP)',
 'terrestrial ecotoxicity potential (TETP)',
 'fossil fuel potential (FFP)',
 'freshwater eutrophication potential (FEP)',
 'marine eutrophication potential (MEP)',
 'human toxicity potential (HTPc)',
 'human toxicity potential (HTPnc)',
 'ionising radiation potential (IRP)',
 'agricultural land occupation (LOP)',
 'surplus ore potential (SOP)',
 'ozone depletion potential (ODPinfinite)',
 'particulate matter formation potential (PMFP)',
 'photochemical oxidant formation potential: humans (HOFP)',
 'photochemical oxidant formation potential: ecosystems (EOFP)',
 'water consumption potential (WCP)']

In [57]:
# The main tree root
root = HierarchyNode("impacts")
indicator_subtrees = [] #  create this list, to find the subtree easier later
# 2nd layer. indicators / and 3rd layer onsite/offsite
for indicator in indicators:
    indicator_subtree = HierarchyNode(indicator, [
        HierarchyNode("Onsite"), HierarchyNode("Offsite")
    ])
    root.add_child(indicator_subtree)
    indicator_subtrees.append(indicator_subtree)

root.as_dict()

{'impacts': {'name': 'impacts',
  'children': {'terrestrial acidification potential (TAP)': {'name': 'terrestrial acidification potential (TAP)',
    'children': {'Onsite': {'name': 'Onsite', 'children': {}, 'value': None},
     'Offsite': {'name': 'Offsite', 'children': {}, 'value': None}},
    'value': None},
   'global warming potential (GWP1000)': {'name': 'global warming potential (GWP1000)',
    'children': {'Onsite': {'name': 'Onsite', 'children': {}, 'value': None},
     'Offsite': {'name': 'Offsite', 'children': {}, 'value': None}},
    'value': None},
   'freshwater ecotoxicity potential (FETP)': {'name': 'freshwater ecotoxicity potential (FETP)',
    'children': {'Onsite': {'name': 'Onsite', 'children': {}, 'value': None},
     'Offsite': {'name': 'Offsite', 'children': {}, 'value': None}},
    'value': None},
   'marine ecotoxicity potential (METP)': {'name': 'marine ecotoxicity potential (METP)',
    'children': {'Onsite': {'name': 'Onsite', 'children': {}, 'value': None},

In [58]:
from copy import deepcopy

# each of the lowest nodes (onsite, offsite) now get their own copy of the base tree we created above
for indicator_sub_tree in indicator_subtrees:
    onsite, offsite = indicator_sub_tree.children
    onsite.join_tree(deepcopy(base_tree))
    offsite.join_tree(deepcopy(base_tree))

lets do some introspection...

In [59]:
root # root has 18 children. all the indicator

[impacts - 18 children ]

In [60]:
#  first indicator, has 2 children, onsite, offsite
root.children[0], [kid for kid in root.children[0].children]

([terrestrial acidification potential (TAP) - 2 children (impacts)],
 [[Onsite - 2 children (terrestrial acidification potential (TAP))],
  [Offsite - 2 children (terrestrial acidification potential (TAP))]])

In [61]:
# going one step down, we see renewable and non-renewables, the base of our base tree
root.children[0].children[0], [kid.name for kid in root.children[0].children[0].children]

([Onsite - 2 children (terrestrial acidification potential (TAP))],
 ['renewables', 'non-renewables'])

In [62]:
# now we fill up all indicator sub-trees going row by row. finding the respective technology node for onsite and offsite and filling in the value
# we will use the technology2tree_tech_map, we created before to find the technology node in the tree
for index, row in enumerate(generation_rows):
    # find the indicator subtree with the filter function.
    # next gives us the first (one and only) element of the filter function
    indicator_subtree: HierarchyNode = next(filter(lambda ind: row["Indicators"] == ind.name, indicator_subtrees))

    # use get, and check if it exits, in order to prevent
    tree_tech_name = technology2tree_tech_map.get(row["Technologies"], None)
    if not tree_tech_name:
        print(row["Technologies"], "will be ignored")

    # the children are called Onsite and Offsite, exactly like the 2 columns with values
    for child in indicator_subtree.children:
        tech_node = child.find_child_by_name(tree_tech_name)
        if not tech_node:
            print("error", child.name, index, row["Technologies"], tree_tech_name)
            break
        tech_node.value = float(row[child.name]) #  child.name is either Onsite or Offsite...

    # assert tech_node

In [63]:
for indicator_subtree in root.children:
    indicator_subtree.calc()

In [64]:
import json
json.dump(root.as_dict(), (base_path / "complete.json").open("w", encoding="utf-8"), indent=2)

In [65]:
root.to_csv((base_path / "complete.csv"))

In [69]:
# one sanky csv file per indicator.
# ! these hierarchies do not have unique names (same subtree in Onsite/Offsite), so we need to make them unique before we can write them to csv
(base_path / "sanky").mkdir(exist_ok=True)
for indicator_subtree in root.children:
    indicator_subtree.make_names_unique()
    indicator_subtree.to_sanky_tree(base_path / "sanky" / f"{indicator_subtree.name}_sanky.csv")