In [1]:
import csv

from enbios2.const import BASE_DATA_PATH

base_path = BASE_DATA_PATH / "temp/miquel_upscaling"

# this is the sheet: "dendrogram-technology dict"
base_denodo_gram_file = base_path / "dendrogram_generation_base.csv"
# this is the PNIEC dendrogram sheet
dendogram_file = base_path / "dendrogram_generation_dendo.csv"
base_denodo_gram_file.exists(), dendogram_file.exists()

(True, True)

In [2]:
dendo_base_reader = csv.DictReader(base_denodo_gram_file.open(encoding="utf-8"))
print(dendo_base_reader.fieldnames)
all_base_rows = list(dendo_base_reader)

dendo_reader = csv.DictReader(dendogram_file.open(encoding="utf-8"))
print(dendo_reader.fieldnames)
dendo_rows = list(dendo_reader)

['Dendrogram name', 'tech name']
['Child', 'Parent', 'Dendrogram level']


In [3]:
# ok. lets match tech-name : Dendrogram name to "Child" (in dendo_rows)
base_dendo_names = set(row['Dendrogram name'] for row in all_base_rows)
base_dendo_names_in_dendo = set(
    row['Child'] for row in dendo_rows if row["Child"])  # the if will filter out the empty ones

In [4]:
# ok, that good. all names "Dendrogram name" appear somewhere as Child (of something)
base_dendo_names - base_dendo_names_in_dendo

set()

In [5]:
%load_ext autoreload
%autoreload 2

In [6]:
from enbios2.experiment.sum_hierarchy import HierarchyNode

# ok lets build the hierarchy, or tree. dendogram is actually the last name I would give it :). Because that is the name of the plot
# we dont care about the "Dendrogram level" column, but just look at the Child - Parent relationship

base_tree: HierarchyNode = None
all_nodes: list[HierarchyNode] = []

for row in dendo_rows:
    if not row["Child"]:
        continue
    if not row["Parent"]:
        base_tree = HierarchyNode(row["Child"])
        all_nodes.append(base_tree)
    else:
        node = HierarchyNode(row["Child"].strip())
        parent_ = list(filter(lambda node: node.name == row["Parent"], all_nodes))
        if not parent_:
            print(row, "does not connect")
            print(all_nodes)
            continue
        parent = parent_[0]
        parent.add_child(node)
        all_nodes.append(node)

In [7]:
# ok, so far so good.
base_tree.assert_all_names_unique()
base_tree.as_dict()
base_tree.to_csv(base_path / "base_tree.csv", include_attrs=[], merge_first_sub_row=True)

ok this was just a test if the basic tree can be build. now lets build a mega tree.
the root is called "impacts", which has several children (ONE PER SCENARIO), which have 18 children, one for each indicator and each indicator has 2 children: onsite and offsite.
each of these 2 subtrees has some subtree of the kind we build before

In [8]:
# however, the generator table include technology names, which are actually not in the tree, but the mapping that "dendrogram_generation_base.csv" is giving.
# so lets find all those "Dendrogram name" nodes and change their names to what the "tech name" column has...
for row in all_base_rows:
    #print(row)
    node = base_tree.find_child_by_name(row['Dendrogram name'])
    assert node
    node.name = row["tech name"]

In [9]:
generation_file = base_path / "PNIEC_generation.csv"
assert generation_file.exists()
generation_rows = list(csv.DictReader(generation_file.open(encoding="utf-8")))

# first lets check if "Technologies" match the "tech name"
base_tech_names = set(r["tech name"] for r in all_base_rows)
technologies = set(r["Technologies"] for r in generation_rows)

# this is what we read from Technologies and matches tech name
real_tech_names = set()

# let's build a map from Technologies -> tech name, we need that for mapping later
technology2tree_tech_map: dict[str,str] = {}
# "el" seems to be a good split token in the Technologies column
for technology in technologies:
    assert "el" in technology
    parts = technology.split("_")
    el_index = parts.index("el")
    # the bits before el are the real tech name
    real_tech_name = "_".join(parts[:el_index])
    #print(real_tech_name)
    real_tech_names.add(real_tech_name)
    if real_tech_name in base_tech_names:
        technology2tree_tech_map[technology] = real_tech_name
    else: # we are happy that this is actually never printed
        print(real_tech_name, "not in base tech names")


In [25]:
# ok lets build the 4 top layers with one iteration over the generation list collecting all the indicators.
# let's collect the scenarios and indicators...

# here we build the scenario and indicator layers
root = HierarchyNode("impacts")

for row in generation_rows:
    if (scenario := row["Scenario"]) not in root:
        scenario_node = HierarchyNode(scenario)
        root.add_child(scenario_node)
    else:
        scenario_node = root[scenario]
        if (indicator := row["Indicators"]) not in scenario_node:
            scenario_node.add_child(HierarchyNode(indicator))

root.as_dict()

{'impacts': {'name': 'impacts',
  'children': {'production_2015': {'name': 'production_2015',
    'children': {'global warming potential (GWP1000)': {'name': 'global warming potential (GWP1000)',
      'children': {},
      'value': None},
     'freshwater ecotoxicity potential (FETP)': {'name': 'freshwater ecotoxicity potential (FETP)',
      'children': {},
      'value': None},
     'marine ecotoxicity potential (METP)': {'name': 'marine ecotoxicity potential (METP)',
      'children': {},
      'value': None},
     'terrestrial ecotoxicity potential (TETP)': {'name': 'terrestrial ecotoxicity potential (TETP)',
      'children': {},
      'value': None},
     'fossil fuel potential (FFP)': {'name': 'fossil fuel potential (FFP)',
      'children': {},
      'value': None},
     'freshwater eutrophication potential (FEP)': {'name': 'freshwater eutrophication potential (FEP)',
      'children': {},
      'value': None},
     'marine eutrophication potential (MEP)': {'name': 'marine eut

In [26]:
# Here we add the Onsite/Offsite layer and for each of them a copy of the base tree
from copy import deepcopy

for leave in root.get_leaves():
    leave.add_child(deepcopy(base_tree))
    leave.add_child(deepcopy(base_tree))
    leave[0].name = "Onsite"
    leave[1].name = "Offsite"

lets do some introspection...

In [27]:
root, root.get_child_names() # root has 7 children. all the scenarios

([impacts - 7 children],
 ['production_2015',
  'tendential_2020',
  'tendential_2025',
  'tendential_2030',
  'target_2020',
  'target_2025',
  'target_2030'])

In [28]:
#  first indicator, has 2 children, onsite, offsite
# note. we added __getitem__ to BaseNode, so we can use the [] operator to get the children
some_scenario = root[0]
some_scenario.get_child_names(), some_scenario.get_num_children()

(['global warming potential (GWP1000)',
  'freshwater ecotoxicity potential (FETP)',
  'marine ecotoxicity potential (METP)',
  'terrestrial ecotoxicity potential (TETP)',
  'fossil fuel potential (FFP)',
  'freshwater eutrophication potential (FEP)',
  'marine eutrophication potential (MEP)',
  'human toxicity potential (HTPc)',
  'human toxicity potential (HTPnc)',
  'ionising radiation potential (IRP)',
  'agricultural land occupation (LOP)',
  'surplus ore potential (SOP)',
  'ozone depletion potential (ODPinfinite)',
  'particulate matter formation potential (PMFP)',
  'photochemical oxidant formation potential: humans (HOFP)',
  'photochemical oxidant formation potential: ecosystems (EOFP)',
  'water consumption potential (WCP)',
  'terrestrial acidification potential (TAP)'],
 18)

In [29]:
# going one step down, we have Onsite and Offsite
some_scenario[0].get_child_names(), some_scenario[0][0].get_child_names()

(['Onsite', 'Offsite'], ['renewables', 'non-renewables'])

In [16]:
# now we fill up all indicator subtrees going row by row. finding the respective technology node for onsite and offsite and filling in the value
# we will use the technology2tree_tech_map, we created before to find the technology node in the tree
for index, row in enumerate(generation_rows):
    # get the scenario subtree
    scenario_subtree = root[row["Scenario"]]

    indicator_subtree = scenario_subtree[row["Indicators"]]
    # print(scenario_subtree, indicator_subtree)
    # use get, and check if it exits, in order to prevent
    tree_tech_name = technology2tree_tech_map.get(row["Technologies"], None)
    if not tree_tech_name:
        print(row["Technologies"], "will be ignored")
    #
    # # the children are called Onsite and Offsite, exactly like the 2 columns with values
    for child in indicator_subtree.children:
        tech_node = child.find_child_by_name(tree_tech_name)
        if not tech_node:
            print("error", child.name, index, row["Technologies"], tree_tech_name)
            break
        tech_node.value = float(row[child.name]) #  child.name is either Onsite or Offsite...

    # assert tech_node

In [17]:
for indicator_subtree in root.collect_all_nodes_at_level(2):
    indicator_subtree.calc()

In [18]:
import json
json.dump(root.as_dict(), (base_path / "complete.json").open("w", encoding="utf-8"), indent=2)

In [19]:
root.to_csv((base_path / "complete.csv"))

In [20]:
from enbios2.generic.util import safe_name

# one sanky csv file per indicator.
# ! these hierarchies do not have unique names (same subtree in Onsite/Offsite), so we need to make them unique before we can write them to csv
(base_path / "sanky").mkdir(exist_ok=True)

root_cp = deepcopy(root)

root_cp.make_names_unique()
# json.dump(root.as_dict(), (base_path / "complete_renamed.json").open("w", encoding="utf-8"), indent=2)
for scenario in root_cp.collect_all_nodes_at_level(1):
    scenario_folder = (base_path / "sanky" / scenario.name)
    scenario_folder.mkdir(exist_ok=True)

    for indicator_subtree in scenario.collect_all_nodes_at_level(1):
        indicator_subtree.to_sanky_tree(scenario_folder /
                                        f"{safe_name(indicator_subtree.name)}_sanky.csv")