## Test the Ingestion class
*September 7, 2023*
This notebook is used to both test the Ingestion class and the use of the from_json spec.

Each cell is a separate test or example.


In [None]:
from os import walk
import pandas as pd
import re
import json
from ingestion import Ingestion

In [None]:
# You must run this cell first to define the utility function assert_value()
def assert_value(value, expected_value):
    if value != expected_value:
        raise Exception(f"Expected {expected_value}")
    else:
        print("OK")


In [None]:
# Test Ingestion.get_value_from_filename()

test_column = {
    "column_name": "housingType",
    "source": "from_filename",
    "spec": r"^Conventional egg production__Farm__(.*)__.*\.xlsx-result-total-impacts$"
}
test_filename = "Conventional egg production__Farm__A__003.xlsx-result-total-impacts"
value = Ingestion.get_value_from_filename(test_column, test_filename)
assert_value(value, "A")

In [None]:
# Test Ingestion.extend_pivot_data()

test_this_pivot_data = [0, 1, 2]
test_next_pivot_data = [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]
extended_pivot_data = Ingestion.extend_pivot_data(test_this_pivot_data, test_next_pivot_data)
assert_value(extended_pivot_data, [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2])

<a id='section_id1'></a>
### Tutorial on using the from_json spec

The following examples illustrate how the various tokens in the from_json spec string are used.

#### The `<key>` token
Try running the cell below.

In [None]:
# The <key> token

json_data_str = """{
    "letter": "a",
    "fruit": "apple"
}"""
json_data = json.loads(json_data_str)
spec = "fruit"
spec_tokens = spec.split(".")
spec_index = 0
(values, pivots) = Ingestion.extract_values_by_spec(json_data, spec_tokens, spec_index)
print(f"values: {values}")


#### The `[].<key>` token pair
Try running the cell below.

In [None]:
# The [].<key> token pair

json_data_str = """[
    {
        "letter": "a",
        "fruit": "apple"
    },
    {
        "letter": "b",
        "fruit": "banana"
    },
    {
        "letter": "b",
        "fruit": "cantaloupe"
    }
]"""
json_data = json.loads(json_data_str)
spec = "[].fruit"
spec_tokens = spec.split(".")
spec_index = 0
(values, pivots) = Ingestion.extract_values_by_spec(json_data, spec_tokens, spec_index)
print(f"values: {values}")


#### The `[n].<key>` token pair
Try running the cell below.

In [None]:
# The [n].<key> token pair

json_data_str = """[
    {
        "letter": "a",
        "fruit": "apple"
    },
    {
        "letter": "b",
        "fruit": "banana"
    },
    {
        "letter": "c",
        "fruit": "cantaloupe"
    }
]"""
json_data = json.loads(json_data_str)
spec = "[1].fruit"
spec_tokens = spec.split(".")
spec_index = 0
(values, pivots) = Ingestion.extract_values_by_spec(json_data, spec_tokens, spec_index)
print(f"values: {values}")


#### The `__keys__[].<key>` token pair
Try running the cell below.

In [None]:
# The __keys__[].<key> token pair

json_data_str = """{
    "farm_a": {
        "letter": "a",
        "fruit": "apple"
    },
    "farm_b": {
        "letter": "b",
        "fruit": "banana"
    },
    "farm_c": {
        "letter": "c",
        "fruit": "cantaloupe"
    }
}"""
json_data = json.loads(json_data_str)
spec = "__keys__[].letter"
spec_tokens = spec.split(".")
spec_index = 0
(values, pivots) = Ingestion.extract_values_by_spec(json_data, spec_tokens, spec_index)
print(f"values: {values}")


#### The `__keys__[n].<key>` token pair
Try running the cell below.

In [None]:
# The __keys__[n].<key> token pair

json_data_str = """{
    "farm_a": {
        "letter": "a",
        "fruit": "apple"
    },
    "farm_b": {
        "letter": "b",
        "fruit": "banana"
    },
    "farm_c": {
        "letter": "c",
        "fruit": "cantaloupe"
    }
}"""
json_data = json.loads(json_data_str)
spec = "__keys__[1].letter"
spec_tokens = spec.split(".")
spec_index = 0
(values, pivots) = Ingestion.extract_values_by_spec(json_data, spec_tokens, spec_index)
print(f"values: {values}")


#### A real-world example of using the from_json spec
You will see values and pivots - don't worry about pivots as they are used internally when merging.

Try running the cell two down.  But don't forget to first run the cell below to define assert_value().

In [None]:
# You must run this cell first to define assert_value(), then you can run
# the examples below
#
def assert_value(value, expected_value):
    if value != expected_value:
        raise Exception(f"Expected {expected_value}")
    else:
        print("OK")


In [None]:
# Test Ingestion.extract_values_by_spec()

json_data_str = """[
    {
        "Human toxicity - CML 2 baseline 2000": [
            {
                "techFlow": {
                    "provider": {
                        "@type": "Process",
                        "@id": "c8af2d86-cec0-43b3-ad93-703c59cc5d20",
                        "name": "Conventional egg production__Farm__A__003.xlsx",
                        "processType": "UNIT_PROCESS",
                        "flowType": "PRODUCT_FLOW"
                    },
                    "flow": {
                        "@type": "Flow",
                        "@id": "75abe2d6-dcf6-42e0-994f-98d8f7a81958",
                        "name": "Egg-conventional cage layer",
                        "category": "Egg",
                        "flowType": "PRODUCT_FLOW",
                        "refUnit": "kg"
                    }
                },
                "result": 0.5578417652145193,
                "directContribution": 0.0,
                "requiredAmount": 1.0,
                "upstreamTechFlows": [
                    {
                        "techFlow": {
                            "provider": {
                                "@type": "Process",
                                "@id": "11936076-34c7-4748-8093-b7b560b3d015",
                                "name": "Feed mill-layers-conventional- Pelletier 2017, Pulse Canada",
                                "category": "Egg production processes/Updated Models using 2019  collected data/Feed production",
                                "processType": "UNIT_PROCESS",
                                "flowType": "PRODUCT_FLOW"
                            },
                            "flow": {
                                "@type": "Flow",
                                "@id": "ad4ec28d-72c8-4771-b633-893ceb1c2461",
                                "name": "Feed - layer",
                                "category": "Egg",
                                "flowType": "PRODUCT_FLOW",
                                "refUnit": "kg"
                            }
                        },
                        "result": 0.42890065687101325,
                        "directContribution": 0.0,
                        "requiredAmount": 1.5949158953683473,
                        "upstreamTechFlows": []
                    },
                    {
                        "techFlow": {
                            "provider": {
                                "@type": "Process",
                                "@id": "fbfa83d4-6625-4214-9e34-27cc15247e48",
                                "name": "Pullets-conventional",
                                "category": "Egg production processes/Pelletier 2017 Processes/Pullets, Breeder, and Hatchery Production",
                                "processType": "UNIT_PROCESS",
                                "flowType": "PRODUCT_FLOW"
                            },
                            "flow": {
                                "@type": "Flow",
                                "@id": "06425698-26e1-4efb-bfa0-e0fea0cdeb6f",
                                "name": "Pullets-conventional",
                                "category": "Egg",
                                "flowType": "PRODUCT_FLOW",
                                "refUnit": "Item(s)"
                            }
                        },
                        "result": 0.08983064766686122,
                        "directContribution": 0.0,
                        "requiredAmount": 0.04418184063273047,
                        "upstreamTechFlows": []
                    }
                ]
            }
        ]
    }
]"""
json_data = json.loads(json_data_str)
spec = "[].__keys__[0].[0].upstreamTechFlows.[].result"
spec_tokens = spec.split(".")
spec_index = 0
(values, pivots) = Ingestion.extract_values_by_spec(json_data, spec_tokens, spec_index)
print(f"values: {values}")
assert_value(values, [0.42890065687101325, 0.08983064766686122]),
print(f"pivots: {pivots}")
assert_value(pivots, {'pivot_0': [0, 0], 'pivot_4': [0, 1]})


<a id='section_id2'></a>
### Tutorial on merging DataFrames

The following example illustrate how two dataframes are merged.  Run the cells sequentially.

In [None]:
left_d = {
    "farm": [
        "farm_a",
        "farm_b",
        "farm_c"
    ],
    "letter": [
        "a",
        "b",
        "c"
    ],
    "fruit": [
        "apple",
        "banana",
        "cantaloupe"
    ]
}
left_df = pd.DataFrame(data=left_d)
left_df

In [None]:
right_d = {
    "farm": [
        "farm_a",
        "farm_a",
        "farm_a",
        "farm_b",
        "farm_b",
        "farm_b",
        "farm_c",
        "farm_c",
        "farm_c"
    ],
    "characteristic": [
        "x",
        "y",
        "z",
        "x",
        "y",
        "z",
        "x",
        "y",
        "z"
    ],
    "value": [
        0.1,
        0.2,
        0.3,
        0.2,
        0.4,
        0.6,
        0.3,
        0.6,
        0.9,
    ]
}
right_df = pd.DataFrame(data=right_d)
right_df

In [None]:
dataframes = {}
dataframes["left_df"] = left_df
dataframes["right_df"] = right_df


In [None]:
dataframe_creation = {
    "dataframe_name": "merged_df",
    "creation_method": "merge",
    "left_dataframe": "left_df",
    "right_dataframe": "right_df",
    "left_merge_columns": [
        "farm"
    ],
    "right_merge_columns": [
        "farm"
    ]
}


In [None]:
# This is the merge code - should be in its own method
#
dataframe_name = dataframe_creation["dataframe_name"]
creation_method = dataframe_creation["creation_method"]
if creation_method == "merge":
    left_df_name = dataframe_creation["left_dataframe"]
    right_df_name = dataframe_creation["right_dataframe"]
    print("\n")
    print(f"Dataframe {dataframe_name} will be merged from {left_df_name} and {right_df_name}")
    left_df = dataframes[left_df_name]
    right_df = dataframes[right_df_name]
    left_merge_columns = dataframe_creation["left_merge_columns"]
    right_merge_columns = dataframe_creation["right_merge_columns"]
    dataframes[dataframe_name] = left_df.merge(
        right_df, how='outer', left_on=left_merge_columns, right_on=right_merge_columns
    )

print(f"Dataframe name: {dataframe_name}")
dataframes[dataframe_name]