### ToDos
- check if unit row lenght is correct

In [1]:
# set environment variable (only for jupyter notebook)
import os
os.environ["UNFCCC_GHG_ROOT_PATH"] = "/Users/danielbusch/Documents/UNFCCC_non-AnnexI_data"

In [2]:
import camelot
import primap2 as pm2
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

from UNFCCC_GHG_data.helper import downloaded_data_path, extracted_data_path

In [3]:
# ###
# configuration
# ###

input_folder = downloaded_data_path / 'UNFCCC' / 'Guinea' / 'BUR1'
output_folder = extracted_data_path / 'UNFCCC' / 'Guinea'
if not output_folder.exists():
    output_folder.mkdir()

pdf_file = "Rapport_IGES-Guinee-BUR1_VF.pdf"


In [4]:
# primap2 format conversion
coords_cols = {
    "category": "category",
    "entity": "entity",
    "unit": "unit",
}

coords_defaults = {
    "source": "GIN-GHG-Inventory",
    "provenance": "measured",
    "area": "GIN",
    "scenario": "BUR1",
}

coords_terminologies = {
    "area": "ISO3",
    # TODO check if this is correct
    "category": "IPCC1996_2006_GIN_Inv",
    "scenario": "PRIMAP",
}

### Q: How to choose gwp_to_use?
### Q: 'unit' and 'category' are 'PRIMAP1'. Are there other options?
### Q: Why are we mapping 'NMVOCs': 'NMVOC', wouldn't it be easier to name it NMVOC in the first place?

In [5]:
# Are we choosing this gwp
gwp_to_use = "AR4GWP100"
coords_value_mapping = {
    'main' : {
    "unit": "PRIMAP1",
    "category": "PRIMAP1",
    "entity": {
        'HFCs': f"HFCS ({gwp_to_use})",
        'PFCs': f"PFCS ({gwp_to_use})",
        'SF6' : f"SF6 ({gwp_to_use})",
        'NMVOCs': 'NMVOC',
    }
    },
    'energy' : {
    "unit": "PRIMAP1",
    "category": "PRIMAP1",
    "entity": {
        'NMVOCs': 'NMVOC',
    }
    },
    'lulucf' : {
    "unit": "PRIMAP1",
    "category": "PRIMAP1",
    "entity": {
        'NMVOCs': 'NMVOC',
    }
    },
    'waste' : {
    "unit": "PRIMAP1",
    "category": "PRIMAP1",
    "entity": {
        'NMVOCs': 'NMVOC',
    }
    },
    'trend' : {
    "unit": "PRIMAP1",
    "category": "PRIMAP1",
    },
    
}


filter_remove = {
    'f_memo': {"category": "MEMO"},
}

### Q: What to put under references and rights?

In [9]:
meta_data = {
    "references": "placeholder",
    "rights": "",
    "contact": "mail@johannes-guetschow.de",
    "title": "Guinea. Biennial update report (BUR). BUR1",
    "comment": "Read fom pdf by Daniel Busch",
    "institution": "UNFCCC",
}

In [7]:
page_def_templates = {
    '110': {
        "area": ['36,718,589,87'],
        "cols": ['290,340,368,392,425,445,465,497,535,564'],
    },
    '111': {
        "area": ['36,736,587,107'],
        "cols": ['293,335,369,399,424,445,468,497,535,565'],
    },
    '112': {
        "area": ['35,733,588,106'],
        "cols": ['293,335,369,399,424,445,468,497,535,565'],
    },
    '113': {
        "area": ['35,733,588,106'],
        "cols": ['293,335,365,399,424,445,468,497,535,565'],
    },
    '131' : {
                "area": ['36,718,590,83'],
                "cols": ['293,332,370,406,442,480,516,554'],
            },
}

# for main table
header_inventory = ['Greenhouse gas source and sink categories',
                   'CO2', 'CH4', "N2O", 'HFCs', 'PFCs', 'SF6', 'NOx', 'CO', 'NMVOCs','SO2'
                   ]
# TODO the extra '-' may be wrong here, check again!
unit_inventory = ['-'] + ['Gg'] * len(header_inventory) # one extra for the category columns
unit_inventory[4] = "GgCO2eq"
unit_inventory[5] = "GgCO2eq"
unit_inventory[6] = "GgCO2eq"

# for energy tables
header_energy = ['Greenhouse gas source and sink categories',
                   'CO2', 'CH4', "N2O", 'NOx', 'CO', 'NMVOCs','SO2'
                   ]
unit_energy = ['-'] + ['Gg'] * len(header_energy) # one extra for the category columns

# for lulucf tables
header_lulucf = ['Greenhouse gas source and sink categories', 'CO2', 'CH4', "N2O", 'NOx', 'CO', 'NMVOCs']
unit_lulucf = ['-'] + ['Gg'] * (len(header_lulucf) - 1)

# for waste table
header_waste = ['Greenhouse gas source and sink categories', 'CO2', 'CH4', "N2O", 'NOx', 'CO', 'NMVOCs', 'SO2']
unit_waste = ['-'] + ['Gg'] * (len(header_waste) - 1)

# for trend table (unit is always Gg for this table)
header_trend = ['data1990', 'data1995', "data2000", 'data2005', 'data2010', 'data2015', 'data2018', 'data2019']


# define config dict
inv_conf = {
    'header': header_inventory,
    'unit': unit_inventory,
    'header_energy' : header_energy,
    'unit_energy' : unit_energy,
    'header_lulucf' : header_lulucf,
    'unit_lulucf' : unit_lulucf,
    'header_waste' : header_waste,
    'unit_waste' : unit_waste,
    'header_trend' : header_trend,
    'entity_row': 0,
    'unit_row': 1,
    'index_cols': "Greenhouse gas source and sink categories",
    'year': {'110' : 1990,
             '111' : 2000,
             '112' : 2010,
             '113' : 2019,
             '116' : 1990,
             '117' : 2000,
             '118' : 2010,
             '119' : 2019,
             '124' : 1990,
             '125' : 2000,
             '126' : 2010,
             '127' : 2019,
            },
    'header_long': ["orig_cat_name", "entity", "unit", "time", "data"],
    "cat_code_regexp" : r'^(?P<code>[a-zA-Z0-9\.]{1,11})[\s\.].*'
}

## 1. Read main tables - pages 110, 111, 112, 113

In [19]:
pages = ['110', '111', '112', '113']
df_all_dict = {}
for page in pages:
    
    print("-"*45)
    print(f"Reading table from page {page}.")
    
    tables_inventory_original = camelot.read_pdf(
        str(input_folder / pdf_file),
        pages=page,
        table_areas=page_def_templates[page]["area"],
        columns=page_def_templates[page]["cols"],
        flavor="stream",
        split_text=True)
    
    print("Reading complete.")
    
    df_inventory = tables_inventory_original[0].df.copy()

    # move broken text in correct row (page 113 is fine)
    if page in ['110', '111', '112']:
        df_inventory.at[4, 0] = "1.A.1 - Industries énergétiques"
        df_inventory = df_inventory.drop(index=3)
        df_inventory.at[8, 0] = "1.A.4 - Autres secteurs"
        df_inventory = df_inventory.drop(index=7)

    # add header and unit
    df_header = pd.DataFrame([inv_conf["header"], inv_conf["unit"]])
    df_inventory = pd.concat([df_header, df_inventory], axis=0, join='outer').reset_index(drop=True)
    df_inventory = pm2.pm2io.nir_add_unit_information(df_inventory,
                                                  unit_row=inv_conf["unit_row"],
                                                  entity_row=inv_conf["entity_row"],
                                                  regexp_entity=".*",
                                                  regexp_unit=".*",
                                                  default_unit="Gg")
    
    print("Added unit information.")
    
    # set index
    df_inventory = df_inventory.set_index(inv_conf["index_cols"])

    # convert to long format
    df_inventory_long = pm2.pm2io.nir_convert_df_to_long(df_inventory, inv_conf["year"][page],
                                                     inv_conf["header_long"])

    # extract category from tuple
    df_inventory_long["orig_cat_name"] = df_inventory_long["orig_cat_name"].str[0] 

    # prep for conversion to PM2 IF and native format
    # make a copy of the categories row
    df_inventory_long["category"] = df_inventory_long["orig_cat_name"]

    # replace cat names by codes in col "category"
    # first the manual replacements
    # TODO: move this to config section
    inv_conf["cat_codes_manual"]['main'] = {
            'Éléments pour mémoire': 'MEMO',
            'Soutes internationales': 'M.BK',
            '1.A.3.a.i - Aviation internationale (soutes internationales)': 'M.BK.A',
            '1.A.3.d.i - Navigation internationale (soutes internationales)' : 'M.BK.M',
            '1.A.5.c - Opérations multilatérales' : 'M.MULTIOP',
            'Total des émissions et absorptions nationales': "0",
            '2A5: Autre': '2A5', 
        }
    df_inventory_long["category"] = \
        df_inventory_long["category"].replace(inv_conf["cat_codes_manual"]['main'])  

    df_inventory_long["category"] = df_inventory_long["category"].str.replace(".", "")
    
    # then the regex replacements
    repl = lambda m: m.group('code')
    df_inventory_long["category"] = \
        df_inventory_long["category"].str.replace(inv_conf["cat_code_regexp"], repl,
                                              regex=True)

    df_inventory_long = df_inventory_long.reset_index(drop=True)


    
    df_inventory_long["data"] = df_inventory_long["data"].str.replace(",", ".")
    df_inventory_long["data"] = df_inventory_long["data"].str.replace("NE1", "NE")

    # make sure all col headers are str
    df_inventory_long.columns = df_inventory_long.columns.map(str)
    df_inventory_long = df_inventory_long.drop(columns=["orig_cat_name"])
    
    df_all_dict[page] = df_inventory_long

df_all = pd.concat([df_all_dict['110'], df_all_dict['111'], df_all_dict['112'], df_all_dict['113']],
                      axis=0,
                      join='outer').reset_index(drop=True)

print("Converting to interchange format.")
df_all_IF = pm2.pm2io.convert_long_dataframe_if(
    df_all,
    coords_cols=coords_cols,
    #add_coords_cols=add_coords_cols,
    coords_defaults=coords_defaults,
    coords_terminologies=coords_terminologies,
    coords_value_mapping=coords_value_mapping['main'],
    #coords_value_filling=coords_value_filling,
    filter_remove=filter_remove,
    #filter_keep=filter_keep,
    meta_data=meta_data,
    convert_str=True,
    time_format="%Y",
    )

---------------------------------------------
Reading table from page 110.
Reading complete.
Added unit information.
---------------------------------------------
Reading table from page 111.
Reading complete.
Added unit information.
---------------------------------------------
Reading table from page 112.
Reading complete.
Added unit information.
---------------------------------------------
Reading table from page 113.
Reading complete.
Added unit information.
Converting to interchange format.


In [12]:
### Test individual values from the tables ###
# TODO and note: this function is work in progress
# Use assert statements and print error message
# with category, entity, year, expected value and actual value

### Test individual values from the tables ###
def assert_individual_value(
    df,
    category_column,
    entity_column,
    category,
    entity,
    year,
    expected_value
):
    arr = df.loc[(df[category_column] == category) & (df[entity_column] == entity), year].values
    print(arr)
    if len(arr) > 1:
        print(f"More than one value found for {category}, {entity}, {year}!")

    # TODO: It looks like this will be true when the value equals 0
    if not arr:
        print((f"No value found for {category}, {entity}, {year}!"))
            
    if not arr[0] == expected_value:
        print(f"Expected value {expected_value}, actual value is {arr[0]}")

    if arr[0] == expected_value:
        print("Value matches expected value.")

    return


test_cases = {
    "1" : {
        "category" : "1.A.1",
        'entity' : "CO2",
        "year" : "2010",
        "expected_value" : 422.474,
    },
    "2" : {
        "category" : "2",
        'entity' : "SO2",
        "year" : "1990",
        "expected_value" : 0.097,
    },
    "3" : {
        "category" : "1.A.3.a.i",
        'entity' : "N2O",
        "year" : "2000",
        "expected_value" : 6e-5,
    },
    '4' : {
        "category" : "2.H.2",
        'entity' : "NMVOC",
        "year" : "2019",
        "expected_value" : 2.506,
    },
    '5' : {
        "category" : "1.A.1",
        'entity' : "CH4",
        "year" : "2019",
        "expected_value" : 0.0011,
    }
}

for key in test_cases.keys():
    print("-"*50)
    print(f"Testing combination {test_cases[key]["category"]}, {test_cases[key]["entity"]}, {test_cases[key]["year"]}.")
    assert_individual_value(
                    df = df_all_IF,
                    category_column = "category (IPCC1996_2006_GIN_Inv)",
                    entity_column = "entity",
                    category = test_cases[key]["category"],
                    entity = test_cases[key]["entity"],
                    year = test_cases[key]["year"],
                    expected_value = test_cases[key]["expected_value"])

--------------------------------------------------
Testing combination 1.A.1, CO2, 2010.
[422.474]
Value matches expected value.
--------------------------------------------------
Testing combination 2, SO2, 1990.
[0.097]
Value matches expected value.
--------------------------------------------------
Testing combination 1.A.3.a.i, N2O, 2000.
[6.e-05]
Value matches expected value.
--------------------------------------------------
Testing combination 2.H.2, NMVOC, 2019.
[2.506]
Value matches expected value.
--------------------------------------------------
Testing combination 1.A.1, CH4, 2019.
[0.0011]
Value matches expected value.


In [24]:
### check data for errors ###
# print a few things to see if it looks "normal"
for c in df_all_IF.columns:
    print('-'*50)
    print(f"Unique values in column {c}")
    print(df_all_IF[c].unique())

--------------------------------------------------
Unique values in column source
['GIN-GHG-Inventory']
--------------------------------------------------
Unique values in column scenario (PRIMAP)
['BUR1']
--------------------------------------------------
Unique values in column provenance
['measured']
--------------------------------------------------
Unique values in column area (ISO3)
['GIN']
--------------------------------------------------
Unique values in column entity
['CH4' 'CO' 'CO2' 'HFCS (AR4GWP100)' 'N2O' 'NMVOC' 'NOx'
 'PFCS (AR4GWP100)' 'SF6' 'SO2']
--------------------------------------------------
Unique values in column unit
['Gg CH4 / yr' 'Gg CO / yr' 'Gg CO2 / yr' 'Gg N2O / yr' 'Gg NMVOC / yr'
 'Gg NOx / yr' 'Gg SF6 / yr' 'Gg SO2 / yr']
--------------------------------------------------
Unique values in column category (IPCC1996_2006_GIN_Inv)
['0' '1' '1.A' '1.A.1' '1.A.2' '1.A.3' '1.A.4' '1.A.5' '1.B' '1.C' '2'
 '2.A' '2.A.1' '2.A.2' '2.A.3' '2.A.4' '2.A.5' '2.B' 

In [25]:
### convert to primap2 format ###
data_pm2_main = pm2.pm2io.from_interchange_format(df_all_IF)

[32m2024-03-21 16:58:31.197[0m | [34m[1mDEBUG   [0m | [36mprimap2.pm2io._interchange_format[0m:[36mfrom_interchange_format[0m:[36m320[0m - [34m[1mExpected array shapes: [[1, 1, 1, 1, 10, 78], [1, 1, 1, 1, 10, 78], [1, 1, 1, 1, 10, 78], [1, 1, 1, 1, 10, 78], [1, 1, 1, 1, 10, 78], [1, 1, 1, 1, 10, 78], [1, 1, 1, 1, 10, 78], [1, 1, 1, 1, 10, 78], [1, 1, 1, 1, 10, 78], [1, 1, 1, 1, 10, 78]], resulting in size 7,800.[0m
[32m2024-03-21 16:58:31.323[0m | [1mINFO    [0m | [36mprimap2._data_format[0m:[36mensure_valid_attributes[0m:[36m292[0m - [1mReference information is not a DOI: 'placeholder'[0m


## 2. Read in sector tables for energy - pages 116, 117, 118, 119

In [27]:
pages = ['116', '117', '118', '119']
df_energy_dict = {}
for page in pages:
    print("-"*45)
    print(f"Reading table from page {page}.")
    
    tables_inventory_original = camelot.read_pdf(
        str(input_folder / pdf_file),
        pages=page,
        flavor="lattice",
        split_text=True
        )
    
    print("Reading complete.")

    # cut last two lines of second table to ignore additional information regarding biomass for energy production 
    df_energy_year = pd.concat([tables_inventory_original[0].df[2:],
                                tables_inventory_original[1].df[3:-2]],
                                axis=0,
                                join='outer').reset_index(drop=True)

    
    # drop duplicate lines - 1.A.3.d.i / 1.A.3.a.i / 1.A.5.c
    # TODO: better to find the index of the line and then drop it by the index
    df_energy_year = df_energy_year.drop(index=[27, 32, 50])  
    
    # add header and unit
    df_header = pd.DataFrame([inv_conf["header_energy"], inv_conf["unit_energy"]])

    df_energy_year = pd.concat([df_header, df_energy_year], axis=0, join='outer').reset_index(drop=True)
    
    df_energy_year = pm2.pm2io.nir_add_unit_information(df_energy_year,
                                                  unit_row=inv_conf["unit_row"],
                                                  entity_row=inv_conf["entity_row"],
                                                  regexp_entity=".*",
                                                  regexp_unit=".*",
                                                  default_unit="Gg")
    
    print("Added unit information.")
    # set index
    df_energy_year = df_energy_year.set_index(inv_conf["index_cols"])

    # convert to long format
    df_energy_year_long = pm2.pm2io.nir_convert_df_to_long(df_energy_year, inv_conf["year"][page],
                                                     inv_conf["header_long"])
    
    # extract from tuple
    df_energy_year_long["orig_cat_name"] = df_energy_year_long["orig_cat_name"].str[0] 

    # prep for conversion to PM2 IF and native format
    # make a copy of the categories row
    df_energy_year_long["category"] = df_energy_year_long["orig_cat_name"]

    # replace individual categories
    # TODO: move to config section
    inv_conf["cat_codes_manual"]['energy'] = {
            'International Bunkers': 'MEMO',
            '1.A.3.a.i - Aviation internationale (soutes internationales)': 'M.BK.A',
            '1.A.3.d.i - Navigation internationale (soutes internationales)' : 'M.BK.M',
            '1.A.5.c - Opérations multilatérales' : 'M.MULTIOP',
        }

    # replace cat names by codes in col "category"
    # first the manual replacements
    df_energy_year_long["category"] = df_energy_year_long["category"].str.replace('\n' ,'')
    df_energy_year_long["category"] = \
        df_energy_year_long["category"].replace(inv_conf["cat_codes_manual"]['energy'])

    df_energy_year_long["category"] = df_energy_year_long["category"].str.replace(".", "")
    
    inv_conf["cat_code_regexp"] = r'^(?P<code>[a-zA-Z0-9\.]{1,11})[\s\.].*'

    # then the regex replacements
    repl = lambda m: m.group('code')
    df_energy_year_long["category"] = \
        df_energy_year_long["category"].str.replace(inv_conf["cat_code_regexp"], repl,
                                              regex=True)

    df_energy_year_long = df_energy_year_long.reset_index(drop=True)

    
    df_energy_year_long["data"] = df_energy_year_long["data"].str.replace(",", ".")
    df_energy_year_long["data"] = df_energy_year_long["data"].str.replace("NE1", "NE")

    # make sure all col headers are str
    df_energy_year_long.columns = df_energy_year_long.columns.map(str)
    df_energy_year_long = df_energy_year_long.drop(columns=["orig_cat_name"])
    
    df_energy_dict[page] = df_energy_year_long

df_energy = pd.concat([df_energy_dict['116'], df_energy_dict['117'], df_energy_dict['118'], df_energy_dict['119']],
                      axis=0,
                      join='outer').reset_index(drop=True)

print("Converting to interchange format.")
df_energy_IF = pm2.pm2io.convert_long_dataframe_if(
    df_energy,
    coords_cols=coords_cols,
    #add_coords_cols=add_coords_cols,
    coords_defaults=coords_defaults,
    coords_terminologies=coords_terminologies,
    coords_value_mapping=coords_value_mapping['energy'],
    #coords_value_filling=coords_value_filling,
    filter_remove=filter_remove,
    #filter_keep=filter_keep,
    meta_data=meta_data,
    convert_str=True,
    time_format="%Y",
    )
    
df_energy_IF

---------------------------------------------
Reading table from page 116.
Reading complete.
Added unit information.
---------------------------------------------
Reading table from page 117.
Reading complete.
Added unit information.
---------------------------------------------
Reading table from page 118.
Reading complete.
Added unit information.
---------------------------------------------
Reading table from page 119.
Reading complete.
Added unit information.
Converting to interchange format.


Unnamed: 0,source,scenario (PRIMAP),provenance,area (ISO3),entity,unit,category (IPCC1996_2006_GIN_Inv),1990,2000,2010,2019
0,GIN-GHG-Inventory,BUR1,measured,GIN,CH4,Gg CH4 / yr,1,6.465,6.489,4.849,5.821
1,GIN-GHG-Inventory,BUR1,measured,GIN,CH4,Gg CH4 / yr,1.A,6.465,6.489,4.849,5.821
2,GIN-GHG-Inventory,BUR1,measured,GIN,CH4,Gg CH4 / yr,1.A.1,0.032,0.024,0.016,0.001
3,GIN-GHG-Inventory,BUR1,measured,GIN,CH4,Gg CH4 / yr,1.A.1.a,0.032,0.024,0.016,0.001
4,GIN-GHG-Inventory,BUR1,measured,GIN,CH4,Gg CH4 / yr,1.A.1.a.i,0.032,0.024,0.016,0.001
...,...,...,...,...,...,...,...,...,...,...,...
373,GIN-GHG-Inventory,BUR1,measured,GIN,SO2,Gg SO2 / yr,1.A.5.b.iii,,,,
374,GIN-GHG-Inventory,BUR1,measured,GIN,SO2,Gg SO2 / yr,1.A.5.c,,,,
375,GIN-GHG-Inventory,BUR1,measured,GIN,SO2,Gg SO2 / yr,1.B,,,,
376,GIN-GHG-Inventory,BUR1,measured,GIN,SO2,Gg SO2 / yr,M.BK.M,,,,


In [30]:
test_cases = {
    "1" : {
        "category" : "1.A.2.k",
        'entity' : "CH4",
        "year" : "1990",
        "expected_value" : 3e-05,
    },
    "2" : {
        "category" : "1.A.4.c.i",
        'entity' : "CO",
        "year" : "1990",
        "expected_value" : 0.0016,
    },
    "3" : {
        "category" : "1.A.3.a.i",
        'entity' : "NMVOC",
        "year" : "2000",
        "expected_value" : 0.0002,
    },
    '4' : {
        "category" : "1",
        'entity' : "SO2",
        "year" : "2010",
        "expected_value" : 0,
    },
    '5' : {
        "category" : "1.A.2.k",
        'entity' : "N2O",
        "year" : "2019",
        "expected_value" : 7e-06,
    }
}

for key in test_cases.keys():
    print("-"*50)
    print(f"Testing combination {test_cases[key]["category"]}, {test_cases[key]["entity"]}, {test_cases[key]["year"]}.")
    assert_individual_value(
                    df = df_energy_IF,
                    category_column = "category (IPCC1996_2006_GIN_Inv)",
                    entity_column = "entity",
                    category = test_cases[key]["category"],
                    entity = test_cases[key]["entity"],
                    year = test_cases[key]["year"],
                    expected_value = test_cases[key]["expected_value"])

--------------------------------------------------
Testing combination 1.A.2.k, CH4, 1990.
[3.e-05]
Value matches expected value.
--------------------------------------------------
Testing combination 1.A.4.c.i, CO, 1990.
[0.0016]
Value matches expected value.
--------------------------------------------------
Testing combination 1.A.3.a.i, NMVOC, 2000.
[0.0002]
Value matches expected value.
--------------------------------------------------
Testing combination 1, SO2, 2010.
[0.]
No value found for 1, SO2, 2010!
Value matches expected value.
--------------------------------------------------
Testing combination 1.A.2.k, N2O, 2019.
[7.e-06]
Value matches expected value.


In [31]:
### convert to primap2 format ###
data_pm2_energy = pm2.pm2io.from_interchange_format(df_energy_IF)

[32m2024-03-21 17:25:29.863[0m | [34m[1mDEBUG   [0m | [36mprimap2.pm2io._interchange_format[0m:[36mfrom_interchange_format[0m:[36m320[0m - [34m[1mExpected array shapes: [[1, 1, 1, 1, 7, 54], [1, 1, 1, 1, 7, 54], [1, 1, 1, 1, 7, 54], [1, 1, 1, 1, 7, 54], [1, 1, 1, 1, 7, 54], [1, 1, 1, 1, 7, 54], [1, 1, 1, 1, 7, 54]], resulting in size 2,646.[0m
[32m2024-03-21 17:25:29.940[0m | [1mINFO    [0m | [36mprimap2._data_format[0m:[36mensure_valid_attributes[0m:[36m292[0m - [1mReference information is not a DOI: 'placeholder'[0m


# 3. Read in LULUCF table - pages 124, 125, 126, 127

In [50]:
pages = ['124', '125', '126', '127']
df_lulucf_dict = {}
for page in pages:
    print("-"*45)
    print(f"Reading table from page {page}.")
    
    tables_inventory_original = camelot.read_pdf(
    str(input_folder / pdf_file),
    pages=page,
    flavor="lattice",
    split_text=True
    )
    print("Reading complete.")

    if page == '127':
        # table on page 127 has one extra row at the top
        # and one extra category 3.A.1.j
        df_lulucf_year = tables_inventory_original[0].df[3:]
        # rename duplicate categories in tables
        # TODO move to config section
        replace_categories = [(19, "3.A.2.a.i - Vaches laitières"),
                              (20, "3.A.2.a.ii - Autres bovins"),
                              (21, "3.A.2.b - Buffle"),
                              (22, "3.A.2.c - Ovins"),
                              (23, "3.A.2.d - Caprins"),
                              (24, "3.A.2.e - Chameaux"),
                              (25, "3.A.2.f - Chevaux"),
                              (26, "3.A.2.g - Mules et ânes"),
                              (27, "3.A.2.h - Porcins"),
                              (28, "3.A.2.i - Volailles"),
                              (29, "3.A.2.j - Autres (préciser)"),]
        for index, category_name in  replace_categories:
            df_lulucf_year.at[index, 0] = category_name
    else:
        # cut first two lines
        df_lulucf_year = tables_inventory_original[0].df[2:] 

        # TODO move to config section
        replace_categories = [(17, "3.A.2.a.i - Vaches laitières"),
                              (18, "3.A.2.a.ii - Autres bovins"),
                              (19, "3.A.2.b - Buffle"),
                              (20, "3.A.2.c - Ovins"),
                              (21, "3.A.2.d - Caprins"),
                              (22, "3.A.2.e - Chameaux"),
                              (23, "3.A.2.f - Chevaux"),
                              (24, "3.A.2.g - Mules et ânes"),
                              (25, "3.A.2.h - Porcins"),
                              (26, "3.A.2.i - Volailles"),]
        for index, category_name in  replace_categories:
            df_lulucf_year.at[index, 0] = category_name
    
    # add header and unit
    df_header = pd.DataFrame([inv_conf["header_lulucf"], inv_conf["unit_lulucf"]])

    df_lulucf_year = pd.concat([df_header, df_lulucf_year], axis=0, join='outer').reset_index(drop=True)

    df_lulucf_year = pm2.pm2io.nir_add_unit_information(df_lulucf_year,
                                                  unit_row=inv_conf["unit_row"],
                                                  entity_row=inv_conf["entity_row"],
                                                  regexp_entity=".*",
                                                  regexp_unit=".*",
                                                  default_unit="Gg")

    print("Added unit information.")
    
    # set index
    df_lulucf_year = df_lulucf_year.set_index(inv_conf["index_cols"])

    # convert to long format
    df_lulucf_year_long = pm2.pm2io.nir_convert_df_to_long(df_lulucf_year, inv_conf["year"][page],
                                                     inv_conf["header_long"])
    
    df_lulucf_year_long["orig_cat_name"] = df_lulucf_year_long["orig_cat_name"].str[0] # extract from tuple

    # prep for conversion to PM2 IF and native format
    # make a copy of the categories row
    df_lulucf_year_long["category"] = df_lulucf_year_long["orig_cat_name"]
   
    # regex replacements
    repl = lambda m: m.group('code')
    df_lulucf_year_long["category"] = \
        df_lulucf_year_long["category"].str.replace(inv_conf["cat_code_regexp"], repl,
                                              regex=True)
    
    df_lulucf_year_long = df_lulucf_year_long.reset_index(drop=True)
    
    df_lulucf_year_long["data"] = df_lulucf_year_long["data"].str.replace(",", ".")
    df_lulucf_year_long["data"] = df_lulucf_year_long["data"].str.replace("NE1", "NE")

    # make sure all col headers are str
    df_lulucf_year_long.columns = df_lulucf_year_long.columns.map(str)
    df_lulucf_year_long = df_lulucf_year_long.drop(columns=["orig_cat_name"])
    
    df_lulucf_dict[page] = df_lulucf_year_long

df_lulucf = pd.concat([df_lulucf_dict['124'], df_lulucf_dict['125'], df_lulucf_dict['126'], df_lulucf_dict['127']],
                      axis=0,
                      join='outer').reset_index(drop=True)

print("Converting to interchange format.")
df_lulucf_IF = pm2.pm2io.convert_long_dataframe_if(
    df_lulucf,
    coords_cols=coords_cols,
    #add_coords_cols=add_coords_cols,
    coords_defaults=coords_defaults,
    coords_terminologies=coords_terminologies,
    coords_value_mapping=coords_value_mapping['lulucf'],
    #coords_value_filling=coords_value_filling,
    filter_remove=filter_remove,
    #filter_keep=filter_keep,
    meta_data=meta_data,
    convert_str=True,
    time_format="%Y",
    )
    
df_lulucf_IF

---------------------------------------------
Reading table from page 124.
Reading complete.
Added unit information.
---------------------------------------------
Reading table from page 125.
Reading complete.
Added unit information.
---------------------------------------------
Reading table from page 126.
Reading complete.
Added unit information.
---------------------------------------------
Reading table from page 127.
Reading complete.
Added unit information.
Converting to interchange format.


Unnamed: 0,source,scenario (PRIMAP),provenance,area (ISO3),entity,unit,category (IPCC1996_2006_GIN_Inv),1990,2000,2010,2019
0,GIN-GHG-Inventory,BUR1,measured,GIN,CH4,Gg CH4 / yr,3,56.987,110.568,187.617,299.503
1,GIN-GHG-Inventory,BUR1,measured,GIN,CH4,Gg CH4 / yr,3.A,55.634,107.911,186.769,298.533
2,GIN-GHG-Inventory,BUR1,measured,GIN,CH4,Gg CH4 / yr,3.A.1,53.796,104.298,180.454,288.239
3,GIN-GHG-Inventory,BUR1,measured,GIN,CH4,Gg CH4 / yr,3.A.1.a,49.050,94.967,161.753,256.319
4,GIN-GHG-Inventory,BUR1,measured,GIN,CH4,Gg CH4 / yr,3.A.1.a.i,10.488,17.802,27.091,31.905
...,...,...,...,...,...,...,...,...,...,...,...
469,GIN-GHG-Inventory,BUR1,measured,GIN,NOx,Gg NOx / yr,3.C.7,0.000,0.000,0.000,0.000
470,GIN-GHG-Inventory,BUR1,measured,GIN,NOx,Gg NOx / yr,3.C.8,0.000,0.000,0.000,0.000
471,GIN-GHG-Inventory,BUR1,measured,GIN,NOx,Gg NOx / yr,3.D,0.000,0.000,0.000,0.000
472,GIN-GHG-Inventory,BUR1,measured,GIN,NOx,Gg NOx / yr,3.D.1,0.000,0.000,0.000,0.000


In [51]:
### convert to primap2 format ###
data_pm2_lulucf = pm2.pm2io.from_interchange_format(df_lulucf_IF)

[32m2024-03-22 09:22:15.333[0m | [34m[1mDEBUG   [0m | [36mprimap2.pm2io._interchange_format[0m:[36mfrom_interchange_format[0m:[36m320[0m - [34m[1mExpected array shapes: [[1, 1, 1, 1, 6, 79], [1, 1, 1, 1, 6, 79], [1, 1, 1, 1, 6, 79], [1, 1, 1, 1, 6, 79], [1, 1, 1, 1, 6, 79], [1, 1, 1, 1, 6, 79]], resulting in size 2,844.[0m
[32m2024-03-22 09:22:15.408[0m | [1mINFO    [0m | [36mprimap2._data_format[0m:[36mensure_valid_attributes[0m:[36m292[0m - [1mReference information is not a DOI: 'placeholder'[0m


# 3. Read in Waste tables - pages 128, 130

In [52]:
# There are three tables for three years on page 128
# and another tabel on page 130

# read three tables
page = '128'
tables_inventory_original_128 = camelot.read_pdf(
    str(input_folder / pdf_file),
    pages=page,
    flavor="lattice",
    split_text=True
)

# read last table
page = '130'
tables_inventory_original_130 = camelot.read_pdf(
    str(input_folder / pdf_file),
    pages=page,
    flavor="lattice",
    split_text=True
)

# save to dict
df_waste_years = {
    '1990' : tables_inventory_original_128[0].df,
    '2000' : tables_inventory_original_128[1].df,
    '2010' : tables_inventory_original_128[2].df,
    '2019' : tables_inventory_original_130[0].df,
}


In [57]:
df_waste_dict = {}
for year in df_waste_years.keys():
    print("-"*45)
    print(f"Processing table for {year}.")

    df_waste_year = df_waste_years[year][2:]
    
    # add header and unit
    df_header = pd.DataFrame([inv_conf["header_waste"], inv_conf["unit_waste"]])

    df_waste_year = pd.concat([df_header, df_waste_year], axis=0, join='outer').reset_index(drop=True)

    df_waste_year = pm2.pm2io.nir_add_unit_information(df_waste_year,
                                                  unit_row=inv_conf["unit_row"],
                                                  entity_row=inv_conf["entity_row"],
                                                  regexp_entity=".*",
                                                  regexp_unit=".*",
                                                  default_unit="Gg")

    print("Added unit information.")
    
    # set index
    df_waste_year = df_waste_year.set_index(inv_conf["index_cols"])

    # convert to long format
    df_waste_year_long = pm2.pm2io.nir_convert_df_to_long(df_waste_year, year,
                                                     inv_conf["header_long"])
    
    df_waste_year_long["orig_cat_name"] = df_waste_year_long["orig_cat_name"].str[0]

    # prep for conversion to PM2 IF and native format
    # make a copy of the categories row
    df_waste_year_long["category"] = df_waste_year_long["orig_cat_name"]

    # regex replacements
    repl = lambda m: m.group('code')
    df_waste_year_long["category"] = \
        df_waste_year_long["category"].str.replace(inv_conf["cat_code_regexp"], repl,
                                              regex=True)
    
    df_waste_year_long = df_waste_year_long.reset_index(drop=True)

    df_waste_year_long["category"] = df_waste_year_long["category"].str.replace(".", "")
    df_waste_year_long["data"] = df_waste_year_long["data"].str.replace(",", ".")
    df_waste_year_long["data"] = df_waste_year_long["data"].str.replace("NE1", "NE")

    # make sure all col headers are str
    df_waste_year_long.columns = df_waste_year_long.columns.map(str)
    df_waste_year_long = df_waste_year_long.drop(columns=["orig_cat_name"])
    
    df_waste_dict[year] = df_waste_year_long

df_waste = pd.concat([df_waste_dict['1990'], df_waste_dict['2000'], df_waste_dict['2010'], df_waste_dict['2019']],
                      axis=0,
                      join='outer').reset_index(drop=True)

print("Converting to interchange format.")
df_waste_IF = pm2.pm2io.convert_long_dataframe_if(
    df_waste,
    coords_cols=coords_cols,
    #add_coords_cols=add_coords_cols,
    coords_defaults=coords_defaults,
    coords_terminologies=coords_terminologies,
    coords_value_mapping=coords_value_mapping['waste'],
    #coords_value_filling=coords_value_filling,
    filter_remove=filter_remove,
    #filter_keep=filter_keep,
    meta_data=meta_data,
    convert_str=True,
    time_format="%Y",
    )
    
df_waste_IF

---------------------------------------------
Processing table for 1990.
Added unit information.
---------------------------------------------
Processing table for 2000.
Added unit information.
---------------------------------------------
Processing table for 2010.
Added unit information.
---------------------------------------------
Processing table for 2019.
Added unit information.
Converting to interchange format.


Unnamed: 0,source,scenario (PRIMAP),provenance,area (ISO3),entity,unit,category (IPCC1996_2006_GIN_Inv),1990,2000,2010,2019
0,GIN-GHG-Inventory,BUR1,measured,GIN,CH4,Gg CH4 / yr,4,1.750,2.925,4.534,6.665
1,GIN-GHG-Inventory,BUR1,measured,GIN,CH4,Gg CH4 / yr,4.A,1.029,2.054,3.323,5.170
2,GIN-GHG-Inventory,BUR1,measured,GIN,CH4,Gg CH4 / yr,4.A.1,,,,
3,GIN-GHG-Inventory,BUR1,measured,GIN,CH4,Gg CH4 / yr,4.A.2,,,,
4,GIN-GHG-Inventory,BUR1,measured,GIN,CH4,Gg CH4 / yr,4.A.3,,,,
...,...,...,...,...,...,...,...,...,...,...,...
86,GIN-GHG-Inventory,BUR1,measured,GIN,SO2,Gg SO2 / yr,4.C.2,0.000,0.000,0.000,0.000
87,GIN-GHG-Inventory,BUR1,measured,GIN,SO2,Gg SO2 / yr,4.D,0.000,0.000,0.000,0.000
88,GIN-GHG-Inventory,BUR1,measured,GIN,SO2,Gg SO2 / yr,4.D.1,0.000,0.000,0.000,0.000
89,GIN-GHG-Inventory,BUR1,measured,GIN,SO2,Gg SO2 / yr,4.D.2,0.000,0.000,0.000,0.000


In [58]:
### convert to primap2 format ###
data_pm2_waste = pm2.pm2io.from_interchange_format(df_waste_IF)

[32m2024-03-22 09:27:11.859[0m | [34m[1mDEBUG   [0m | [36mprimap2.pm2io._interchange_format[0m:[36mfrom_interchange_format[0m:[36m320[0m - [34m[1mExpected array shapes: [[1, 1, 1, 1, 7, 13], [1, 1, 1, 1, 7, 13], [1, 1, 1, 1, 7, 13], [1, 1, 1, 1, 7, 13], [1, 1, 1, 1, 7, 13], [1, 1, 1, 1, 7, 13], [1, 1, 1, 1, 7, 13]], resulting in size 637.[0m
[32m2024-03-22 09:27:11.898[0m | [1mINFO    [0m | [36mprimap2._data_format[0m:[36mensure_valid_attributes[0m:[36m292[0m - [1mReference information is not a DOI: 'placeholder'[0m


# 4. Read in trend tables - pages 131 - 137

In [67]:
#%matplotlib widget 
#camelot.plot(tables_inventory_original[0], kind='text')

df_main_dict = {}
pages = ['131', '132', '133', '134', '135', '136', '137']
entities = ['CO2', 'CH4', 'N2O', 'NOx', 'CO', 'NMVOCs', 'SO2']

# for this set of tables every page is a different entity
for page, entity in zip(pages, entities):

    print("-"*45)
    print(f"Reading table for page {page} and entity {entity}.")
    
    # first table needs to be read in with flavor="stream"
    # flavor="lattice" raises an error, maybe camelot issue
    # see https://github.com/atlanhq/camelot/issues/306
    # or because characters in first row almost reach
    # the table grid    
    if page == '131':
        tables_inventory_original = camelot.read_pdf(
            str(input_folder / pdf_file),
            pages=page,
            table_areas=page_def_templates[page]["area"],
            columns=page_def_templates[page]["cols"],
            flavor="stream",
            split_text=True
        )
        
        df_trend_entity = tables_inventory_original[0].df[1:]
    else:
        tables_inventory_original = camelot.read_pdf(
            str(input_folder / pdf_file),
            pages=page,
            flavor="lattice",
            split_text=True)
        df_trend_entity = tables_inventory_original[0].df[3:]

    print(f"Reading complete.")

    # add columns
    # 'data' prefix is needed for pd.wide_to_long() later
    columns_years = ['data1990', 'data1995', "data2000", 'data2005', 'data2010', 'data2015', 'data2018', 'data2019']
    df_trend_entity.columns = ['orig_cat_name'] + columns_years
    
    # unit is always Gg
    df_trend_entity['unit'] = 'Gg'
    
    # only one entity per table
    df_trend_entity['entity'] = entity
    
    df_trend_entity["category"] = df_trend_entity["orig_cat_name"]

    # delete rows that are just a headline or empty
    #row_to_delete = df_trend_entity.index[df_trend_entity['category'] == 'Éléments pour mémoire'][0]
    #df_trend_entity = df_trend_entity.drop(index = row_to_delete)

    # in the first table there is no empty line
    if page != '131':
        row_to_delete = df_trend_entity.index[df_trend_entity['category'] == ''][0]
        df_trend_entity = df_trend_entity.drop(index = row_to_delete)
        
    inv_conf["cat_code_regexp"] = r'^(?P<code>[a-zA-Z0-9\.]{1,11})[\s\.].*'

    df_trend_entity["category"] = df_trend_entity["category"].replace(
        {
         'Total des émissions et absorptions nationales': "0",
         '2A5: Autre' : '2A5',
         'Éléments pour mémoire': 'MEMO',
         'Soutes internationales' : 'M.BK',
         '1.A.3.a.i - Aviation internationale (soutes internationales)' : 'M.BK.A',
         '1.A.3.d.i - Navigation internationale (soutes internationales)' : 'M.BK.M',
         '1.A.5.c - Opérations multilatérales' : 'M.MULTIOP',
        })

    df_trend_entity["category"] = df_trend_entity["category"].str.replace(".", "")
    df_trend_entity["category"] = df_trend_entity["category"].str.replace("\n", "")
    
    
    repl = lambda m: m.group('code')
    df_trend_entity["category"] = \
        df_trend_entity["category"].str.replace(inv_conf["cat_code_regexp"], repl,
                                              regex=True)
    
    df_trend_entity = df_trend_entity.reset_index(drop=True)
    
    print(f"Created category codes.")
    
    for year in columns_years:
        df_trend_entity[year] = df_trend_entity[year].str.replace(",", ".")
        df_trend_entity[year] = df_trend_entity[year].str.replace("NE1", "NE")
    
    # make sure all col headers are str
    df_trend_entity.columns = df_trend_entity.columns.map(str)
    
    df_trend_entity = df_trend_entity.drop(columns=["orig_cat_name"])
    
    df_trend_entity_long = pd.wide_to_long(df_trend_entity, stubnames='data',  i='category', j='time')
    
    print(f"Converted to long format.")
    
    df_trend_entity_long = df_trend_entity_long.reset_index()
    
    df_main_dict[page] =  df_trend_entity_long

print("Converting to interchange format.")

df_trend_all = pd.concat([df_main_dict['131'], df_main_dict['132']], axis=0, join='outer').reset_index(drop=True)

df_trend_IF = pm2.pm2io.convert_long_dataframe_if(
    df_trend_all,
    coords_cols=coords_cols,
    #add_coords_cols=add_coords_cols,
    coords_defaults=coords_defaults,
    coords_terminologies=coords_terminologies,
    coords_value_mapping=coords_value_mapping['trend'],
    #coords_value_filling=coords_value_filling,
    filter_remove=filter_remove,
    #filter_keep=filter_keep,
    meta_data=meta_data,
    convert_str=True,
    time_format="%Y",
    )
    
df_trend_IF
       

---------------------------------------------
Reading table for page 131 and entity CO2.
Reading complete.
Created category codes.
Converted to long format.
---------------------------------------------
Reading table for page 132 and entity CH4.
Reading complete.
Created category codes.
Converted to long format.
---------------------------------------------
Reading table for page 133 and entity N2O.
Reading complete.
Created category codes.
Converted to long format.
---------------------------------------------
Reading table for page 134 and entity NOx.
Reading complete.
Created category codes.
Converted to long format.
---------------------------------------------
Reading table for page 135 and entity CO.
Reading complete.
Created category codes.
Converted to long format.
---------------------------------------------
Reading table for page 136 and entity NMVOCs.
Reading complete.
Created category codes.
Converted to long format.
---------------------------------------------
Reading ta

Unnamed: 0,source,scenario (PRIMAP),provenance,area (ISO3),entity,unit,category (IPCC1996_2006_GIN_Inv),1990,1995,2000,2005,2010,2015,2018,2019
0,GIN-GHG-Inventory,BUR1,measured,GIN,CH4,Gg CH4 / yr,0,65.202,93.368,119.981,152.272,196.057,253.025,296.416,312.034
1,GIN-GHG-Inventory,BUR1,measured,GIN,CH4,Gg CH4 / yr,1,6.465,7.066,6.489,5.984,4.849,5.360,5.931,5.866
2,GIN-GHG-Inventory,BUR1,measured,GIN,CH4,Gg CH4 / yr,1.A,6.465,7.066,6.489,5.984,4.849,5.360,5.931,5.866
3,GIN-GHG-Inventory,BUR1,measured,GIN,CH4,Gg CH4 / yr,1.A.1,0.032,0.027,0.024,0.020,0.016,0.002,0.005,0.001
4,GIN-GHG-Inventory,BUR1,measured,GIN,CH4,Gg CH4 / yr,1.A.2,0.006,0.012,0.018,0.023,0.028,0.024,0.026,0.033
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151,GIN-GHG-Inventory,BUR1,measured,GIN,CO2,Gg CO2 / yr,5,,,,,,,,
152,GIN-GHG-Inventory,BUR1,measured,GIN,CO2,Gg CO2 / yr,M.BK,0.719,1.438,2.158,19.529,36.900,21.840,51.718,66.197
153,GIN-GHG-Inventory,BUR1,measured,GIN,CO2,Gg CO2 / yr,M.BK.A,0.719,1.438,2.158,19.529,36.900,21.840,51.718,66.197
154,GIN-GHG-Inventory,BUR1,measured,GIN,CO2,Gg CO2 / yr,M.BK.M,,,,,,,,


In [68]:
### convert to primap2 format ###
data_pm2_trend = pm2.pm2io.from_interchange_format(df_trend_IF)

[32m2024-03-22 09:52:43.765[0m | [34m[1mDEBUG   [0m | [36mprimap2.pm2io._interchange_format[0m:[36mfrom_interchange_format[0m:[36m320[0m - [34m[1mExpected array shapes: [[1, 1, 1, 1, 2, 78], [1, 1, 1, 1, 2, 78]], resulting in size 312.[0m
[32m2024-03-22 09:52:43.826[0m | [1mINFO    [0m | [36mprimap2._data_format[0m:[36mensure_valid_attributes[0m:[36m292[0m - [1mReference information is not a DOI: 'placeholder'[0m


# Combine tables and save to IF and native format

In [72]:
#### combine

#data_pm2_main
#data_pm2_trend
#data_pm2_energy
#data_pm2_lulucf
#data_pm2_waste

# tolerance needs to be high as rounding in trend tables leads to inconsistent data
data_pm2 = data_pm2_main.pr.merge(data_pm2_energy,tolerance=0.11)

[32m2024-03-22 10:09:36.801[0m | [34m[1mDEBUG   [0m | [36mprimap2._merge[0m:[36mmerge[0m:[36m230[0m - [34m[1mmerging for CH4[0m
[32m2024-03-22 10:09:37.026[0m | [34m[1mDEBUG   [0m | [36mprimap2._merge[0m:[36mmerge[0m:[36m230[0m - [34m[1mmerging for CO2[0m
[32m2024-03-22 10:09:37.187[0m | [34m[1mDEBUG   [0m | [36mprimap2._merge[0m:[36mmerge[0m:[36m230[0m - [34m[1mmerging for N2O[0m
[32m2024-03-22 10:09:37.351[0m | [34m[1mDEBUG   [0m | [36mprimap2._merge[0m:[36mmerge[0m:[36m230[0m - [34m[1mmerging for SO2[0m
[32m2024-03-22 10:09:37.448[0m | [34m[1mDEBUG   [0m | [36mprimap2._merge[0m:[36mmerge[0m:[36m230[0m - [34m[1mmerging for NMVOC[0m
[32m2024-03-22 10:09:37.533[0m | [31m[1mERROR   [0m | [36mprimap2._merge[0m:[36mmerge_with_tolerance_core[0m:[36m74[0m - [31m[1mpr.merge error: found discrepancies larger than tolerance (11.00%) for source=GIN-GHG-Inventory, scenario (PRIMAP)=BUR1, provenance=measured, area 

MergeError: pr.merge error: found discrepancies larger than tolerance (11.00%) for source=GIN-GHG-Inventory, scenario (PRIMAP)=BUR1, provenance=measured, area (ISO3)=GIN, category (IPCC1996_2006_GIN_Inv)=1.A.2:
shown are relative discrepancies.
               NMVOC
time                
1990-01-01  0.800000
2000-01-01  0.800000
2010-01-01  0.869848

In [None]:
# convert back to IF to have units in the fixed format
data_if = data_pm2.pr.to_interchange_format()

# ###
# save data to IF and native format
# ###
pm2.pm2io.write_interchange_format(
    output_folder / (output_filename + coords_terminologies["category"] + "_raw"), data_if)

encoding = {var: compression for var in data_pm2.data_vars}
data_pm2.pr.to_netcdf(
    output_folder / (output_filename + coords_terminologies["category"] + "_raw.nc"),
    encoding=encoding)