In [3]:
# filter out warnings, which can be excessive
import warnings
warnings.filterwarnings("ignore")


from attribute_table import AttributeTable
import importlib
import matplotlib.pyplot as plt
import model_attributes as ma
import numpy as np
import os, os.path
import pandas as pd
import pathlib
import setup_analysis as sa
import sisepuede_file_structure as sfs
import sql_utilities as squ
import support_classes as sc
import support_functions as sf
import time



# 1. Load `sisepuede_data_pipline` libraries and scripts
- I have found it convenient to work from the SISEPUEDE directory sometimes, but it does not matter 
- For now, to do this, you have to set the path to the `sisepuede_data_pipeline` git repository using `fp_lib` below
- This will be fixed in later iterations, once we determine the best installation path forward

In [None]:
# NOTE: need to set a path to the sisepuede_data_pipeline here. Will fix this with an installation, but for now
fp_lib = "/Users/jsyme/Documents/Projects/git_jbus/sisepuede_data_pipeline"
import sys
if fp_lib not in sys.path:
    sys.path.append(fp_lib)
import lib.data_objects as do
import lib.process_utilities as pu
import lib.sisepuede_data_constructs as dc
import lib.data_construction_utilities as util

# 2. Instantiate a `SISEPUEDEDataConstructs` object to allow `sispeuede_data_pipeline` objects to access SISEPUEDE elements 
###  This is normally accessed from _within_ the `sispeuede_data_pipeline` repository
- sets up a SISEPUEDEFileStructure (`construct.sisepuede_file_struct`) object, which includes a `ModelAttributes` object (`construct.sisepuede_file_struct.model_attributes`)
- includes `sc.Regions` (`construct.regions`) and `sc.TimePeriods` (`construct.time_periods`) objects to support seamless region and time period integration with SISEPUEDE

In [4]:
construct = dc.SISEPUEDEDataConstructs()

MISSIONSEARCHNOTE: As of 2023-10-06, there is a temporary solution implemeted in ModelAttributes.get_variable_to_simplex_group_dictionary() to ensure that transition probability rows are enforced on a simplex.

FIX THIS ASAP TO DERIVE PROPERLY.


In [10]:
# model_attributes object 
construct.sisepuede_file_struct.model_attributes

<model_attributes.ModelAttributes at 0x17a009350>

In [6]:
# regions object
construct.regions


<support_classes.Regions at 0x109b3c850>

In [9]:
# time periods object
construct.time_periods

<support_classes.TimePeriods at 0x17a79ff10>

# 3. Set up a `process_utilities.Repository` object to access input raw data in another location
- Method lets us swap out local data for remote (AWS S3) repositories trivially
- e.g., I have input data stored at `/Users/jsyme/SISEPUEDE_DATA_REPOSITORY`
- Initialize with a dictionary (see `?pu.Repository` for information on initializaing it)

In [14]:
# setup a local repostory
repo = pu.Repository(
    {
        "local": {
            "path": "/Users/jsyme/SISEPUEDE_DATA_REPOSITORY"
        }
    }
)


In [13]:
?pu.Repository

[0;31mInit signature:[0m
[0mpu[0m[0;34m.[0m[0mRepository[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdict_config[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mdict[0m[0;34m,[0m [0mstr[0m[0;34m,[0m [0msupport_classes[0m[0;34m.[0m[0mYAMLConfiguration[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mkey_local[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'local'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mkey_path[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'path'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mkey_s3[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m's3'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mkey_s3_access[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'access_key'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mkey_s3_bucket[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'bucket'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mkey_s3_path[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'path'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m  

# 4. Set up a `process_utilities.InputsReader` object for a group of variables that share a `config.yaml` file
- can initialize `InputsReader` with a `process_utilities.Repository` to access files from that repository
- `config.yaml` file includes keys nested in the following way:
    - `inputs`: information about inputs shared across variables in the group
        - `$DATANAME$` (e.g., `fao_land_cover`): name of a dataset to read into the `InputsReader` object. Suggested convention is that this should be all lower-case, and it should not contain any spaces.
            - `citation`: bibtex key 
            - `readas`: one of "faostat", "world_bank", "iea" `data_objects.AbstractDataset` objects. Reading in these datasets makes some shared shortcut methods available, such as standardized field names and subsetting
                - specify `"faostat"` to read an FAOSTAT dataset as a `data_objects.DatasetFAO` object (contains some shared information on)
            - Below the name of the dataset, you must specify one of the following keys to define how to read in the dataset. The keys may requre subkeys to define information about the read process. Note that CSV (via `pandas.read_csv`) and XLSX (via `pandas.read_excel`) are currently automatically read in, can/will add additional file types when necessary. Keyword arguments can be passed in the level below any of these keys using the `kwargs` key.
                
                - `local`: read a file from a local path
                    - `path`: full path to the file to read. CSV and XLSX are currently automatically read in, can/will add additional file types when necessary
                    - `kwargs`:
                        - specify keyword arguments to the pandas read method here; e.g., if `path` is an XLSX file, you can specify `sheet_name` here
                - `pipeline_output`: read from the pipeline's output database. NOTE: this should 
                - `repo`: read a path from a repository--only valid if the `process_utilities.InputsReader` object is initialized with a repository
                    - `path`: path _within_ repository to the file to read. 
                    - `kwargs`:
                        - specify keyword arguments to the pandas read method here
            - *EXAMPLE*: (1) Read an excel file from the repository used in the reader's initialization with sheet name "SHEET1" and assign it to the `df_energy_data` property and (2) read a remote crosswalk from the SISEPUEDE github repository 
            
            
            ```
            inputs: 
                df_energy_data:
                    repo: "path_in_repo/energy_data.xlsx"
                    kwargs:
                        sheet_name: "SHEET1"
                
                fao_crosswalk_crop_type: 
                    remote: 
                      path: "https://raw.githubusercontent.com/jcsyme/sisepuede/main/ref/data_crosswalks/fao_crop_categories.csv"
            ```
            
            -    
                
    - `variables`: information about how to handle projection and interpolation for SISEPUEDE Model Variables handled in the group
        **UNDER DEVELOPMENT**

In [18]:
# build 
fp_config = "/Users/jsyme/Documents/Projects/git_jbus/sisepuede_data_pipeline/afolu/initial_crops_and_land_use/config.yaml"
reader = pu.InputsReader(
    fp_config,
    repository = repo, 
)




##  Compare inputs in `"sisepuede_data_pipeline/afolu/initial_crops_and_land_use/config.yaml"` with properties in `reader`

In [21]:
[x for x in dir(reader) if not x.startswith("_")]

['config',
 'fao_agrc_production',
 'fao_crosswalk_crop_type',
 'fao_crosswalk_land_cover',
 'fao_land_cover',
 'fao_land_use',
 'fp_config',
 'is_inputs_reader',
 'key_inputs',
 'key_kwargs',
 'key_local',
 'key_path',
 'key_readas',
 'key_remote',
 'key_repo',
 'key_s3',
 'repository',
 'valid_source_keys']

In [26]:
# read in as a do.DatasetFAO object
data_agrc = reader.fao_agrc_production
type(data_agrc)

lib.data_objects.DatasetFAO

In [33]:
# subset FAO data easily (keywords are "cleaned" field names; e.g, "Year" -> "year", "Year Code" -> "year_code")
data_agrc.get_subset(
    area = "Mexico", # specify as a single element (singleton) or list
    element = ["Yield"],
    item_code = [1729, 1735],
    year = [2020, 2021, 2022],
)

Unnamed: 0,Area Code,Area Code (M49),Area,Item Code,Item Code (CPC),Item,Element Code,Element,Year Code,Year,Unit,Value,Flag,Note
0,138,'484,Mexico,1729,'F1729,"Treenuts, Total",5419,Yield,2020,2020,100 g/ha,16733.0,E,
1,138,'484,Mexico,1729,'F1729,"Treenuts, Total",5419,Yield,2021,2021,100 g/ha,14843.0,E,
2,138,'484,Mexico,1729,'F1729,"Treenuts, Total",5419,Yield,2022,2022,100 g/ha,16564.0,E,
3,138,'484,Mexico,1735,'F1735,Vegetables Primary,5419,Yield,2020,2020,100 g/ha,217852.0,A,
4,138,'484,Mexico,1735,'F1735,Vegetables Primary,5419,Yield,2021,2021,100 g/ha,212879.0,A,
5,138,'484,Mexico,1735,'F1735,Vegetables Primary,5419,Yield,2022,2022,100 g/ha,218739.0,A,


In [24]:
# read in as a dataframe
reader.fao_crosswalk_crop_type

Unnamed: 0,fao_crop,cat_1,``$CAT-AGRICULTURE$``,super_cat
0,"Abaca, manila hemp, raw",fibers,fibers,fibers
1,Agave fibres nes,fibers,fibers,fibers
2,"Agave fibres, raw, n.e.c.",fibers,fibers,fibers
3,"Almonds, in shell",nuts,nuts,woody_perennial
4,"Almonds, with shell",nuts,nuts,woody_perennial
...,...,...,...,...
244,Watermelons,vegetables,vegetables_and_vines,annual_crops
245,Wheat,cereals,cereals,cereals
246,Yams,tubers,tubers,tubers
247,Yautia,tubers,tubers,tubers
