In [28]:
from pyincore import IncoreClient, DataService
from pyincore.dataset import Dataset
import requests as rs
import os
from pymongo import MongoClient
import csv
import json
from dotenv import load_dotenv
from collections import defaultdict
from typing import List, Dict

load_dotenv()

True

## Define Clients and estabilish connections to dataservice and mongodb


In [2]:
prod_client = IncoreClient()
dev_client = IncoreClient("https://incore-dev.ncsa.illinois.edu")

prod_dataservice = DataService(client=prod_client)
dev_dataservice = DataService(client=dev_client)

Connection successful to IN-CORE services. pyIncore version detected: 1.12.0
Connection successful to IN-CORE services. pyIncore version detected: 1.12.0


In [None]:
mongo_username = "root"
mongo_password_dev = os.getenv("PWDEV")
host = "localhost"
port_dev = "27019"  # dev

mongoclient_dev = MongoClient(
    "mongodb://%s:%s@%s:%s"
    % (mongo_username, mongo_password_dev, host, port_dev)
)

In [3]:
mongo_username = "root"
mongo_password_prod = os.getenv("PWPROD")
host = "localhost"
port_prod = "27020"  # prod

mongoclient_prod: MongoClient = MongoClient(
    "mongodb://%s:%s@%s:%s"
    % (mongo_username, mongo_password_prod, host, port_prod)
)

### Load unique datatypes found out by Chen


In [4]:
# read the unique list of datatypes
with open("pyincore_unique_data_types.csv", "r") as f:
    reader = csv.reader(f)
    unique_data_types = [datatype[0] for datatype in list(reader)[1:]]

### Function to load dataset ids from mongodb without space consideration


In [29]:
def load_ids_from_mongodb(
    mongoclient: MongoClient, datatypes: List[str]
) -> Dict[str, List[Dict[str, str]]]:
    unique_datatype_dataset_ids = defaultdict(list)
    for datatype in datatypes:
        for document in mongoclient["datadb"]["Dataset"].find(
            {"dataType": datatype, "deleted": False}
        ):
            unique_datatype_dataset_ids[datatype].append(
                {
                    "id": str(document["_id"]),
                    "creator": str(document["creator"]),
                    "format": str(document["format"]),
                }
            )
    return unique_datatype_dataset_ids

In [None]:
prod_unique_datatype_dataset_ids = load_ids_from_mongodb(
    mongoclient_prod, unique_data_types
)

### Function to load dataset ids from API endpoint with ability to filter out via Space

The spaces considered are:

- ergo
- incore


In [6]:
prod_url = "https://incore.ncsa.illinois.edu/data/api/datasets?space={}&type={}&limit=100000&skip=0"
dev_url = "https://incore-dev.ncsa.illinois.edu/data/api/datasets?space={}&type={}&limit=100000&skip=0"

In [21]:
def load_ids_from_api(
    url: str, spaces: List[str], datatypes: List[str]
) -> Dict[str, List[Dict[str, str]]]:
    unique_datatype_dataset_ids = defaultdict(list)
    header = {
        "Authorization": f"bearer {os.getenv('TOKEN')}",
    }
    for datatype in datatypes:
        for space in spaces:
            response = rs.get(url.format(space, datatype), headers=header)
            if response.status_code != 200:
                print(
                    f"Error: {response.status_code} for {datatype} in {space}"
                )
                continue
            for dataset in response.json():
                if dataset["deleted"]:
                    continue
                unique_datatype_dataset_ids[datatype].append(
                    {
                        "id": dataset["id"],
                        "creator": dataset["creator"],
                        "format": dataset["format"],
                        "space": space,
                    }
                )
    return unique_datatype_dataset_ids

### Skip below cell and load the extracted ids directly


In [25]:
space_filtered_dataset_ids_prod = load_ids_from_api(
    prod_url, ["ergo", "incore"], unique_data_types
)

### Skip to here for loading extracted ids


In [26]:
# write to file for faster access next time
with open("pyincore_prod_unique_datatypes_dataset_ids.json", "w") as f:
    json.dump(space_filtered_dataset_ids_prod, f, indent=4)

In [27]:
with open("pyincore_prod_unique_datatypes_dataset_ids.json", "r") as f:
    space_filtered_dataset_ids_prod = json.load(f)

In [9]:
len(unique_data_types)

157

### Get Dataset Objects from the dataset IDs collected

After loading the dataset, we will need to load it based on the format the dataset is in.

Ex:
| _Format_ | _Dataset Function to use_ |
| ---------- | --------------------------- |
| shapefile | `get_dataframe_from_shapefile` |
| shp-network | not sure |
| json | `get_json_reader` |
| table | `get_csv_reader` or `get_dataframe_from_csv` |


In [30]:
def get_datasets(
    datasets: List[dict], dataservice: DataService
) -> List[Dataset]:
    # download and get a list of all the datasets in the database for a datatype
    dataset_objects: List[Dataset] = []
    for ds in datasets:
        dataset_objects.append(
            Dataset.from_data_service(ds["id"], dataservice)
        )
    return dataset_objects

### Example usage


In [41]:
ergo_bldg_inventory_ver6_datasets = get_datasets(
    space_filtered_dataset_ids_prod["ergo:buildingInventoryVer6"],
    prod_dataservice,
)

Dataset already exists locally. Reading from local cached zip.
Unzipped folder found in the local cache. Reading from it...
Dataset already exists locally. Reading from local cached zip.
Unzipped folder found in the local cache. Reading from it...


In [43]:
print(ergo_bldg_inventory_ver6_datasets[0].format)

shapefile


In [44]:
gdf1 = ergo_bldg_inventory_ver6_datasets[0].get_dataframe_from_shapefile()
gdf2 = ergo_bldg_inventory_ver6_datasets[1].get_dataframe_from_shapefile()

In [45]:
gdf1.columns

Index(['parid', 'parid_card', 'bldg_id', 'struct_typ', 'str_prob',
       'year_built', 'no_stories', 'a_stories', 'b_stories', 'bsmt_type',
       'sq_foot', 'gsq_foot', 'occ_type', 'occ_detail', 'major_occ',
       'broad_occ', 'appr_bldg', 'repl_cst', 'str_cst', 'nstra_cst',
       'nstrd_cst', 'dgn_lvl', 'cont_val', 'efacility', 'dwell_unit',
       'str_typ2', 'occ_typ2', 'tract_id', 'guid', 'FID_NEW', 'origin',
       'stat_class', 'rmv_improv', 'rmv_land', 'elev', 'period', 'strctid',
       'geometry'],
      dtype='object')

In [46]:
gdf2.columns

Index(['strctid', 'Lon', 'Lat', 'archtype', 'parid', 'struct_typ',
       'year_built', 'no_stories', 'a_stories', 'b_stories', 'bsmt_type',
       'sq_foot', 'gsq_foot', 'occ_type', 'occ_detail', 'major_occ',
       'broad_occ', 'appr_bldg', 'repl_cst', 'str_cst', 'nstra_cst',
       'nstrd_cst', 'dgn_lvl', 'cont_val', 'efacility', 'dwell_unit',
       'str_typ2', 'occ_typ2', 'appr_land', 'appr_tot', 'types', 'failure',
       'fun', 'guid', 'geometry'],
      dtype='object')

### Cache Cleanup


In [36]:
# clear prod cache files
prod_client.clear_cache()

In [None]:
# clear dev cache files
dev_client.clear_cache()