In [None]:
from pyincore import IncoreClient, DataService
from pyincore.dataset import Dataset
import requests as rs
import os
from pymongo import MongoClient
import csv
import json
from dotenv import load_dotenv
from collections import defaultdict
from typing import List, Dict

load_dotenv()

## Define Clients and estabilish connections to dataservice and mongodb


In [None]:
prod_client = IncoreClient()
dev_client = IncoreClient("https://incore-dev.ncsa.illinois.edu")

prod_dataservice = DataService(client=prod_client)
dev_dataservice = DataService(client=dev_client)

In [None]:
mongo_username = "root"
mongo_password_dev = os.getenv("PWDEV")
host = "localhost"
port_dev = "27019"  # dev

mongoclient_dev = MongoClient(
    "mongodb://%s:%s@%s:%s"
    % (mongo_username, mongo_password_dev, host, port_dev)
)

In [None]:
mongo_username = "root"
mongo_password_prod = os.getenv("PWPROD")
host = "localhost"
port_prod = "27020"  # prod

mongoclient_prod: MongoClient = MongoClient(
    "mongodb://%s:%s@%s:%s"
    % (mongo_username, mongo_password_prod, host, port_prod)
)

### Load unique datatypes


In [None]:
# read the unique list of datatypes
with open("pyincore_unique_data_types.csv", "r") as f:
    reader = csv.reader(f)
    unique_data_types = [datatype[0] for datatype in list(reader)[1:]]

### Function to load dataset ids from mongodb without space consideration


In [None]:
def load_ids_from_mongodb(
    mongoclient: MongoClient, datatypes: List[str]
) -> Dict[str, List[Dict[str, str]]]:
    unique_datatype_dataset_ids = defaultdict(list)
    for datatype in datatypes:
        for document in mongoclient["datadb"]["Dataset"].find(
            {"dataType": datatype, "deleted": False}
        ):
            unique_datatype_dataset_ids[datatype].append(
                {
                    "id": str(document["_id"]),
                    "creator": str(document["creator"]),
                    "format": str(document["format"]),
                }
            )
    return unique_datatype_dataset_ids

In [None]:
prod_unique_datatype_dataset_ids = load_ids_from_mongodb(
    mongoclient_prod, unique_data_types
)

### Function to load dataset ids from API endpoint with ability to filter out via Space

The spaces considered are:

- ergo
- incore


In [None]:
prod_url = "https://incore.ncsa.illinois.edu/data/api/datasets?space={}&type={}&limit=100000&skip=0"
dev_url = "https://incore-dev.ncsa.illinois.edu/data/api/datasets?space={}&type={}&limit=100000&skip=0"

In [None]:
def load_ids_from_api(
    url: str, spaces: List[str], datatypes: List[str]
) -> Dict[str, List[Dict[str, str]]]:
    unique_datatype_dataset_ids = defaultdict(list)
    header = {
        "Authorization": f"bearer {os.getenv('TOKEN')}",
    }
    for datatype in datatypes:
        for space in spaces:
            response = rs.get(url.format(space, datatype), headers=header)
            if response.status_code != 200:
                print(
                    f"Error: {response.status_code} for {datatype} in {space}"
                )
                continue
            for dataset in response.json():
                if dataset["deleted"]:
                    continue
                unique_datatype_dataset_ids[datatype].append(
                    {
                        "id": dataset["id"],
                        "creator": dataset["creator"],
                        "format": dataset["format"],
                        "space": space,
                    }
                )
    return unique_datatype_dataset_ids

### Skip below cell and load the extracted ids directly


In [None]:
space_filtered_dataset_ids_prod = load_ids_from_api(
    prod_url, ["ergo", "incore"], unique_data_types
)

In [None]:
space_filtered_dataset_ids_dev = load_ids_from_api(
    dev_url, ["ergo", "incore"], unique_data_types
)

### Skip to here for loading extracted ids


### Write


In [None]:
# write to file for faster access next time
with open("pyincore_prod_unique_datatypes_dataset_ids.json", "w") as f:
    json.dump(space_filtered_dataset_ids_prod, f, indent=4)

In [None]:
# write to file for faster access next time
with open("pyincore_dev_unique_datatypes_dataset_ids.json", "w") as f:
    json.dump(space_filtered_dataset_ids_dev, f, indent=4)

### Read


In [None]:
with open("pyincore_prod_unique_datatypes_dataset_ids.json", "r") as f:
    space_filtered_dataset_ids_prod = json.load(f)

In [None]:
with open("pyincore_dev_unique_datatypes_dataset_ids.json", "r") as f:
    space_filtered_dataset_ids_dev = json.load(f)

### Prod


In [None]:
print(len(unique_data_types))
print(len(space_filtered_dataset_ids_prod.keys()))
datasets_of_datatype_not_in_incore_ergo_space = list(
    set(unique_data_types) - set(space_filtered_dataset_ids_prod.keys())
)
print(datasets_of_datatype_not_in_incore_ergo_space)

### Dev


In [None]:
print(len(unique_data_types))
print(len(space_filtered_dataset_ids_dev.keys()))
datasets_of_datatype_not_in_incore_ergo_space = list(
    set(unique_data_types) - set(space_filtered_dataset_ids_dev.keys())
)
print(datasets_of_datatype_not_in_incore_ergo_space)

### Get Dataset Objects from the dataset IDs collected

After loading the dataset, we will need to load it based on the format the dataset is in.

Ex:
| _Format_ | _Dataset Function to use_ |
| ---------- | --------------------------- |
| shapefile | `get_dataframe_from_shapefile` |
| shp-network | not sure |
| json | `get_json_reader` |
| table | `get_csv_reader` or `get_dataframe_from_csv` |


In [None]:
def get_datasets(
    datasets: List[dict], dataservice: DataService
) -> List[Dataset]:
    # download and get a list of all the datasets in the database for a datatype
    dataset_objects: List[Dataset] = []
    for ds in datasets:
        dataset_objects.append(
            Dataset.from_data_service(ds["id"], dataservice)
        )
    return dataset_objects

In [None]:
ds_df = Dataset.from_data_service(
    "5d25118eb9219c0692cd7527", prod_dataservice
).get_dataframe_from_shapefile()
# ds_df = Dataset.from_data_service("5d25118eb9219c0692cd7527", dev_dataservice).get_dataframe_from_csv()
ds_df.head()

### Example usage


In [None]:
print("incore:tornadoWindfield" in space_filtered_dataset_ids_prod.keys())
print("incore:tornadoWindfield" in space_filtered_dataset_ids_dev.keys())

### Prod


In [None]:
dataset_objs = get_datasets(
    space_filtered_dataset_ids_prod["incore:epfDamageRatios"],
    prod_dataservice,
)

### Dev


In [None]:
dataset_objs = get_datasets(
    space_filtered_dataset_ids_dev["ergo:bridges"],
    dev_dataservice,
)

In [None]:
print(len(dataset_objs))
print(dataset_objs[0].format)

In [None]:
dataset_obj_dfs = []

for ds_ob in dataset_objs:
    if ds_ob.format == "shapefile":
        dataset_obj_dfs.append(ds_ob.get_dataframe_from_shapefile())
    elif ds_ob.format == "table":
        dataset_obj_dfs.append(ds_ob.get_dataframe_from_csv())
    else:
        msg = "Dataset format not in consideration for id {} in {} format".format(
            ds_ob.id, ds_ob.format
        )
        dataset_obj_dfs.append(msg)
        print(msg)

In [None]:
for ds_ob in dataset_objs:
    print(ds_ob.format)

In [None]:
for i, df in enumerate(dataset_obj_dfs):
    if isinstance(df, str):
        print(df)
        continue
    print("Dataset {}".format(dataset_objs[i].id))
    print(df.dtypes)
    print("=" * 20)

In [None]:
dataset_obj_dfs[0].columns

In [None]:
dataset_obj_dfs[0].head()

### Cache Cleanup


In [None]:
# clear prod cache files
prod_client.clear_cache()

In [None]:
# clear dev cache files
dev_client.clear_cache()