### Scan the specs in pyincore specs

In [None]:
from pyincore import IncoreClient, SpaceService
import pyincore.analyses
import os
import inspect
import importlib
from pymongo import MongoClient
import csv
import json

In [None]:
client = IncoreClient("https://incore-dev.ncsa.illinois.edu")

In [None]:
client = IncoreClient()

In [None]:
def list_folders_in_path(path):
    folder_names = [name for name in os.listdir(path) if os.path.isdir(os.path.join(path, name))]
    return folder_names

In [None]:
def list_classes(module):
    class_names = [name for name, obj in inspect.getmembers(module) if inspect.isclass(obj)]
    return class_names

In [None]:
def instantiate_class(module, class_name):
    cls = getattr(module, class_name)
    instance = cls(client)
    return instance

In [None]:
# Get a list of all items in the pyincore.analyses package
available_analyses = dir(pyincore.analyses)
words_to_check = ['util', 'results', 'components']

module_names = []
unique_dataset_types = []
type_keys = []
for module_name in list_folders_in_path(pyincore.analyses.__path__[0]):
    module = importlib.import_module("pyincore.analyses." + module_name)
    for class_name in list_classes(module):
        # extract only classname match the module name
        if module_name.lower() in class_name.lower() and all(word not in class_name.lower() for word in words_to_check):
            instance = instantiate_class(module, class_name)
            
            for spec_in in instance.get_spec()["input_datasets"]:
                for spec_type in spec_in["type"]:
                    if spec_type not in type_keys:
                        type_keys.append(spec_type)
                        unique_dataset_types.append({"type": spec_type, 
                                                     "description": spec_in.get("description"), 
                                                     "IO": "input",
                                                     "analyses": class_name.lower()
                                                    })            
                    
            for spec_out in instance.get_spec()["output_datasets"]:
                if spec_out["type"] not in type_keys:
                    type_keys.append(spec_out["type"])
                    unique_dataset_types.append({"type": spec_out["type"], 
                                                 "description": spec_out.get("description"),
                                                 "IO": "output",
                                                 "analyses": class_name.lower()
                                                })

In [None]:
unique_dataset_types[:2]

### Compare with what we have

In [None]:
mongo_username = "root"
mongo_password = os.environ.get("PW")
host = "localhost"
port = "27019" # dev

mongoclient = MongoClient('mongodb://%s:%s@%s:%s' % (mongo_username, mongo_password, host, port))

In [None]:
mongo_username = "root"
mongo_password = os.environ.get("PW")
host = "localhost"
port = "27020" # prod

mongoclient = MongoClient('mongodb://%s:%s@%s:%s' % (mongo_username, mongo_password, host, port))

### Add description

In [None]:
for unique_data_type in unique_dataset_types:
    found = False
    for document in mongoclient['semanticsdb']['Type'].find():
        if unique_data_type["type"] == document["dc:title"]:
            found = True
            document["dc:description"] = unique_data_type["description"]
            # replace document
            mongoclient['semanticsdb']['Type'].replace_one({'_id':document['_id']}, document)
            break
    unique_data_type["exist"] = found

In [None]:
# save to csv
with open("pyincore_unique_data_types.csv", 'w', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, dialect="unix", fieldnames=unique_dataset_types[0].keys())
    writer.writeheader()
    # Write the data rows to the CSV file
    for unique_data_type in unique_dataset_types:
        writer.writerow(unique_data_type)

### Move to different space (ncsa:)

In [None]:
spacesvc = SpaceService(client)

In [None]:
move_to_ergo_ids = []
move_to_incore_ids = []
move_to_ncsa_ids = []

for document in mongoclient['semanticsdb']['Type'].find():
    document_id = str(document["_id"])
    if document["dc:title"].startswith("ergo:"):
        move_to_ergo_ids.append(document_id)
    elif document["dc:title"].startswith("incore:"):
        move_to_incore_ids.append(document_id)
    elif document["dc:title"].startswith("ncsa:"):
        move_to_ncsa_ids.append(document_id)
    else:
        print("orphan: " + document_id)

In [None]:
def add_to_space(space_name, move_to_space_ids):
    space_document = mongoclient['spacedb']['Space'].find_one({"metadata.name": space_name})
    for move_to_space_id in move_to_space_ids:
        if move_to_space_id not in space_document["members"]:
            space_document["members"].append(move_to_space_id)
    
    mongoclient['spacedb']['Space'].replace_one({'_id':space_document['_id']}, space_document)

In [None]:
add_to_space("ncsa", move_to_ncsa_ids)

In [None]:
add_to_space("ergo", move_to_ergo_ids)

In [None]:
add_to_space("incore", move_to_incore_ids)

#### Update dataType for each dataset

In [None]:
for data_document in mongoclient['datadb']['Dataset'].find():
    if data_document["dataType"] not in type_keys:
        if data_document["dataType"].startswith("ergo:"):
            data_document["dataType"] = data_document["dataType"].replace("ergo:", "ncsa:")
        elif data_document["dataType"].startswith("incore:"):
            data_document["dataType"] = data_document["dataType"].replace("incore:", "ncsa:")
        elif data_document["dataType"].startswith("ncsa:"):
            pass
        else:
            print("unrecognized data type:", data_document["dataType"])
        
        mongoclient['datadb']['Dataset'].replace_one({'_id':data_document['_id']}, data_document)