In [None]:
from pycrucible.pycrucible import *
from dotenv import load_dotenv
import os
load_dotenv()

In [None]:
apikey = os.environ.get('apikey')
old_apikey = os.environ.get('old_apikey')

In [None]:
crux = CrucibleClient("https://crucible.lbl.gov/testapi", apikey)

In [None]:
# pull all the dataset IDs from the old Crucible API (https://crucible.lbl.gov/api) using requests
old_url = "https://crucible.lbl.gov/api"
old_dsids = requests.get(f'{old_url}/list_datasets', headers = {"Authorization": f"Bearer {old_apikey}"}).json()
old_dsids = [x['unique_id'] for x in old_dsids]
new_ds = crux.list_datasets()
new_dsids = [x['unique_id'] for x in new_ds]


# for each dataset ID in the old database - check if it exists in the new database using the CrucibleClient
missing_ds = [dsid for dsid in old_dsids if dsid not in new_dsids]
print(len(missing_ds))
with open('missing_ids.txt', 'w') as f:
    f.writelines('\n'.join(missing_ds))

In [None]:
with open('missing_ids.txt', 'r') as f:
    missing_ds = [x.replace('\n', '').strip() for x in f.readlines()]
print(len(missing_ds))
print(missing_ds[0:5])

In [None]:
def build_placeholder_project(project_id, **kwargs):
    PROJ = {}
    PROJ['project_id'] = project_id

    PROJ['project_lead_name'] = 'Morgan Wall'
    PROJ['project_lead_email'] = 'mkwall@lbl.gov'
    PROJ['organization'] = "Molecular Foundry"
    PROJ['status'] = 'inactive'
    PROJ['title'] = project_id
    return PROJ

In [None]:
errors_copy = []

In [None]:
# if it does not exist -- add it
from pycrucible.utils import run_shell
import sys
import json
sys.path.append("/home/jupyter-mkwall/git/")
from crucible_utils.mf_proposal_db_utils import build_mfuser, build_mfp_project
from crucible_utils.constants import propdb_api_url
from dotenv import load_dotenv

load_dotenv()
errors = []
no_json = []
for i,dsid in enumerate(missing_ds):
    local_json_path = f'./missing-ds-json/{dsid}.json'
    
    if not os.path.exists(local_json_path):
        run_shell(f"rclone copy mf-cloud-storage:/mf-storage-prod/{dsid}/{dsid}.json ./missing-ds-json/")

    if i%20==0:
        print(i)
        
    # if it still doesn't exist make a note
    if not os.path.exists(local_json_path):
        no_json.append(dsid)
        
    else:
        with open(local_json_path) as f:
            D = json.load(f)

        # User
        if isinstance(D['orcid'],str) and D['orcid'] != 'XXXX-XXXX-XXXX-XXXX':
            #print(f"{dsid} has user {D['orcid']}")
            try:
                user = crux.get_or_add_user(orcid = D['orcid'], get_user_info_function = build_mfuser, propdb_api_url = propdb_api_url)
            except:
                pass
                
        # Project
        prop = D.get('proposal', None)
        if prop is None:
            prop = D.get('project', None)
            print(f"found as project for {dsid}")
            
        if isinstance(prop, str) and prop not in ['', 'unknown']:
            try:
                proj = crux.get_or_add_project(crucible_project_id = D['proposal'], 
                                               get_project_info_function = build_mfp_project, 
                                               propdb_api_url = propdb_api_url)
            except:
                proj = crux.get_or_add_project(crucible_project_id = D['proposal'], 
                               get_project_info_function = build_placeholder_project, 
                               propdb_api_url = propdb_api_url)
                
        public_value = D.get("public", False)

        try:
            # Dataset Record, Instrument, Scientific Metadata, Keywords, Access Groups
            new_ds_record = crux.create_dataset(dataset_name = D['dataset_name'],
                                             unique_id = D['unique_id'], 
                                             public = D.get("public", False),
                                             owner_orcid = D['orcid'],
                                             project_id= D.get("proposal", 'unknown'),
                                             instrument_name= D['instrument_name'],
                                             measurement = D.get('measurement', None), 
                                             session_name = D.get("session", None),
                                             creation_time = D['creation_time'],
                                             data_format = D['data_format'], 
                                             scientific_metadata= D['metadata_dictionary'],
                                             keywords = D['keywords'], 
                                             file_to_upload = D['file_to_upload'])
            
            # Associated Files
            if isinstance(D['associated_files'], list):
                for af in D['associated_files']:
                    try:
                        response = crux.add_associated_file(dsid, af['path'], af['size'], af['sha256_hash'])
                    except:
                        print(af)
            elif isinstance(D['associated_files'], dict):
                for k,v in D['associated_files'].items():
                    try:
                        response = crux.add_associated_file(dsid, k, v['size'], v['sha256_hash'])
                    except:
                        print(k,v)
            else:
                print(type(D['associated_files']))
    
            # Thumbnails
            for tn in D['thumbnails']:
                crux.add_thumbnail(dsid, tn['filepath'], tn['thumbnail'])
                
        except Exception as err:
            errors.append({'dsid':dsid, 'error':err})
            
errors_copy += errors

In [None]:
len(errors)

In [None]:
errors

In [None]:
no_json