---
---
# [0] Setup

In [1]:
# using env `box2` (env_box2.yaml)

from pprint import pp

import json
from typing import List, Callable
from datetime import datetime
from pathlib import Path, PurePosixPath

from box_sdk_gen import JWTConfig, BoxJWTAuth, BoxClient # box authentication
from box_sdk_gen import AddShareLinkToFileSharedLink, UpdateSharedLinkOnFileSharedLink, AddShareLinkToFileSharedLinkPermissionsField, UpdateSharedLinkOnFileSharedLinkPermissionsField # box link sharing (lol)

In [2]:
''' ———————————————————————————————— Inputs ———————————————————————————————— '''

root_folder_id : str = '270589643170'   # path: 'rubox:shared_permalinks/redplanet/redplanet_cache/'
fpath_jwt_config : Path = list((Path.cwd() / '.secret/').glob('*.json'))[0]   # token from account:"iliketochacha", app:"redplanet_export_v4"

---
---
# [1] Authenticate

In [3]:
def authenticate_box(fpath_jwt_config : Path) -> BoxClient:
    jwt_config : JWTConfig  = JWTConfig.from_config_file(config_file_path=fpath_jwt_config)
    auth       : BoxJWTAuth = BoxJWTAuth(config=jwt_config)
    user_auth  : BoxJWTAuth = auth.with_user_subject(user_id="34633045408")   # impersonating user from account:"iliketochacha" ("Kasane Teto")
    client     : BoxClient  = BoxClient(auth=user_auth)
    return client


client : BoxClient = authenticate_box(fpath_jwt_config)

---
---
# [2] Accessing Files

In [4]:
''' ————————————————————————————— FOLDER STUFF ————————————————————————————— '''

def _get_folder_dict(
    client             : BoxClient,
    folder_id          : str,
    entries_per_search : int = 100,
    offset             : int = 0,
) -> dict:
    """
    RETURN:
        `dict`, which looks like this:
            {'id': '270589643170',
            'etag': '1',
            'type': 'folder',
            'sequence_id': '1',
            'name': 'redplanet_cache',
            'created_at': '2024-06-18T00:30:18-07:00',
            'modified_at': '2024-06-22T23:58:19-07:00',
            'description': '',
            'size': 11322836466,
            'path_collection': {'total_count': 2,
                                'entries': [{'id': '0',
                                            'type': 'folder',
                                            'name': 'All Files'},
                                            {'id': '270582896601',
                                            'etag': '1',
                                            'type': 'folder',
                                            'sequence_id': '1',
                                            'name': 'redplanet'}]},
            'created_by': {'id': '18595935892',
                            'type': 'user',
                            'name': 'Zain Kamal',
                            'login': 'zk117@rutgers.edu'},
            'modified_by': {'id': '18595935892',
                            'type': 'user',
                            'name': 'Zain Kamal',
                            'login': 'zk117@rutgers.edu'},
            'content_created_at': '2024-06-18T00:30:18-07:00',
            'content_modified_at': '2024-06-22T23:58:19-07:00',
            'owned_by': {'id': '18595935892',
                        'type': 'user',
                        'name': 'Zain Kamal',
                        'login': 'zk117@rutgers.edu'},
            'parent': {'id': '270582896601',
                        'etag': '1',
                        'type': 'folder',
                        'sequence_id': '1',
                        'name': 'redplanet'},
            'item_status': 'active',
            'item_collection': {'limit': 100,
                                'total_count': 3,
                                'offset': 0,
                                'order': [{'by': 'type', 'direction': 'ASC'},
                                        {'by': 'name', 'direction': 'ASC'}],
                                'entries': [{'id': '271456434555',
                                            'etag': '0',
                                            'type': 'folder',
                                            'sequence_id': '0',
                                            'name': 'Crust'},
                                            {'id': '271455606507',
                                            'etag': '0',
                                            'type': 'folder',
                                            'sequence_id': '0',
                                            'name': 'GRS'},
                                            {'id': '271457846138',
                                            'etag': '0',
                                            'type': 'folder',
                                            'sequence_id': '0',
                                            'name': 'Mag'}]}}

    """
    folder_dict : dict = client.folders.get_folder_by_id(
        folder_id = folder_id,
        limit     = entries_per_search,
        offset    = offset,
    ).to_dict()
    return folder_dict


def _get_folder_name(client: BoxClient, folder_id: str) -> str:
    empty_folder_dict : dict = _get_folder_dict(
        client             = client,
        folder_id          = folder_id,
        entries_per_search = 0,
    )
    folder_name : str = empty_folder_dict['name']
    return folder_name


def _get_folder_entries(client: BoxClient, folder_id: str) -> List[dict]:
    """
    We can't just call `get_folder_dict(...)` and then access `['item_collection']['entries']`, since the API call only returns up to 1,000 entries at a time. Therefore we need to repeatedly access the folder while incrementing the `offset` argument.

    RETURN:
        `List[dict]`, where FOLDERS look like:
            {
                'id'         : '271456434555',
                'etag'       : '0',
                'type'       : 'folder',
                'sequence_id': '0',
                'name'       : 'Crust'
            }
        and FILES look like:
            {
                'id'          : '1568917463274',
                'etag'        : '1',
                'type'        : 'file',
                'sequence_id' : '1',
                'name'        : '._Moho-Mars-Khan2022-18-2550-2550.sh',
                'sha1'        : '1abc429954206064f4ceed2886e22e1ffb65703a',
                'file_version': {
                    'id'  : '1724084185674',
                    'type': 'file_version',
                    'sha1': '1abc429954206064f4ceed2886e22e1ffb65703a'
                },
            }
        
    """

    empty_folder_dict : dict = _get_folder_dict(
        client             = client,
        folder_id          = folder_id,
        entries_per_search = 0,
    )
    num_entries        : int        = empty_folder_dict['item_collection']['total_count']

    entries            : List[dict] = []
    entries_per_search : int        = 1_000

    for offset in range(0, num_entries, entries_per_search):
        this_folder_dict : dict = _get_folder_dict(
            client             = client,
            folder_id          = folder_id,
            entries_per_search = entries_per_search,
            offset             = offset,
        )
        this_entries : List[dict] = this_folder_dict['item_collection']['entries']
        entries += this_entries

    return entries


''' —————————————————————————————— FILE STUFF —————————————————————————————— '''

def _get_file_dict(client: BoxClient, file_id: str) -> dict:
    """
    RETURN:
        `dict`, which looks like this:
            {'id': '1568919222016',
            'etag': '1',
            'type': 'file',
            'sequence_id': '1',
            'name': '2022_Mars_Odyssey_GRS_Element_Concentration_Maps.zip',
            'sha1': '5aa4778a4600c5fb85df1c6e363bd429e9c9e096',
            'file_version': {'id': '1724085982816',
                            'type': 'file_version',
                            'sha1': '5aa4778a4600c5fb85df1c6e363bd429e9c9e096'},
            'description': '',
            'size': 208819,
            'path_collection': {'total_count': 4,
                                'entries': [{'id': '0',
                                            'type': 'folder',
                                            'name': 'All Files'},
                                            {'id': '270582896601',
                                            'etag': '1',
                                            'type': 'folder',
                                            'sequence_id': '1',
                                            'name': 'redplanet'},
                                            {'id': '270589643170',
                                            'etag': '1',
                                            'type': 'folder',
                                            'sequence_id': '1',
                                            'name': 'redplanet_cache'},
                                            {'id': '271455606507',
                                            'etag': '0',
                                            'type': 'folder',
                                            'sequence_id': '0',
                                            'name': 'GRS'}]},
            'created_at': '2024-06-22T21:59:11-07:00',
            'modified_at': '2024-06-22T21:59:11-07:00',
            'content_created_at': '2024-03-08T02:44:12-08:00',
            'content_modified_at': '2024-03-08T02:44:12-08:00',
            'created_by': {'id': '18595935892',
                            'type': 'user',
                            'name': 'Zain Kamal',
                            'login': 'zk117@rutgers.edu'},
            'modified_by': {'id': '18595935892',
                            'type': 'user',
                            'name': 'Zain Kamal',
                            'login': 'zk117@rutgers.edu'},
            'owned_by': {'id': '18595935892',
                        'type': 'user',
                        'name': 'Zain Kamal',
                        'login': 'zk117@rutgers.edu'},
            'shared_link': {'url': 'https://rutgers.box.com/s/i1dy31or67y030yhof3c39ts19emigzd',
                            'effective_access': 'open',
                            'effective_permission': 'can_download',
                            'is_password_enabled': False,
                            'download_count': 0,
                            'preview_count': 0,
                            'download_url': 'https://rutgers.box.com/shared/static/i1dy31or67y030yhof3c39ts19emigzd.zip',
                            'access': 'open',
                            'permissions': {'can_download': True,
                                            'can_preview': True,
                                            'can_edit': False}},
            'parent': {'id': '271455606507',
                        'etag': '0',
                        'type': 'folder',
                        'sequence_id': '0',
                        'name': 'GRS'},
            'item_status': 'active'}

    """
    file_dict : dict = client.files.get_file_by_id(file_id=file_id).to_dict()
    return file_dict


def _get_file_name(client: BoxClient, file_id: str) -> str:
    file_dict : dict = _get_file_dict(client=client, file_id=file_id)
    file_name : str = file_dict['name']
    return file_name


''' ——————————————————————— CONSTRUCT REGISTRY PART 1 —————————————————————— '''

def add_fileinfo_to_registry(
    client         : BoxClient,
    root_folder_id : str,
    func_include   : Callable[[PurePosixPath], bool] = lambda x: True,
    func_exclude   : Callable[[PurePosixPath], bool] = lambda x: False,
    registry       : dict = None,
) -> dict:
    if registry is None:
        registry = {}

    def _traverse_folder(
        this_folder_id : str,
        this_registry  : dict,
        path_stack     : List[str] = [],
    ) -> dict:
        path_stack.append(_get_folder_name(client, this_folder_id))
        
        for entry in _get_folder_entries(client, this_folder_id): # List[dict]

            this_entry_fullpath : PurePosixPath = PurePosixPath(*path_stack, entry['name'])

            if (not func_include(this_entry_fullpath)) or (func_exclude(this_entry_fullpath)):
                continue

            match entry['type']:
                case 'folder':
                    _traverse_folder(
                        this_folder_id = entry['id'],
                        this_registry  = this_registry,
                        path_stack     = path_stack,
                    )
                case 'file':
                    this_registry[str(this_entry_fullpath)] = {
                        'TEMP_box_id': entry['id'],
                        'sha1': entry['sha1'],
                    }
        
        path_stack.pop()
        return this_registry

    return _traverse_folder(root_folder_id, registry)

In [5]:
registry_everything : dict = add_fileinfo_to_registry(client, root_folder_id)
pp(registry_everything)
pp(len(registry_everything.keys()))

{'redplanet_cache/Crust/dichotomy/dichotomy_coordinates-JAH-0-360.txt': {'TEMP_box_id': '1568920998501',
                                                                         'sha1': '74f912ddae101b6cd2dee5379f41cb6f7198555b'},
 'redplanet_cache/Crust/moho/shcoeffs/._Moho-Mars-Khan2022-18-2550-2550.sh': {'TEMP_box_id': '1568917463274',
                                                                              'sha1': '1abc429954206064f4ceed2886e22e1ffb65703a'},
 'redplanet_cache/Crust/moho/shcoeffs/._Moho-Mars-Khan2022-19-2550-2550.sh': {'TEMP_box_id': '1568919412079',
                                                                              'sha1': '1abc429954206064f4ceed2886e22e1ffb65703a'},
 'redplanet_cache/Crust/moho/shcoeffs/._Moho-Mars-Khan2022-19-2550-2600.sh': {'TEMP_box_id': '1568918454795',
                                                                              'sha1': '1abc429954206064f4ceed2886e22e1ffb65703a'},
 'redplanet_cache/Crust/moho/shcoeffs/._Moho-M

In [10]:
registry_noshcoeffs : dict = add_fileinfo_to_registry(
    client, 
    root_folder_id, 
    func_exclude = lambda path: path.name == 'shcoeffs'
)

pp(registry_noshcoeffs)
pp(len(registry_noshcoeffs.keys()))

{'redplanet_cache/Crust/dichotomy/dichotomy_coordinates-JAH-0-360.txt': {'TEMP_box_id': '1568920998501',
                                                                         'sha1': '74f912ddae101b6cd2dee5379f41cb6f7198555b'},
 'redplanet_cache/Crust/topo/Mars_HRSC_MOLA_BlendDEM_Global_200mp_v2.zarr.zip': {'TEMP_box_id': '1568920072138',
                                                                                 'sha1': 'dc7648a41bba9f5b7acd229560832840150d7ff7'},
 'redplanet_cache/Crust/topo/Mars_MGS_MOLA_DEM_mosaic_global_463m_reprojected.zarr.zip': {'TEMP_box_id': '1568918871944',
                                                                                          'sha1': '0d544f068937cedd946f18163aebc43105a78652'},
 'redplanet_cache/GRS/2022_Mars_Odyssey_GRS_Element_Concentration_Maps.zip': {'TEMP_box_id': '1568919222016',
                                                                              'sha1': '5aa4778a4600c5fb85df1c6e363bd429e9c9e096'},
 'redplanet_cach

---
---
# [3] Handling Share info

In [11]:
def _get_file_sharestatus(client: BoxClient, file_id: str) -> str:
    """
    RETURNS:
        One of the following lists:
            ['not shared']
            ['incorrect sharing permissions', file_shareinfo_dict]
            ['correct sharing permissions',   file_shareinfo_dict, download_url]
    """
    file_dict : dict = _get_file_dict(client=client, file_id=file_id)

    file_shareinfo_dict : dict = file_dict.get('shared_link')
    """
    file_shareinfo_dict looks like this:
        {'url'                : 'https://rutgers.box.com/s/i1dy31or67y030yhof3c39ts19emigzd',
        'effective_access'    : 'open',
        'effective_permission': 'can_download',
        'is_password_enabled' : False,
        'download_count'      : 0,
        'preview_count'       : 0,
        'download_url'        : 'https://rutgers.box.com/shared/static/i1dy31or67y030yhof3c39ts19emigzd.zip',
        'access'              : 'open',
        'permissions'         : {'can_download': True, 
                                 'can_preview' : True, 
                                 'can_edit'    : False}}
    """


    ''' ———————————————————————— CASE 1: Not shared ———————————————————————— '''
    if file_shareinfo_dict is None:
        return ['not shared']
    
    
    ''' —————————————————— CASE 2: Bad sharing permissions ————————————————— '''
    proper_perms = {
        'effective_access'    : 'open',
        'effective_permission': 'can_download',
        'is_password_enabled' : False,
        'access'              : 'open',
        'permissions'         : 
            {
                'can_download': True, 
                'can_preview' : True, 
                'can_edit'    : False
            }
    }   # NOTE: only paid accounts can change time-until-unshare (optional key "unshared_at"), so those are undetected and must be resolved manually.

    def _is_subset_dict(subset_dict, main_dict):
        return all(item in main_dict.items() for item in subset_dict.items())

    if not _is_subset_dict(proper_perms, file_shareinfo_dict):
        return ['incorrect sharing permissions', file_shareinfo_dict]
    

    ''' ————————————————— CASE 3: Good sharing permissions ————————————————— '''
    return ['correct sharing permissions', file_shareinfo_dict, file_shareinfo_dict['download_url']]


def _add_get_file_dlurl(client: BoxClient, file_id: str) -> str:
    file_dict : dict = client.shared_links_files.add_share_link_to_file(
        file_id = file_id,
        fields = "shared_link",
        shared_link = AddShareLinkToFileSharedLink(
            access      = 'open', # AddShareLinkToFileSharedLinkAccessField.OPEN.value
            password    = None,
            unshared_at = None,
            permissions = AddShareLinkToFileSharedLinkPermissionsField(
                can_download = True,
                can_preview  = True,
                can_edit     = False,
            ),
        ),
    ).to_dict()
    file_download_url : str = file_dict['shared_link']['download_url']
    return file_download_url


def _update_get_file_dlurl(clinet: BoxClient, file_id: str) -> str:
    file_dict : dict = client.shared_links_files.update_shared_link_on_file(
        file_id = file_id,
        fields = "shared_link",
        shared_link = UpdateSharedLinkOnFileSharedLink(
            access      = 'open',
            password    = None,
            unshared_at = None,
            permissions = UpdateSharedLinkOnFileSharedLinkPermissionsField(
                can_download = True,
                can_preview  = True,
                can_edit     = False,
            ),
        ),
    ).to_dict()
    file_download_url : str = file_dict['shared_link']['download_url']
    return file_download_url



''' ——————————————————————— CONSTRUCT REGISTRY PART 2 —————————————————————— '''

def add_filedlurls_to_registry(
    client         : BoxClient,
    registry_v1       : dict,
) -> dict:
    """
    Assume that `registry_v1` is dict of sub-dicts, where sub-dicts look like:
        'redplanet_cache/Mag/Langlais2019.sh': {
            'TEMP_box_id': '1568919057294',
            'sha1'       : '0625c76c9594d1bf22e3bfa6c17ca8ee36ac2d2a'
        }
    """

    registry_v2 : dict = {}

    for filepath, fileinfo in registry_v1.items():
        file_id : str = fileinfo['TEMP_box_id']
        file_sharestatus : list = _get_file_sharestatus(client, file_id)

        match file_sharestatus[0]:
            case 'not shared':
                file_dlurl : str = _add_get_file_dlurl(client, file_id)
            case 'incorrect sharing permissions':
                file_dlurl : str = _update_get_file_dlurl(client, file_id)
            case 'correct sharing permissions':
                file_dlurl : str = file_sharestatus[2]

        fileinfo['download_url'] = file_dlurl
        _ = fileinfo.pop('TEMP_box_id')

    return registry_v1

In [12]:
registry_noshcoeffs_dlurls : dict = add_filedlurls_to_registry(client, registry_noshcoeffs)
pp(registry_noshcoeffs_dlurls)
pp(len(registry_noshcoeffs_dlurls.keys()))

{'redplanet_cache/Crust/dichotomy/dichotomy_coordinates-JAH-0-360.txt': {'sha1': '74f912ddae101b6cd2dee5379f41cb6f7198555b',
                                                                         'download_url': 'https://rutgers.box.com/shared/static/tekd1w26h9mvfnyw8bpy4ko4v48931ri.txt'},
 'redplanet_cache/Crust/topo/Mars_HRSC_MOLA_BlendDEM_Global_200mp_v2.zarr.zip': {'sha1': 'dc7648a41bba9f5b7acd229560832840150d7ff7',
                                                                                 'download_url': 'https://rutgers.box.com/shared/static/ou32pr1v6d3osfpicqimn5p9j3tywzs6.zip'},
 'redplanet_cache/Crust/topo/Mars_MGS_MOLA_DEM_mosaic_global_463m_reprojected.zarr.zip': {'sha1': '0d544f068937cedd946f18163aebc43105a78652',
                                                                                          'download_url': 'https://rutgers.box.com/shared/static/u3y88syjopfwvs4rq6mxu6qar2r1aaah.zip'},
 'redplanet_cache/GRS/2022_Mars_Odyssey_GRS_Element_Concentration_Maps.

---
---
# [4] Export to JSON

In [None]:
def export_registry_to_json(
    registry : dict, 
    fpath    : Path = None
) -> None:
    if fpath is None:
        dirpath_output = Path.cwd() / 'output'
        dirpath_output.mkdir(exist_ok=True)
        fpath = dirpath_output / f"redplanet_registry_{datetime.now().strftime('%Y-%m-%d_%H-%M')}.json"

    with open(fpath, 'w') as f:
        json.dump(registry, f, indent=4)