In [None]:
import boto3
import os
import fsspec

In [None]:
# Define variables for upload

dir = ''  # directory where curated are located locally
bucket = ''       # bucket to upload to: czi-psomagen or czi-novogene
project = ''   # project name
mode = ''  # 'test' for a dry run or 'upload' to actually upload to s3

s3client = boto3.client('s3')
FS = fsspec.filesystem("s3")
errors = []


def get_paths(dir):
    '''
    Given the directory where curated matrices are, determine the s3 key to upload to.
    Return dictionary of paths, where key is local file and value is s3 key to upload to.
    Check to see if there is only one run date
    '''
    paths = {}
    for file in os.listdir(dir):
        if file.endswith('h5ad'):
            local_file_path = dir + file
            
            file_split = file.split("__")
            group = file_split[0]
            sample = file_split[1]
            order = file_split[2]

            # For each file, determind date
            r = s3client.list_objects_v2(Bucket=bucket, Prefix=f'{project}/{order}/{group}/processed/cellranger/', Delimiter='/')
            dates = [dir['Prefix'].replace(f'{project}/{order}/{group}/processed/cellranger/','') for dir in r['CommonPrefixes']]
            if len(dates)>1:
                errors.append(f'ERROR: multiple cellranger dates for {file}\n')
                print(f'ERROR: muliple cellranger dates for {file}')
            new_obj_name = f'{project}/{order}/{group}/processed/cellranger/{dates[0]}outs/per_sample_outs/{sample}/curated.h5ad'
            paths[local_file_path] = new_obj_name
   
    return paths


def upload(paths):
    '''
    Given the directory where curated matrices are, upload to given bucket and project.
    The group, sample, and order will be parsed from filenames, split by '__'.
    '''
    for file,s3_key in paths.items(): 
        if FS.isdir(f'{bucket}/{"/".join(s3_key.split("/")[0:-1])}'):
            try:
                #s3client.upload_file(file, bucket, s3_key)
                print(f'SUCCESS: {file.split("/")[-1]} uploaded to {s3_key}')
            except Exception as e:
                errors.append(e)
                print(f'Error uploading file: {e}')
        else:
            errors.append(f'ERROR: not valid s3_key {s3_key}\n')
            print(f'ERROR: not valid s3_key {s3_key}')


In [None]:
# Determine paths, including check for multiple runs
# Upload only if mode is 'upload', or else just run checks and have a dry run

paths = get_paths(dir)
if len(errors) > 0:
    print("ERROR: cannot upload")
    print(errors)
else:
    if mode == 'upload':
        upload(paths)
    else:
        for file,s3_key in paths.items():
            if not FS.isdir(f'{bucket}/{"/".join(s3_key.split("/")[0:-1])}'):
                errors.append(f'ERROR: not valid s3_key {s3_key}\n')
                print(f'ERROR: not valid s3_key {s3_key}\n')
            else:
                print(f'VALID:\t{file.split("/")[-1]}\t{s3_key}')


In [None]:
# check error logs

print(errors)