### 1. Isoseqsim

#### Most data

In [9]:
import os
import shutil
import subprocess

# Define the data entities and workflow IDs
data = [
    ("arabidopsis_isoseqsim_e000", "3cd69353-f222-4612-870c-573488385e9a"),
    ("arabidopsis_isoseqsim_e016", "d2905989-cf2d-49d3-8b9c-dd1575e4a6e6"),
    ("arabidopsis_isoseqsim_e050", "ac604e1c-02ce-4dba-a082-35ca9f3f7a0a"),
    ("arabidopsis_isoseqsim_e085", "479a7edf-8d61-45f5-a583-2a655b74e74e"),
    ("magnaporthe_isoseqsim_e000", "0cb9c79b-aee2-47fe-9ec2-1d8a99193f3a"),
    ("magnaporthe_isoseqsim_e016", "05fa9629-a0d7-4eb7-acf5-192796e472ec"),
    ("magnaporthe_isoseqsim_e050", "5d1754c3-f5f5-4365-9254-e2aaf58b55c1"),
    ("magnaporthe_isoseqsim_e085", "972c2b1f-b86b-4da0-a293-9d67bb4d5140"),
    ("mouse_isoseqsim_e000", "7f2a36fd-a9ec-465e-a09c-0e434dccf26a"),
    ("mouse_isoseqsim_e016", "452b6c4f-698d-4308-9e2f-1874cbd8d825"),
    ("mouse_isoseqsim_e050", "3e522436-95aa-4c32-89d0-f80a1be32f7a"),
    ("mouse_isoseqsim_e085", "5df50a98-bbce-4c0b-aeb0-1b77eec5e871")
]

# Define the base path for the folders
base_path = 'terra_outputs/1.isoseqsim_pb'

# Function to create folders and clear existing contents
def create_folders(data_entity):
    entity_path = os.path.join(base_path, data_entity)
    if os.path.exists(entity_path):
        shutil.rmtree(entity_path)
    os.makedirs(entity_path, exist_ok=True)
    os.makedirs(os.path.join(entity_path, "ID_reffree"), exist_ok=True)
    os.makedirs(os.path.join(entity_path, "ID_refguided"), exist_ok=True)
    os.makedirs(os.path.join(entity_path, "Quant"), exist_ok=True)
    return entity_path

# Function to generate and run shell scripts
def generate_and_run_scripts(bucket_name, prefix, entity_path):
    blobs = subprocess.run(['gsutil', 'ls', '-r', f'gs://{bucket_name}/{prefix}'], capture_output=True, text=True).stdout.splitlines()
    for blob in blobs:
        if not blob.startswith('gs://'):
            continue
        file_name = blob.split('/')[-1]
        if file_name.endswith(('gtf', 'gff3', 'gff')) and 'reduced' not in file_name and 'de_novo' not in file_name:
            target_folder = os.path.join(entity_path, "ID_reffree")
        elif file_name.endswith(('gtf', 'gff3', 'gff')) and ('reduced' in file_name or 'de_novo' in file_name):
            target_folder = os.path.join(entity_path, "ID_refguided")
        elif ('quant' in file_name and '.log' not in file_name) or 'Gffcompare' in file_name:
            target_folder = os.path.join(entity_path, "Quant")
        else:
            continue
        os.makedirs(target_folder, exist_ok=True)
        target_path = os.path.join(target_folder, file_name)
        subprocess.run(['gsutil', 'cp', blob, target_path], capture_output=True, text=True)

        # Remove _reduced from file names in ID_refguided folder
        if 'ID_refguided' in target_path and '_reduced' in file_name:
            new_file_name = file_name.replace('_reduced', '')
            new_target_path = os.path.join(target_folder, new_file_name)
            os.rename(target_path, new_target_path)
        # Remove _de_novo from file names in ID_refguided folder
        if 'ID_refguided' in target_path and '_de_novo' in file_name:
            new_file_name = file_name.replace('_de_novo', '')
            new_target_path = os.path.join(target_folder, new_file_name)
            os.rename(target_path, new_target_path)

# Main script
for data_entity, workflow_id in data:
    entity_path = create_folders(data_entity)
    prefix = f"submissions/83650cf1-a084-4e09-8211-51a16c122cdd/LongReadRNABenchmark/{workflow_id}"
    bucket_name = "fc-070af439-57c5-4d06-af39-40284061e6f3"
    generate_and_run_scripts(bucket_name, prefix, entity_path)

print("All downloading and renaming is done.")

All downloading and renaming is done.


In [5]:
import os
import shutil
import subprocess

# Define the data entities and workflow IDs
data = [
    ("arabidopsis_isoseqsim_e000", "3cd69353-f222-4612-870c-573488385e9a"),
    ("arabidopsis_isoseqsim_e016", "d2905989-cf2d-49d3-8b9c-dd1575e4a6e6"),
    ("arabidopsis_isoseqsim_e050", "ac604e1c-02ce-4dba-a082-35ca9f3f7a0a"),
    ("arabidopsis_isoseqsim_e085", "479a7edf-8d61-45f5-a583-2a655b74e74e"),
    ("magnaporthe_isoseqsim_e000", "0cb9c79b-aee2-47fe-9ec2-1d8a99193f3a"),
    ("magnaporthe_isoseqsim_e016", "05fa9629-a0d7-4eb7-acf5-192796e472ec"),
    ("magnaporthe_isoseqsim_e050", "5d1754c3-f5f5-4365-9254-e2aaf58b55c1"),
    ("magnaporthe_isoseqsim_e085", "972c2b1f-b86b-4da0-a293-9d67bb4d5140"),
    ("mouse_isoseqsim_e000", "7f2a36fd-a9ec-465e-a09c-0e434dccf26a"),
    ("mouse_isoseqsim_e016", "452b6c4f-698d-4308-9e2f-1874cbd8d825"),
    ("mouse_isoseqsim_e050", "3e522436-95aa-4c32-89d0-f80a1be32f7a"),
    ("mouse_isoseqsim_e085", "5df50a98-bbce-4c0b-aeb0-1b77eec5e871")
]

# Define the base path for the folders
base_path = 'terra_outputs/1.isoseqsim_pb'

# Function to create folders if they do not exist
def create_folders(data_entity):
    entity_path = os.path.join(base_path, data_entity)
    if not os.path.exists(entity_path):
        os.makedirs(entity_path, exist_ok=True)
        os.makedirs(os.path.join(entity_path, "Quant"), exist_ok=True)
    return entity_path

# Function to generate and run shell scripts
def generate_and_run_scripts(bucket_name, prefix, entity_path):
    blobs = subprocess.run(['gsutil', 'ls', '-r', f'gs://{bucket_name}/{prefix}'], capture_output=True, text=True).stdout.splitlines()
    for blob in blobs:
        if not blob.startswith('gs://'):
            continue
        file_name = blob.split('/')[-1]
        if file_name in ['Oarfish_quant.tsv', 'IsoQuant_quant.tsv', 'Flair_quant.tsv']:
            target_folder = os.path.join(entity_path, "Quant")
            os.makedirs(target_folder, exist_ok=True)
            target_path = os.path.join(target_folder, file_name)
            if not os.path.exists(target_path):
                subprocess.run(['gsutil', 'cp', blob, target_path], capture_output=True, text=True)
                print(f"Copied file to: {target_path}")
            else:
                print(f"File {file_name} already exists, skipping copy")
        else:
            print(f"Skipping file: {file_name}")

# Main script
for data_entity, workflow_id in data:
    entity_path = create_folders(data_entity)
    prefix = f"submissions/83650cf1-a084-4e09-8211-51a16c122cdd/LongReadRNABenchmark/{workflow_id}"
    bucket_name = "fc-070af439-57c5-4d06-af39-40284061e6f3"
    generate_and_run_scripts(bucket_name, prefix, entity_path)

print("All downloading and renaming is done.")

Skipping file: :
Skipping file: :
Skipping file: :
Skipping file: :
Skipping file: :
Skipping file: bambuTask.log
Skipping file: gcs_delocalization.sh
Skipping file: gcs_localization.sh
Skipping file: gcs_transfer.sh
Skipping file: memory_retry_rc
Skipping file: monitoring.log
Skipping file: rc
Skipping file: script
Skipping file: stderr
Skipping file: stdout
Skipping file: :
Skipping file: Bambu.gtf
Skipping file: Bambu_ndr1_reduced.gtf
Skipping file: Bambu_quant.txt
Skipping file: Bambu_reduced.gtf
Skipping file: :
Skipping file: output
Skipping file: :
Skipping file: :
Skipping file: stderr
Skipping file: stdout
Skipping file: :
Skipping file: stderr
Skipping file: stdout
Skipping file: :
Skipping file: stderr
Skipping file: stdout
Skipping file: :
Skipping file: stderr
Skipping file: stdout
Skipping file: :
Skipping file: stderr
Skipping file: stdout
Skipping file: :
Skipping file: stderr
Skipping file: stdout
Skipping file: :
Skipping file: stderr
Skipping file: stdout
Skipping fi

In [6]:
import os
import shutil
import subprocess

# Define the data entities and workflow IDs
data = [
    ("arabidopsis_isoseqsim_e000", "3cd69353-f222-4612-870c-573488385e9a"),
    ("arabidopsis_isoseqsim_e016", "d2905989-cf2d-49d3-8b9c-dd1575e4a6e6"),
    ("arabidopsis_isoseqsim_e050", "ac604e1c-02ce-4dba-a082-35ca9f3f7a0a"),
    ("arabidopsis_isoseqsim_e085", "479a7edf-8d61-45f5-a583-2a655b74e74e"),
    ("magnaporthe_isoseqsim_e000", "0cb9c79b-aee2-47fe-9ec2-1d8a99193f3a"),
    ("magnaporthe_isoseqsim_e016", "05fa9629-a0d7-4eb7-acf5-192796e472ec"),
    ("magnaporthe_isoseqsim_e050", "5d1754c3-f5f5-4365-9254-e2aaf58b55c1"),
    ("magnaporthe_isoseqsim_e085", "972c2b1f-b86b-4da0-a293-9d67bb4d5140"),
    ("mouse_isoseqsim_e000", "7f2a36fd-a9ec-465e-a09c-0e434dccf26a"),
    ("mouse_isoseqsim_e016", "452b6c4f-698d-4308-9e2f-1874cbd8d825"),
    ("mouse_isoseqsim_e050", "3e522436-95aa-4c32-89d0-f80a1be32f7a"),
    ("mouse_isoseqsim_e085", "5df50a98-bbce-4c0b-aeb0-1b77eec5e871")
]

# Define the base path for the folders
base_path = 'terra_outputs/1.isoseqsim_pb'

# Function to create folders if they do not exist
def create_folders(data_entity):
    entity_path = os.path.join(base_path, data_entity)
    if not os.path.exists(entity_path):
        os.makedirs(entity_path, exist_ok=True)
        os.makedirs(os.path.join(entity_path, "Quant"), exist_ok=True)
    return entity_path

# Function to generate and run shell scripts
def generate_and_run_scripts(bucket_name, prefix, entity_path):
    blobs = subprocess.run(['gsutil', 'ls', '-r', f'gs://{bucket_name}/{prefix}'], capture_output=True, text=True).stdout.splitlines()
    for blob in blobs:
        if not blob.startswith('gs://'):
            continue
        file_name = blob.split('/')[-1]
        if file_name in ['Gffcompare_OUT_expression_matrix.tsv']:
            target_folder = os.path.join(entity_path, "Quant")
            os.makedirs(target_folder, exist_ok=True)
            target_path = os.path.join(target_folder, file_name)
            if not os.path.exists(target_path):
                subprocess.run(['gsutil', 'cp', blob, target_path], capture_output=True, text=True)
                print(f"Copied file to: {target_path}")
            else:
                print(f"File {file_name} already exists, skipping copy")
        else:
            print(f"Skipping file: {file_name}")

# Main script
for data_entity, workflow_id in data:
    entity_path = create_folders(data_entity)
    prefix = f"submissions/83650cf1-a084-4e09-8211-51a16c122cdd/LongReadRNABenchmark/{workflow_id}"
    bucket_name = "fc-070af439-57c5-4d06-af39-40284061e6f3"
    generate_and_run_scripts(bucket_name, prefix, entity_path)

print("All downloading and renaming is done.")

Skipping file: :
Skipping file: :
Skipping file: :
Skipping file: :
Skipping file: :
Skipping file: bambuTask.log
Skipping file: gcs_delocalization.sh
Skipping file: gcs_localization.sh
Skipping file: gcs_transfer.sh
Skipping file: memory_retry_rc
Skipping file: monitoring.log
Skipping file: rc
Skipping file: script
Skipping file: stderr
Skipping file: stdout
Skipping file: :
Skipping file: Bambu.gtf
Skipping file: Bambu_ndr1_reduced.gtf
Skipping file: Bambu_quant.txt
Skipping file: Bambu_reduced.gtf
Skipping file: :
Skipping file: output
Skipping file: :
Skipping file: :
Skipping file: stderr
Skipping file: stdout
Skipping file: :
Skipping file: stderr
Skipping file: stdout
Skipping file: :
Skipping file: stderr
Skipping file: stdout
Skipping file: :
Skipping file: stderr
Skipping file: stdout
Skipping file: :
Skipping file: stderr
Skipping file: stdout
Skipping file: :
Skipping file: stderr
Skipping file: stdout
Skipping file: :
Skipping file: stderr
Skipping file: stdout
Skipping fi

#### some of PB and Talon runs downloaded as following

In [10]:
import os
import subprocess

# Define the new data entities and workflow IDs
new_data = [
    ("magnaporthe_isoseqsim_e000", "4769aead-2de0-4404-8bac-18d54c4c8c92"),
    ("magnaporthe_isoseqsim_e016", "42110a00-bd49-4577-96ac-092d61a9cdd8"),
    ("magnaporthe_isoseqsim_e050", "cdb65062-9366-4538-8f1c-8f47243d69e4"),
    ("magnaporthe_isoseqsim_e085", "41c88228-554f-411c-b166-9c6c9cccfde7"),
    ("mouse_isoseqsim_e050", "0454732f-e60a-416b-b0f0-eddfa3a30f11"),
    ("mouse_isoseqsim_e085", "2a9edc2e-4a71-476c-9353-5f074d9f60f0")
]

# Define the base path for the folders
base_path = 'terra_outputs/1.isoseqsim_pb'

# Function to create folders if they do not exist
def create_folders_if_not_exist(data_entity):
    entity_path = os.path.join(base_path, data_entity)
    os.makedirs(os.path.join(entity_path, "ID_reffree"), exist_ok=True)
    os.makedirs(os.path.join(entity_path, "ID_refguided"), exist_ok=True)
    os.makedirs(os.path.join(entity_path, "Quant"), exist_ok=True)
    return entity_path

# Function to generate and run shell scripts
def generate_and_run_scripts(bucket_name, prefix, entity_path):
    blobs = subprocess.run(['gsutil', 'ls', '-r', f'gs://{bucket_name}/{prefix}'], capture_output=True, text=True).stdout.splitlines()
    for blob in blobs:
        if not blob.startswith('gs://'):
            continue
        file_name = blob.split('/')[-1]
        if file_name.endswith(('gtf', 'gff3', 'gff')) and 'reduced' not in file_name and 'de_novo' not in file_name:
            target_folder = os.path.join(entity_path, "ID_reffree")
        elif file_name.endswith(('gtf', 'gff3', 'gff')) and ('reduced' in file_name or 'de_novo' in file_name):
            target_folder = os.path.join(entity_path, "ID_refguided")
        elif ('quant' in file_name and '.log' not in file_name) or 'Gffcompare' in file_name:
            target_folder = os.path.join(entity_path, "Quant")
        else:
            continue
        os.makedirs(target_folder, exist_ok=True)
        target_path = os.path.join(target_folder, file_name)
        subprocess.run(['gsutil', 'cp', blob, target_path], capture_output=True, text=True)

        # Remove _reduced from file names in ID_refguided folder
        if 'ID_refguided' in target_path and '_reduced' in file_name:
            new_file_name = file_name.replace('_reduced', '')
            new_target_path = os.path.join(target_folder, new_file_name)
            os.rename(target_path, new_target_path)
        # Remove _de_novo from file names in ID_refguided folder
        if 'ID_refguided' in target_path and '_de_novo' in file_name:
            new_file_name = file_name.replace('_de_novo', '')
            new_target_path = os.path.join(target_folder, new_file_name)
            os.rename(target_path, new_target_path)
            
# Main script for new additions
for data_entity, workflow_id in new_data:
    entity_path = create_folders_if_not_exist(data_entity)
    if data_entity.startswith("mouse_isoseqsim"):
        prefix = f"submissions/72afa895-b4f8-435b-9b48-b8832344dec1/LongReadRNABenchmark/{workflow_id}"
    else:
        prefix = f"submissions/8a7ce71b-3d5f-4e48-8f43-0f5fd1ce808a/LongReadRNABenchmark/{workflow_id}"
    bucket_name = "fc-070af439-57c5-4d06-af39-40284061e6f3"
    generate_and_run_scripts(bucket_name, prefix, entity_path)

print("All downloading and renaming for new additions is done.")

All downloading and renaming for new additions is done.


#### LRAA

In [11]:
import os
import subprocess

# Define the base path for the folders
base_path = 'terra_outputs/1.isoseqsim_pb'

# Define the sample data
samples = [
    ("arabidopsis_isoseqsim_e000", "6d455be3-a05b-41a7-a020-20cf21ad8729", "48ca747f-81b4-4f67-bd85-31831964623b", "45b5bb93-504e-43d3-b264-a67bf3aaf8a0"),
    ("arabidopsis_isoseqsim_e016", "1d968d49-0862-4f52-8653-c55b84b9b9c6", "8aab7bc1-a894-41c3-be08-bfae47613d98", "4dde33e5-55dc-4fec-98f0-884f5269cae3"),
    ("arabidopsis_isoseqsim_e050", "35d111ad-2d83-44a3-8e05-888624894b26", "14a80970-9446-4d7d-8535-ef2594956c16", "aa11b064-f905-400b-9c91-e6c04d83342b"),
    ("arabidopsis_isoseqsim_e085", "c1ab3f79-0afe-40cd-9b03-fa7af6a10868", "f0ca5281-8961-440d-a137-0fbacec20122", "8687a254-29f6-476f-a955-a56895b0e5e3"),
    ("magnaporthe_isoseqsim_e000", "c7586692-88bd-4327-b8d8-97d717e07358", "dd3659ea-604d-4cbb-b2f8-f1381d910522", "62658008-7391-46df-bdc6-31d8a9619be5"),
    ("magnaporthe_isoseqsim_e016", "8056f46a-6179-4ffb-98da-16d74f20b368", "cbf9ebe1-4fca-4f45-9338-55249eb2360e", "470a5ee7-31ac-4131-85d0-45c8798c2e55"),
    ("magnaporthe_isoseqsim_e050", "800e9f96-b062-4278-bf15-c50f92c24972", "28c76a41-4b41-4608-b08c-e58368671867", "891688de-78d5-492f-bc61-8922da7d126c"),
    ("magnaporthe_isoseqsim_e085", "ea426c29-3b6e-4f6b-a20e-93930fd95e73", "4e5d1609-1a82-4ec1-95a8-1a3d62f69939", "893567ba-72aa-4f9a-b79f-c452a7ddb157"),
    ("mouse_isoseqsim_e000", "7c03c9f7-372c-487a-9982-a3593a062dc6", "c3bb5db8-5961-48ad-bf03-e6988986885c", "2ecc9d7c-2cf3-4b7c-9767-4062d325e96c"),
    ("mouse_isoseqsim_e016", "6305f7d9-4e33-4bec-a1ee-2bfb71acd5cb", "6b7356a0-fc7c-4704-8b78-e5e0f1e2e9ef", "f3c9bbf7-5f2f-4ce2-907a-027ce459fb53"),
    ("mouse_isoseqsim_e050", "e27cdc31-5d3b-4f00-9dec-c1c1d2ddc5c4", "b9643569-2eba-45de-bcac-f5bbb201a38c", "b7162db0-7235-4619-a8c4-fd8ed072a776"),
    ("mouse_isoseqsim_e085", "116257fc-16c3-4b89-bc92-c1ba1f99e423", "c5623e91-b7c3-4458-ac4f-1c38589c858d", "3a8d518c-25ca-44d3-a922-2cf1d6007e9d")
]

# Function to create folders if they do not exist
def create_folders_if_not_exist(data_entity):
    entity_path = os.path.join(base_path, data_entity)
    os.makedirs(os.path.join(entity_path, "ID_reffree"), exist_ok=True)
    os.makedirs(os.path.join(entity_path, "ID_refguided"), exist_ok=True)
    os.makedirs(os.path.join(entity_path, "Quant"), exist_ok=True)
    return entity_path

# Function to find the dynamic folder name
def find_dynamic_folder_name(bucket_name, prefix):
    result = subprocess.run(['gsutil', 'ls', f'gs://{bucket_name}/{prefix}/call-LRAA_direct/LRAA_runner/'], capture_output=True, text=True)
    folders = result.stdout.splitlines()
    for folder in folders:
        if folder.endswith('/'):
            return folder.split('/')[-2]
    return None

# Function to copy and rename files
def copy_and_rename_files(bucket_name, refguided_prefix, reffree_prefix, quant_prefix, entity_path):
    # Find dynamic folder names
    refguided_dynamic_folder = find_dynamic_folder_name(bucket_name, refguided_prefix)
    reffree_dynamic_folder = find_dynamic_folder_name(bucket_name, reffree_prefix)
    quant_dynamic_folder = find_dynamic_folder_name(bucket_name, quant_prefix)

    # Copy refguided gtf file
    refguided_gtf_blob = f'gs://{bucket_name}/{refguided_prefix}/call-LRAA_direct/LRAA_runner/{refguided_dynamic_folder}/call-LRAA_runner_task/LRAA_0216.LRAA.ref-guided.gtf'
    refguided_gtf_target = os.path.join(entity_path, "ID_refguided", "LRAA_0216.gtf")
    subprocess.run(['gsutil', 'cp', refguided_gtf_blob, refguided_gtf_target], capture_output=True, text=True)
    
    # Copy refguided quant file
    refguided_quant_blob = f'gs://{bucket_name}/{refguided_prefix}/call-LRAA_direct/LRAA_runner/{refguided_dynamic_folder}/call-LRAA_runner_task/LRAA_0216.LRAA.ref-guided.quant.expr'
    refguided_quant_target = os.path.join(entity_path, "Quant", "LRAA_0216.ref-guided.quant.expr")
    subprocess.run(['gsutil', 'cp', refguided_quant_blob, refguided_quant_target], capture_output=True, text=True)
    
    # Copy reffree gtf file
    reffree_gtf_blob = f'gs://{bucket_name}/{reffree_prefix}/call-LRAA_direct/LRAA_runner/{reffree_dynamic_folder}/call-LRAA_runner_task/LRAA_0216.LRAA.ref-free.gtf'
    reffree_gtf_target = os.path.join(entity_path, "ID_reffree", "LRAA_0216.gtf")
    subprocess.run(['gsutil', 'cp', reffree_gtf_blob, reffree_gtf_target], capture_output=True, text=True)
    
    # Copy reffree quant file
    reffree_quant_blob = f'gs://{bucket_name}/{reffree_prefix}/call-LRAA_direct/LRAA_runner/{reffree_dynamic_folder}/call-LRAA_runner_task/LRAA_0216.LRAA.ref-free.quant.expr'
    reffree_quant_target = os.path.join(entity_path, "Quant", "LRAA_0216.ref-free.quant.expr")
    subprocess.run(['gsutil', 'cp', reffree_quant_blob, reffree_quant_target], capture_output=True, text=True)
    
    # Copy quant-only quant file
    quant_only_blob = f'gs://{bucket_name}/{quant_prefix}/call-LRAA_direct/LRAA_runner/{quant_dynamic_folder}/call-LRAA_runner_task/LRAA_0216.LRAA.quant-only.quant.expr'
    quant_only_target = os.path.join(entity_path, "Quant", "LRAA_0216.quant-only.quant.expr")
    subprocess.run(['gsutil', 'cp', quant_only_blob, quant_only_target], capture_output=True, text=True)

# Main script for new additions
bucket_name = "fc-070af439-57c5-4d06-af39-40284061e6f3"
for data_entity, refguided_id, reffree_id, quant_id in samples:
    entity_path = create_folders_if_not_exist(data_entity)
    refguided_prefix = f"submissions/fa76d3f1-6ea5-4d34-87cb-e7f815f7d2b4/LRAA_wf/{refguided_id}"
    reffree_prefix = f"submissions/88a6fd76-a088-4d13-bc4a-7046f9ea5856/LRAA_wf/{reffree_id}"
    quant_prefix = f"submissions/6d2d7f72-6e15-4fa6-ac65-b1d69930fd45/LRAA_wf/{quant_id}"
    copy_and_rename_files(bucket_name, refguided_prefix, reffree_prefix, quant_prefix, entity_path)

print("All copying and renaming for new additions is done.")

All copying and renaming for new additions is done.


In [12]:
import os
import subprocess

# Define the base path for the folders
base_path = 'terra_outputs/1.isoseqsim_pb'

# Define the sample data
samples = [
    ("arabidopsis_isoseqsim_e000", "05debfd7-71ef-45e1-b25b-553922d51f2e", "5cf188bc-cb7f-4eb0-a478-b3df835782cb", "b1d9e60c-e200-4fb9-be38-c8ba804b2d14"),
    ("arabidopsis_isoseqsim_e016", "57d11f10-c5ba-4dfc-8f17-1b4e6154c8e7", "7543f0f1-e617-419d-aa37-f3211b856ca0", "295accf6-70d7-4e2a-9cc5-996052615f9e"),
    ("arabidopsis_isoseqsim_e050", "37168fd5-fb89-4f3b-b6e5-f00de817ebed", "ac12e04f-dfcd-4c97-95e6-a9e94d27c3bb", "eec7282a-969a-4a08-9cb8-516d8bf400f2"),
    ("arabidopsis_isoseqsim_e085", "b76b21f4-87d0-4550-9684-18b4f428547c", "5b967c8e-3ba4-44e0-8a69-75375720f1d6", "2e9adca4-6914-4e76-9059-c6860e998239"),
    ("magnaporthe_isoseqsim_e000", "1635312c-b197-49ab-a483-12f2e3ab682a", "73cc1586-bd7a-435f-b509-c3a57fa9575b", "ae3931b2-0d4a-4de6-9066-0b3e72438308"),
    ("magnaporthe_isoseqsim_e016", "07e96289-be1b-4cc2-8724-9ab0db4a27d0", "a985fd10-81d7-4539-9728-0976471db7d3", "9fef3896-7723-4511-aca4-aae9d252f168"),
    ("magnaporthe_isoseqsim_e050", "d4bdfe20-5382-449a-80d9-5137639b1600", "ec25a3cc-8d3a-4f66-93ca-364bd53d1369", "364ac2b5-bb1b-40df-b762-f6005989ccf7"),
    ("magnaporthe_isoseqsim_e085", "c518aa93-89cf-4032-b54b-b3ec13cc5b8b", "4029761a-1460-4998-96a7-4ae36449a4c7", "9af86739-5412-44ee-b025-a536021ba355"),
    ("mouse_isoseqsim_e000", "6848615b-9052-4ace-a922-d32e03d3aa47", "12c8495e-8e32-4f66-9fae-2424582b5db2", "ac77d0b6-4d46-495d-9c4f-6c1cb2308d41"),
    ("mouse_isoseqsim_e016", "c8427222-6cc0-416a-bb18-ae32950ae214", "c4cbfa49-4d09-4b6c-b457-8c9a9e600b63", "c460342e-eb3e-4718-811a-049848c2b77f"),
    ("mouse_isoseqsim_e050", "26e0209e-cb82-4a9a-aeb3-ef50e4cc2b0d", "9370d3e6-6a6c-4dc3-8067-026a0e84ef11", "6fb60dc3-6f69-40d9-840f-7c3c0a38125e"),
    ("mouse_isoseqsim_e085", "2aceb0f1-908a-4860-80c4-c359da029d7e", "92564ab5-6229-4cbd-a187-66ea49365d8a", "a503b169-4181-4d72-a18c-9b61c468f1ec")
]

# Function to create folders if they do not exist
def create_folders_if_not_exist(data_entity):
    entity_path = os.path.join(base_path, data_entity)
    os.makedirs(os.path.join(entity_path, "ID_reffree"), exist_ok=True)
    os.makedirs(os.path.join(entity_path, "ID_refguided"), exist_ok=True)
    os.makedirs(os.path.join(entity_path, "Quant"), exist_ok=True)
    return entity_path

# Function to find the dynamic folder name
def find_dynamic_folder_name(bucket_name, prefix):
    result = subprocess.run(['gsutil', 'ls', f'gs://{bucket_name}/{prefix}/call-LRAA_direct/LRAA_runner/'], capture_output=True, text=True)
    folders = result.stdout.splitlines()
    for folder in folders:
        if folder.endswith('/'):
            return folder.split('/')[-2]
    return None

# Function to copy and rename files
def copy_and_rename_files(bucket_name, refguided_prefix, reffree_prefix, quant_prefix, entity_path):
    # Find dynamic folder names
    refguided_dynamic_folder = find_dynamic_folder_name(bucket_name, refguided_prefix)
    reffree_dynamic_folder = find_dynamic_folder_name(bucket_name, reffree_prefix)
    quant_dynamic_folder = find_dynamic_folder_name(bucket_name, quant_prefix)

    # Copy refguided gtf file
    refguided_gtf_blob = f'gs://{bucket_name}/{refguided_prefix}/call-LRAA_direct/LRAA_runner/{refguided_dynamic_folder}/call-LRAA_runner_task/LRAA_0223.LRAA.ref-guided.gtf'
    refguided_gtf_target = os.path.join(entity_path, "ID_refguided", "LRAA_0223.gtf")
    subprocess.run(['gsutil', 'cp', refguided_gtf_blob, refguided_gtf_target], capture_output=True, text=True)
    
    # Copy refguided quant file
    refguided_quant_blob = f'gs://{bucket_name}/{refguided_prefix}/call-LRAA_direct/LRAA_runner/{refguided_dynamic_folder}/call-LRAA_runner_task/LRAA_0223.LRAA.ref-guided.quant.expr'
    refguided_quant_target = os.path.join(entity_path, "Quant", "LRAA_0223.ref-guided.quant.expr")
    subprocess.run(['gsutil', 'cp', refguided_quant_blob, refguided_quant_target], capture_output=True, text=True)
    
    # Copy reffree gtf file
    reffree_gtf_blob = f'gs://{bucket_name}/{reffree_prefix}/call-LRAA_direct/LRAA_runner/{reffree_dynamic_folder}/call-LRAA_runner_task/LRAA_0223.LRAA.ref-free.gtf'
    reffree_gtf_target = os.path.join(entity_path, "ID_reffree", "LRAA_0223.gtf")
    subprocess.run(['gsutil', 'cp', reffree_gtf_blob, reffree_gtf_target], capture_output=True, text=True)
    
    # Copy reffree quant file
    reffree_quant_blob = f'gs://{bucket_name}/{reffree_prefix}/call-LRAA_direct/LRAA_runner/{reffree_dynamic_folder}/call-LRAA_runner_task/LRAA_0223.LRAA.ref-free.quant.expr'
    reffree_quant_target = os.path.join(entity_path, "Quant", "LRAA_0223.ref-free.quant.expr")
    subprocess.run(['gsutil', 'cp', reffree_quant_blob, reffree_quant_target], capture_output=True, text=True)
    
    # Copy quant-only quant file
    quant_only_blob = f'gs://{bucket_name}/{quant_prefix}/call-LRAA_direct/LRAA_runner/{quant_dynamic_folder}/call-LRAA_runner_task/LRAA_0223.LRAA.quant-only.quant.expr'
    quant_only_target = os.path.join(entity_path, "Quant", "LRAA_0223.quant-only.quant.expr")
    subprocess.run(['gsutil', 'cp', quant_only_blob, quant_only_target], capture_output=True, text=True)

# Main script for new additions
bucket_name = "fc-070af439-57c5-4d06-af39-40284061e6f3"
for data_entity, refguided_id, reffree_id, quant_id in samples:
    entity_path = create_folders_if_not_exist(data_entity)
    refguided_prefix = f"submissions/3acdb78b-6702-41e3-afc2-bfe890483b40/LRAA_wf/{refguided_id}"
    reffree_prefix = f"submissions/5773ce10-26b5-4ae6-ace7-97c6a9109cfe/LRAA_wf/{reffree_id}"
    quant_prefix = f"submissions/201fc35d-a33f-409e-9162-ae48c34bd5d9/LRAA_wf/{quant_id}"
    copy_and_rename_files(bucket_name, refguided_prefix, reffree_prefix, quant_prefix, entity_path)

print("All copying and renaming for new additions is done.")

All copying and renaming for new additions is done.


### 2. SIRVs

In [13]:
import os
import shutil
import subprocess

# Define the data entities and workflow IDs
data = [
    ("CL_BT474_E0_sirv", "ae7499f2-1065-4e6f-8429-e67a028fb5e9"),
    ("CL_BT474_E1_sirv", "6adc70e9-ba5a-4cb3-913f-8b19f8f39db1"),
    ("CL_BT474_E2_sirv", "9661d318-624f-4f94-9e2d-a38067912479"),
    ("CL_HG002_E0_sirv", "b0e3c89c-6be6-4d4f-ba47-aa73c9eb3bd1"),
    ("CL_HG002_E1_sirv", "598ed0c4-203f-4175-bb4e-b1ea95ff07ad"),
    ("CL_HG002_E2_sirv", "949f569a-fe3b-45ce-bfdf-7c13c7bbc84e"),
    ("CL_K562_E0_sirv", "f35eb79e-fa6a-4d3b-8f81-cd3f4640215a"),
    ("CL_K562_E1_sirv", "87877f18-7531-4340-90e2-29d7221dc1b5"),
    ("CL_K562_E2_sirv", "ec72a56c-4592-4d3f-a17f-3fe0b6d30764"),
    ("CL_UHRR_E0_sirv", "5c7b099c-dc97-471e-94f1-23958ff90187"),
    ("CL_UHRR_E1_sirv", "2475ff2d-80c0-4013-bca0-23ed131e5f94"),
    ("CL_UHRR_E2_sirv", "2bc2b13c-6c5a-4888-99c2-05f065185b18")
]

# Define the base path for the folders
base_path = 'terra_outputs/2.sirvs_pb'

# Function to create folders and clear existing contents
def create_folders(data_entity):
    entity_path = os.path.join(base_path, data_entity)
    if os.path.exists(entity_path):
        shutil.rmtree(entity_path)
    os.makedirs(entity_path, exist_ok=True)
    os.makedirs(os.path.join(entity_path, "ID_reffree"), exist_ok=True)
    os.makedirs(os.path.join(entity_path, "ID_refguided"), exist_ok=True)
    os.makedirs(os.path.join(entity_path, "Quant"), exist_ok=True)
    return entity_path

# Function to generate and run shell scripts
def generate_and_run_scripts(bucket_name, prefix, entity_path):
    blobs = subprocess.run(['gsutil', 'ls', '-r', f'gs://{bucket_name}/{prefix}'], capture_output=True, text=True).stdout.splitlines()
    for blob in blobs:
        if not blob.startswith('gs://'):
            continue
        file_name = blob.split('/')[-1]
        if file_name.endswith(('gtf', 'gff3', 'gff')) and 'reduced' not in file_name and 'de_novo' not in file_name:
            target_folder = os.path.join(entity_path, "ID_reffree")
        elif file_name.endswith(('gtf', 'gff3', 'gff')) and ('reduced' in file_name or 'de_novo' in file_name):
            target_folder = os.path.join(entity_path, "ID_refguided")
        elif ('quant' in file_name and '.log' not in file_name) or 'Gffcompare' in file_name:
            target_folder = os.path.join(entity_path, "Quant")
        else:
            continue
        os.makedirs(target_folder, exist_ok=True)
        target_path = os.path.join(target_folder, file_name)
        subprocess.run(['gsutil', 'cp', blob, target_path], capture_output=True, text=True)

        # Remove _reduced from file names in ID_refguided folder
        if 'ID_refguided' in target_path and '_reduced' in file_name:
            new_file_name = file_name.replace('_reduced', '')
            new_target_path = os.path.join(target_folder, new_file_name)
            os.rename(target_path, new_target_path)
        # Remove _de_novo from file names in ID_refguided folder
        if 'ID_refguided' in target_path and '_de_novo' in file_name:
            new_file_name = file_name.replace('_de_novo', '')
            new_target_path = os.path.join(target_folder, new_file_name)
            os.rename(target_path, new_target_path)
            
# Main script
for data_entity, workflow_id in data:
    entity_path = create_folders(data_entity)
    prefix = f"submissions/49b8f003-2692-4ba0-a86b-f6a1f8d6adb5/LongReadRNABenchmark/{workflow_id}"
    bucket_name = "fc-070af439-57c5-4d06-af39-40284061e6f3"
    generate_and_run_scripts(bucket_name, prefix, entity_path)

print("All downloading and renaming is done.")

All downloading and renaming is done.


In [14]:
import os
import subprocess

# Define the base path for the folders
base_path = 'terra_outputs/2.sirvs_pb'

# Define the sample data with flipped second and third columns
samples = [
    ("CL_BT474_E0_sirv", "76e21671-4697-48f8-a2d5-e0cd1b693734", "19e97607-fa9b-4e09-8098-027e6725b187", "0c39db16-93a2-4b56-98f4-49e3896b8d3d"),
    ("CL_BT474_E1_sirv", "0fd765c6-94fb-49dd-8743-4df7c3424ad8", "92f790e9-1c30-4bd5-a066-c7b3693c4cde", "6d1a19be-2630-4a76-8b7b-52bc6d2ca5b2"),
    ("CL_BT474_E2_sirv", "a9e9fa73-0f45-4c70-889a-8103c76ff474", "8a08564d-3d9f-4e50-95bf-e5407fc1d02f", "88b14a5d-36c8-46f4-8b6d-3828ab3652c4"),
    ("CL_HG002_E0_sirv", "06d0a5e0-c9f6-46e6-b937-9891c51bcae3", "6c6505fb-8209-449a-b89d-ca51c6c7dedf", "b56e1637-d412-4151-b2ee-c3e1d94e0c4f"),
    ("CL_HG002_E1_sirv", "ca11fc1c-06c4-488f-bf03-5e87b53a0dfd", "e03a4ea2-029b-443c-b90b-e06bec9d6b7b", "d89091ad-ba53-41e3-b99a-6005c9a1d91f"),
    ("CL_HG002_E2_sirv", "3000baee-fc22-4637-97af-f5b36adfa709", "366ee683-02ad-4f30-bf43-4d6dade451d8", "7c7b610f-60d4-4636-8236-fe6d28d42166"),
    ("CL_K562_E0_sirv", "e1773bfc-079c-44d0-abfe-69ad68f41358", "f9559cc4-319b-4546-9e15-2c87077b16a3", "527b8ebd-ec25-4dd7-882d-d55832783fbd"),
    ("CL_K562_E1_sirv", "fb6c4a0f-9ba6-4b88-93ad-019fed70cadf", "29e579a5-ea3c-426c-b935-e5d07f3b1e85", "b449ae0c-5131-4fbd-a702-af5db84c61f5"),
    ("CL_K562_E2_sirv", "ed2b46d5-4e91-4504-ba4a-4ee66c66352d", "fc0aa713-87e9-42ad-8c15-7dafb5ca8174", "e14c3783-56d1-4df7-8ffb-da059bbb185f"),
    ("CL_UHRR_E0_sirv", "f848304f-723e-40e0-9124-d6124ab4a567", "d4cb7e68-65f7-4428-9ef7-018b7c993ed7", "ac12fbd3-05a7-44b6-ae6d-2b6dc585e492"),
    ("CL_UHRR_E1_sirv", "f5f70d64-9963-4857-aaaa-4347e7c879e5", "ff717c6c-3746-4f37-8e50-14e92e869a8b", "87fda963-bfe7-494d-9822-d05fca1c63e8"),
    ("CL_UHRR_E2_sirv", "d0ddded5-ecd2-4994-bae7-c7762b384b80", "dc3aee96-03e6-4356-8a24-098c82f2fe5a", "204329bc-86da-4b4d-8c99-2d7de7499cff")
]

# Function to create folders if they do not exist
def create_folders_if_not_exist(data_entity):
    entity_path = os.path.join(base_path, data_entity)
    os.makedirs(os.path.join(entity_path, "ID_reffree"), exist_ok=True)
    os.makedirs(os.path.join(entity_path, "ID_refguided"), exist_ok=True)
    os.makedirs(os.path.join(entity_path, "Quant"), exist_ok=True)
    return entity_path

# Function to find the dynamic folder name
def find_dynamic_folder_name(bucket_name, prefix):
    result = subprocess.run(['gsutil', 'ls', f'gs://{bucket_name}/{prefix}/call-LRAA_direct/LRAA_runner/'], capture_output=True, text=True)
    folders = result.stdout.splitlines()
    for folder in folders:
        if folder.endswith('/'):
            return folder.split('/')[-2]
    return None

# Function to copy and rename files
def copy_and_rename_files(bucket_name, refguided_prefix, reffree_prefix, quant_prefix, entity_path):
    # Find dynamic folder names
    refguided_dynamic_folder = find_dynamic_folder_name(bucket_name, refguided_prefix)
    reffree_dynamic_folder = find_dynamic_folder_name(bucket_name, reffree_prefix)
    quant_dynamic_folder = find_dynamic_folder_name(bucket_name, quant_prefix)

    # Copy refguided gtf file
    refguided_gtf_blob = f'gs://{bucket_name}/{refguided_prefix}/call-LRAA_direct/LRAA_runner/{refguided_dynamic_folder}/call-LRAA_runner_task/LRAA_0223.LRAA.ref-guided.gtf'
    refguided_gtf_target = os.path.join(entity_path, "ID_refguided", "LRAA_0223.gtf")
    subprocess.run(['gsutil', 'cp', refguided_gtf_blob, refguided_gtf_target], capture_output=True, text=True)
    
    # Copy refguided quant file
    refguided_quant_blob = f'gs://{bucket_name}/{refguided_prefix}/call-LRAA_direct/LRAA_runner/{refguided_dynamic_folder}/call-LRAA_runner_task/LRAA_0223.LRAA.ref-guided.quant.expr'
    refguided_quant_target = os.path.join(entity_path, "Quant", "LRAA_0223.ref-guided.quant.expr")
    subprocess.run(['gsutil', 'cp', refguided_quant_blob, refguided_quant_target], capture_output=True, text=True)
    
    # Copy reffree gtf file
    reffree_gtf_blob = f'gs://{bucket_name}/{reffree_prefix}/call-LRAA_direct/LRAA_runner/{reffree_dynamic_folder}/call-LRAA_runner_task/LRAA_0223.LRAA.ref-free.gtf'
    reffree_gtf_target = os.path.join(entity_path, "ID_reffree", "LRAA_0223.gtf")
    subprocess.run(['gsutil', 'cp', reffree_gtf_blob, reffree_gtf_target], capture_output=True, text=True)
    
    # Copy reffree quant file
    reffree_quant_blob = f'gs://{bucket_name}/{reffree_prefix}/call-LRAA_direct/LRAA_runner/{reffree_dynamic_folder}/call-LRAA_runner_task/LRAA_0223.LRAA.ref-free.quant.expr'
    reffree_quant_target = os.path.join(entity_path, "Quant", "LRAA_0223.ref-free.quant.expr")
    subprocess.run(['gsutil', 'cp', reffree_quant_blob, reffree_quant_target], capture_output=True, text=True)
    
    # Copy quant-only quant file
    quant_only_blob = f'gs://{bucket_name}/{quant_prefix}/call-LRAA_direct/LRAA_runner/{quant_dynamic_folder}/call-LRAA_runner_task/LRAA_0223.LRAA.quant-only.quant.expr'
    quant_only_target = os.path.join(entity_path, "Quant", "LRAA_0223.quant-only.quant.expr")
    subprocess.run(['gsutil', 'cp', quant_only_blob, quant_only_target], capture_output=True, text=True)

# Main script for new additions
bucket_name = "fc-070af439-57c5-4d06-af39-40284061e6f3"
for data_entity, refguided_id, reffree_id, quant_id in samples:
    entity_path = create_folders_if_not_exist(data_entity)
    refguided_prefix = f"submissions/220eed48-3da5-40ad-bdeb-5a0365e73321/LRAA_wf/{refguided_id}"
    reffree_prefix = f"submissions/cb795aea-72d3-42d9-adbf-b507dfaa22c7/LRAA_wf/{reffree_id}"
    quant_prefix = f"submissions/e72c9ab4-9672-44a7-8cc6-fb9bf485424e/LRAA_wf/{quant_id}"
    copy_and_rename_files(bucket_name, refguided_prefix, reffree_prefix, quant_prefix, entity_path)
    print(f"Copying and renaming for {data_entity} is done.")

print("All copying and renaming for new additions is done.")

Copying and renaming for CL_BT474_E0_sirv is done.
Copying and renaming for CL_BT474_E1_sirv is done.
Copying and renaming for CL_BT474_E2_sirv is done.
Copying and renaming for CL_HG002_E0_sirv is done.
Copying and renaming for CL_HG002_E1_sirv is done.
Copying and renaming for CL_HG002_E2_sirv is done.
Copying and renaming for CL_K562_E0_sirv is done.
Copying and renaming for CL_K562_E1_sirv is done.
Copying and renaming for CL_K562_E2_sirv is done.
Copying and renaming for CL_UHRR_E0_sirv is done.
Copying and renaming for CL_UHRR_E1_sirv is done.
Copying and renaming for CL_UHRR_E2_sirv is done.
All copying and renaming for new additions is done.


### 3. MORFs

In [2]:
import os
import shutil
import subprocess
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define the data entities and workflow IDs
data = [
    ("morf2_ont_merged_annot_compat_1isoform", "f9928ccc-b1c7-43ac-9a18-bb1a21fccd72"),
    ("morf2_pacbio_merged_annot_compat_1isoform", "9457a288-2ed7-4479-a40d-8aba2732c7ef")
]

# Define the base path for the folders
base_path = 'terra_outputs/3.morfs_pb_ont'

# Function to create folders and clear existing contents
def create_folders(data_entity):
    entity_path = os.path.join(base_path, data_entity)
    try:
        if os.path.exists(entity_path):
            shutil.rmtree(entity_path)
        os.makedirs(entity_path, exist_ok=True)
        os.makedirs(os.path.join(entity_path, "ID_reffree"), exist_ok=True)
        os.makedirs(os.path.join(entity_path, "ID_refguided"), exist_ok=True)
        os.makedirs(os.path.join(entity_path, "Quant"), exist_ok=True)
        logging.info(f"Created folders for {data_entity}")
    except Exception as e:
        logging.error(f"Error creating folders for {data_entity}: {e}")
    return entity_path

# Function to generate and run shell scripts
def generate_and_run_scripts(bucket_name, prefix, entity_path):
    try:
        blobs = subprocess.run(['gsutil', 'ls', '-r', f'gs://{bucket_name}/{prefix}'], capture_output=True, text=True).stdout.splitlines()
        for blob in blobs:
            if not blob.startswith('gs://'):
                continue
            file_name = blob.split('/')[-1]
            if file_name.endswith(('gtf', 'gff3', 'gff')) and 'reduced' not in file_name and 'de_novo' not in file_name:
                target_folder = os.path.join(entity_path, "ID_reffree")
            elif file_name.endswith(('gtf', 'gff3', 'gff')) and ('reduced' in file_name or 'de_novo' in file_name):
                target_folder = os.path.join(entity_path, "ID_refguided")
            elif ('quant' in file_name and '.log' not in file_name) or 'Gffcompare' in file_name:
                target_folder = os.path.join(entity_path, "Quant")
            else:
                continue
            os.makedirs(target_folder, exist_ok=True)
            target_path = os.path.join(target_folder, file_name)
            subprocess.run(['gsutil', 'cp', blob, target_path], capture_output=True, text=True)

            # Remove _reduced from file names in ID_refguided folder
            if 'ID_refguided' in target_path and '_reduced' in file_name:
                new_file_name = file_name.replace('_reduced', '')
                new_target_path = os.path.join(target_folder, new_file_name)
                os.rename(target_path, new_target_path) 
            # Remove _de_novo from file names in ID_refguided folder
            if 'ID_refguided' in target_path and '_de_novo' in file_name:
                new_file_name = file_name.replace('_de_novo', '')
                new_target_path = os.path.join(target_folder, new_file_name)
                os.rename(target_path, new_target_path)
                
        logging.info(f"Processed files for prefix {prefix}")
    except Exception as e:
        logging.error(f"Error processing files for prefix {prefix}: {e}")

# Main script
for data_entity, workflow_id in data:
    entity_path = create_folders(data_entity)
    prefix = f"submissions/3403b60a-64f9-4717-aa10-cf1be2b41f48/LongReadRNABenchmark/{workflow_id}"
    bucket_name = "fc-070af439-57c5-4d06-af39-40284061e6f3"
    generate_and_run_scripts(bucket_name, prefix, entity_path)

logging.info("All downloading and renaming is done.")

2025-01-05 22:32:38,544 - INFO - Created folders for morf2_ont_merged_annot_compat_1isoform
2025-01-05 22:33:18,178 - INFO - Processed files for prefix submissions/3403b60a-64f9-4717-aa10-cf1be2b41f48/LongReadRNABenchmark/f9928ccc-b1c7-43ac-9a18-bb1a21fccd72
2025-01-05 22:33:18,180 - INFO - Created folders for morf2_pacbio_merged_annot_compat_1isoform
2025-01-05 22:34:05,860 - INFO - Processed files for prefix submissions/3403b60a-64f9-4717-aa10-cf1be2b41f48/LongReadRNABenchmark/9457a288-2ed7-4479-a40d-8aba2732c7ef
2025-01-05 22:34:05,861 - INFO - All downloading and renaming is done.


In [3]:
import os
import subprocess

# Define the base path for the folders
base_path = 'terra_outputs/3.morfs_pb_ont'

# Define the sample data with flipped second and third columns
samples = [
    ("morf2_ont_merged_annot_compat_1isoform", "84fdadcb-427c-448b-b292-461f374450de", "d2ef6d1a-d7bd-4e2a-a465-b2becadbbe14", "27157524-5544-4b16-9cb2-c36f0dfd7ef5"),
    ("morf2_pacbio_merged_annot_compat_1isoform", "1d9e75ab-8021-4cc2-a82b-c88f85194c91", "cb7c66dd-110c-4d76-bcb8-b13232ad9393", "1447b1ec-83f3-42a2-a19c-9fd25d5a0af5")
]

# Function to create folders if they do not exist
def create_folders_if_not_exist(data_entity):
    entity_path = os.path.join(base_path, data_entity)
    os.makedirs(os.path.join(entity_path, "ID_reffree"), exist_ok=True)
    os.makedirs(os.path.join(entity_path, "ID_refguided"), exist_ok=True)
    os.makedirs(os.path.join(entity_path, "Quant"), exist_ok=True)
    return entity_path

# Function to find the dynamic folder name
def find_dynamic_folder_name(bucket_name, prefix):
    result = subprocess.run(['gsutil', 'ls', f'gs://{bucket_name}/{prefix}/call-LRAA_direct/LRAA_runner/'], capture_output=True, text=True)
    folders = result.stdout.splitlines()
    for folder in folders:
        if folder.endswith('/'):
            return folder.split('/')[-2]
    return None

# Function to copy and rename files
def copy_and_rename_files(bucket_name, refguided_prefix, reffree_prefix, quant_prefix, entity_path):
    # Find dynamic folder names
    refguided_dynamic_folder = find_dynamic_folder_name(bucket_name, refguided_prefix)
    reffree_dynamic_folder = find_dynamic_folder_name(bucket_name, reffree_prefix)
    quant_dynamic_folder = find_dynamic_folder_name(bucket_name, quant_prefix)

    # Copy refguided gtf file
    refguided_gtf_blob = f'gs://{bucket_name}/{refguided_prefix}/call-LRAA_direct/LRAA_runner/{refguided_dynamic_folder}/call-LRAA_runner_task/LRAA_0223.LRAA.ref-guided.gtf'
    refguided_gtf_target = os.path.join(entity_path, "ID_refguided", "LRAA_0223.gtf")
    subprocess.run(['gsutil', 'cp', refguided_gtf_blob, refguided_gtf_target], capture_output=True, text=True)
    
    # Copy refguided quant file
    refguided_quant_blob = f'gs://{bucket_name}/{refguided_prefix}/call-LRAA_direct/LRAA_runner/{refguided_dynamic_folder}/call-LRAA_runner_task/LRAA_0223.LRAA.ref-guided.quant.expr'
    refguided_quant_target = os.path.join(entity_path, "Quant", "LRAA_0223.ref-guided.quant.expr")
    subprocess.run(['gsutil', 'cp', refguided_quant_blob, refguided_quant_target], capture_output=True, text=True)
    
    # Copy reffree gtf file
    reffree_gtf_blob = f'gs://{bucket_name}/{reffree_prefix}/call-LRAA_direct/LRAA_runner/{reffree_dynamic_folder}/call-LRAA_runner_task/LRAA_0223.LRAA.ref-free.gtf'
    reffree_gtf_target = os.path.join(entity_path, "ID_reffree", "LRAA_0223.gtf")
    subprocess.run(['gsutil', 'cp', reffree_gtf_blob, reffree_gtf_target], capture_output=True, text=True)
    
    # Copy reffree quant file
    reffree_quant_blob = f'gs://{bucket_name}/{reffree_prefix}/call-LRAA_direct/LRAA_runner/{reffree_dynamic_folder}/call-LRAA_runner_task/LRAA_0223.LRAA.ref-free.quant.expr'
    reffree_quant_target = os.path.join(entity_path, "Quant", "LRAA_0223.ref-free.quant.expr")
    subprocess.run(['gsutil', 'cp', reffree_quant_blob, reffree_quant_target], capture_output=True, text=True)
    
    # Copy quant-only quant file
    quant_only_blob = f'gs://{bucket_name}/{quant_prefix}/call-LRAA_direct/LRAA_runner/{quant_dynamic_folder}/call-LRAA_runner_task/LRAA_0223.LRAA.quant-only.quant.expr'
    quant_only_target = os.path.join(entity_path, "Quant", "LRAA_0223.quant-only.quant.expr")
    subprocess.run(['gsutil', 'cp', quant_only_blob, quant_only_target], capture_output=True, text=True)

# Main script for new additions
bucket_name = "fc-070af439-57c5-4d06-af39-40284061e6f3"
for data_entity, refguided_id, reffree_id, quant_id in samples:
    entity_path = create_folders_if_not_exist(data_entity)
    refguided_prefix = f"submissions/00cad999-67e3-49eb-8762-f53e49acffd8/LRAA_wf/{refguided_id}"
    reffree_prefix = f"submissions/7d5530e5-7419-41d8-8304-2e48a101ed49/LRAA_wf/{reffree_id}"
    quant_prefix = f"submissions/76ea339b-300a-4769-b9f9-feec848de7ca/LRAA_wf/{quant_id}"
    copy_and_rename_files(bucket_name, refguided_prefix, reffree_prefix, quant_prefix, entity_path)

print("All copying and renaming for new additions is done.")

All copying and renaming for new additions is done.


In [4]:
import os
import shutil
import subprocess
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define the data entities and workflow IDs
data = [
    ("morf2_ont_merged_annot_compat_1isoform", "f9928ccc-b1c7-43ac-9a18-bb1a21fccd72"),
    ("morf2_pacbio_merged_annot_compat_1isoform", "9457a288-2ed7-4479-a40d-8aba2732c7ef")
]

# Define the base path for the folders
base_path = 'terra_outputs/3.morfs_pb_ont'

# Function to create folders if they do not exist
def create_folders(data_entity):
    entity_path = os.path.join(base_path, data_entity)
    try:
        if not os.path.exists(entity_path):
            os.makedirs(entity_path, exist_ok=True)
            os.makedirs(os.path.join(entity_path, "Quant"), exist_ok=True)
            logging.info(f"Created folders for {data_entity}")
        else:
            logging.info(f"Folders for {data_entity} already exist, skipping creation")
    except Exception as e:
        logging.error(f"Error creating folders for {data_entity}: {e}")
    return entity_path

# Function to generate and run shell scripts
def generate_and_run_scripts(bucket_name, prefix, entity_path):
    try:
        blobs = subprocess.run(['gsutil', 'ls', '-r', f'gs://{bucket_name}/{prefix}'], capture_output=True, text=True).stdout.splitlines()
        for blob in blobs:
            if not blob.startswith('gs://'):
                continue
            file_name = blob.split('/')[-1]
            logging.info(f"Processing file: {file_name}")
            if file_name in ['Oarfish_quant.tsv', 'IsoQuant_quant.tsv', 'Flair_quant.tsv']:
                target_folder = os.path.join(entity_path, "Quant")
                os.makedirs(target_folder, exist_ok=True)
                target_path = os.path.join(target_folder, file_name)
                if not os.path.exists(target_path):
                    subprocess.run(['gsutil', 'cp', blob, target_path], capture_output=True, text=True)
                    logging.info(f"Copied file to: {target_path}")
                else:
                    logging.info(f"File {file_name} already exists, skipping copy")
            else:
                logging.info(f"Skipping file: {file_name}")
                
        logging.info(f"Processed files for prefix {prefix}")
    except Exception as e:
        logging.error(f"Error processing files for prefix {prefix}: {e}")

# Main script
for data_entity, workflow_id in data:
    entity_path = create_folders(data_entity)
    prefix = f"submissions/3403b60a-64f9-4717-aa10-cf1be2b41f48/LongReadRNABenchmark/{workflow_id}"
    bucket_name = "fc-070af439-57c5-4d06-af39-40284061e6f3"
    generate_and_run_scripts(bucket_name, prefix, entity_path)

logging.info("All downloading and renaming is done.")

2025-01-05 22:34:33,488 - INFO - Folders for morf2_ont_merged_annot_compat_1isoform already exist, skipping creation
2025-01-05 22:34:38,318 - INFO - Processing file: :
2025-01-05 22:34:38,319 - INFO - Skipping file: :
2025-01-05 22:34:38,319 - INFO - Processing file: :
2025-01-05 22:34:38,320 - INFO - Skipping file: :
2025-01-05 22:34:38,320 - INFO - Processing file: :
2025-01-05 22:34:38,321 - INFO - Skipping file: :
2025-01-05 22:34:38,322 - INFO - Processing file: :
2025-01-05 22:34:38,322 - INFO - Skipping file: :
2025-01-05 22:34:38,323 - INFO - Processing file: :
2025-01-05 22:34:38,323 - INFO - Skipping file: :
2025-01-05 22:34:38,324 - INFO - Processing file: :
2025-01-05 22:34:38,324 - INFO - Skipping file: :
2025-01-05 22:34:38,324 - INFO - Processing file: bambuTask.log
2025-01-05 22:34:38,325 - INFO - Skipping file: bambuTask.log
2025-01-05 22:34:38,325 - INFO - Processing file: monitoring.log
2025-01-05 22:34:38,326 - INFO - Skipping file: monitoring.log
2025-01-05 22:34:

### 4. Cell_lines

#### one gene one isoform run on 7 tools

In [17]:
import os
import subprocess

# Define the data entities and workflow IDs
data = [
    ("CL_BT474_E0_human", "e049cc16-972e-4797-aee8-a00540a88590"),
    ("CL_BT474_E1_human", "39d2340d-db40-4d52-a2ec-16b02aa5383a"),
    ("CL_BT474_E2_human", "16cc1518-b5ee-492b-b07e-5cb171701113"),
    ("CL_HG002_E0_human", "53a6ab1d-6756-49a7-881a-714df6cb647e"),
    ("CL_HG002_E1_human", "93183632-e972-4e40-8076-4b7d366ffee8"),
    ("CL_HG002_E2_human", "3e18ed82-2c2a-4897-929e-d9a072368a4a"),
    ("CL_K562_E0_human", "576648d7-110e-4f91-9441-32d035b7cccb"),
    ("CL_K562_E1_human", "3433600f-293d-4771-bbab-471caf6fdabe"),
    ("CL_K562_E2_human", "e1128fe2-8f84-4728-b3db-798b6e313b63"),
    ("CL_UHRR_E0_human", "37c09cdc-0d6d-45ce-8ea4-938859ba265c"),
    ("CL_UHRR_E1_human", "8b4bbacf-7f79-4722-9d89-ce2ca45c8ecc"),
    ("CL_UHRR_E2_human", "77c480e8-9620-48c0-a669-91aca22036a2")
]

# Define the base path for the folders
base_path = 'terra_outputs/4.cell_lines_pb'

# Function to create folders if they do not exist
def create_folders(data_entity):
    entity_path = os.path.join(base_path, data_entity)
    os.makedirs(os.path.join(entity_path, "ID_refguided"), exist_ok=True)
    os.makedirs(os.path.join(entity_path, "Quant"), exist_ok=True)
    return entity_path

# Function to generate and run shell scripts
def generate_and_run_scripts(bucket_name, prefix, entity_path):
    blobs = subprocess.run(['gsutil', 'ls', '-r', f'gs://{bucket_name}/{prefix}'], capture_output=True, text=True).stdout.splitlines()
    for blob in blobs:
        if not blob.startswith('gs://'):
            continue
        file_name = blob.split('/')[-1]
        
        # Skip files containing 'LRAA' in their names
        if 'LRAA' in file_name:
            continue
        
        if file_name.endswith(('gtf', 'gff3', 'gff')) and ('reduced' in file_name or 'de_novo' in file_name):
            target_folder = os.path.join(entity_path, "ID_refguided")
        elif ('quant' in file_name and '.log' not in file_name) or 'Gffcompare' in file_name:
            target_folder = os.path.join(entity_path, "Quant")
        else:
            continue
        os.makedirs(target_folder, exist_ok=True)
        target_path = os.path.join(target_folder, file_name)
        
        # Only copy the file if it does not already exist
        if not os.path.exists(target_path):
            subprocess.run(['gsutil', 'cp', blob, target_path], capture_output=True, text=True)

        # Remove _reduced from file names in ID_refguided folder
        if 'ID_refguided' in target_path and '_reduced' in file_name:
            new_file_name = file_name.replace('_reduced', '')
            new_target_path = os.path.join(target_folder, new_file_name)
            if not os.path.exists(new_target_path):
                os.rename(target_path, new_target_path)
            
        # Remove _de_novo from file names in ID_refguided folder
        if 'ID_refguided' in target_path and '_de_novo' in file_name:
            new_file_name = file_name.replace('_de_novo', '')
            new_target_path = os.path.join(target_folder, new_file_name)
            if not os.path.exists(new_target_path):
                os.rename(target_path, new_target_path)

# Main script
for data_entity, workflow_id in data:
    entity_path = create_folders(data_entity)
    prefix = f"submissions/48f762a6-0132-4c3f-969d-d334701c69d6/LongReadRNABenchmark/{workflow_id}"
    bucket_name = "fc-070af439-57c5-4d06-af39-40284061e6f3"
    generate_and_run_scripts(bucket_name, prefix, entity_path)

print("All downloading and renaming is done.")

All downloading and renaming is done.


#### the master run

In [7]:
import os
import subprocess

# Define the data entities and workflow IDs
data = [
    ("CL_BT474_E0_human", "e194a948-c36c-4a6e-bb1b-68252ccd1d41"),
    ("CL_BT474_E1_human", "f7fd338d-9014-4cf5-8936-72f1d6b48186"),
    ("CL_BT474_E2_human", "be5cc1a2-f6cb-4cdc-b6ae-c2d6ae7b080e"),
    ("CL_HG002_E0_human", "c7a718ad-a842-4e0c-a503-389729bb054b"),
    ("CL_HG002_E1_human", "dd2ce545-8bfe-42d0-88ca-b099c19a4b96"),
    ("CL_HG002_E2_human", "3c72c3b6-f035-4d21-ab44-87f3006d5648"),
    ("CL_K562_E0_human", "ed694b67-5473-4795-86b1-92f5de418463"),
    ("CL_K562_E1_human", "ec956af7-f130-4fc7-ba83-551b04ce8883"),
    ("CL_K562_E2_human", "0afd94ec-c64e-4bd2-a44e-71558b232f9c"),
    ("CL_UHRR_E0_human", "5f1ab32c-342f-40c3-b20a-79ad02ce2e1d"),
    ("CL_UHRR_E1_human", "cdd7e121-74cb-4d65-a351-877e5b36af01"),
    ("CL_UHRR_E2_human", "7563083e-f21f-4de5-984b-dba239750bcf")
]

# Define the base path for the folders
base_path = 'terra_outputs/4.cell_lines_pb'

# Function to create folders if they do not exist
def create_folders(data_entity):
    entity_path = os.path.join(base_path, data_entity)
    os.makedirs(os.path.join(entity_path, "ID_reffree"), exist_ok=True)
    os.makedirs(os.path.join(entity_path, "Quant"), exist_ok=True)
    return entity_path

# Function to generate and run shell scripts
def generate_and_run_scripts(bucket_name, prefix, entity_path):
    print(f"Listing blobs in gs://{bucket_name}/{prefix}")
    result = subprocess.run(['gsutil', 'ls', '-r', f'gs://{bucket_name}/{prefix}'], capture_output=True, text=True)
    if result.returncode != 0:
        print(f"Error listing blobs: {result.stderr}")
        return
    blobs = result.stdout.splitlines()
    print(f"Found {len(blobs)} blobs")
    for blob in blobs:
        print(f"Processing blob: {blob}")
        if not blob.startswith('gs://'):
            continue
        file_name = blob.split('/')[-1]
        
        # Skip files containing 'LRAA' in their names
        if 'LRAA' in file_name:
            continue
        
        if file_name.endswith(('gtf', 'gff3', 'gff')) and 'reduced' not in file_name and 'de_novo' not in file_name:
            target_folder = os.path.join(entity_path, "ID_reffree")
        elif ('quant' in file_name and '.log' not in file_name) or 'Gffcompare' in file_name:
            target_folder = os.path.join(entity_path, "Quant")
        else:
            continue
        os.makedirs(target_folder, exist_ok=True)
        target_path = os.path.join(target_folder, file_name)
        
        # Only copy the file if it does not already exist
        if not os.path.exists(target_path):
            print(f"Copying {blob} to {target_path}")
            copy_result = subprocess.run(['gsutil', 'cp', blob, target_path], capture_output=True, text=True)
            if copy_result.returncode != 0:
                print(f"Error copying {blob} to {target_path}: {copy_result.stderr}")
            else:
                print(f"Successfully copied {blob} to {target_path}")
        else:
            print(f"File {target_path} already exists, skipping.")

# Main script
for data_entity, workflow_id in data:
    entity_path = create_folders(data_entity)
    prefix = f"submissions/df5ee582-07fc-4c1b-a282-c467a962f23b/LongReadRNABenchmark/{workflow_id}"
    bucket_name = "fc-070af439-57c5-4d06-af39-40284061e6f3"
    generate_and_run_scripts(bucket_name, prefix, entity_path)

print("All downloading and renaming is done.")

Listing blobs in gs://fc-070af439-57c5-4d06-af39-40284061e6f3/submissions/df5ee582-07fc-4c1b-a282-c467a962f23b/LongReadRNABenchmark/e194a948-c36c-4a6e-bb1b-68252ccd1d41
Found 1022 blobs
Processing blob: gs://fc-070af439-57c5-4d06-af39-40284061e6f3/submissions/df5ee582-07fc-4c1b-a282-c467a962f23b/LongReadRNABenchmark/e194a948-c36c-4a6e-bb1b-68252ccd1d41/:
Processing blob: 
Processing blob: gs://fc-070af439-57c5-4d06-af39-40284061e6f3/submissions/df5ee582-07fc-4c1b-a282-c467a962f23b/LongReadRNABenchmark/e194a948-c36c-4a6e-bb1b-68252ccd1d41/call-bambu/:
Processing blob: 
Processing blob: gs://fc-070af439-57c5-4d06-af39-40284061e6f3/submissions/df5ee582-07fc-4c1b-a282-c467a962f23b/LongReadRNABenchmark/e194a948-c36c-4a6e-bb1b-68252ccd1d41/call-bambu/bambuWorkflow/:
Processing blob: 
Processing blob: gs://fc-070af439-57c5-4d06-af39-40284061e6f3/submissions/df5ee582-07fc-4c1b-a282-c467a962f23b/LongReadRNABenchmark/e194a948-c36c-4a6e-bb1b-68252ccd1d41/call-bambu/bambuWorkflow/44a2796f-8ceb-46b

#### Espresso 11 runs

In [19]:
import os
import shutil
import subprocess

# Define the data entities and workflow IDs
data = [
    ("CL_BT474_E1_human", "4fba5a42-fe23-4395-8fbb-91dc4022b6be"),
    ("CL_BT474_E2_human", "2da5f51e-343b-413e-801d-19d717ee0e31"),
    ("CL_HG002_E0_human", "763e418c-0c62-4359-beb8-150d10e8f355"),
    ("CL_HG002_E1_human", "5e16ed89-5917-4201-b066-6b743282fa3e"),
    ("CL_HG002_E2_human", "c4dfed70-cf93-40d6-a404-a153324f5521"),
    ("CL_K562_E0_human", "cab32035-5e01-4a44-a76f-dc92525468c3"),
    ("CL_K562_E1_human", "2079af4c-82bc-4103-bb05-3d7ed065c1a7"),
    ("CL_K562_E2_human", "c2ebe4be-f3f2-4863-b569-42d696929b85"),
    ("CL_UHRR_E0_human", "027f6079-40db-4e10-869c-38e48def649e"),
    ("CL_UHRR_E1_human", "d487b968-a478-4f70-93ee-c102cc2e1b36"),
    ("CL_UHRR_E2_human", "d79d4ab7-1d92-4709-9bb1-aa998783631d")
]

# Define the base path for the folders
base_path = 'terra_outputs/4.cell_lines_pb'

# Function to create folders if they do not exist
def create_folders(data_entity):
    entity_path = os.path.join(base_path, data_entity)
    os.makedirs(os.path.join(entity_path, "ID_reffree"), exist_ok=True)
    os.makedirs(os.path.join(entity_path, "ID_refguided"), exist_ok=True)
    os.makedirs(os.path.join(entity_path, "Quant"), exist_ok=True)
    return entity_path

# Function to generate and run shell scripts
def generate_and_run_scripts(bucket_name, prefix, entity_path):
    blobs = subprocess.run(['gsutil', 'ls', '-r', f'gs://{bucket_name}/{prefix}'], capture_output=True, text=True).stdout.splitlines()
    for blob in blobs:
        if not blob.startswith('gs://'):
            continue
        file_name = blob.split('/')[-1]
        if file_name.endswith(('gtf', 'gff3', 'gff')) and 'reduced' not in file_name and 'de_novo' not in file_name:
            target_folder = os.path.join(entity_path, "ID_reffree")
        elif file_name.endswith(('gtf', 'gff3', 'gff')) and ('reduced' in file_name or 'de_novo' in file_name):
            target_folder = os.path.join(entity_path, "ID_refguided")
        elif ('quant' in file_name and '.log' not in file_name) or 'Gffcompare' in file_name:
            target_folder = os.path.join(entity_path, "Quant")
        else:
            continue
        os.makedirs(target_folder, exist_ok=True)
        target_path = os.path.join(target_folder, file_name)
        
        # Only copy the file if it does not already exist
        if not os.path.exists(target_path):
            subprocess.run(['gsutil', 'cp', blob, target_path], capture_output=True, text=True)

        # Remove _reduced from file names in ID_refguided folder
        if 'ID_refguided' in target_path and '_reduced' in file_name:
            new_file_name = file_name.replace('_reduced', '')
            new_target_path = os.path.join(target_folder, new_file_name)
            if not os.path.exists(new_target_path):
                os.rename(target_path, new_target_path)
            
        # Remove _de_novo from file names in ID_refguided folder
        if 'ID_refguided' in target_path and '_de_novo' in file_name:
            new_file_name = file_name.replace('_de_novo', '')
            new_target_path = os.path.join(target_folder, new_file_name)
            if not os.path.exists(new_target_path):
                os.rename(target_path, new_target_path)

# Main script
for data_entity, workflow_id in data:
    entity_path = create_folders(data_entity)
    prefix = f"submissions/2de9fdde-5143-40ba-b761-6cd8371a05cc/LongReadRNABenchmark/{workflow_id}"
    bucket_name = "fc-070af439-57c5-4d06-af39-40284061e6f3"
    generate_and_run_scripts(bucket_name, prefix, entity_path)

print("All downloading and renaming is done.")

All downloading and renaming is done.


#### Espresso 1 run

In [20]:
import os
import shutil
import subprocess

# Define the data entities and workflow IDs
data = [
    ("CL_BT474_E0_human", "75781bc1-adfe-414e-af0b-87735e76f8b6"),
]

# Define the base path for the folders
base_path = 'terra_outputs/4.cell_lines_pb'

# Function to create folders if they do not exist
def create_folders(data_entity):
    entity_path = os.path.join(base_path, data_entity)
    os.makedirs(os.path.join(entity_path, "ID_reffree"), exist_ok=True)
    os.makedirs(os.path.join(entity_path, "ID_refguided"), exist_ok=True)
    os.makedirs(os.path.join(entity_path, "Quant"), exist_ok=True)
    return entity_path

# Function to generate and run shell scripts
def generate_and_run_scripts(bucket_name, prefix, entity_path):
    blobs = subprocess.run(['gsutil', 'ls', '-r', f'gs://{bucket_name}/{prefix}'], capture_output=True, text=True).stdout.splitlines()
    for blob in blobs:
        if not blob.startswith('gs://'):
            continue
        file_name = blob.split('/')[-1]
        if file_name.endswith(('gtf', 'gff3', 'gff')) and 'reduced' not in file_name and 'de_novo' not in file_name:
            target_folder = os.path.join(entity_path, "ID_reffree")
        elif file_name.endswith(('gtf', 'gff3', 'gff')) and ('reduced' in file_name or 'de_novo' in file_name):
            target_folder = os.path.join(entity_path, "ID_refguided")
        elif ('quant' in file_name and '.log' not in file_name) or 'Gffcompare' in file_name:
            target_folder = os.path.join(entity_path, "Quant")
        else:
            continue
        os.makedirs(target_folder, exist_ok=True)
        target_path = os.path.join(target_folder, file_name)
        
        # Only copy the file if it does not already exist
        if not os.path.exists(target_path):
            subprocess.run(['gsutil', 'cp', blob, target_path], capture_output=True, text=True)

        # Remove _reduced from file names in ID_refguided folder
        if 'ID_refguided' in target_path and '_reduced' in file_name:
            new_file_name = file_name.replace('_reduced', '')
            new_target_path = os.path.join(target_folder, new_file_name)
            if not os.path.exists(new_target_path):
                os.rename(target_path, new_target_path)
            
        # Remove _de_novo from file names in ID_refguided folder
        if 'ID_refguided' in target_path and '_de_novo' in file_name:
            new_file_name = file_name.replace('_de_novo', '')
            new_target_path = os.path.join(target_folder, new_file_name)
            if not os.path.exists(new_target_path):
                os.rename(target_path, new_target_path)

# Main script
for data_entity, workflow_id in data:
    entity_path = create_folders(data_entity)
    prefix = f"submissions/e8c6564f-ccf6-4711-b2f5-28c03ac4c06e/LongReadRNABenchmark/{workflow_id}"
    bucket_name = "fc-070af439-57c5-4d06-af39-40284061e6f3"
    generate_and_run_scripts(bucket_name, prefix, entity_path)

print("All downloading and renaming is done.")

All downloading and renaming is done.


#### flair 12 runs

In [21]:
import os
import shutil
import subprocess

# Define the data entities and workflow IDs
data = [
    ("CL_BT474_E0_human", "3940dd5c-9f3a-4ed2-b4b8-4cfb1f644aef"),
    ("CL_BT474_E1_human", "6f935a0b-7c1f-48fc-8da0-02f4d7fee616"),
    ("CL_BT474_E2_human", "96f92cb0-c6eb-4865-b989-056ba214a3b8"),
    ("CL_HG002_E0_human", "29e82f71-21b9-4ce2-a2c4-95e1fee8fa05"),
    ("CL_HG002_E1_human", "58c2766b-a67e-4949-ba0a-3ce6c12e2e86"),
    ("CL_HG002_E2_human", "af8cc213-428a-4073-8dce-345ff8ef147f"),
    ("CL_K562_E0_human", "c3f8a213-90fc-405d-af72-d9780337dccb"),
    ("CL_K562_E1_human", "356489f4-0b7e-4156-87d3-dd6f7ffda291"),
    ("CL_K562_E2_human", "da39479a-f53b-4423-bb28-0a2dfaaccd37"),
    ("CL_UHRR_E0_human", "998ae1a4-feb6-4501-a6ab-77c187f8d516"),
    ("CL_UHRR_E1_human", "a8c5fcfb-9e84-4761-9909-1a143d8abd76"),
    ("CL_UHRR_E2_human", "e919d865-eb87-4ce2-b5f9-852bd74c83b3")
]

# Define the base path for the folders
base_path = 'terra_outputs/4.cell_lines_pb'

# Function to create folders if they do not exist
def create_folders(data_entity):
    entity_path = os.path.join(base_path, data_entity)
    os.makedirs(os.path.join(entity_path, "ID_reffree"), exist_ok=True)
    os.makedirs(os.path.join(entity_path, "ID_refguided"), exist_ok=True)
    os.makedirs(os.path.join(entity_path, "Quant"), exist_ok=True)
    return entity_path

# Function to generate and run shell scripts
def generate_and_run_scripts(bucket_name, prefix, entity_path):
    blobs = subprocess.run(['gsutil', 'ls', '-r', f'gs://{bucket_name}/{prefix}'], capture_output=True, text=True).stdout.splitlines()
    for blob in blobs:
        if not blob.startswith('gs://'):
            continue
        file_name = blob.split('/')[-1]
        if file_name.endswith(('gtf', 'gff3', 'gff')) and 'reduced' not in file_name and 'de_novo' not in file_name:
            target_folder = os.path.join(entity_path, "ID_reffree")
        elif file_name.endswith(('gtf', 'gff3', 'gff')) and ('reduced' in file_name or 'de_novo' in file_name):
            target_folder = os.path.join(entity_path, "ID_refguided")
        elif ('quant' in file_name and '.log' not in file_name) or 'Gffcompare' in file_name:
            target_folder = os.path.join(entity_path, "Quant")
        else:
            continue
        os.makedirs(target_folder, exist_ok=True)
        target_path = os.path.join(target_folder, file_name)
        
        # Only copy the file if it does not already exist
        if not os.path.exists(target_path):
            subprocess.run(['gsutil', 'cp', blob, target_path], capture_output=True, text=True)

        # Remove _reduced from file names in ID_refguided folder
        if 'ID_refguided' in target_path and '_reduced' in file_name:
            new_file_name = file_name.replace('_reduced', '')
            new_target_path = os.path.join(target_folder, new_file_name)
            if not os.path.exists(new_target_path):
                os.rename(target_path, new_target_path)
            
        # Remove _de_novo from file names in ID_refguided folder
        if 'ID_refguided' in target_path and '_de_novo' in file_name:
            new_file_name = file_name.replace('_de_novo', '')
            new_target_path = os.path.join(target_folder, new_file_name)
            if not os.path.exists(new_target_path):
                os.rename(target_path, new_target_path)

# Main script
for data_entity, workflow_id in data:
    entity_path = create_folders(data_entity)
    prefix = f"submissions/7aca010c-3355-467d-8ed9-445bd768604d/LongReadRNABenchmark/{workflow_id}"
    bucket_name = "fc-070af439-57c5-4d06-af39-40284061e6f3"
    generate_and_run_scripts(bucket_name, prefix, entity_path)

print("All downloading and renaming is done.")

All downloading and renaming is done.


#### talon 11 runs

In [22]:
import os
import shutil
import subprocess

# Define the data entities and workflow IDs
data = [
    ("CL_BT474_E1_human", "2a01a6b6-4d9d-4bca-bc0b-7a9407754863"),
    ("CL_BT474_E2_human", "8ec700bb-aa98-4ebb-972e-1a0b25c4d73e"),
    ("CL_HG002_E0_human", "b8a2e5de-73a8-49b6-a70c-4cfc3d08aad4"),
    ("CL_HG002_E1_human", "a35de324-fa29-4dc6-8e93-baf41e505ac6"),
    ("CL_HG002_E2_human", "0912ee01-311d-4c36-b52d-1ab868d9dca3"),
    ("CL_K562_E0_human", "fb7948ce-22fa-4ce8-8ddc-ec32904c13c2"),
    ("CL_K562_E1_human", "d2fa602b-b6a8-48bf-92c4-abcc8ca427f7"),
    ("CL_K562_E2_human", "5adc67b8-9316-45f2-bb72-63d466e2001a"),
    ("CL_UHRR_E0_human", "74813f87-2db6-426e-bbf6-066d56b5001c"),
    ("CL_UHRR_E1_human", "c23a005c-eee8-4deb-ad72-f4331795948f"),
    ("CL_UHRR_E2_human", "dc494c01-5d72-4c3f-91f3-299a555fb342")
]

# Define the base path for the folders
base_path = 'terra_outputs/4.cell_lines_pb'

# Function to create folders if they do not exist
def create_folders(data_entity):
    entity_path = os.path.join(base_path, data_entity)
    os.makedirs(os.path.join(entity_path, "ID_reffree"), exist_ok=True)
    os.makedirs(os.path.join(entity_path, "ID_refguided"), exist_ok=True)
    os.makedirs(os.path.join(entity_path, "Quant"), exist_ok=True)
    return entity_path

# Function to generate and run shell scripts
def generate_and_run_scripts(bucket_name, prefix, entity_path):
    blobs = subprocess.run(['gsutil', 'ls', '-r', f'gs://{bucket_name}/{prefix}'], capture_output=True, text=True).stdout.splitlines()
    for blob in blobs:
        if not blob.startswith('gs://'):
            continue
        file_name = blob.split('/')[-1]
        if file_name.endswith(('gtf', 'gff3', 'gff')) and 'reduced' not in file_name and 'de_novo' not in file_name:
            target_folder = os.path.join(entity_path, "ID_reffree")
        elif file_name.endswith(('gtf', 'gff3', 'gff')) and ('reduced' in file_name or 'de_novo' in file_name):
            target_folder = os.path.join(entity_path, "ID_refguided")
        elif ('quant' in file_name and '.log' not in file_name) or 'Gffcompare' in file_name:
            target_folder = os.path.join(entity_path, "Quant")
        else:
            continue
        os.makedirs(target_folder, exist_ok=True)
        target_path = os.path.join(target_folder, file_name)
        
        # Only copy the file if it does not already exist
        if not os.path.exists(target_path):
            subprocess.run(['gsutil', 'cp', blob, target_path], capture_output=True, text=True)

        # Remove _reduced from file names in ID_refguided folder
        if 'ID_refguided' in target_path and '_reduced' in file_name:
            new_file_name = file_name.replace('_reduced', '')
            new_target_path = os.path.join(target_folder, new_file_name)
            if not os.path.exists(new_target_path):
                os.rename(target_path, new_target_path)
            
        # Remove _de_novo from file names in ID_refguided folder
        if 'ID_refguided' in target_path and '_de_novo' in file_name:
            new_file_name = file_name.replace('_de_novo', '')
            new_target_path = os.path.join(target_folder, new_file_name)
            if not os.path.exists(new_target_path):
                os.rename(target_path, new_target_path)

# Main script
for data_entity, workflow_id in data:
    entity_path = create_folders(data_entity)
    prefix = f"submissions/21c3e5c4-6fd8-4125-bda7-9eb923c35503/LongReadRNABenchmark/{workflow_id}"
    bucket_name = "fc-070af439-57c5-4d06-af39-40284061e6f3"
    generate_and_run_scripts(bucket_name, prefix, entity_path)

print("All downloading and renaming is done.")

All downloading and renaming is done.


#### Isocelles - 12 runs

In [23]:
import os
import shutil
import subprocess

# Define the data entities and workflow IDs
data = [
    ("CL_BT474_E0_human", "2f0a9b53-b03d-4b27-9dd8-87b8e96062aa"),
    ("CL_BT474_E1_human", "df0582bc-8bab-4316-8bfe-4e5489f6f1f3"),
    ("CL_BT474_E2_human", "3ee25508-59ab-4052-a41f-3480af4f35c3"),
    ("CL_HG002_E0_human", "4e799636-0fb3-4f32-95b2-20c02f7c6a6f"),
    ("CL_HG002_E1_human", "964c1cb9-b42f-4ff9-88d7-8b07da63c91b"),
    ("CL_HG002_E2_human", "de0693df-a0e6-4107-a6d0-43d9d5b12040"),
    ("CL_K562_E0_human", "2dec46e7-3b83-4af4-88fc-6747d5d71091"),
    ("CL_K562_E1_human", "de8b4959-7fc0-457a-9286-225ab48a571a"),
    ("CL_K562_E2_human", "b138c1f6-887d-4503-9ba2-7e0b6b0d1f94"),
    ("CL_UHRR_E0_human", "c05d6980-5c2a-4d5f-b24e-2260857a046e"),
    ("CL_UHRR_E1_human", "930b6aa6-fb1d-4cf3-a2b7-55d4d1103084"),
    ("CL_UHRR_E2_human", "9c9dfa06-494f-4973-89bc-1c5101678088")
]

# Define the base path for the folders
base_path = 'terra_outputs/4.cell_lines_pb'

# Function to create folders if they do not exist
def create_folders(data_entity):
    entity_path = os.path.join(base_path, data_entity)
    os.makedirs(os.path.join(entity_path, "ID_reffree"), exist_ok=True)
    os.makedirs(os.path.join(entity_path, "ID_refguided"), exist_ok=True)
    os.makedirs(os.path.join(entity_path, "Quant"), exist_ok=True)
    return entity_path

# Function to generate and run shell scripts
def generate_and_run_scripts(bucket_name, prefix, entity_path):
    blobs = subprocess.run(['gsutil', 'ls', '-r', f'gs://{bucket_name}/{prefix}'], capture_output=True, text=True).stdout.splitlines()
    for blob in blobs:
        if not blob.startswith('gs://'):
            continue
        file_name = blob.split('/')[-1]
        if file_name.endswith(('gtf', 'gff3', 'gff')) and 'reduced' not in file_name and 'de_novo' not in file_name:
            target_folder = os.path.join(entity_path, "ID_reffree")
        elif file_name.endswith(('gtf', 'gff3', 'gff')) and ('reduced' in file_name or 'de_novo' in file_name):
            target_folder = os.path.join(entity_path, "ID_refguided")
        elif ('quant' in file_name and '.log' not in file_name) or 'Gffcompare' in file_name:
            target_folder = os.path.join(entity_path, "Quant")
        else:
            continue
        os.makedirs(target_folder, exist_ok=True)
        target_path = os.path.join(target_folder, file_name)
        
        # Only copy the file if it does not already exist
        if not os.path.exists(target_path):
            subprocess.run(['gsutil', 'cp', blob, target_path], capture_output=True, text=True)

        # Remove _reduced from file names in ID_refguided folder
        if 'ID_refguided' in target_path and '_reduced' in file_name:
            new_file_name = file_name.replace('_reduced', '')
            new_target_path = os.path.join(target_folder, new_file_name)
            if not os.path.exists(new_target_path):
                os.rename(target_path, new_target_path)
            
        # Remove _de_novo from file names in ID_refguided folder
        if 'ID_refguided' in target_path and '_de_novo' in file_name:
            new_file_name = file_name.replace('_de_novo', '')
            new_target_path = os.path.join(target_folder, new_file_name)
            if not os.path.exists(new_target_path):
                os.rename(target_path, new_target_path)

# Main script
for data_entity, workflow_id in data:
    entity_path = create_folders(data_entity)
    prefix = f"submissions/b9440719-b66e-45ac-9d15-73ce1dd6f3b3/LongReadRNABenchmark/{workflow_id}"
    bucket_name = "fc-070af439-57c5-4d06-af39-40284061e6f3"
    generate_and_run_scripts(bucket_name, prefix, entity_path)

print("All downloading and renaming is done.")

All downloading and renaming is done.


#### talon 1 run - waiting to complete

In [1]:
import os
import shutil
import subprocess

# Define the data entities and workflow IDs
data = [
    ("CL_BT474_E0_human", "cecbd726-c185-4f7b-a41f-54dd54f11001"),
]

# Define the base path for the folders
base_path = 'terra_outputs/4.cell_lines_pb'

# Function to create folders if they do not exist
def create_folders(data_entity):
    entity_path = os.path.join(base_path, data_entity)
    os.makedirs(os.path.join(entity_path, "ID_reffree"), exist_ok=True)
    os.makedirs(os.path.join(entity_path, "ID_refguided"), exist_ok=True)
    os.makedirs(os.path.join(entity_path, "Quant"), exist_ok=True)
    return entity_path

# Function to generate and run shell scripts
def generate_and_run_scripts(bucket_name, prefix, entity_path):
    blobs = subprocess.run(['gsutil', 'ls', '-r', f'gs://{bucket_name}/{prefix}'], capture_output=True, text=True).stdout.splitlines()
    for blob in blobs:
        if not blob.startswith('gs://'):
            continue
        file_name = blob.split('/')[-1]
        if file_name.endswith(('gtf', 'gff3', 'gff')) and 'reduced' not in file_name and 'de_novo' not in file_name:
            target_folder = os.path.join(entity_path, "ID_reffree")
        elif file_name.endswith(('gtf', 'gff3', 'gff')) and ('reduced' in file_name or 'de_novo' in file_name):
            target_folder = os.path.join(entity_path, "ID_refguided")
        elif ('quant' in file_name and '.log' not in file_name) or 'Gffcompare' in file_name:
            target_folder = os.path.join(entity_path, "Quant")
        else:
            continue
        os.makedirs(target_folder, exist_ok=True)
        target_path = os.path.join(target_folder, file_name)
        
        # Only copy the file if it does not already exist
        if not os.path.exists(target_path):
            subprocess.run(['gsutil', 'cp', blob, target_path], capture_output=True, text=True)

        # Remove _reduced from file names in ID_refguided folder
        if 'ID_refguided' in target_path and '_reduced' in file_name:
            new_file_name = file_name.replace('_reduced', '')
            new_target_path = os.path.join(target_folder, new_file_name)
            if not os.path.exists(new_target_path):
                os.rename(target_path, new_target_path)
            
        # Remove _de_novo from file names in ID_refguided folder
        if 'ID_refguided' in target_path and '_de_novo' in file_name:
            new_file_name = file_name.replace('_de_novo', '')
            new_target_path = os.path.join(target_folder, new_file_name)
            if not os.path.exists(new_target_path):
                os.rename(target_path, new_target_path)

# Main script
for data_entity, workflow_id in data:
    entity_path = create_folders(data_entity)
    prefix = f"submissions/17893e15-d514-4ad7-a9da-1b9fa15424ed/LongReadRNABenchmark/{workflow_id}"
    bucket_name = "fc-070af439-57c5-4d06-af39-40284061e6f3"
    generate_and_run_scripts(bucket_name, prefix, entity_path)

print("All downloading and renaming is done.")

All downloading and renaming is done.


#### Modify bambu format to be able to run gffcomapre

In [24]:
import os
import fnmatch
import shutil

# Define the base path
base_path = 'terra_outputs/1.isoseqsim_pb/'
backup_dir = 'data_backup/'

# Ensure the backup directory exists
os.makedirs(backup_dir, exist_ok=True)

# Flag to track if the first file has been processed
first_file_processed = False

# Walk through the directory tree
for root, dirs, files in os.walk(base_path):
    for file in files:
        if 'Bambu' in file and file.endswith('.gtf'):
            # Check if the path matches the specific pattern
            if fnmatch.fnmatch(root, 'terra_outputs/1.isoseqsim_pb/arabidopsis*/ID_refguided'):
                file_path = os.path.join(root, file)
                print(file_path)
                
                # Print the first six rows of the first file
                if not first_file_processed:
                    with open(file_path, 'r') as f:
                        for _ in range(6):
                            line = f.readline()
                            if line:
                                print(line.strip())
                            else:
                                break
                    first_file_processed = True
                
                # Create the new file name by replacing '/' with '_'
                new_file_name = file_path.replace('/', '_')
                new_file_path = os.path.join(backup_dir, new_file_name)
                
                # Copy the file to the backup directory with the new name
                shutil.copy(file_path, new_file_path)

terra_outputs/1.isoseqsim_pb/arabidopsis_isoseqsim_e016/ID_refguided/Bambu.gtf
Chr1	Bambu	transcript	3631	5899	.	+	.	gene_id "transcript_id AT1G01010.1; AT1G01010"; transcript_id "AT1G01010.1";
Chr1	Bambu	exon	3631	3913	.	+	.	gene_id "transcript_id AT1G01010.1; AT1G01010"; transcript_id "AT1G01010.1"; exon_number "1";
Chr1	Bambu	exon	3996	4276	.	+	.	gene_id "transcript_id AT1G01010.1; AT1G01010"; transcript_id "AT1G01010.1"; exon_number "2";
Chr1	Bambu	exon	4486	4605	.	+	.	gene_id "transcript_id AT1G01010.1; AT1G01010"; transcript_id "AT1G01010.1"; exon_number "3";
Chr1	Bambu	exon	4706	5095	.	+	.	gene_id "transcript_id AT1G01010.1; AT1G01010"; transcript_id "AT1G01010.1"; exon_number "4";
Chr1	Bambu	exon	5174	5326	.	+	.	gene_id "transcript_id AT1G01010.1; AT1G01010"; transcript_id "AT1G01010.1"; exon_number "5";
terra_outputs/1.isoseqsim_pb/arabidopsis_isoseqsim_e016/ID_refguided/Bambu_ndr1.gtf
terra_outputs/1.isoseqsim_pb/arabidopsis_isoseqsim_e000/ID_refguided/Bambu.gtf
terra_outputs

In [25]:
import os
import fnmatch
import re

# Define the base path
base_path = 'terra_outputs/1.isoseqsim_pb/'

# Flag to track if the first file has been processed
first_file_processed = False

# Walk through the directory tree
for root, dirs, files in os.walk(base_path):
    for file in files:
        if 'Bambu' in file and file.endswith('.gtf'):
            # Check if the path matches the specific pattern
            if fnmatch.fnmatch(root, 'terra_outputs/1.isoseqsim_pb/arabidopsis*/ID_refguided'):
                file_path = os.path.join(root, file)
                print(file_path)
                
                # Modify the original file
                with open(file_path, 'r') as f:
                    lines = f.readlines()
                
                with open(file_path, 'w') as f:
                    for line in lines:
                        modified_line = re.sub(r'gene_id "transcript_id ([^;]+); ([^"]+)"', r'gene_id "transcript_id_\1_\2"', line)
                        f.write(modified_line)
                
                # Print the first six rows of the modified file
                if not first_file_processed:
                    with open(file_path, 'r') as f:
                        for _ in range(6):
                            line = f.readline()
                            if line:
                                print(line.strip())
                            else:
                                break
                    first_file_processed = True

terra_outputs/1.isoseqsim_pb/arabidopsis_isoseqsim_e016/ID_refguided/Bambu.gtf
Chr1	Bambu	transcript	3631	5899	.	+	.	gene_id "transcript_id_AT1G01010.1_AT1G01010"; transcript_id "AT1G01010.1";
Chr1	Bambu	exon	3631	3913	.	+	.	gene_id "transcript_id_AT1G01010.1_AT1G01010"; transcript_id "AT1G01010.1"; exon_number "1";
Chr1	Bambu	exon	3996	4276	.	+	.	gene_id "transcript_id_AT1G01010.1_AT1G01010"; transcript_id "AT1G01010.1"; exon_number "2";
Chr1	Bambu	exon	4486	4605	.	+	.	gene_id "transcript_id_AT1G01010.1_AT1G01010"; transcript_id "AT1G01010.1"; exon_number "3";
Chr1	Bambu	exon	4706	5095	.	+	.	gene_id "transcript_id_AT1G01010.1_AT1G01010"; transcript_id "AT1G01010.1"; exon_number "4";
Chr1	Bambu	exon	5174	5326	.	+	.	gene_id "transcript_id_AT1G01010.1_AT1G01010"; transcript_id "AT1G01010.1"; exon_number "5";
terra_outputs/1.isoseqsim_pb/arabidopsis_isoseqsim_e016/ID_refguided/Bambu_ndr1.gtf
terra_outputs/1.isoseqsim_pb/arabidopsis_isoseqsim_e000/ID_refguided/Bambu.gtf
terra_outputs/1.iso

#### LRAA on cell lines

In [6]:
import os
import subprocess

# Define the data entities and workflow IDs
data = [
    ("CL_BT474_E2_human", "f2fa59c1-cf0c-4832-ab17-f66e8e4a41c5"),
    ("CL_HG002_E0_human", "eddc2d1b-c682-4819-a6fb-1e9052bd7d84"),
    ("CL_HG002_E1_human", "82e89938-140f-4415-9fb1-fed1c9c300fd"),
    ("CL_HG002_E2_human", "cf9ce49b-3685-48ee-a0f5-4b5d757c5160"),
    ("CL_UHRR_E0_human", "a4ff44c9-5d85-42a3-9d27-c5925b691cce"),
    ("CL_UHRR_E1_human", "acd6be06-e62c-48a4-b39d-ee502540274b"),
    ("CL_UHRR_E2_human", "8c49b474-ca80-4734-9b2c-47977f639811"),
]

# Define the base path for the folders
base_path = 'terra_outputs/4.cell_lines_pb'

# Function to create folders if they do not exist
def create_folders(data_entity):
    entity_path = os.path.join(base_path, data_entity)
    os.makedirs(os.path.join(entity_path, "ID_refguided"), exist_ok=True)
    return entity_path

# Function to generate and run shell scripts
def generate_and_run_scripts(bucket_name, prefix, entity_path):
    blobs = subprocess.run(['gsutil', 'ls', '-r', f'gs://{bucket_name}/{prefix}'], capture_output=True, text=True).stdout.splitlines()
    for blob in blobs:
        if not blob.startswith('gs://'):
            continue
        file_name = blob.split('/')[-1]
        if file_name == 'lraa_0214.LRAA_ref-guided.gtf':
            target_folder = os.path.join(entity_path, "ID_refguided")
            os.makedirs(target_folder, exist_ok=True)
            target_path = os.path.join(target_folder, 'LRAA_0214.gtf')
            
            # Only copy the file if it does not already exist
            if not os.path.exists(target_path):
                subprocess.run(['gsutil', 'cp', blob, target_path], capture_output=True, text=True)

# Main script
for data_entity, workflow_id in data:
    entity_path = create_folders(data_entity)
    prefix = f"submissions/c3562858-f039-4952-a36a-36b8fd83b309/CombinedWorkflow/{workflow_id}"
    bucket_name = "fc-070af439-57c5-4d06-af39-40284061e6f3"
    generate_and_run_scripts(bucket_name, prefix, entity_path)

print("All downloading and renaming is done.")

All downloading and renaming is done.


In [7]:
import os
import subprocess

# Define the data entities and workflow IDs
data = [
    ("CL_BT474_E0_human", "4413c5b8-cc49-4b23-a492-bd8109137ebc"),
    ("CL_BT474_E1_human", "ed183332-6b13-412d-9cb1-d40efb0167c1"),
    ("CL_K562_E0_human", "851ddcfa-aeb5-491a-8cce-dc680559d587"),
    ("CL_K562_E1_human", "703050a4-2c8a-46b0-8d8d-446bf4a98dea"),
    ("CL_K562_E2_human", "30c15145-5972-4d69-b7c9-98a6649a1b49"),
]

# Define the base path for the folders
base_path = 'terra_outputs/4.cell_lines_pb'

# Function to create folders if they do not exist
def create_folders(data_entity):
    entity_path = os.path.join(base_path, data_entity)
    os.makedirs(os.path.join(entity_path, "ID_refguided"), exist_ok=True)
    return entity_path

# Function to generate and run shell scripts
def generate_and_run_scripts(bucket_name, prefix, entity_path):
    blobs = subprocess.run(['gsutil', 'ls', '-r', f'gs://{bucket_name}/{prefix}'], capture_output=True, text=True).stdout.splitlines()
    for blob in blobs:
        if not blob.startswith('gs://'):
            continue
        file_name = blob.split('/')[-1]
        if file_name == 'lraa_0214.LRAA_ref-guided.gtf':
            target_folder = os.path.join(entity_path, "ID_refguided")
            os.makedirs(target_folder, exist_ok=True)
            target_path = os.path.join(target_folder, 'LRAA_0214.gtf')
            
            # Only copy the file if it does not already exist
            if not os.path.exists(target_path):
                subprocess.run(['gsutil', 'cp', blob, target_path], capture_output=True, text=True)

# Main script
for data_entity, workflow_id in data:
    entity_path = create_folders(data_entity)
    prefix = f"submissions/ce5983d5-1342-4e13-83b8-dc4e2a6affd4/CombinedWorkflow/{workflow_id}"
    bucket_name = "fc-070af439-57c5-4d06-af39-40284061e6f3"
    generate_and_run_scripts(bucket_name, prefix, entity_path)

print("All downloading and renaming is done.")

All downloading and renaming is done.


In [8]:
import os
import subprocess

# Define the data entities and workflow IDs
data = [
    ("CL_BT474_E0_human", "1bd49ae1-cd92-4d84-881f-a032050ba0e2"),
    ("CL_BT474_E1_human", "838fe4f8-4268-4cf1-bca8-83401b1bb220"),
    ("CL_BT474_E2_human", "b3a5b2de-0579-490c-93f5-e26f899f2e16"),
    ("CL_HG002_E0_human", "51cce60f-412f-4007-b0a9-484b8f465b19"),
    ("CL_HG002_E1_human", "131e88fa-0fc6-4a6d-902a-e0a82ac1a4ff"),
    ("CL_HG002_E2_human", "73439877-5071-4067-9c9b-dc56c801c666"),
    ("CL_K562_E0_human", "bb4a10f3-5932-44ad-acbe-21b77261d987"),
    ("CL_K562_E1_human", "229f4024-eeed-4bae-942d-2b37a182be71"),
    ("CL_K562_E2_human", "f9a493b9-ef9a-4632-bba2-58cd5d587857"),
    ("CL_UHRR_E0_human", "53664390-f3c0-4eee-9045-8774722adccc"),
    ("CL_UHRR_E1_human", "256a8995-2314-4e10-ba98-4e05267fee51"),
    ("CL_UHRR_E2_human", "b7c47d91-7b78-4481-9e67-8417bd7d8bc3"),
]

# Define the base path for the folders
base_path = 'terra_outputs/4.cell_lines_pb'

# Function to create folders if they do not exist
def create_folders(data_entity):
    entity_path = os.path.join(base_path, data_entity)
    os.makedirs(os.path.join(entity_path, "ID_reffree"), exist_ok=True)
    return entity_path

# Function to generate and run shell scripts
def generate_and_run_scripts(bucket_name, prefix, entity_path):
    blobs = subprocess.run(['gsutil', 'ls', '-r', f'gs://{bucket_name}/{prefix}'], capture_output=True, text=True).stdout.splitlines()
    for blob in blobs:
        if not blob.startswith('gs://'):
            continue
        file_name = blob.split('/')[-1]
        if file_name == 'lraa_0213.LRAA_ref-free.gtf':
            target_folder = os.path.join(entity_path, "ID_reffree")
            os.makedirs(target_folder, exist_ok=True)
            target_path = os.path.join(target_folder, 'LRAA_0213.gtf')
            
            # Only copy the file if it does not already exist
            if not os.path.exists(target_path):
                subprocess.run(['gsutil', 'cp', blob, target_path], capture_output=True, text=True)

# Main script
for data_entity, workflow_id in data:
    entity_path = create_folders(data_entity)
    prefix = f"submissions/5461d373-306f-4cef-9189-80127e113f4d/CombinedWorkflow/{workflow_id}"
    bucket_name = "fc-070af439-57c5-4d06-af39-40284061e6f3"
    generate_and_run_scripts(bucket_name, prefix, entity_path)

print("All downloading and renaming is done.")

All downloading and renaming is done.
