<a href="https://colab.research.google.com/github/vkt1414/Cloud-Resources-Workflows/blob/main/Terra_Cromwell_Workflow_Metadata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# An introduction to using the Fiss API in Python in BioData Catalyst

This notebook introduces users to the Firecloud API using a Python Jupyter notebook. The example covers how the API communicates between the data table and notebook. The user loads an existing Terra data table into the notebook, subsets the dataframe, and saves the new dataframe as a tsv to the workspace bucket or as a new Terra data table.

Note: a more scalable version of this process is available in the [terra_data_table_util](https://app.terra.bio/#workspaces/biodata-catalyst/BioData%20Catalyst%20Collection/notebooks/launch/terra_data_table_util.ipynb) notebook. 

## Notebook Runtime

We suggest using both the default environment and compute power. 

# Load packages

In [None]:
!pip install firecloud

In [6]:
from firecloud import fiss
import firecloud.api as fapi
import os
import io
import pandas as pd

## Set environment variables that Fiss API requires

In [None]:
!gcloud auth login --update-adc

In [None]:
!gcloud config set project terra-fd442854

In [None]:
# Get the Google billing project name and workspace name
billing_project = "terra-billing-datester"
workspace = "TotalSegmentator"
bucket = "gs://fc-5af492dc-6993-4c91-bbf6-3e2747868642/"

# Verify that we've captured the environment variables
print("Billing project: " + billing_project)
print("Workspace: " + workspace)
print("Workspace storage bucket: " + bucket)

In [10]:
import time
import pandas as pd

def get_workflow_metadata_with_retry(billing_project, workspace, submission_id, workflow_id):
    max_attempts = 3
    print(f'Getting workflow metadata for workflow {workflow_id} in submission {submission_id}...')
    for i in range(max_attempts):
        print(f'Attempt {i+1} of {max_attempts}...')
        response = fiss.fapi.get_workflow_metadata(billing_project, workspace, submission_id, workflow_id)
        if response.status_code >= 200 and response.status_code < 300:
            print(f'Successfully retrieved workflow metadata for workflow {workflow_id} in submission {submission_id}.')
            return response
        else:
            print(f'Received response with status code {response.status_code}. Retrying after 30 seconds...')
            time.sleep(30)
    raise Exception(f'Failed to get workflow metadata for workflow {workflow_id} after {max_attempts} attempts')


def get_submission_with_retry(billing_project, workspace, submission_id):
    max_attempts = 3
    print(f'Getting submission {submission_id}...')
    for i in range(max_attempts):
        print(f'Attempt {i+1} of {max_attempts}...')
        response = fapi.get_submission(billing_project, workspace, submission_id)
        if response.status_code >= 200 and response.status_code < 300:
            print(f'Successfully retrieved submission {submission_id}.')
            return response
        else:
            print(f'Received response with status code {response.status_code}. Retrying after 30 seconds...')
            time.sleep(30)
    raise Exception(f'Failed to get submission {submission_id} after {max_attempts} attempts')
    
    
def list_submissions_with_retry(billing_project, workspace):
    max_attempts = 3
    print(f'Listing submissions for workspace {workspace} in billing project {billing_project}...')
    for i in range(max_attempts):
        print(f'Attempt {i+1} of {max_attempts}...')
        response = fapi.list_submissions(billing_project, workspace)
        if response.status_code >= 200 and response.status_code < 300:
            print(f'Successfully retrieved list of submissions for workspace {workspace} in billing project {billing_project}.')
            return response
        else:
            print(f'Received response with status code {response.status_code}. Retrying after 30 seconds...')
            time.sleep(30)
    raise Exception(f'Failed to list submissions for workspace {workspace} in billing project {billing_project} after {max_attempts} attempts')

In [None]:
submissions_list_response=list_submissions_with_retry(billing_project, workspace)
submissions_list=submissions_list_response.json()
submissions_list = [submission['submissionId'] for submission in submissions_list]
submissions_list

In [None]:
columns = ['attempt', 'backend', 'cromwell-workflow-id', 'terra-submission-id', 'wdl-task-name', 'compressedDockerSize', 'dockerImageUsed', 'end', 'start', 'executionStatus', 'executionBucket', 'googleProject', 'instanceName', 'machineType', 'zone', 'runtimeAttributes', 'description', 'startTime', 'endTime']
final = pd.DataFrame(columns=columns)

for submission in submissions_list:
    submissions_response= get_submission_with_retry(billing_project, workspace, submission)
    workflows = submissions_response.json()['workflows']
    #workflows_ids = [workflow['workflowId'] for workflow in workflows]
    workflows_ids = []
    for workflow in workflows:
        if 'workflowId' in workflow:
            workflows_ids.append(workflow['workflowId'])
    for workflow in workflows_ids:
        workflow_response = get_workflow_metadata_with_retry(billing_project, workspace, submission, workflow)
        workflow_data = workflow_response.json()
        if 'calls' in workflow_data:
            keys = workflow_data['calls'].keys()
            dataframes = {}
            for key in keys:
                attempts = workflow_data['calls'][key]
                df = pd.DataFrame(columns=columns)
                rows = []
                for attempt in attempts:
                    for event in attempt['executionEvents']:
                        if 'backendLabels' in attempt:
            # Extract the information we need
                            row = {
                                'attempt': attempt['attempt'],
                                'backend': attempt['backend'],
                                'cromwell-workflow-id': attempt['backendLabels']['cromwell-workflow-id'],
                                'terra-submission-id': attempt['backendLabels']['terra-submission-id'],
                                'wdl-task-name': attempt['backendLabels']['wdl-task-name'],
                                'compressedDockerSize': float(attempt.get('compressedDockerSize', 0))/1073741824,
                                'dockerImageUsed': attempt.get('dockerImageUsed', 'default_value'),
                                'end': attempt['end'],
                                'start': attempt['start'],
                                'executionStatus': attempt['executionStatus'],
                                'executionBucket': attempt['jes']['executionBucket'],
                                'googleProject': attempt['jes']['googleProject'],
                                'instanceName': attempt['jes']['instanceName'],
                                'machineType': attempt['jes']['machineType'],
                                'zone': attempt['jes']['zone'],
                                'runtimeAttributes': attempt['runtimeAttributes'],
                                'description': event['description'],
                                'startTime': event['startTime'],
                                'endTime': event['endTime']
                            }
                            rows.append(row)
                df = pd.concat([df, pd.DataFrame(rows)], ignore_index=True)
                dataframes[key] = df
            final = pd.concat([final, pd.concat(dataframes.values(), ignore_index=True)], ignore_index=True)
final