In [10]:
import requests

# Vantage6 API for the RAVEN
This notebook *should* contain all the code needed to interact with the vantage6 API 
from the RAVEN UI.

This following section are included in the notebook:

  1. Authenticate with the vantage6 server - This will change as we got a last minute  
     request to use KeyCloak for authentication.
  2. Creating prerequisites - This is **static** content which should already be at the   
     vantage6 server. This is also not needed in case of the RAVEN UI, and you can skip  
     this section as I already have created the required content.
  3. Creating the cohorts from the patient IDs
  4. Running other analytics

...  

## 1. Authenticate with the vantage6 server

In [1]:
# I authenticated using the vantage6 client library as since we are now using keycloak
# obtain the token from the keycloak server is not trivial because of the callback
# mechanism. I expect that you know how to authenticate with keycloak

# NOTE: The refresh mechanism is not working as expected now that we moved to keycloak.
# I've worked around this by setting the expiration time to 1 day. We need to fix this.

# !pip install vantage6-client==5.0.0a22
from vantage6.client import UserClient

# Authentication will open a browser window to authenticate.
client = UserClient(
    "https://orchestrator.idea.lst.tfo.upm.es:443/server",
    auth_url="https://auth.vantage6.ai:8443",
    auth_client="public_client",
    auth_realm="vantage7",
    log_level="INFO"
)
client.authenticate()

 Welcome to
                  _                     __  
                 | |                   / /  
__   ____ _ _ __ | |_ __ _  __ _  ___ / /_  
\ \ / / _` | '_ \| __/ _` |/ _` |/ _ \ '_ \ 
 \ V / (_| | | | | || (_| | (_| |  __/ (_) |
  \_/ \__,_|_| |_|\__\__,_|\__, |\___|\___/ 
                            __/ |           
                           |___/            

 --> Join us on Discord! https://discord.gg/rwRvwyK
 --> Docs: https://docs.vantage6.ai
 --> Blog: https://vantage6.ai
------------------------------------------------------------
Cite us!
If you publish your findings obtained using vantage6, 
please cite the proper sources as mentioned in:
https://vantage6.ai/vantage6/references
------------------------------------------------------------
opening browser for login


127.0.0.1 - - [02/Jul/2025 11:10:21] "GET /callback?state=state&session_state=2b5d2eef-189b-4d2c-9a53-db08ba066c62&iss=https%3A%2F%2Fauth.vantage6.ai%3A8443%2Frealms%2Fvantage7&code=0200f4be-c121-4780-a612-3dddba52e994.2b5d2eef-189b-4d2c-9a53-db08ba066c62.40218515-930d-4f32-b4c8-4e650a2bfd46 HTTP/1.1" 200 -


 --> Succesfully authenticated
 --> Name: None (id=7)
 --> Organization: root (id=1)


In [2]:
headers = {
    "Authorization": f"Bearer {client._access_token}"
}

# get the server info
print("Server version: ", client.util.get_server_version())

Server version:  {'version': '5.0.0a22'}


## 2. Creating prerequisites (Static content)

In [6]:
# This is 'static' content which should already be at the vantage6 server. The vantage6
# UI can be used to manage the 'static' content. It is static from the point of view of
# RAVEN UI. All calls in this section use the vantage6 client library, as you don't
# need to implement these.
#
# It is also possible to use the vantage6 UI for this purpose. It is temporary available
# at https://idea4rc.kube.franky.codes or also at the UPM server when the routing has
# been fixed.
#
# **YOU DO NOT NEED TO CREATE THESE, YOU CAN SKIP THIS SECTION.**

### 2.1 Create the organizations

In [59]:
# client.organization.create(
#     name="Example Organization 1",
#     address1="123 Main St",
#     address2="Apt 1",
#     zipcode="1234AB",
#     country="NL",
#     domain="example-organization-1.com",
# )

In [60]:
# client.organization.create(
#     name="Example Organization 2",
#     address1="123 Main St",
#     address2="Apt 2",
#     zipcode="1234AB",
#     country="NL",
#     domain="example-organization-2.com",
# )

In [3]:
# The organizations are created. All organization have an ID which can be used to
# identify the organization at a later stage.
client.organization.list(fields=('id', 'name'))

[{'id': 3, 'name': 'Example Organization 2'},
 {'id': 2, 'name': 'Example Organization 1'},
 {'id': 1, 'name': 'root'}]

### 2.2 Create the users

In [4]:
# Users have certain permissions. These permissions are given in the form of `rules`. To
# make it easier to manage them, they are grouped in `roles`. We can use the ID of the
# role to assign it to a user.
client.role.list(fields=('id', 'name'))

[{'id': 2, 'name': 'container'},
 {'id': 1, 'name': 'Root'},
 {'id': 5, 'name': 'Researcher'},
 {'id': 6, 'name': 'Organization Admin'},
 {'id': 4, 'name': 'Viewer'},
 {'id': 3, 'name': 'node'},
 {'id': 7, 'name': 'Collaboration Admin'}]

In [63]:
# client.user.create(
#     username="user1",
#     password="Password123!",
#     email="user1@example-organization-1.com",
#     firstname="User 1",
#     lastname="User 1",
#     organization=2,
#     roles=[6]
# )

In [64]:
# client.user.create(
#     username="user2",
#     password="Password123!",
#     email="user2@example-organization-2.com",
#     firstname="User 2",
#     lastname="User 2",
#     organization=3,
#     roles=[6]
# )

In [65]:
# client.user.create(
#     username="raven",
#     password="Password123!",
#     email="raven@example-organization-2.com",
#     firstname="Raven",
#     lastname="Raven",
#     organization=2,
#     roles=[7]
# )

In [5]:
# TODO FM: Check that the users are both in KeyCloak and in the vantage6 server.
client.user.list(fields=('id', 'username'))

[{'id': 7, 'username': 'admin'},
 {'id': 9, 'username': 'user_2'},
 {'id': 11, 'username': 'alejandro'},
 {'id': 4, 'username': 'raven'},
 {'id': 8, 'username': 'user_1'}]

### 2.3 Create the collaboration

In [5]:
# client.collaboration.create(
#     name="Example Collaboration 1",
#     organizations=[2, 3]
# )

In [6]:
# In vantage6 multiple collaborations can be present. A collaboration is a group of
# organizations that can collaborate on a certain task. In IDEA4RC we create one
# collaboration for all CoE and Research Center organizations. We also have created a
# testing collaboration for now. We need the collaboration ID to create a new session
# and also when we want to create a new task.
client.collaboration.list(fields=('id', 'name'), scope="global")

[{'id': 2, 'name': 'Testing'}, {'id': 1, 'name': 'Example Collaboration 1'}]

### 2.4 Create the nodes

In [9]:
# client.node.create(
#     collaboration=1,
#     organization=2,
#     name="Organization 2 Node 1",
# )

In [10]:
# client.node.create(
#     collaboration=1,
#     organization=3,
#     name="Organization 3 Node 1",
# )

In [7]:
client.node.list(fields=("id", "name", "status"))

[{'id': 10, 'name': 'Testing-root-node', 'status': 'offline'},
 {'id': 6, 'name': 'Organization 3 Node 1', 'status': 'offline'},
 {'id': 5, 'name': 'Organization 2 Node 1', 'status': 'offline'}]

## 3. New Workspace
*New study in vantage6*

In [8]:
# Lets set the collaboration ID now to the testing collaboration.
COLLABORATION_ID = 2

In [11]:
# Normally we expect all organizations to be part of the Collaboration. However, or
# test collaboration does not. So we collect the organizations that are part of the
# collaboration and add them to the study.
response = requests.get(
    f"https://orchestrator.idea.lst.tfo.upm.es/server/organization?collaboration_id={COLLABORATION_ID}",
    headers=headers
)
response.json()

{'data': [{'studies': '/server/study?organization_id=1',
   'tasks': '/server/task?init_org_id=1',
   'collaborations': '/server/collaboration?organization_id=1',
   'zipcode': None,
   'country': None,
   'id': 1,
   'name': 'root',
   'users': '/server/user?organization_id=1',
   'nodes': '/server/node?organization_id=1',
   'domain': None,
   'public_key': '',
   'runs': '/server/run?organization_id=1',
   'address1': None,
   'address2': None}],
 'links': {'first': '/server/organization?collaboration_id=2&page=1',
  'self': '/server/organization?collaboration_id=2&page=1',
  'last': '/server/organization?collaboration_id=2&page=1'}}

In [12]:
ORGANIZATION_IDS = [org["id"] for org in response.json()["data"]]
ORGANIZATION_IDS

[1]

In [50]:
# To create a new study we need the organizations ids (the internal ids in vantage6)
# that are included in this workspace. The name of the study needs to be unique.
# TODO: we need to decide on the unique name of the study, this could be the name of
# the workspace.
response = requests.post(
    "https://orchestrator.idea.lst.tfo.upm.es/server/study",
    headers=headers,
    json={
        # The collaboration id is the vantage6 id of the collaboration. This is
        # is the same for all workspaces. I used 1 now, but this can change when we
        # are still developing the platform.
        "collaboration_id": COLLABORATION_ID,
        # The name of the study needs to be unique. I guess the name of the workspace
        # is also unique, so we can use that.
        "name": "Example Study 11",
        # The organization ids are the internal ids of the organizations in vantage6.
        "organization_ids": ORGANIZATION_IDS,
    }
)
response.json()
# In the case that:
#
# - The name is not unique
# - The collaboration id is not valid (non existing)
# - The organization ids are not valid (non existing)
#
# The API will return a 4xx error with a message. It will be of the following format:
# {
#     "msg": "Error message",
# }

{'id': 11,
 'name': 'Example Study 11',
 'organizations': [{'address2': None,
   'collaborations': '/server/collaboration?organization_id=1',
   'runs': '/server/run?organization_id=1',
   'zipcode': None,
   'country': None,
   'name': 'root',
   'nodes': '/server/node?organization_id=1',
   'domain': None,
   'tasks': '/server/task?init_org_id=1',
   'studies': '/server/study?organization_id=1',
   'id': 1,
   'public_key': '',
   'address1': None,
   'users': '/server/user?organization_id=1'}],
 'tasks': '/server/task?study_id=11',
 'collaboration': {'id': 2,
  'link': '/server/collaboration/2',
  'methods': ['PATCH', 'GET', 'DELETE']}}

In [51]:
# Now that we have our study ID lets save it so we can use it later.
STUDY_ID = response.json()["id"]

In [52]:
# You can always view all studies. This endpoint is not necessarily needed for the
# RAVEN UI but I thought it would be useful to have it here.
response = requests.get("https://orchestrator.idea.lst.tfo.upm.es/server/study", headers=headers)
response.json()["data"]

[{'organizations': '/server/organization?study_id=9',
  'name': 'Example Study 9',
  'tasks': '/server/task?study_id=9',
  'id': 9,
  'collaboration': {'id': 1,
   'link': '/server/collaboration/1',
   'methods': ['GET', 'PATCH', 'DELETE']}},
 {'organizations': '/server/organization?study_id=2',
  'name': 'Example Study 2',
  'tasks': '/server/task?study_id=2',
  'id': 2,
  'collaboration': {'id': 1,
   'link': '/server/collaboration/1',
   'methods': ['GET', 'PATCH', 'DELETE']}},
 {'organizations': '/server/organization?study_id=3',
  'name': 'Example Study 3',
  'tasks': '/server/task?study_id=3',
  'id': 3,
  'collaboration': {'id': 1,
   'link': '/server/collaboration/1',
   'methods': ['GET', 'PATCH', 'DELETE']}},
 {'organizations': '/server/organization?study_id=11',
  'name': 'Example Study 11',
  'tasks': '/server/task?study_id=11',
  'id': 11,
  'collaboration': {'id': 2,
   'link': '/server/collaboration/2',
   'methods': ['GET', 'PATCH', 'DELETE']}},
 {'organizations': '/ser

In [None]:
# You can also view the organizations that are part of a study. This endpoint is not
# necessarily needed for the RAVEN UI but I thought it would be useful to have it here.
response = requests.get(
    f"https://orchestrator.idea.lst.tfo.upm.es/server/organization?study_id={STUDY_ID}",
    headers=headers
)
response.json()

{'data': [{'address2': None,
   'collaborations': '/server/collaboration?organization_id=1',
   'runs': '/server/run?organization_id=1',
   'zipcode': None,
   'country': None,
   'name': 'root',
   'nodes': '/server/node?organization_id=1',
   'domain': None,
   'tasks': '/server/task?init_org_id=1',
   'studies': '/server/study?organization_id=1',
   'id': 1,
   'public_key': '',
   'address1': None,
   'users': '/server/user?organization_id=1'}],
 'links': {'first': '/server/organization?study_id=11&page=1',
  'self': '/server/organization?study_id=11&page=1',
  'last': '/server/organization?study_id=11&page=1'}}

## 4. New Analysis
*New session in vantage6*

In [54]:
# When a new analysis is created in RAVEN we need to create a new session in vantage6.
# A session is a file space on the data stations in which we can store dataframes (an
# extraction of the data from the OMOP database). We need the study id which should be
# stored in the workspace in order to create the session.
response = requests.post(
    "https://orchestrator.idea.lst.tfo.upm.es/server/session",
    headers=headers,
    json={
        # The collaboration id is the vantage6 id of the collaboration. This is
        # is the same for all workspaces. I used 1 now, but this can change when we
        # are still developing the platform.
        "collaboration_id": COLLABORATION_ID,
        # The name of the session needs to be unique within the collaboration, so in the
        # case of IDEA4RC this needs to always be unique. I would use the analysis ID to
        # create a unique name.
        "name": "Example Session 8",
        # The study id should be linked to the workspace.
        "study_id": STUDY_ID,
        # The scope is the scope of the session. In IDEA4RC we use the collaboration
        # scope. This means that others users can use the same session.
        "scope": "collaboration"
    }
)
response.json()
# In the case that:
#
# - The name is not unique
# - The study id is not valid (non existing)
# - The scope is not valid (only 'collaboration' should be used)
# - The collaboration id is not valid (non existing)
#
# The API will return a 4xx error with a message. It will be of the following format:
# {
#     "msg": "Error message",
# }

{'name': 'Example Session 8',
 'study': {'id': 11,
  'link': '/server/study/11',
  'methods': ['PATCH', 'GET', 'DELETE']},
 'dataframes': '/server/session/8/dataframe',
 'collaboration': {'id': 2,
  'link': '/server/collaboration/2',
  'methods': ['PATCH', 'GET', 'DELETE']},
 'owner': {'id': 7,
  'link': '/server/user/7',
  'methods': ['PATCH', 'GET', 'DELETE']},
 'last_used_at': '2025-07-01T06:36:52.874247',
 'scope': 'col',
 'tasks': '/server/task?session_id=8',
 'ready': True,
 'created_at': '2025-07-01T06:36:52.874168',
 'id': 8}

In [56]:
SESSION_ID = response.json()["id"]
SESSION_ID

8

## 5. New cohort
*Create a new dataframe in vantage6*

In [68]:
# When a new cohort is created vantage6 needs to extract the data from the OMOP database
# and store it in the session as a dataframe. This is done by executing a vantage6
# extraction task.

#
# Static content
#
image = "https://harbor2.vantage6.ai/idea4rc/sessions:latest"
label = "omop"

#
# Dynamic content
#
study_id = STUDY_ID # related to the workspace
session_id = SESSION_ID # related to the analysis

# The name of the cohort, this should be unique within a session. You can probably use
# the same name that you use in the RAVEN UI. Alternatively, we can also not send it.
# In that case the name will be generated by vantage6.
name = "Cohort_name_2"

# Each `image` can have multiple `methods`. We need to use a different method for
# sarcoma and head and neck as we are extracting different features.
method = "create_cohort"

# The input for the task is the patient ids and which features we want to extract.
arguments = {
    "patient_ids": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], # These should be coming from the cohort builder in RAVEN
    "features": "sarcoma" # or "head_neck" in case of head and neck
}

In [61]:
# before we can create a task we need to prepare task instructions. In vantage6 we can
# (but we dont in IDEA4RC) use end-to-end encryption, therefore we need to store the
# input for each organization individually.
import json
import base64
payload = {
    "label": label,
    "name": name, # optional, v6 will generate a name if not provided
    "task": {
        "method": method,
        "image": image,
        # In vantage6 we can (but we dont in IDEA4RC) use end-to-end encryption,
        # therefore we need to store the input for each organization individually.
        "organizations": [
            {
                "id": id_,
                "input": base64.b64encode(
                    json.dumps(arguments).encode("UTF-8")
                ).decode("UTF-8")
            }
            # We always create a cohort for all organizations in the study. Even though
            # in a later stage we might send computation tasks to a subset of the
            # organizations.
            for id_ in ORGANIZATION_IDS
        ]
    }
}
payload

{'label': 'omop',
 'name': 'Cohort_name_2',
 'task': {'method': 'create_cohort',
  'image': 'https://harbor2.vantage6.ai/idea4rc/sessions:latest',
  'organizations': [{'id': 1,
    'input': 'eyJwYXRpZW50X2lkcyI6IFsxLCAyLCAzLCA0LCA1LCA2LCA3LCA4LCA5LCAxMF0sICJmZWF0dXJlcyI6ICJzYXJjb21hIn0='}]}}

In [62]:
# Create a vantage6 task to extract the data from the omop data source and store it
# into a dataframe.
response = requests.post(
    f"https://orchestrator.idea.lst.tfo.upm.es/server/session/{SESSION_ID}/dataframe",
    headers=headers,
    json=payload
)
response.json()

{'name': 'Cohort_name_2',
 'ready': True,
 'session': {'id': 8,
  'link': '/server/session/8',
  'methods': ['PATCH', 'DELETE', 'GET']},
 'tasks': '/server/task?dataframe_id=6',
 'columns': [],
 'id': 6,
 'db_label': 'omop',
 'last_session_task': {'created_at': '2025-07-01T10:17:26.402115',
  'job_id': 6,
  'depends_on': [],
  'finished_at': None,
  'name': 'Session initialization: Example Session 8',
  'study': {'id': 11,
   'link': '/server/study/11',
   'methods': ['PATCH', 'DELETE', 'GET']},
  'method': 'create_cohort',
  'init_org': {'id': 1,
   'link': '/server/organization/1',
   'methods': ['PATCH', 'DELETE', 'GET']},
  'status': 'awaiting',
  'id': 6,
  'dataframe': {'db_label': 'omop', 'name': 'Cohort_name_2', 'id': 6},
  'children': '/server/task?parent_id=6',
  'collaboration': {'id': 2,
   'link': '/server/collaboration/2',
   'methods': ['PATCH', 'DELETE', 'GET']},
  'databases': [{'label': 'omop',
    'type': 'source',
    'dataframe_id': None,
    'dataframe_name': None

In [63]:
# TODO:
#   - Poll for the dataframe status

## 6. Summary statistics (TODO)

In [64]:
# Before we can display the summary statistics we need to calculate them. This is done
# through a vantage6 algorithm. We first need to be sure the dataframe is ready to be
# used. Then we can execute the algorithm and await the results to be displayed.

# TODO:
# TODO:
#   - Compute summary statistics on the dataframe
#   - Poll for the summary statistics results

In [69]:
# image = "https://harbor2.vantage6.ai/idea4rc/analytics:latest"
# dataframe_id = None

# 7. Collect algorithm metadata
*Collect input arguments and their types*

In [None]:
# Obtain a list of all available algorithms in the algorithm store. For now two algorithms
# are in there:
#
# - `sessions` (`harbor2.vantage6.ai/idea4rc/sessions:latest`). We've used this
#   algorithm already when creating the cohort.
# - `analytics` (`harbor2.vantage6.ai/idea4rc/analytics:latest`). This algorithm is
#   used to create the analytics. We've already used this algorithm for the computation
#   of the summary statistics. But this package currently also contains the crosstab
#   statistics and will be extended in the future with all the other analytics.
#
# In the response, each algorithm has one or more `functions`. The `functions` are
# actual Python functions that are executed on the data stations. A `function` in
# vantage6 expects a specific set of attributes that can be modified by the user. You
# should visualize these in a form in the RAVEN UI.
#
# 1. `databases`. A list of databases (typically only one) that will be supplied by the
#    node based on the `label` or `dataframe_id`. A `label` refers to the OMOP
#    database (in the IDEA4RC case) and is only used for the extraction algorithm (=
#    `create_cohort`). The `dataframe_id` refers to the cohort dataframe which can be
#    used for the analysis. In the extraction job we do not let the user select the
#    database, as we always use the OMOP database. So no need to visualize this. In the
#    analysis job we do let the user select the database, this happens when the user
#    selects a set of cohorts.
# 2. `arguments`. A list of arguments that can be modified by the user. In the case of
#    the extraction algorithm we have two arguments: `patient_ids` and `features`. The
#    `patient_ids` should be the list of patients that are comming from the cohort
#    builder and the `features` should be the of the tumor type that was selected in the
#    RAVEN workspace.
#
# The other important metadata are:
#
# 1. `name`. The method name, depending on which method the user selects in the UI
#    different arguments need to be provided. You also need to provide this `name` in
#    the `method` when creating a vantage6 task.
# 2. `image`. The image is the docker image that will be used to execute the function.
#    You also need to supply this when creating a vantage6 task.
#
# You should be able to use this metadata to create the interface in the RAVEN UI in
# order to create a task.
algorithms = requests.get(
    "https://orchestrator.idea.lst.tfo.upm.es/store/algorithm",
    headers=headers
)
algorithms.json()

{'data': [{'developer_id': 3,
   'name': 'v6-idea4rc-analytics',
   'invalidated_at': None,
   'reviews': '/store/review?algorithm_id=3',
   'approved_at': '2025-07-01T08:59:34.075847',
   'submission_comments': 'Updated the display names',
   'id': 3,
   'status': 'approved',
   'partitioning': 'horizontal',
   'image': 'harbor2.vantage6.ai/idea4rc/analytics',
   'digest': 'sha256:dd35b7dcc89a9b25ffee86e9ca67c055b935f65af756427bfb2509cb231a41d8',
   'documentation_url': 'https://github.com/idea4rc/v6-analytics',
   'description': 'This algorithm computes summary statistics for a set of cohorts',
   'vantage6_version': '5.0.0a22',
   'functions': [{'name': 'summary',
     'id': 4,
     'display_name': 'Summary statistics',
     'ui_visualizations': [],
     'arguments': [{'conditional_value': None,
       'conditional_operator': None,
       'is_frontend_only': False,
       'name': 'columns',
       'id': 10,
       'display_name': 'Columns',
       'default_value': '',
       'condit

## 8. Create analytics (TODO)
- Hardcoded algorithms
- algorithm metadata (parameters, etc.)

In [None]:
#   - Create analytics
#   - Poll for analytics results
#   - Start preprocessing task
#   - Poll for the preprocessing task to finish