This notebook is used to get the organization names and evidence status for evidence items associated to variants in the not supported transcript variants category.

This notebook was run on November 16, 2023 (20231116)

In [1]:
import csv
import os
import sys
from typing import Optional

import requests
from civicpy import civic as civicpy

module_path = os.path.abspath(os.path.join("../.."))
if module_path not in sys.path:
    sys.path.append(module_path)

from utils import load_civicpy_cache, NotSupportedVariantCategory  # noqa: E402

In [2]:
# Get set of not supported variant IDs
not_supported_vids = set()

with open("not_supported_variants.csv", "r") as rf:
    reader = csv.reader(rf, delimiter="\t")
    next(reader)

    for row in reader:
        if row[4] == NotSupportedVariantCategory.TRANSCRIPT_VAR:
            not_supported_vids.add(row[0])

num_transcript_vars = len(not_supported_vids)
f"There are {num_transcript_vars} not supported {NotSupportedVariantCategory.TRANSCRIPT_VAR}"

'There are 366 not supported Transcript Variants'

In [3]:
# Use latest cache that has been pushed to the repo
load_civicpy_cache()

Using cache-20230803.pkl for civicpy cache


In [4]:
# Get not supported variants from IDs
not_supported_variants = civicpy.get_variants_by_ids(not_supported_vids)

In [5]:
# civicpy at the moment does not capture submission events, so we must hit the graphql api
def get_organization_name_for_evidence(evidence_id: int) -> Optional[str]:
    """Get organization name for an evidence item

    :param evidence_id: CIViC evidence item ID
    :return: Organization name for evidence item if found
    """
    query = f"""
        {{
            evidenceItem(id: {evidence_id}) {{
                submissionEvent {{
                    organization {{
                        name
                    }}
                }}
            }}
        }}
    """

    resp = requests.post(
        "https://civicdb.org/api/graphql",
        json={"query": query},
        headers={"Content-Type": "application/json"},
    ).json()

    ev_data = resp.get("data").get("evidenceItem")

    org_name = None
    if ev_data:
        submission_event = ev_data["submissionEvent"]
        if submission_event:
            org = submission_event["organization"]
            if org:
                org_name = org["name"]
    return org_name

In [6]:
org_name_mapping = {}  # organization name with status count mapping
eids_no_org_name = {}  # evidence ids and associated status mapping
checked_eids = set()

for v in not_supported_variants:
    for mp in v.molecular_profiles:
        for e in mp.evidence_items:
            if e.id in checked_eids:
                # Do not want to have duplicate counts
                continue

            checked_eids.add(e.id)
            e_status = e.status
            org_name = get_organization_name_for_evidence(e.id)

            if org_name:
                if org_name in org_name_mapping:
                    org_name_mapping[org_name][e_status] += 1
                    org_name_mapping[org_name]["total"] += 1
                else:
                    status_count = {
                        "submitted": 0,
                        "accepted": 0,
                        "rejected": 0,
                        "total": 1,
                    }
                    status_count[e_status] += 1
                    org_name_mapping[org_name] = status_count
            else:
                if e.id in eids_no_org_name:
                    eids_no_org_name[e.id][e_status] += 1
                    eids_no_org_name[e.id]["total"] += 1
                else:
                    status_count = {
                        "submitted": 0,
                        "accepted": 0,
                        "rejected": 0,
                        "total": 1,
                    }
                    status_count[e_status] += 1
                    eids_no_org_name[e.id] = status_count

In [7]:
sorted(org_name_mapping.items(), key=lambda x: x[1]["total"], reverse=True)

[('University Health Network (Toronto)',
  {'submitted': 314, 'accepted': 12, 'rejected': 6, 'total': 332}),
 ('The McDonnell Genome Institute',
  {'submitted': 61, 'accepted': 35, 'rejected': 0, 'total': 96}),
 ('Illumina', {'submitted': 0, 'accepted': 3, 'rejected': 0, 'total': 3}),
 ('BCCA (POGS)', {'submitted': 1, 'accepted': 1, 'rejected': 0, 'total': 2}),
 ('Treehouse Childhood Cancer Initiative',
  {'submitted': 1, 'accepted': 0, 'rejected': 0, 'total': 1}),
 ('NTRK SC-VCEP', {'submitted': 1, 'accepted': 0, 'rejected': 0, 'total': 1}),
 ('Pediatric Cancer Taskforce',
  {'submitted': 1, 'accepted': 0, 'rejected': 0, 'total': 1}),
 ('ClinGen Somatic ',
  {'submitted': 0, 'accepted': 1, 'rejected': 0, 'total': 1})]

In [8]:
sorted(eids_no_org_name.items(), key=lambda x: x[1]["total"], reverse=True)

[(6225, {'submitted': 1, 'accepted': 0, 'rejected': 0, 'total': 1}),
 (8066, {'submitted': 1, 'accepted': 0, 'rejected': 0, 'total': 1}),
 (8067, {'submitted': 0, 'accepted': 1, 'rejected': 0, 'total': 1}),
 (8614, {'submitted': 1, 'accepted': 0, 'rejected': 0, 'total': 1}),
 (8650, {'submitted': 1, 'accepted': 0, 'rejected': 0, 'total': 1}),
 (8651, {'submitted': 0, 'accepted': 1, 'rejected': 0, 'total': 1}),
 (11475, {'submitted': 1, 'accepted': 0, 'rejected': 0, 'total': 1}),
 (8661, {'submitted': 1, 'accepted': 0, 'rejected': 0, 'total': 1}),
 (8051, {'submitted': 1, 'accepted': 0, 'rejected': 0, 'total': 1}),
 (11466, {'submitted': 1, 'accepted': 0, 'rejected': 0, 'total': 1}),
 (11393, {'submitted': 1, 'accepted': 0, 'rejected': 0, 'total': 1}),
 (11461, {'submitted': 1, 'accepted': 0, 'rejected': 0, 'total': 1}),
 (8655, {'submitted': 1, 'accepted': 0, 'rejected': 0, 'total': 1}),
 (7230, {'submitted': 1, 'accepted': 0, 'rejected': 0, 'total': 1}),
 (11462, {'submitted': 1, 'acc

In [9]:
f"There are {len(checked_eids)} evidence items associated to {NotSupportedVariantCategory.TRANSCRIPT_VAR}"

'There are 452 evidence items associated to Transcript Variants'