# COSMIC Cancer Gene Census and OncoKB Annotation Test

This notebook tests the `maf_COSMIC_annotation` method with all available COSMIC columns and OncoKB annotations.


In [1]:
import sys
import pandas as pd
from pathlib import Path
import logging

# Add src to path
sys.path.insert(0, '../../src')

from pyMut.annotate.cosmic_cancer_annotate import maf_COSMIC_OncoKB_annotation

# Suppress verbose logging
logging.getLogger().setLevel(logging.WARNING)


## File Paths and Validation


In [2]:
# Define file paths
maf_file = Path("../../src/pyMut/data/examples/tcga_laml.maf.gz")
cosmic_file = Path(
    "../../src/pyMut/data/resources/Cosmic_CancerGeneCensus_Tsv_v102_GRCh38/Cosmic_CancerGeneCensus_v102_GRCh38.tsv.gz")
oncokb_file = Path("../../src/pyMut/data/resources/OncoKb/cancerGeneList.tsv")

print(f"MAF file exists: {maf_file.exists()}")
print(f"COSMIC file exists: {cosmic_file.exists()}")
print(f"OncoKB file exists: {oncokb_file.exists()}")


MAF file exists: True
COSMIC file exists: True
OncoKB file exists: True


## Preview COSMIC Data Columns


In [3]:
# Preview COSMIC data structure
import gzip
with gzip.open(cosmic_file, 'rt') as f:
    cosmic_df = pd.read_csv(f, sep='\t', nrows=3)

print(f"COSMIC data shape: {cosmic_df.shape}")
print(f"COSMIC columns ({len(cosmic_df.columns)}):")
for i, col in enumerate(cosmic_df.columns, 1):
    print(f"  {i:2d}. {col}")


COSMIC data shape: (3, 21)
COSMIC columns (21):
   1. GENE_SYMBOL
   2. NAME
   3. COSMIC_GENE_ID
   4. CHROMOSOME
   5. GENOME_START
   6. GENOME_STOP
   7. CHR_BAND
   8. SOMATIC
   9. GERMLINE
  10. TUMOUR_TYPES_SOMATIC
  11. TUMOUR_TYPES_GERMLINE
  12. CANCER_SYNDROME
  13. TISSUE_TYPE
  14. MOLECULAR_GENETICS
  15. ROLE_IN_CANCER
  16. MUTATION_TYPES
  17. TRANSLOCATION_PARTNER
  18. OTHER_GERMLINE_MUT
  19. OTHER_SYNDROME
  20. TIER
  21. SYNONYMS


## Preview OncoKB Data Columns


In [4]:
# Preview OncoKB data structure
oncokb_df = pd.read_csv(oncokb_file, sep='\t', nrows=3)

print(f"OncoKB data shape: {oncokb_df.shape}")
print(f"OncoKB columns ({len(oncokb_df.columns)}):")
for i, col in enumerate(oncokb_df.columns, 1):
    print(f"  {i:2d}. {col}")


OncoKB data shape: (3, 17)
OncoKB columns (17):
   1. Hugo Symbol
   2. Entrez Gene ID
   3. GRCh37 Isoform
   4. GRCh37 RefSeq
   5. GRCh38 Isoform
   6. GRCh38 RefSeq
   7. Is Oncogene
   8. Is Tumor Suppressor Gene
   9. # of occurrence within resources (Column J-P)
  10. OncoKB Annotated
  11. MSK-IMPACT
  12. MSK-HEME
  13. FOUNDATION ONE
  14. FOUNDATION ONE HEME
  15. Vogelstein
  16. COSMIC CGC (v99)
  17. Gene Aliases


## Run COSMIC + OncoKB Annotation (Uncompressed Output)


In [5]:
# Run annotation with uncompressed output (COSMIC + OncoKB)
result_df, output_path = maf_COSMIC_OncoKB_annotation(
    maf_file=maf_file,
    annotation_table=cosmic_file,
    oncokb_table=oncokb_file,
    synonyms_column="SYNONYMS",
)

print(f"Annotation completed!")
print(f"Output file: {output_path}")
print(f"Result shape: {result_df.shape}")


Annotation completed!
Output file: ../../src/pyMut/data/examples/tcga_laml_COSMIC_OncoKB_annotated.maf.gz
Result shape: (2207, 53)


## Results Summary


In [6]:
# Show added COSMIC columns
cosmic_columns = [col for col in result_df.columns if col.startswith('COSMIC_')]
print(f"Added {len(cosmic_columns)} COSMIC columns:")
for i, col in enumerate(cosmic_columns, 1):
    print(f"  {i:2d}. {col}")

print()  # Empty line for separation

# Show added OncoKB columns
oncokb_columns = [col for col in result_df.columns if col.startswith('OncoKB_')]
print(f"Added {len(oncokb_columns)} OncoKB columns:")
for i, col in enumerate(oncokb_columns, 1):
    print(f"  {i:2d}. {col}")


Added 20 COSMIC columns:
   1. COSMIC_NAME
   2. COSMIC_COSMIC_GENE_ID
   3. COSMIC_CHROMOSOME
   4. COSMIC_GENOME_START
   5. COSMIC_GENOME_STOP
   6. COSMIC_CHR_BAND
   7. COSMIC_SOMATIC
   8. COSMIC_GERMLINE
   9. COSMIC_TUMOUR_TYPES_SOMATIC
  10. COSMIC_TUMOUR_TYPES_GERMLINE
  11. COSMIC_CANCER_SYNDROME
  12. COSMIC_TISSUE_TYPE
  13. COSMIC_MOLECULAR_GENETICS
  14. COSMIC_ROLE_IN_CANCER
  15. COSMIC_MUTATION_TYPES
  16. COSMIC_TRANSLOCATION_PARTNER
  17. COSMIC_OTHER_GERMLINE_MUT
  18. COSMIC_OTHER_SYNDROME
  19. COSMIC_TIER
  20. COSMIC_SYNONYMS

Added 16 OncoKB columns:
   1. OncoKB_Entrez Gene ID
   2. OncoKB_GRCh37 Isoform
   3. OncoKB_GRCh37 RefSeq
   4. OncoKB_GRCh38 Isoform
   5. OncoKB_GRCh38 RefSeq
   6. OncoKB_Is Oncogene
   7. OncoKB_Is Tumor Suppressor Gene
   8. OncoKB_# of occurrence within resources (Column J-P)
   9. OncoKB_OncoKB Annotated
  10. OncoKB_MSK-IMPACT
  11. OncoKB_MSK-HEME
  12. OncoKB_FOUNDATION ONE
  13. OncoKB_FOUNDATION ONE HEME
  14. OncoKB_Vogelst

In [7]:
# Check annotation coverage
total_count = len(result_df)
print(f"Annotation coverage:")
print(f"  Total mutations: {total_count}")

if cosmic_columns:
    # Check for non-empty COSMIC annotation values
    has_cosmic_annotation = result_df[cosmic_columns].apply(lambda x: x.str.strip() != "", axis=0).any(axis=1)
    cosmic_annotated_count = has_cosmic_annotation.sum()
    print(f"  With COSMIC annotation: {cosmic_annotated_count}")
    print(f"  COSMIC coverage: {cosmic_annotated_count/total_count*100:.1f}%")

if oncokb_columns:
    # Check for non-empty OncoKB annotation values
    has_oncokb_annotation = result_df[oncokb_columns].apply(lambda x: x.str.strip() != "", axis=0).any(axis=1)
    oncokb_annotated_count = has_oncokb_annotation.sum()
    print(f"  With OncoKB annotation: {oncokb_annotated_count}")
    print(f"  OncoKB coverage: {oncokb_annotated_count/total_count*100:.1f}%")

# Check for mutations with both annotations
if cosmic_columns and oncokb_columns:
    has_both_annotations = (
        result_df[cosmic_columns].apply(lambda x: x.str.strip() != "", axis=0).any(axis=1) &
        result_df[oncokb_columns].apply(lambda x: x.str.strip() != "", axis=0).any(axis=1)
    )
    both_annotated_count = has_both_annotations.sum()
    print(f"  With both COSMIC and OncoKB annotation: {both_annotated_count}")
    print(f"  Both annotations coverage: {both_annotated_count/total_count*100:.1f}%")


Annotation coverage:
  Total mutations: 2207
  With COSMIC annotation: 513
  COSMIC coverage: 23.2%
  With OncoKB annotation: 565
  OncoKB coverage: 25.6%
  With both COSMIC and OncoKB annotation: 485
  Both annotations coverage: 22.0%


## Sample Annotated Data


In [8]:
# Show sample of annotated data
all_annotation_columns = cosmic_columns + oncokb_columns
if all_annotation_columns:
    has_annotation_mask = result_df[all_annotation_columns].apply(lambda x: x.str.strip() != "", axis=0).any(axis=1)
    annotated_rows = result_df[has_annotation_mask]

    if len(annotated_rows) > 0:
        print("Sample annotated mutations (COSMIC + OncoKB):")

        # Select sample columns from both COSMIC and OncoKB
        sample_cols = ['Hugo_Symbol']

        # Add some key COSMIC columns if they exist
        cosmic_sample_cols = ['COSMIC_ROLE_IN_CANCER', 'COSMIC_TIER']
        for col in cosmic_sample_cols:
            if col in annotated_rows.columns:
                sample_cols.append(col)

        # Add some key OncoKB columns if they exist
        oncokb_sample_cols = ['OncoKB_Is Oncogene', 'OncoKB_Is Tumor Suppressor Gene', 'OncoKB_OncoKB Annotated']
        for col in oncokb_sample_cols:
            if col in annotated_rows.columns:
                sample_cols.append(col)

        display_df = annotated_rows[sample_cols].head(5)
        display(display_df)
    else:
        print("No mutations found with annotations")


Sample annotated mutations (COSMIC + OncoKB):


Unnamed: 0,Hugo_Symbol,COSMIC_ROLE_IN_CANCER,COSMIC_TIER,OncoKB_Is Oncogene,OncoKB_Is Tumor Suppressor Gene,OncoKB_OncoKB Annotated
8,ABL1,"oncogene, fusion",1.0,Yes,No,Yes
40,AFF4,"oncogene, fusion",1.0,Yes,No,Yes
58,ALOX5,,,Yes,Yes,Yes
70,ANKRD26,,,Yes,No,Yes
91,ARHGAP5,oncogene,2.0,,,
