# COSMIC Cancer Gene Census Annotation Test

This notebook tests the `maf_COSMIC_annotation` method with all available COSMIC columns.


In [8]:
import sys
import pandas as pd
from pathlib import Path
import logging

# Add src to path
sys.path.insert(0, '../../src')

from pyMut.annotate.cosmic_cancer_annotate import maf_COSMIC_annotation

# Suppress verbose logging
logging.getLogger().setLevel(logging.WARNING)


## File Paths and Validation


In [9]:
# Define file paths
maf_file = Path("../../src/pyMut/data/examples/tcga_laml.maf.gz")
cosmic_file = Path(
    "../../src/pyMut/data/resources/Cosmic_CancerGeneCensus_Tsv_v102_GRCh38/Cosmic_CancerGeneCensus_v102_GRCh38.tsv.gz")

print(f"MAF file exists: {maf_file.exists()}")
print(f"COSMIC file exists: {cosmic_file.exists()}")


MAF file exists: True
COSMIC file exists: True


## Preview COSMIC Data Columns


In [10]:
# Preview COSMIC data structure
import gzip
with gzip.open(cosmic_file, 'rt') as f:
    cosmic_df = pd.read_csv(f, sep='\t', nrows=3)

print(f"COSMIC data shape: {cosmic_df.shape}")
print(f"COSMIC columns ({len(cosmic_df.columns)}):")
for i, col in enumerate(cosmic_df.columns, 1):
    print(f"  {i:2d}. {col}")


COSMIC data shape: (3, 21)
COSMIC columns (21):
   1. GENE_SYMBOL
   2. NAME
   3. COSMIC_GENE_ID
   4. CHROMOSOME
   5. GENOME_START
   6. GENOME_STOP
   7. CHR_BAND
   8. SOMATIC
   9. GERMLINE
  10. TUMOUR_TYPES_SOMATIC
  11. TUMOUR_TYPES_GERMLINE
  12. CANCER_SYNDROME
  13. TISSUE_TYPE
  14. MOLECULAR_GENETICS
  15. ROLE_IN_CANCER
  16. MUTATION_TYPES
  17. TRANSLOCATION_PARTNER
  18. OTHER_GERMLINE_MUT
  19. OTHER_SYNDROME
  20. TIER
  21. SYNONYMS


## Run COSMIC Annotation (Uncompressed Output)


In [11]:
# Run annotation with uncompressed output
result_df, output_path = maf_COSMIC_annotation(
    maf_file=maf_file,
    annotation_table=cosmic_file,
    synonyms_column="SYNONYMS",
)

print(f"Annotation completed!")
print(f"Output file: {output_path}")
print(f"Result shape: {result_df.shape}")


Annotation completed!
Output file: ../../src/pyMut/data/examples/tcga_laml_COSMIC_annotated.maf.gz
Result shape: (2207, 37)


## Results Summary


In [12]:
# Show added COSMIC columns
cosmic_columns = [col for col in result_df.columns if col.startswith('COSMIC_')]
print(f"Added {len(cosmic_columns)} COSMIC columns:")
for i, col in enumerate(cosmic_columns, 1):
    print(f"  {i:2d}. {col}")


Added 20 COSMIC columns:
   1. COSMIC_NAME
   2. COSMIC_COSMIC_GENE_ID
   3. COSMIC_CHROMOSOME
   4. COSMIC_GENOME_START
   5. COSMIC_GENOME_STOP
   6. COSMIC_CHR_BAND
   7. COSMIC_SOMATIC
   8. COSMIC_GERMLINE
   9. COSMIC_TUMOUR_TYPES_SOMATIC
  10. COSMIC_TUMOUR_TYPES_GERMLINE
  11. COSMIC_CANCER_SYNDROME
  12. COSMIC_TISSUE_TYPE
  13. COSMIC_MOLECULAR_GENETICS
  14. COSMIC_ROLE_IN_CANCER
  15. COSMIC_MUTATION_TYPES
  16. COSMIC_TRANSLOCATION_PARTNER
  17. COSMIC_OTHER_GERMLINE_MUT
  18. COSMIC_OTHER_SYNDROME
  19. COSMIC_TIER
  20. COSMIC_SYNONYMS


In [13]:
# Check annotation coverage
if cosmic_columns:
    # Check for non-empty annotation values (not just non-null, since missing values are filled with empty strings)
    has_annotation = result_df[cosmic_columns].apply(lambda x: x.str.strip() != "", axis=0).any(axis=1)
    annotated_count = has_annotation.sum()
    total_count = len(result_df)

    print(f"Annotation coverage:")
    print(f"  Total mutations: {total_count}")
    print(f"  With COSMIC annotation: {annotated_count}")
    print(f"  Coverage: {annotated_count/total_count*100:.1f}%")


Annotation coverage:
  Total mutations: 2207
  With COSMIC annotation: 513
  Coverage: 23.2%


## Sample Annotated Data


In [14]:
# Show sample of annotated data
if cosmic_columns:
    has_annotation_mask = result_df[cosmic_columns].apply(lambda x: x.str.strip() != "", axis=0).any(axis=1)
    annotated_rows = result_df[has_annotation_mask]

    if len(annotated_rows) > 0:
        print("Sample annotated mutations:")
        sample_cols = ['Hugo_Symbol', 'COSMIC_ROLE_IN_CANCER', 'COSMIC_TIER']
        display_df = annotated_rows[sample_cols].drop_duplicates(subset=['Hugo_Symbol']).head(5)
        display(display_df)
    else:
        print("No mutations found with COSMIC annotations")


Sample annotated mutations:


Unnamed: 0,Hugo_Symbol,COSMIC_ROLE_IN_CANCER,COSMIC_TIER
8,ABL1,"oncogene, fusion",1.0
40,AFF4,"oncogene, fusion",1.0
91,ARHGAP5,oncogene,2.0
93,ARHGEF10L,TSG,2.0
99,ARID1A,"TSG, fusion",1.0
