# Exploration of Genomic mutation Dataset

Here, I perform initial exploration of the mutation dataset.
Inspect
* structure
* column meanings
* missing data
* statistics



In [50]:
# Standard libraries
import pandas as pd
import numpy as np

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Display settings
# Display more rows and columns
pd.set_option("display.max_rows", 400)
pd.set_option("display.max_columns", 400)
pd.set_option("display.width", 1000)

# Optional: Plot styling
sns.set(style="whitegrid")

In [51]:
# Path to your data
file_path = "../data/raw/data_mutations.txt"

# Read the file
df = pd.read_csv(file_path, sep="\t", comment="#", low_memory=False)

In [52]:
# Preview
df.head(10)

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Consequence,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,dbSNP_RS,dbSNP_Val_Status,Tumor_Sample_Barcode,Matched_Norm_Sample_Barcode,Match_Norm_Seq_Allele1,Match_Norm_Seq_Allele2,Tumor_Validation_Allele1,Tumor_Validation_Allele2,Match_Norm_Validation_Allele1,Match_Norm_Validation_Allele2,Verification_Status,Validation_Status,Mutation_Status,Sequencing_Phase,Sequence_Source,Validation_Method,Score,BAM_File,Sequencer,t_ref_count,t_alt_count,n_ref_count,n_alt_count,HGVSc,HGVSp,HGVSp_Short,Transcript_ID,RefSeq,Protein_position,Codons,Exon_Number,1000G_AF,1000G_AFR_AF,1000G_AMR_AF,1000G_EAS_AF,1000G_EUR_AF,1000G_SAS_AF,APPRIS,Allele,Amino_acids,BIOTYPE,CANONICAL,CCDS,CDS_position,CLIN_SIG,CONTEXT,COSMIC,DISTANCE,DOMAINS,ENSP,ESP_AA_AF,ESP_EA_AF,EXON,Existing_variation,FLAGS,Feature,Feature_type,GDC_FILTER,GENE_PHENO,Gene,HGNC_ID,HGVS_OFFSET,HIGH_INF_POS,IMPACT,INTRON,MANE,MAX_AF,MAX_AF_POPS,MOTIF_NAME,MOTIF_POS,MOTIF_SCORE_CHANGE,One_Consequence,PHENO,PICK,PUBMED,PolyPhen,RNA_Support,RNA_alt_count,RNA_depth,RNA_ref_count,SIFT,SOMATIC,SWISSPROT,SYMBOL,SYMBOL_SOURCE,TRANSCRIPTION_FACTORS,TRANSCRIPT_STRAND,TREMBL,TSL,UNIPARC,UNIPROT_ISOFORM,VARIANT_CLASS,all_effects,cDNA_position,callers,case_id,genomic_location_explanation,gnomAD_AF,gnomAD_AFR_AF,gnomAD_AMR_AF,gnomAD_ASJ_AF,gnomAD_EAS_AF,gnomAD_FIN_AF,gnomAD_NFE_AF,gnomAD_OTH_AF,gnomAD_SAS_AF,gnomAD_non_cancer_AF,gnomAD_non_cancer_AFR_AF,gnomAD_non_cancer_AMI_AF,gnomAD_non_cancer_AMR_AF,gnomAD_non_cancer_ASJ_AF,gnomAD_non_cancer_EAS_AF,gnomAD_non_cancer_FIN_AF,gnomAD_non_cancer_MAX_AF_POPS_adj,gnomAD_non_cancer_MAX_AF_adj,gnomAD_non_cancer_MID_AF,gnomAD_non_cancer_NFE_AF,gnomAD_non_cancer_OTH_AF,gnomAD_non_cancer_SAS_AF,hotspot,miRNA,n_depth,normal_bam_uuid,t_depth,tumor_bam_uuid,Annotation_Status
0,MTCL1,23255,BI,GRCh38,18,8806948,8806948,+,missense_variant,Missense_Mutation,SNP,G,G,A,rs199934266,,TCGA-AB-2881-03A,TCGA-AB-2881-11A,,,,,,,,,Somatic,,,,,,,26,6,,,ENST00000306329.15:c.3449G>A,p.Arg1150His,p.R1150H,ENST00000306329,NM_001378206.1,1150,cGt/cAt,9/14,,,,,,,P3,A,R/H,protein_coding,YES,,3449/5718,,CTTCCGTGCGG,COSM1318381,,Coiled-coils_(Ncoils):Coil;PANTHER:PTHR15742;P...,ENSP00000305027,0.0,0.000116,9/14,rs199934266,,ENST00000306329,Transcript,,,ENSG00000168502,HGNC:29121,,,MODERATE,,,0.000124,gnomAD_NFE,,,,missense_variant,,1.0,,benign(0.045),Unknown,,,,tolerated(0.16),,Q9Y4B5.139,MTCL1,HGNC,,1,,5.0,UPI0001AE65C5,Q9Y4B5-1,SNV,"MTCL1,missense_variant,p.R831H,ENST00000359865...",3449/5718,mutect2;varscan2,fff35c80-88cd-4923-80c1-0273ba5bed0f,,7.2e-05,0.0,2.9e-05,0.0,0.0,9.5e-05,0.000124,0.0,3.3e-05,4.7e-05,0.0,0.0,0.0,0.0,0.0,0.0,nfe,0.000108,0.0,0.000108,0.0,0.0,0,,39,ae212e50-9fb6-47c6-b334-836962553043,32,d1373e1b-d60d-405d-aeb4-ae045f03e331,SUCCESS
1,HSH2D,84941,BI,GRCh38,19,16157257,16157257,+,inframe_insertion,In_Frame_Ins,INS,G,G,CTCC,novel,,TCGA-AB-2881-03A,TCGA-AB-2881-11A,,,,,,,,,Somatic,,,,,,,89,9,,,ENST00000613986.4:c.522delinsCTCC,p.Ser175dup,p.S175dup,ENST00000613986,NM_001352265.2,175,ccG/ccCTCC,6/6,,,,,,,P1,CTCC,P/PS,protein_coding,,CCDS74304.1,522/1059,,AAGCCGTCAGC,,,PANTHER:PTHR14388;PANTHER:PTHR14388:SF3;MobiDB...,ENSP00000483354,,,6/6,,,ENST00000613986,Transcript,,,ENSG00000196684,HGNC:24920,,,MODERATE,,NM_001382417.1,,,,,,inframe_insertion,,,,,Unknown,,,,,,Q96JZ2.142,HSH2D,HGNC,,1,,2.0,UPI0000073F82,Q96JZ2-1,indel,"HSH2D,inframe_insertion,p.S175dup,ENST00000616...",639/1957,pindel;varscan2*,fff35c80-88cd-4923-80c1-0273ba5bed0f,,,,,,,,,,,,,,,,,,,,,,,,0,,129,ae212e50-9fb6-47c6-b334-836962553043,98,d1373e1b-d60d-405d-aeb4-ae045f03e331,SUCCESS
2,PRDM16,63976,BI,GRCh38,1,3411959,3411959,+,missense_variant,Missense_Mutation,SNP,G,G,A,rs768733228,,TCGA-AB-2895-03A,TCGA-AB-2895-11A,,,,,,,,,Somatic,,,,,,,25,35,,,ENST00000270722.10:c.1762G>A,p.Val588Met,p.V588M,ENST00000270722,NM_022114.4,588,Gtg/Atg,9/17,,,,,,,P4,A,V/M,protein_coding,YES,CCDS41236.2,1762/3831,uncertain_significance,CCTGTGTGGAG,COSM1317542,,PANTHER:PTHR24393;PANTHER:PTHR24393:SF69;MobiD...,ENSP00000270722,,,9/17,rs768733228;COSV54594630,,ENST00000270722,Transcript,,1.0,ENSG00000142611,HGNC:14000,,,MODERATE,,NM_022114.4,0.000612,gnomAD_EAS,,,,missense_variant,1;1,1.0,,benign(0.051),Unknown,,,,tolerated(0.23),0;1,Q9HAZ2.187,PRDM16,HGNC,,1,,1.0,UPI0000458A29,Q9HAZ2-1,SNV,"PRDM16,missense_variant,p.V588M,ENST0000027072...",1819/8698,muse;mutect2;varscan2,137dfe0a-2a2b-423c-9a8a-78233b6ead74,,4.4e-05,0.0,0.0,0.0,0.000612,0.0,0.0,0.0,0.0,2.7e-05,0.0,0.0,0.0,0.0,0.00081,0.0,eas,0.00081,0.0,0.0,0.0,0.0,0,,87,3fef7a26-431a-4ff5-8d6f-2ad825c500cf,61,598ec85a-72ce-40dc-b1df-876611eb4eeb,SUCCESS
3,DCAF8,50717,BI,GRCh38,1,160240206,160240206,+,missense_variant,Missense_Mutation,SNP,A,A,G,rs1208295999,,TCGA-AB-2895-03A,TCGA-AB-2895-11A,,,,,,,,,Somatic,,,,,,,44,35,,,ENST00000326837.6:c.214T>C,p.Ser72Pro,p.S72P,ENST00000326837,,72,Tca/Cca,4/14,,,,,,,P1,G,S/P,protein_coding,,CCDS1200.1,214/1794,,ACCTGAGCTCT,COSM1317642;COSM1317643,,MobiDB_lite:mobidb-lite;MobiDB_lite:mobidb-lite,ENSP00000318227,,,4/14,rs1208295999;COSV58784424,,ENST00000326837,Transcript,,1.0,ENSG00000132716,HGNC:24891,,,MODERATE,,,,,,,,missense_variant,0;1,,,benign(0.006),Unknown,,,,tolerated(0.11),0;1,Q5TAQ9.150,DCAF8,HGNC,,-1,,5.0,UPI0000141A39,Q5TAQ9-1,SNV,"DCAF8,missense_variant,p.S72P,ENST00000368073,...",515/3972,muse;mutect2;varscan2,137dfe0a-2a2b-423c-9a8a-78233b6ead74,,,,,,,,,,,7e-06,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,1.5e-05,0.0,0.0,0,,70,3fef7a26-431a-4ff5-8d6f-2ad825c500cf,79,598ec85a-72ce-40dc-b1df-876611eb4eeb,SUCCESS
4,DNMT3A,1788,BI,GRCh38,2,25234373,25234373,+,missense_variant,Missense_Mutation,SNP,C,C,T,rs147001633,,TCGA-AB-2895-03A,TCGA-AB-2895-11A,,,,,,,,,Somatic,,,,,,,63,45,,,ENST00000264709.7:c.2645G>A,p.Arg882His,p.R882H,ENST00000264709,NM_175629.2,882,cGc/cAc,23/23,,,,,,,P1,T,R/H,protein_coding,YES,CCDS33157.1,2645/2739,uncertain_significance;pathogenic;likely_patho...,CCAAGCGGCTC,COSM442676;COSM52944,,PDB-ENSP_mappings:2qrv.A;PDB-ENSP_mappings:2qr...,ENSP00000264709,0.000908,0.000581,23/23,rs147001633;COSV53036153;COSV53040144;COSV5304...,,ENST00000264709,Transcript,gdc_pon,1.0,ENSG00000119772,HGNC:2978,,,MODERATE,,,0.000908,AA,,,,missense_variant,1;1;1;1,1.0,26619011;22160010;24606448;22898539;21067377;2...,benign(0.067),Unknown,,,,deleterious(0),0;1;1;1,Q9Y6K1.183,DNMT3A,HGNC,,-1,,1.0,UPI000000DA70,Q9Y6K1-1,SNV,"DNMT3A,missense_variant,p.R882H,ENST0000026470...",2983/9501,muse;mutect2;varscan2,137dfe0a-2a2b-423c-9a8a-78233b6ead74,,0.000219,0.000616,0.0,0.000298,0.000544,0.000277,0.000159,0.000163,0.000229,0.000366,0.000512,0.0,6.6e-05,0.0,0.000202,9.5e-05,afr,0.000512,0.0,0.000433,0.0,0.000417,0,,117,3fef7a26-431a-4ff5-8d6f-2ad825c500cf,108,598ec85a-72ce-40dc-b1df-876611eb4eeb,SUCCESS
5,CLASP2,23122,BI,GRCh38,3,33560868,33560869,+,frameshift_variant,Frame_Shift_Ins,INS,-,-,T,,,TCGA-AB-2895-03A,TCGA-AB-2895-11A,,,,,,,,,Somatic,,,,,,,144,90,,,ENST00000682230.1:c.2869dup,p.Met957AsnfsTer4,p.M957Nfs*4,ENST00000682230,NM_001400405.1,957,atg/aAtg,28/39,,,,,,,A2,T,M/NX,protein_coding,YES,,2896-2897/4548,,CACCCATTTTT,COSM1422446;COSM4615334;COSM4615335,,Gene3D:1.25.10.10;PANTHER:PTHR21567;PANTHER:PT...,ENSP00000352581,,,29/40,,,ENST00000359576,Transcript,,,ENSG00000163539,HGNC:17078,,,HIGH,,,,,,,,frameshift_variant,,1.0,,,Unknown,,,,,,O75122.175,CLASP2,HGNC,,-1,,5.0,UPI0001747A09,O75122-3,insertion,"CLASP2,frameshift_variant,p.M964Nfs*4,ENST0000...",3107-3108/7141,mutect2;varscan2,137dfe0a-2a2b-423c-9a8a-78233b6ead74,,,,,,,,,,,,,,,,,,,,,,,,0,,216,3fef7a26-431a-4ff5-8d6f-2ad825c500cf,234,598ec85a-72ce-40dc-b1df-876611eb4eeb,SUCCESS
6,SH3TC2,79628,BI,GRCh38,5,149026590,149026590,+,missense_variant,Missense_Mutation,SNP,C,C,T,rs140666774,,TCGA-AB-2895-03A,TCGA-AB-2895-11A,,,,,,,,,Somatic,,,,,,,146,38,,,ENST00000515425.6:c.3035G>A,p.Arg1012Gln,p.R1012Q,ENST00000515425,NM_024577.4,1012,cGg/cAg,12/17,,,,,,,P2,T,R/Q,protein_coding,YES,CCDS4293.1,3035/3867,,GGTTCCGATAA,COSM1319240;COSM1319241,,PANTHER:PTHR22647;PANTHER:PTHR22647:SF2;Gene3D...,ENSP00000423660,0.000454,0.0,12/17,rs140666774,,ENST00000515425,Transcript,,1.0,ENSG00000169247,HGNC:29427,,,MODERATE,,NM_024577.4,0.000454,AA,,,,missense_variant,,1.0,,benign(0.015),Unknown,,,,tolerated(0.42),,Q8TF17.161,SH3TC2,HGNC,,-1,A0A514TP98.4,1.0,UPI00001DFBEE,Q8TF17-1,SNV,"SH3TC2,missense_variant,p.R1012Q,ENST000005154...",3075/26468,muse;mutect2;varscan2,137dfe0a-2a2b-423c-9a8a-78233b6ead74,,2.4e-05,0.000308,0.0,0.0,0.0,0.0,9e-06,0.0,0.0,6.8e-05,0.000243,0.0,0.0,0.0,0.0,0.0,afr,0.000243,0.0,0.0,0.0,0.0,0,,113,3fef7a26-431a-4ff5-8d6f-2ad825c500cf,185,598ec85a-72ce-40dc-b1df-876611eb4eeb,SUCCESS
7,ATP10B,23120,BI,GRCh38,5,160686133,160686133,+,missense_variant,Missense_Mutation,SNP,T,T,C,,,TCGA-AB-2895-03A,TCGA-AB-2895-11A,,,,,,,,,Somatic,,,,,,,52,44,,,ENST00000327245.10:c.416A>G,p.Lys139Arg,p.K139R,ENST00000327245,NM_001366652.1,139,aAg/aGg,6/26,,,,,,,P4,C,K/R,protein_coding,YES,CCDS43394.1,416/4386,,GTCTCTTGAAG,COSM1319228,,CDD:cd02073;PANTHER:PTHR24092:SF79;PANTHER:PTH...,ENSP00000313600,,,6/26,COSV58779196,,ENST00000327245,Transcript,,,ENSG00000118322,HGNC:13543,,,MODERATE,,NM_025153.3,,,,,,missense_variant,1,1.0,,benign(0.005),Unknown,,,,tolerated(1),1,O94823.175,ATP10B,HGNC,,-1,,1.0,UPI0000191DAE,O94823-1,SNV,"ATP10B,missense_variant,p.K139R,ENST0000032724...",1262/7565,muse;mutect2;varscan2,137dfe0a-2a2b-423c-9a8a-78233b6ead74,,,,,,,,,,,,,,,,,,,,,,,,0,,108,3fef7a26-431a-4ff5-8d6f-2ad825c500cf,96,598ec85a-72ce-40dc-b1df-876611eb4eeb,SUCCESS
8,DOCK2,1794,BI,GRCh38,5,169698450,169698450,+,splice_donor_variant,Splice_Site,SNP,G,G,A,rs1429834722,,TCGA-AB-2895-03A,TCGA-AB-2895-11A,,,,,,,,,Somatic,,,,,,,66,66,,,ENST00000520908.7:c.1055+1G>A,,p.X352_splice,ENST00000520908,NM_004946.3,352,,,,,,,,,P1,A,,protein_coding,YES,CCDS4371.1,,,CACCCGTAAGA,COSM355254,,,ENSP00000429283,,,,rs1429834722;COSV56944648,,ENST00000520908,Transcript,NonExonic,1.0,ENSG00000134516,HGNC:2988,,,HIGH,11/51,NM_004946.3,4.6e-05,gnomAD_FIN,,,,splice_donor_variant,0;1,1.0,,,Unknown,,,,,0;1,Q92608.184,DOCK2,HGNC,,1,,2.0,UPI00001A38CC,Q92608-1,SNV,"DOCK2,splice_donor_variant,p.X48_splice,ENST00...",,muse;mutect2;varscan2,137dfe0a-2a2b-423c-9a8a-78233b6ead74,,8e-06,0.0,0.0,0.0,0.0,4.6e-05,9e-06,0.0,0.0,,,,,,,,,,,,,,0,,98,3fef7a26-431a-4ff5-8d6f-2ad825c500cf,132,598ec85a-72ce-40dc-b1df-876611eb4eeb,SUCCESS
9,NPM1,4869,BI,GRCh38,5,171410539,171410540,+,frameshift_variant,Frame_Shift_Ins,INS,-,-,TCTG,rs587776806,,TCGA-AB-2895-03A,TCGA-AB-2895-11A,,,,,,,,,Somatic,,,,,,,48,29,,,ENST00000296930.10:c.860_863dup,p.Trp288CysfsTer12,p.W288Cfs*12,ENST00000296930,NM_002520.7,287,ctc/cTCTGtc,11/11,,,,,,,P1,TCTG,L/LCX,protein_coding,,CCDS4376.1,859-860/885,pathogenic,AAGATCTCTGG,COSM158604,,PDB-ENSP_mappings:2llh.A;PDB-ENSP_mappings:2vx...,ENSP00000296930,,,11/11,rs587776806;COSV51548899;COSV51555098;COSV5155...,,ENST00000296930,Transcript,,1.0,ENSG00000181163,HGNC:7910,4.0,,HIGH,,NM_002520.7,,,,,,frameshift_variant,1;1;1;1;1,,15659725;20026798,,Unknown,,,,,0;1;1;1;1,P06748.243,NPM1,HGNC,,1,A0A0S2Z491.28,1.0,UPI00001303ED,P06748-1,insertion,"NPM1,frameshift_variant,p.W224Cfs*12,ENST00000...",959-960/1320,mutect2;pindel;varscan2,137dfe0a-2a2b-423c-9a8a-78233b6ead74,,,,,,,,,,,,,,,,,,,,,,,,0,,76,3fef7a26-431a-4ff5-8d6f-2ad825c500cf,77,598ec85a-72ce-40dc-b1df-876611eb4eeb,SUCCESS


In [53]:
print("shape", df.shape)
print("\nColumns:")
print(df.columns.tolist())

shape (569, 140)

Columns:
['Hugo_Symbol', 'Entrez_Gene_Id', 'Center', 'NCBI_Build', 'Chromosome', 'Start_Position', 'End_Position', 'Strand', 'Consequence', 'Variant_Classification', 'Variant_Type', 'Reference_Allele', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2', 'dbSNP_RS', 'dbSNP_Val_Status', 'Tumor_Sample_Barcode', 'Matched_Norm_Sample_Barcode', 'Match_Norm_Seq_Allele1', 'Match_Norm_Seq_Allele2', 'Tumor_Validation_Allele1', 'Tumor_Validation_Allele2', 'Match_Norm_Validation_Allele1', 'Match_Norm_Validation_Allele2', 'Verification_Status', 'Validation_Status', 'Mutation_Status', 'Sequencing_Phase', 'Sequence_Source', 'Validation_Method', 'Score', 'BAM_File', 'Sequencer', 't_ref_count', 't_alt_count', 'n_ref_count', 'n_alt_count', 'HGVSc', 'HGVSp', 'HGVSp_Short', 'Transcript_ID', 'RefSeq', 'Protein_position', 'Codons', 'Exon_Number', '1000G_AF', '1000G_AFR_AF', '1000G_AMR_AF', '1000G_EAS_AF', '1000G_EUR_AF', '1000G_SAS_AF', 'APPRIS', 'Allele', 'Amino_acids', 'BIOTYPE', 'CANONICAL', 'CCD

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Columns: 140 entries, Hugo_Symbol to Annotation_Status
dtypes: float64(62), int64(9), object(69)
memory usage: 622.5+ KB


In [55]:
missing_values = df.isna().mean().sort_values(ascending=False)
missing_values.head(20)

genomic_location_explanation     1.0
TRANSCRIPTION_FACTORS            1.0
n_ref_count                      1.0
Sequencer                        1.0
BAM_File                         1.0
Score                            1.0
Validation_Method                1.0
Sequence_Source                  1.0
Sequencing_Phase                 1.0
Validation_Status                1.0
Verification_Status              1.0
Match_Norm_Validation_Allele2    1.0
Match_Norm_Validation_Allele1    1.0
Tumor_Validation_Allele2         1.0
Tumor_Validation_Allele1         1.0
Match_Norm_Seq_Allele2           1.0
Match_Norm_Seq_Allele1           1.0
MOTIF_NAME                       1.0
MOTIF_POS                        1.0
RNA_ref_count                    1.0
dtype: float64

In [56]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Entrez_Gene_Id,569.0,605141.9,7505472.0,43.0,4869.0,22999.0,80005.0,109729100.0
Start_Position,569.0,76214910.0,55983500.0,900111.0,32878440.0,61735620.0,116852100.0,248593000.0
End_Position,569.0,76214910.0,55983500.0,900111.0,32878440.0,61735620.0,116852100.0,248593000.0
dbSNP_Val_Status,0.0,,,,,,,
Match_Norm_Seq_Allele1,0.0,,,,,,,
Match_Norm_Seq_Allele2,0.0,,,,,,,
Tumor_Validation_Allele1,0.0,,,,,,,
Tumor_Validation_Allele2,0.0,,,,,,,
Match_Norm_Validation_Allele1,0.0,,,,,,,
Match_Norm_Validation_Allele2,0.0,,,,,,,


In [69]:
unknown_columns = ['HGVSp_Short', 'Allele', 'One_Consequence', 'PolyPhen', 'SIFT', 'SOMATIC', 'VARIANT_CLASS']
for col in unknown_columns:
    if col in df.columns:
        print(f"\nColumn: {col}")
        print(df[col].head(5))


Column: HGVSp_Short
0     p.R1150H
1    p.S175dup
2      p.V588M
3       p.S72P
4      p.R882H
Name: HGVSp_Short, dtype: object

Column: Allele
0       A
1    CTCC
2       A
3       G
4       T
Name: Allele, dtype: object

Column: One_Consequence
0     missense_variant
1    inframe_insertion
2     missense_variant
3     missense_variant
4     missense_variant
Name: One_Consequence, dtype: object

Column: PolyPhen
0    benign(0.045)
1              NaN
2    benign(0.051)
3    benign(0.006)
4    benign(0.067)
Name: PolyPhen, dtype: object

Column: SIFT
0    tolerated(0.16)
1                NaN
2    tolerated(0.23)
3    tolerated(0.11)
4     deleterious(0)
Name: SIFT, dtype: object

Column: SOMATIC
0        NaN
1        NaN
2        0;1
3        0;1
4    0;1;1;1
Name: SOMATIC, dtype: object

Column: VARIANT_CLASS
0      SNV
1    indel
2      SNV
3      SNV
4      SNV
Name: VARIANT_CLASS, dtype: object


In [66]:
# Figure out alleles and their corresponding gene
gene = df['Hugo_Symbol']
allele = df['Allele']
gene_and_allele = df[['Hugo_Symbol','Allele']]
print(gene_and_allele.head(10))

  Hugo_Symbol Allele
0       MTCL1      A
1       HSH2D   CTCC
2      PRDM16      A
3       DCAF8      G
4      DNMT3A      T
5      CLASP2      T
6      SH3TC2      T
7      ATP10B      C
8       DOCK2      A
9        NPM1   TCTG


## Interesting 
* HGVSp_Short: 
* Allele
* One_Consequence
* PolyPhen
* SIFT
* SOMATIC
* VARIANT_CLASS
* VARIANT_CLASSIFICATION

### Key Columns Used in This Analysis



- **Hugo_Symbol:** which gene is affected by a mutation.
  The official gene name (from the HUGO Gene Nomenclature Committee).  
  *Example*: `TP53`, `NPM1`, `DNMT3A`.  
  

- **Variant_Classification:**  what kind of mutation is it.
  The type of mutation in terms of its biological effect.  
  *Examples*:  
  - `Missense_Mutation` → amino acid change  
  - `Silent` → no change to the protein  
  - `Nonsense_Mutation` → introduces a stop codon  
  - `Frame_Shift_Ins/Del` → disrupts protein reading frame  
  - `Splice_Site` → alters RNA splicing  
  

- **Tumor_Sample_Barcode:**  mutation to an individual patient and the mutation burden.
  A unique identifier for each tumor sample (patient).  
  *Example*: `TCGA-AB-2935-03A`.  
  

- **t_ref_count**  
  Number of sequencing reads supporting the **reference (normal)** allele in the tumor sample.  
  *Example*: `63` means 63 reads match the reference base.

- **t_alt_count**  
  Number of sequencing reads supporting the **alternate (mutated)** allele in the tumor sample.  
  *Example*: `45` means 45 reads match the mutation.  


### Summary of mutation

- Total rows: 569
- Unique tumor samples: 72

**Top mutated genes:**
- NPM1, TP53, DNMT3A, IDH2, and FLT3 appear most frequently, consistent with known cancer drivers.

**Mutation classes:**
- Missense mutations dominate (365), followed by silent mutations (110).
- Frameshift and nonsense mutations occur less frequently but often have higher impact.

**Mutations per sample:**
- Some patients have >30 mutations, others fewer than 10.
- This suggests heterogeneity in mutational burden.




In [79]:
key_columns = ["Hugo_Symbol", "Variant_Classification",  "t_ref_count", "t_alt_count", 'hotspot']
df_key = df[key_columns]
print(df_key.head(10))

  Hugo_Symbol Variant_Classification  t_ref_count  t_alt_count  hotspot
0       MTCL1      Missense_Mutation           26            6        0
1       HSH2D           In_Frame_Ins           89            9        0
2      PRDM16      Missense_Mutation           25           35        0
3       DCAF8      Missense_Mutation           44           35        0
4      DNMT3A      Missense_Mutation           63           45        0
5      CLASP2        Frame_Shift_Ins          144           90        0
6      SH3TC2      Missense_Mutation          146           38        0
7      ATP10B      Missense_Mutation           52           44        0
8       DOCK2            Splice_Site           66           66        0
9        NPM1        Frame_Shift_Ins           48           29        0


In [77]:
# Explore mutation-related columns
if 'Variant_Classification' in df.columns:
    print("Top mutation classifications:")
    print(df['Variant_Classification'].value_counts().head(10))

if 'Hugo_Symbol' in df.columns:
    print("\nTop mutated genes:")
    print(df['Hugo_Symbol'].value_counts().head(10))


Top mutation classifications:
Variant_Classification
Missense_Mutation    365
Silent               110
Frame_Shift_Ins       30
Nonsense_Mutation     22
Splice_Site           13
Frame_Shift_Del        9
Splice_Region          6
In_Frame_Ins           4
Intron                 4
5'Flank                2
Name: count, dtype: int64

Top mutated genes:
Hugo_Symbol
NPM1      8
TP53      7
DNMT3A    6
IDH2      5
RUNX1     4
TTN       3
GOLGA4    3
FLT3      3
FREM2     3
KRAS      3
Name: count, dtype: int64
