In [2]:
import pandas as pd

In [24]:
df = pd.read_csv("/lustre/scratch126/casm/team274sb/lr26/pepper-tumor1B01/annotated_normalized_tumor_outputt2t.vcf",
                 sep="\t",
                 comment="#",
                 dtype="str",
                 header=None,
                 names=["chr","pos","id","ref","alt","qual","filter","info","format","sample"])
df

Unnamed: 0,chr,pos,id,ref,alt,qual,filter,info,format,sample
0,chr1,4427,.,A,C,12.2,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:11:3:3:1:DV
1,chr1,4438,.,C,T,10.8,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:10:3:3:1:DV
2,chr1,4440,.,G,A,11.3,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:11:3:3:1:DV
3,chr1,4464,.,C,T,6.2,PASS,.,GT:GQ:DP:AD:VAF:C,0/1:6:3:2:0.666667:DV
4,chr1,4501,.,T,C,13.7,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:13:3:3:1:DV
...,...,...,...,...,...,...,...,...,...,...
7598646,chrY,62456021,.,G,T,6.5,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:5:2:2:1:DV
7598647,chrY,62456075,.,G,T,13.4,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:6:2:2:1:DV
7598648,chrY,62456143,.,AG,A,2.6,refCall,.,GT:GQ:DP:AD:VAF:C,./.:3:2:2:1:DV
7598649,chrY,62456186,.,AG,A,6.1,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:4:2:2:1:DV


In [25]:
print("INFO field of first variant:")
print(df.loc[0, "info"])
df_filtered = df.copy()

INFO field of first variant:
.


In [26]:
def parse_sample_format(format_str, sample_str):
    """
    Parses FORMAT and SAMPLE fields of a VCF into a dictionary
    Handles numeric parsing and multi-allelic values robustly
    """
    keys = format_str.split(":")
    values = sample_str.split(":")
    sample_dict = dict(zip(keys, values))

    # Parse GT
    sample_dict["GT"] = sample_dict.get("GT", "")

    # Parse DP
    try:
        sample_dict["DP"] = int(sample_dict.get("DP", 0))
    except ValueError:
        sample_dict["DP"] = 0

    # Parse AD
    ad_val = sample_dict.get("AD", "0")
    if "," in ad_val:
        try:
            ad_list = [int(a) for a in ad_val.split(",") if a.isdigit()]
            sample_dict["AD"] = sum(ad_list[1:])  # Only ALT allele depth(s)
        except:
            sample_dict["AD"] = 0
    else:
        sample_dict["AD"] = int(ad_val) if ad_val.isdigit() else 0

    # Parse VAF (handle multiple values by taking max)
    vaf_val = sample_dict.get("VAF", "0.0")
    try:
        if "," in vaf_val:
            vaf_list = [float(v) for v in vaf_val.split(",") if v]
            sample_dict["VAF"] = max(vaf_list) if vaf_list else 0.0
        else:
            sample_dict["VAF"] = float(vaf_val)
    except:
        sample_dict["VAF"] = 0.0

    return sample_dict


def get_genotype_from_GT(gt_str):
    """
    Determines genotype type based on GT field
    """
    alleles = gt_str.replace('|', '/').split('/')
    if len(alleles) == 2:
        if alleles[0] == alleles[1]:
            return "homozygous"
        else:
            return "heterozygous"
    return "unknown"

def extract_gt(sample_str, format_str):
    keys = format_str.split(":")
    values = sample_str.split(":")
    format_dict = dict(zip(keys, values))
    gt = format_dict.get("GT", "")
    return get_genotype_from_GT(gt)

def filter_variant_by_sample(sample_dict, min_dp=10, min_ad=3, min_vaf=0.1):
    """
    Apply basic quality filters on depth, allele depth, and VAF
    """
    dp = sample_dict.get("DP", 0)
    ad = sample_dict.get("AD", 0)
    vaf = sample_dict.get("VAF", 0.0)
    
    return (dp >= min_dp) and (ad >= min_ad) and (vaf >= min_vaf)

def get_cosmic_annotation(info_str):
    """
    Parses a COSMIC-style INFO field and returns a dictionary of annotations.
    This assumes the INFO string is formatted as key=value;key=value;...
    """
    info_dict = {}
    for entry in info_str.split(";"):
        if "=" in entry:
            key, value = entry.split("=", 1)
            info_dict[key] = value
    return info_dict


def extract_gene(annotation):
    return annotation.get("GENE_SYMBOL")

def extract_protein_position(annotation):
    return annotation.get("MUTATION_AA")

def extract_mutation_description(annotation):
    return annotation.get("MUTATION_DESCRIPTION") or annotation.get("rsid")


In [27]:
# Extract sample info
df_filtered["sample_info"] = df_filtered.apply(lambda row: parse_sample_format(row["format"], row["sample"]), axis=1)

# Add specific fields as separate columns
df_filtered["DP"]  = df_filtered["sample_info"].apply(lambda x: x["DP"])
df_filtered["AD"]  = df_filtered["sample_info"].apply(lambda x: x["AD"])
df_filtered["VAF"] = df_filtered["sample_info"].apply(lambda x: x["VAF"])
df_filtered["GT"]  = df_filtered["sample_info"].apply(lambda x: x["GT"])
df_filtered["genotype"] = df_filtered["GT"].apply(get_genotype_from_GT)

# Apply filtering
df_filtered["pass_sample_filters"] = df_filtered["sample_info"].apply(filter_variant_by_sample)

# Extract VEP annotations
df_filtered["cosmic_ann"] = df_filtered["info"].apply(get_cosmic_annotation)

df_filtered["gene"] = df_filtered["cosmic_ann"].apply(extract_gene)
df_filtered["protein_position"] = df_filtered["cosmic_ann"].apply(extract_protein_position)
df_filtered["mutation_description"] = df_filtered["cosmic_ann"].apply(extract_mutation_description)


In [28]:
df_filtered

Unnamed: 0,chr,pos,id,ref,alt,qual,filter,info,format,sample,...,DP,AD,VAF,GT,genotype,pass_sample_filters,cosmic_ann,gene,protein_position,mutation_description
0,chr1,4427,.,A,C,12.2,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:11:3:3:1:DV,...,3,3,1.000000,1/1,homozygous,False,{},,,
1,chr1,4438,.,C,T,10.8,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:10:3:3:1:DV,...,3,3,1.000000,1/1,homozygous,False,{},,,
2,chr1,4440,.,G,A,11.3,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:11:3:3:1:DV,...,3,3,1.000000,1/1,homozygous,False,{},,,
3,chr1,4464,.,C,T,6.2,PASS,.,GT:GQ:DP:AD:VAF:C,0/1:6:3:2:0.666667:DV,...,3,2,0.666667,0/1,heterozygous,False,{},,,
4,chr1,4501,.,T,C,13.7,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:13:3:3:1:DV,...,3,3,1.000000,1/1,homozygous,False,{},,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7598646,chrY,62456021,.,G,T,6.5,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:5:2:2:1:DV,...,2,2,1.000000,1/1,homozygous,False,{},,,
7598647,chrY,62456075,.,G,T,13.4,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:6:2:2:1:DV,...,2,2,1.000000,1/1,homozygous,False,{},,,
7598648,chrY,62456143,.,AG,A,2.6,refCall,.,GT:GQ:DP:AD:VAF:C,./.:3:2:2:1:DV,...,2,2,1.000000,./.,homozygous,False,{},,,
7598649,chrY,62456186,.,AG,A,6.1,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:4:2:2:1:DV,...,2,2,1.000000,1/1,homozygous,False,{},,,


In [29]:
df_intronless = df_filtered[df_filtered["mutation_description"] != "intron_variant"]
df_intronless

Unnamed: 0,chr,pos,id,ref,alt,qual,filter,info,format,sample,...,DP,AD,VAF,GT,genotype,pass_sample_filters,cosmic_ann,gene,protein_position,mutation_description
0,chr1,4427,.,A,C,12.2,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:11:3:3:1:DV,...,3,3,1.000000,1/1,homozygous,False,{},,,
1,chr1,4438,.,C,T,10.8,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:10:3:3:1:DV,...,3,3,1.000000,1/1,homozygous,False,{},,,
2,chr1,4440,.,G,A,11.3,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:11:3:3:1:DV,...,3,3,1.000000,1/1,homozygous,False,{},,,
3,chr1,4464,.,C,T,6.2,PASS,.,GT:GQ:DP:AD:VAF:C,0/1:6:3:2:0.666667:DV,...,3,2,0.666667,0/1,heterozygous,False,{},,,
4,chr1,4501,.,T,C,13.7,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:13:3:3:1:DV,...,3,3,1.000000,1/1,homozygous,False,{},,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7598646,chrY,62456021,.,G,T,6.5,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:5:2:2:1:DV,...,2,2,1.000000,1/1,homozygous,False,{},,,
7598647,chrY,62456075,.,G,T,13.4,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:6:2:2:1:DV,...,2,2,1.000000,1/1,homozygous,False,{},,,
7598648,chrY,62456143,.,AG,A,2.6,refCall,.,GT:GQ:DP:AD:VAF:C,./.:3:2:2:1:DV,...,2,2,1.000000,./.,homozygous,False,{},,,
7598649,chrY,62456186,.,AG,A,6.1,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:4:2:2:1:DV,...,2,2,1.000000,1/1,homozygous,False,{},,,


In [30]:
df_syn = df_intronless[df_intronless["mutation_description"] != "synonymous_variant"]
df_syn

Unnamed: 0,chr,pos,id,ref,alt,qual,filter,info,format,sample,...,DP,AD,VAF,GT,genotype,pass_sample_filters,cosmic_ann,gene,protein_position,mutation_description
0,chr1,4427,.,A,C,12.2,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:11:3:3:1:DV,...,3,3,1.000000,1/1,homozygous,False,{},,,
1,chr1,4438,.,C,T,10.8,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:10:3:3:1:DV,...,3,3,1.000000,1/1,homozygous,False,{},,,
2,chr1,4440,.,G,A,11.3,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:11:3:3:1:DV,...,3,3,1.000000,1/1,homozygous,False,{},,,
3,chr1,4464,.,C,T,6.2,PASS,.,GT:GQ:DP:AD:VAF:C,0/1:6:3:2:0.666667:DV,...,3,2,0.666667,0/1,heterozygous,False,{},,,
4,chr1,4501,.,T,C,13.7,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:13:3:3:1:DV,...,3,3,1.000000,1/1,homozygous,False,{},,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7598646,chrY,62456021,.,G,T,6.5,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:5:2:2:1:DV,...,2,2,1.000000,1/1,homozygous,False,{},,,
7598647,chrY,62456075,.,G,T,13.4,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:6:2:2:1:DV,...,2,2,1.000000,1/1,homozygous,False,{},,,
7598648,chrY,62456143,.,AG,A,2.6,refCall,.,GT:GQ:DP:AD:VAF:C,./.:3:2:2:1:DV,...,2,2,1.000000,./.,homozygous,False,{},,,
7598649,chrY,62456186,.,AG,A,6.1,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:4:2:2:1:DV,...,2,2,1.000000,1/1,homozygous,False,{},,,


In [31]:
df3 = df_syn[df_syn["mutation_description"] != "3_prime_UTR_variant"]
df3

Unnamed: 0,chr,pos,id,ref,alt,qual,filter,info,format,sample,...,DP,AD,VAF,GT,genotype,pass_sample_filters,cosmic_ann,gene,protein_position,mutation_description
0,chr1,4427,.,A,C,12.2,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:11:3:3:1:DV,...,3,3,1.000000,1/1,homozygous,False,{},,,
1,chr1,4438,.,C,T,10.8,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:10:3:3:1:DV,...,3,3,1.000000,1/1,homozygous,False,{},,,
2,chr1,4440,.,G,A,11.3,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:11:3:3:1:DV,...,3,3,1.000000,1/1,homozygous,False,{},,,
3,chr1,4464,.,C,T,6.2,PASS,.,GT:GQ:DP:AD:VAF:C,0/1:6:3:2:0.666667:DV,...,3,2,0.666667,0/1,heterozygous,False,{},,,
4,chr1,4501,.,T,C,13.7,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:13:3:3:1:DV,...,3,3,1.000000,1/1,homozygous,False,{},,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7598646,chrY,62456021,.,G,T,6.5,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:5:2:2:1:DV,...,2,2,1.000000,1/1,homozygous,False,{},,,
7598647,chrY,62456075,.,G,T,13.4,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:6:2:2:1:DV,...,2,2,1.000000,1/1,homozygous,False,{},,,
7598648,chrY,62456143,.,AG,A,2.6,refCall,.,GT:GQ:DP:AD:VAF:C,./.:3:2:2:1:DV,...,2,2,1.000000,./.,homozygous,False,{},,,
7598649,chrY,62456186,.,AG,A,6.1,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:4:2:2:1:DV,...,2,2,1.000000,1/1,homozygous,False,{},,,


In [32]:
df5 = df3[df3["mutation_description"] != "5_prime_UTR_variant"]
df5

Unnamed: 0,chr,pos,id,ref,alt,qual,filter,info,format,sample,...,DP,AD,VAF,GT,genotype,pass_sample_filters,cosmic_ann,gene,protein_position,mutation_description
0,chr1,4427,.,A,C,12.2,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:11:3:3:1:DV,...,3,3,1.000000,1/1,homozygous,False,{},,,
1,chr1,4438,.,C,T,10.8,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:10:3:3:1:DV,...,3,3,1.000000,1/1,homozygous,False,{},,,
2,chr1,4440,.,G,A,11.3,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:11:3:3:1:DV,...,3,3,1.000000,1/1,homozygous,False,{},,,
3,chr1,4464,.,C,T,6.2,PASS,.,GT:GQ:DP:AD:VAF:C,0/1:6:3:2:0.666667:DV,...,3,2,0.666667,0/1,heterozygous,False,{},,,
4,chr1,4501,.,T,C,13.7,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:13:3:3:1:DV,...,3,3,1.000000,1/1,homozygous,False,{},,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7598646,chrY,62456021,.,G,T,6.5,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:5:2:2:1:DV,...,2,2,1.000000,1/1,homozygous,False,{},,,
7598647,chrY,62456075,.,G,T,13.4,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:6:2:2:1:DV,...,2,2,1.000000,1/1,homozygous,False,{},,,
7598648,chrY,62456143,.,AG,A,2.6,refCall,.,GT:GQ:DP:AD:VAF:C,./.:3:2:2:1:DV,...,2,2,1.000000,./.,homozygous,False,{},,,
7598649,chrY,62456186,.,AG,A,6.1,PASS,.,GT:GQ:DP:AD:VAF:C,1/1:4:2:2:1:DV,...,2,2,1.000000,1/1,homozygous,False,{},,,


In [33]:
df5["qual"] = pd.to_numeric(df5["qual"]).astype(int)

df_fin = df5[
    (df5["filter"] == "PASS") &
    (df5["qual"] > 5) &
    (df5["AD"] > 5) &
    (df5["mutation_description"].notna())
]

df_fin

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df5["qual"] = pd.to_numeric(df5["qual"]).astype(int)


Unnamed: 0,chr,pos,id,ref,alt,qual,filter,info,format,sample,...,DP,AD,VAF,GT,genotype,pass_sample_filters,cosmic_ann,gene,protein_position,mutation_description
48927,chr1,14950750,.,T,C,24,PASS,GENE_SYMBOL=CASP9;COSMIC_GENE_ID=COSG102681;TR...,GT:GQ:DP:AD:VAF:C,1/1:24:29:29:1:P,...,29,29,1.000,1/1,homozygous,True,"{'GENE_SYMBOL': 'CASP9', 'COSMIC_GENE_ID': 'CO...",CASP9,p.Q221R,missense_variant
49000,chr1,14968816,.,G,A,23,PASS,GENE_SYMBOL=CASP9;COSMIC_GENE_ID=COSG102681;TR...,GT:GQ:DP:AD:VAF:C,1/1:23:27:26:0.963:P,...,27,26,0.963,1/1,homozygous,True,"{'GENE_SYMBOL': 'CASP9', 'COSMIC_GENE_ID': 'CO...",CASP9,p.A28V,missense_variant
102498,chr1,36149695,.,C,T,23,PASS,GENE_SYMBOL=THRAP3;COSMIC_GENE_ID=COSG77463;TR...,GT:GQ:DP:AD:VAF:C,0/1:23:30:18:0.6:P,...,30,18,0.600,0/1,heterozygous,True,"{'GENE_SYMBOL': 'THRAP3', 'COSMIC_GENE_ID': 'C...",THRAP3,p.A201V,missense_variant
126001,chr1,47138842,.,T,C,19,PASS,GENE_SYMBOL=STIL;COSMIC_GENE_ID=COSG80911;TRAN...,GT:GQ:DP:AD:VAF:C,0/1:19:29:11:0.379:P,...,29,11,0.379,0/1,heterozygous,True,"{'GENE_SYMBOL': 'STIL', 'COSMIC_GENE_ID': 'COS...",STIL,p.H985R,missense_variant
126115,chr1,47180683,.,G,A,22,PASS,GENE_SYMBOL=STIL;COSMIC_GENE_ID=COSG80911;TRAN...,GT:GQ:DP:AD:VAF:C,0/1:22:25:12:0.48:P,...,25,12,0.480,0/1,heterozygous,True,"{'GENE_SYMBOL': 'STIL', 'COSMIC_GENE_ID': 'COS...",STIL,p.A86V,missense_variant
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7291436,chr9,127266169,.,C,T,24,PASS,GENE_SYMBOL=TNC;COSMIC_GENE_ID=COSG105617;TRAN...,GT:GQ:DP:AD:VAF:C,0/1:24:21:12:0.571:P,...,21,12,0.571,0/1,heterozygous,True,"{'GENE_SYMBOL': 'TNC', 'COSMIC_GENE_ID': 'COSG...",TNC,p.R1066H,missense_variant
7291475,chr9,127278662,.,T,C,25,PASS,GENE_SYMBOL=TNC;COSMIC_GENE_ID=COSG105617;TRAN...,GT:GQ:DP:AD:VAF:C,1/1:25:32:32:1:P,...,32,32,1.000,1/1,homozygous,True,"{'GENE_SYMBOL': 'TNC', 'COSMIC_GENE_ID': 'COSG...",TNC,p.Q539R,missense_variant
7303539,chr9,133284599,.,G,A,24,PASS,GENE_SYMBOL=CNTRL;COSMIC_GENE_ID=COSG105745;TR...,GT:GQ:DP:AD:VAF:C,1/1:24:35:35:1:P,...,35,35,1.000,1/1,homozygous,True,"{'GENE_SYMBOL': 'CNTRL', 'COSMIC_GENE_ID': 'CO...",CNTRL,p.V56I,missense_variant
7303577,chr9,133294439,.,C,T,27,PASS,GENE_SYMBOL=CNTRL;COSMIC_GENE_ID=COSG105745;TR...,GT:GQ:DP:AD:VAF:C,1/1:27:31:31:1:P,...,31,31,1.000,1/1,homozygous,True,"{'GENE_SYMBOL': 'CNTRL', 'COSMIC_GENE_ID': 'CO...",CNTRL,p.P216L,missense_variant


In [34]:
print(df_fin["mutation_description"].unique())
print(df_fin["gene"].unique())

['missense_variant' 'intron_variant,splice_region_variant'
 'missense_variant,splice_region_variant'
 'splice_region_variant,synonymous_variant' 'stop_gained']
['CASP9' 'THRAP3' 'STIL' 'TENT5C' 'NOTCH2' 'BCL9' 'NTRK1' 'FCRL4' 'FCGR2B'
 'PBX1' 'RGS7' 'BMPR1A' 'NUTM2D' 'MUC6' 'NUP98' 'FAT3' 'DDX10' 'KDM5A'
 'ERC1' 'CHD4' 'COL2A1' 'ATF1' 'NACA' 'HMGA2' 'PTPRB' 'SETD1B' 'NCOR2'
 'POLE' 'BAZ1A' 'FOXA1' 'NIN' 'TRIP11' 'GOLGA5' 'BUB1B' 'KNL1' 'NTRK3'
 'FES' 'CHD2' 'GRIN2A' 'CDH11' 'ZFHX3' 'RFWD3' 'CBFA2T3' 'FANCA' 'USP6'
 'RABEP1' 'PER1' 'NCOR1' 'SPECC1' 'SUZ12' 'ERBB2' 'RNF43' 'RNF213'
 'ASPSCR1' 'SETBP1' 'MUC16' 'CEP89' 'ALK' 'BIRC6' 'EPAS1' 'MSH6' 'RGPD3'
 'RANBP2' 'LRP1B' 'ITGAV' 'ERBB4' 'BARD1' 'CRNKL1' 'PLCG1' 'TMPRSS2' 'ISX'
 'MRTFA' 'MLH1' 'MITF' 'GATA2' 'ATR' 'WWTR1' 'GMPS' 'MLF1' 'MUC4' 'N4BP2'
 'KIT' 'AFF1' 'TET2' 'FAT4' 'CASP3' 'FAT1' 'SDHA' 'DROSHA' 'IL7R' 'IL6ST'
 'RAD17' 'ACSL6' 'FGFR4' 'FLT4' 'HLA-A' 'DAXX' 'CCND3' 'ECT2L' 'EZR'
 'PMS2' 'EGFR' 'ELN' 'CUX1' 'MET' 'POT1' 'CNTNAP

In [35]:
df_fin.to_csv("/lustre/scratch126/casm/team274sb/lr26/pepper-tumor1B01/normalized_annotated_normalized_tumort2t_filtered_mutated.vcf", sep="\t")

In [36]:
import pandas as pd
df = pd.read_csv("/lustre/scratch126/casm/team274sb/lr26/pepper-tumor1B01/normalized_annotated_normalized_tumort2t_filtered_mutated.vcf",
                 sep="\t",
                 comment="#",
                 dtype="str",
                 header=None,
                 names=["no", "chr","pos","id","ref","alt","qual","filter","info","format","sample"])
df

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,no,chr,pos,id,ref,alt,qual,filter,info,format,sample
,chr,pos,id,ref,alt,qual,filter,info,format,sample,sample_info,DP,AD,VAF,GT,genotype,pass_sample_filters,cosmic_ann,gene,protein_position,mutation_description
48927,chr1,14950750,.,T,C,24,PASS,GENE_SYMBOL=CASP9;COSMIC_GENE_ID=COSG102681;TRANSCRIPT_ACCESSION=ENST00000333868.9;COSMIC_SAMPLE_ID=COSS2785973;COSMIC_PHENOTYPE_ID=COSO36004837;GENOMIC_MUTATION_ID=COSV61600812;LEGACY_MUTATION_ID=COSM6281180;MUTATION_ID=110430602;MUTATION_CDS=c.662A>G;MUTATION_AA=p.Q221R;MUTATION_DESCRIPTION=missense_variant;MUTATION_ZYGOSITY;LOH=.;CHROMOSOME=1;GENOME_START=15506048;GENOME_STOP=15506048;STRAND=-;PUBMED_PMID=27175599;COSMIC_STUDY_ID;HGVSP=ENSP00000330237.5:p.Gln221Arg;HGVSC=ENST00000333868.9:c.662A>G;HGVSG=1:g.15506048T>C;GENOMIC_WT_ALLELE=T;GENOMIC_MUT_ALLELE=C,GT:GQ:DP:AD:VAF:C,1/1:24:29:29:1:P,"{'GT': '1/1', 'GQ': '24', 'DP': 29, 'AD': 29, ...",29,29,1.0,1/1,homozygous,True,"{'GENE_SYMBOL': 'CASP9', 'COSMIC_GENE_ID': 'CO...",CASP9,p.Q221R,missense_variant
49000,chr1,14968816,.,G,A,23,PASS,GENE_SYMBOL=CASP9;COSMIC_GENE_ID=COSG102681;TRANSCRIPT_ACCESSION=ENST00000333868.9;COSMIC_SAMPLE_ID=COSS2296299;COSMIC_PHENOTYPE_ID=COSO29324830;GENOMIC_MUTATION_ID=COSV61600760;LEGACY_MUTATION_ID=COSM3750476;MUTATION_ID=110430846;MUTATION_CDS=c.83C>T;MUTATION_AA=p.A28V;MUTATION_DESCRIPTION=missense_variant;MUTATION_ZYGOSITY;LOH=.;CHROMOSOME=1;GENOME_START=15524118;GENOME_STOP=15524118;STRAND=-;PUBMED_PMID=25275298;COSMIC_STUDY_ID;HGVSP=ENSP00000330237.5:p.Ala28Val;HGVSC=ENST00000333868.9:c.83C>T;HGVSG=1:g.15524118G>A;GENOMIC_WT_ALLELE=G;GENOMIC_MUT_ALLELE=A,GT:GQ:DP:AD:VAF:C,1/1:23:27:26:0.963:P,"{'GT': '1/1', 'GQ': '23', 'DP': 27, 'AD': 26, ...",27,26,0.963,1/1,homozygous,True,"{'GENE_SYMBOL': 'CASP9', 'COSMIC_GENE_ID': 'CO...",CASP9,p.A28V,missense_variant
102498,chr1,36149695,.,C,T,23,PASS,GENE_SYMBOL=THRAP3;COSMIC_GENE_ID=COSG77463;TRANSCRIPT_ACCESSION=ENST00000354618.9;COSMIC_SAMPLE_ID=COSS2955773;COSMIC_PHENOTYPE_ID=COSO36605381;GENOMIC_MUTATION_ID=COSV100663623;LEGACY_MUTATION_ID=COSM9180497;MUTATION_ID=113249093;MUTATION_CDS=c.602C>T;MUTATION_AA=p.A201V;MUTATION_DESCRIPTION=missense_variant;MUTATION_ZYGOSITY;LOH=.;CHROMOSOME=1;GENOME_START=36286832;GENOME_STOP=36286832;STRAND=+;PUBMED_PMID=31636198;COSMIC_STUDY_ID;HGVSP=ENSP00000346634.5:p.Ala201Val;HGVSC=ENST00000354618.9:c.602C>T;HGVSG=1:g.36286832C>T;GENOMIC_WT_ALLELE=C;GENOMIC_MUT_ALLELE=T,GT:GQ:DP:AD:VAF:C,0/1:23:30:18:0.6:P,"{'GT': '0/1', 'GQ': '23', 'DP': 30, 'AD': 18, ...",30,18,0.6,0/1,heterozygous,True,"{'GENE_SYMBOL': 'THRAP3', 'COSMIC_GENE_ID': 'C...",THRAP3,p.A201V,missense_variant
126001,chr1,47138842,.,T,C,19,PASS,GENE_SYMBOL=STIL;COSMIC_GENE_ID=COSG80911;TRANSCRIPT_ACCESSION=ENST00000371877.7;COSMIC_SAMPLE_ID=COSS2185970;COSMIC_PHENOTYPE_ID=COSO36284888;GENOMIC_MUTATION_ID=COSV54551803;LEGACY_MUTATION_ID=COSM4144058;MUTATION_ID=118445599;MUTATION_CDS=c.2954A>G;MUTATION_AA=p.H985R;MUTATION_DESCRIPTION=missense_variant;MUTATION_ZYGOSITY;LOH=.;CHROMOSOME=1;GENOME_START=47260415;GENOME_STOP=47260415;STRAND=-;PUBMED_PMID;COSMIC_STUDY_ID=COSU589;HGVSP=ENSP00000360944.3:p.His985Arg;HGVSC=ENST00000371877.7:c.2954A>G;HGVSG=1:g.47260415T>C;GENOMIC_WT_ALLELE=T;GENOMIC_MUT_ALLELE=C,GT:GQ:DP:AD:VAF:C,0/1:19:29:11:0.379:P,"{'GT': '0/1', 'GQ': '19', 'DP': 29, 'AD': 11, ...",29,11,0.379,0/1,heterozygous,True,"{'GENE_SYMBOL': 'STIL', 'COSMIC_GENE_ID': 'COS...",STIL,p.H985R,missense_variant
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7291436,chr9,127266169,.,C,T,24,PASS,GENE_SYMBOL=TNC;COSMIC_GENE_ID=COSG105617;TRANSCRIPT_ACCESSION=ENST00000350763.8;COSMIC_SAMPLE_ID=COSS2955809;COSMIC_PHENOTYPE_ID=COSO36605381;GENOMIC_MUTATION_ID=COSV60785681;LEGACY_MUTATION_ID=COSM5009671;MUTATION_ID=113141353;MUTATION_CDS=c.3197G>A;MUTATION_AA=p.R1066H;MUTATION_DESCRIPTION=missense_variant;MUTATION_ZYGOSITY;LOH=.;CHROMOSOME=9;GENOME_START=115073620;GENOME_STOP=115073620;STRAND=-;PUBMED_PMID=31636198;COSMIC_STUDY_ID;HGVSP=ENSP00000265131.4:p.Arg1066His;HGVSC=ENST00000350763.8:c.3197G>A;HGVSG=9:g.115073620C>T;GENOMIC_WT_ALLELE=C;GENOMIC_MUT_ALLELE=T,GT:GQ:DP:AD:VAF:C,0/1:24:21:12:0.571:P,"{'GT': '0/1', 'GQ': '24', 'DP': 21, 'AD': 12, ...",21,12,0.571,0/1,heterozygous,True,"{'GENE_SYMBOL': 'TNC', 'COSMIC_GENE_ID': 'COSG...",TNC,p.R1066H,missense_variant
7291475,chr9,127278662,.,T,C,25,PASS,GENE_SYMBOL=TNC;COSMIC_GENE_ID=COSG105617;TRANSCRIPT_ACCESSION=ENST00000350763.8;COSMIC_SAMPLE_ID=COSS2955802;COSMIC_PHENOTYPE_ID=COSO36605381;GENOMIC_MUTATION_ID=COSV107406352;LEGACY_MUTATION_ID=COSM10541708;MUTATION_ID=113142731;MUTATION_CDS=c.1616A>G;MUTATION_AA=p.Q539R;MUTATION_DESCRIPTION=missense_variant;MUTATION_ZYGOSITY;LOH=.;CHROMOSOME=9;GENOME_START=115086115;GENOME_STOP=115086115;STRAND=-;PUBMED_PMID=31636198;COSMIC_STUDY_ID;HGVSP=ENSP00000265131.4:p.Gln539Arg;HGVSC=ENST00000350763.8:c.1616A>G;HGVSG=9:g.115086115T>C;GENOMIC_WT_ALLELE=T;GENOMIC_MUT_ALLELE=C,GT:GQ:DP:AD:VAF:C,1/1:25:32:32:1:P,"{'GT': '1/1', 'GQ': '25', 'DP': 32, 'AD': 32, ...",32,32,1.0,1/1,homozygous,True,"{'GENE_SYMBOL': 'TNC', 'COSMIC_GENE_ID': 'COSG...",TNC,p.Q539R,missense_variant
7303539,chr9,133284599,.,G,A,24,PASS,GENE_SYMBOL=CNTRL;COSMIC_GENE_ID=COSG105745;TRANSCRIPT_ACCESSION=ENST00000373855.5;COSMIC_SAMPLE_ID=COSS2955803;COSMIC_PHENOTYPE_ID=COSO36605381;GENOMIC_MUTATION_ID=COSV53042713;LEGACY_MUTATION_ID=COSM6248842;MUTATION_ID=116323577;MUTATION_CDS=c.166G>A;MUTATION_AA=p.V56I;MUTATION_DESCRIPTION=missense_variant;MUTATION_ZYGOSITY;LOH=.;CHROMOSOME=9;GENOME_START=121088492;GENOME_STOP=121088492;STRAND=+;PUBMED_PMID=31636198;COSMIC_STUDY_ID;HGVSP=ENSP00000362962.1:p.Val56Ile;HGVSC=ENST00000373855.5:c.166G>A;HGVSG=9:g.121088492G>A;GENOMIC_WT_ALLELE=G;GENOMIC_MUT_ALLELE=A,GT:GQ:DP:AD:VAF:C,1/1:24:35:35:1:P,"{'GT': '1/1', 'GQ': '24', 'DP': 35, 'AD': 35, ...",35,35,1.0,1/1,homozygous,True,"{'GENE_SYMBOL': 'CNTRL', 'COSMIC_GENE_ID': 'CO...",CNTRL,p.V56I,missense_variant
7303577,chr9,133294439,.,C,T,27,PASS,GENE_SYMBOL=CNTRL;COSMIC_GENE_ID=COSG105745;TRANSCRIPT_ACCESSION=ENST00000373855.5;COSMIC_SAMPLE_ID=COSS2385246;COSMIC_PHENOTYPE_ID=COSO27984932;GENOMIC_MUTATION_ID=COSV53045057;LEGACY_MUTATION_ID=COSM4407606;MUTATION_ID=116323581;MUTATION_CDS=c.647C>T;MUTATION_AA=p.P216L;MUTATION_DESCRIPTION=missense_variant;MUTATION_ZYGOSITY;LOH=.;CHROMOSOME=9;GENOME_START=121098411;GENOME_STOP=121098411;STRAND=+;PUBMED_PMID;COSMIC_STUDY_ID=COSU533;HGVSP=ENSP00000362962.1:p.Pro216Leu;HGVSC=ENST00000373855.5:c.647C>T;HGVSG=9:g.121098411C>T;GENOMIC_WT_ALLELE=C;GENOMIC_MUT_ALLELE=T,GT:GQ:DP:AD:VAF:C,1/1:27:31:31:1:P,"{'GT': '1/1', 'GQ': '27', 'DP': 31, 'AD': 31, ...",31,31,1.0,1/1,homozygous,True,"{'GENE_SYMBOL': 'CNTRL', 'COSMIC_GENE_ID': 'CO...",CNTRL,p.P216L,missense_variant


In [37]:
print(df["sample"].unique())
print(df["info"].unique())

['mutation_description' 'missense_variant'
 'intron_variant,splice_region_variant'
 'missense_variant,splice_region_variant'
 'splice_region_variant,synonymous_variant' 'stop_gained']
['gene' 'CASP9' 'THRAP3' 'STIL' 'TENT5C' 'NOTCH2' 'BCL9' 'NTRK1' 'FCRL4'
 'FCGR2B' 'PBX1' 'RGS7' 'BMPR1A' 'NUTM2D' 'MUC6' 'NUP98' 'FAT3' 'DDX10'
 'KDM5A' 'ERC1' 'CHD4' 'COL2A1' 'ATF1' 'NACA' 'HMGA2' 'PTPRB' 'SETD1B'
 'NCOR2' 'POLE' 'BAZ1A' 'FOXA1' 'NIN' 'TRIP11' 'GOLGA5' 'BUB1B' 'KNL1'
 'NTRK3' 'FES' 'CHD2' 'GRIN2A' 'CDH11' 'ZFHX3' 'RFWD3' 'CBFA2T3' 'FANCA'
 'USP6' 'RABEP1' 'PER1' 'NCOR1' 'SPECC1' 'SUZ12' 'ERBB2' 'RNF43' 'RNF213'
 'ASPSCR1' 'SETBP1' 'MUC16' 'CEP89' 'ALK' 'BIRC6' 'EPAS1' 'MSH6' 'RGPD3'
 'RANBP2' 'LRP1B' 'ITGAV' 'ERBB4' 'BARD1' 'CRNKL1' 'PLCG1' 'TMPRSS2' 'ISX'
 'MRTFA' 'MLH1' 'MITF' 'GATA2' 'ATR' 'WWTR1' 'GMPS' 'MLF1' 'MUC4' 'N4BP2'
 'KIT' 'AFF1' 'TET2' 'FAT4' 'CASP3' 'FAT1' 'SDHA' 'DROSHA' 'IL7R' 'IL6ST'
 'RAD17' 'ACSL6' 'FGFR4' 'FLT4' 'HLA-A' 'DAXX' 'CCND3' 'ECT2L' 'EZR'
 'PMS2' 'EGFR' 'E

In [38]:
mlem = (df["info"].value_counts())
print(mlem[mlem > 1])

info
MUC16     46
FANCA      9
FAT1       6
FGFR4      5
RNF213     4
CRNKL1     4
RGPD3      4
CSMD3      4
MUC6       3
SPECC1     3
ALK        3
SDHA       3
TNC        3
ZFHX3      2
PTPRB      2
CUX1       2
HLA-A      2
KIT        2
PER1       2
RFWD3      2
USP6       2
STIL       2
NTRK1      2
NACA       2
NUTM2D     2
CASP9      2
LRP1B      2
MLF1       2
N4BP2      2
IL6ST      2
PMS2       2
NBN        2
CNTRL      2
CCND3      2
PCM1       2
Name: count, dtype: int64


In [None]:
"/lustre/scratch126/casm/team274sb/lr26/pepper-tumor1B01/normalized_annotated_shared_germline_new_filtered_mutated.vcf"
"/lustre/scratch126/casm/team274sb/lr26/pepper-tumor1B01/normalized_annotated_tumor_somatic_new_filtered_mutated.vcf"
"/lustre/scratch126/casm/team274sb/lr26/pepper-tumor38/normalized_annotated_shared_germline_new_filtered_mutated.vcf"
"/lustre/scratch126/casm/team274sb/lr26/pepper-tumor38/normalized_annotated_tumor_somatic_new_filtered_mutated.vcf"

In [39]:
colnames = ["unnamed", "chr", "pos", "ref", "alt", "count", "gene_symbol", "mutation_aa", "mutation_description"]
t2t_df = pd.read_csv("/lustre/scratch126/casm/team274sb/lr26/T2T/cosmic_summary_hotspots_t2t.tsv", sep="\t", dtype=str, names = colnames)  # Adjust path/columns as needed
hg38_df = pd.read_csv("/lustre/scratch126/casm/team274sb/lr26/hg38/cosmic_summary_hotspots_hg38.tsv", sep="\t", dtype=str, names = colnames)  # Adjust path/columns as needed

In [40]:
t2t_df

Unnamed: 0,unnamed,chr,pos,ref,alt,count,gene_symbol,mutation_aa,mutation_description
0,,CHROM,POS,REF,ALT,COUNT,GENE_SYMBOL,MUTATION_AA,MUTATION_DESCRIPTION
1,704,chr1,1998343,G,A,12,TNFRSF14,p.W12*,stop_gained
2,753,chr1,1999405,C,T,12,TNFRSF14,p.Y35=,synonymous_variant
3,893,chr1,2001410,C,T,31,TNFRSF14,p.?,intron_variant
4,1119,chr1,2004535,G,A,22,TNFRSF14,p.V241I,missense_variant
...,...,...,...,...,...,...,...,...,...
5499,1077914,chrX,132740258,G,A,16,PHF6,p.R274Q,missense_variant
5500,1078007,chrX,132742440,C,T,23,PHF6,p.R319*,stop_gained
5501,1078505,chrX,151822337,GCTGGT,G,13,ATP2B3,p.L425_V426del,inframe_deletion
5502,1078857,chrX,151833356,C,T,14,ATP2B3,p.T901M,missense_variant


In [41]:
df_fin

Unnamed: 0,chr,pos,id,ref,alt,qual,filter,info,format,sample,...,DP,AD,VAF,GT,genotype,pass_sample_filters,cosmic_ann,gene,protein_position,mutation_description
48927,chr1,14950750,.,T,C,24,PASS,GENE_SYMBOL=CASP9;COSMIC_GENE_ID=COSG102681;TR...,GT:GQ:DP:AD:VAF:C,1/1:24:29:29:1:P,...,29,29,1.000,1/1,homozygous,True,"{'GENE_SYMBOL': 'CASP9', 'COSMIC_GENE_ID': 'CO...",CASP9,p.Q221R,missense_variant
49000,chr1,14968816,.,G,A,23,PASS,GENE_SYMBOL=CASP9;COSMIC_GENE_ID=COSG102681;TR...,GT:GQ:DP:AD:VAF:C,1/1:23:27:26:0.963:P,...,27,26,0.963,1/1,homozygous,True,"{'GENE_SYMBOL': 'CASP9', 'COSMIC_GENE_ID': 'CO...",CASP9,p.A28V,missense_variant
102498,chr1,36149695,.,C,T,23,PASS,GENE_SYMBOL=THRAP3;COSMIC_GENE_ID=COSG77463;TR...,GT:GQ:DP:AD:VAF:C,0/1:23:30:18:0.6:P,...,30,18,0.600,0/1,heterozygous,True,"{'GENE_SYMBOL': 'THRAP3', 'COSMIC_GENE_ID': 'C...",THRAP3,p.A201V,missense_variant
126001,chr1,47138842,.,T,C,19,PASS,GENE_SYMBOL=STIL;COSMIC_GENE_ID=COSG80911;TRAN...,GT:GQ:DP:AD:VAF:C,0/1:19:29:11:0.379:P,...,29,11,0.379,0/1,heterozygous,True,"{'GENE_SYMBOL': 'STIL', 'COSMIC_GENE_ID': 'COS...",STIL,p.H985R,missense_variant
126115,chr1,47180683,.,G,A,22,PASS,GENE_SYMBOL=STIL;COSMIC_GENE_ID=COSG80911;TRAN...,GT:GQ:DP:AD:VAF:C,0/1:22:25:12:0.48:P,...,25,12,0.480,0/1,heterozygous,True,"{'GENE_SYMBOL': 'STIL', 'COSMIC_GENE_ID': 'COS...",STIL,p.A86V,missense_variant
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7291436,chr9,127266169,.,C,T,24,PASS,GENE_SYMBOL=TNC;COSMIC_GENE_ID=COSG105617;TRAN...,GT:GQ:DP:AD:VAF:C,0/1:24:21:12:0.571:P,...,21,12,0.571,0/1,heterozygous,True,"{'GENE_SYMBOL': 'TNC', 'COSMIC_GENE_ID': 'COSG...",TNC,p.R1066H,missense_variant
7291475,chr9,127278662,.,T,C,25,PASS,GENE_SYMBOL=TNC;COSMIC_GENE_ID=COSG105617;TRAN...,GT:GQ:DP:AD:VAF:C,1/1:25:32:32:1:P,...,32,32,1.000,1/1,homozygous,True,"{'GENE_SYMBOL': 'TNC', 'COSMIC_GENE_ID': 'COSG...",TNC,p.Q539R,missense_variant
7303539,chr9,133284599,.,G,A,24,PASS,GENE_SYMBOL=CNTRL;COSMIC_GENE_ID=COSG105745;TR...,GT:GQ:DP:AD:VAF:C,1/1:24:35:35:1:P,...,35,35,1.000,1/1,homozygous,True,"{'GENE_SYMBOL': 'CNTRL', 'COSMIC_GENE_ID': 'CO...",CNTRL,p.V56I,missense_variant
7303577,chr9,133294439,.,C,T,27,PASS,GENE_SYMBOL=CNTRL;COSMIC_GENE_ID=COSG105745;TR...,GT:GQ:DP:AD:VAF:C,1/1:27:31:31:1:P,...,31,31,1.000,1/1,homozygous,True,"{'GENE_SYMBOL': 'CNTRL', 'COSMIC_GENE_ID': 'CO...",CNTRL,p.P216L,missense_variant


In [42]:
annotated_df = pd.merge(df_fin, t2t_df, on=["chr", "pos", "ref", "alt"], how="left")
# Merge VCF and annotation TSV
annotated_df = annotated_df[annotated_df["count"].notna()]


In [43]:
annotated_df


Unnamed: 0,chr,pos,id,ref,alt,qual,filter,info,format,sample,...,pass_sample_filters,cosmic_ann,gene,protein_position,mutation_description_x,unnamed,count,gene_symbol,mutation_aa,mutation_description_y
0,chr1,14950750,.,T,C,24,PASS,GENE_SYMBOL=CASP9;COSMIC_GENE_ID=COSG102681;TR...,GT:GQ:DP:AD:VAF:C,1/1:24:29:29:1:P,...,True,"{'GENE_SYMBOL': 'CASP9', 'COSMIC_GENE_ID': 'CO...",CASP9,p.Q221R,missense_variant,15376,12,CASP9,p.Q221R,missense_variant
1,chr1,14968816,.,G,A,23,PASS,GENE_SYMBOL=CASP9;COSMIC_GENE_ID=COSG102681;TR...,GT:GQ:DP:AD:VAF:C,1/1:23:27:26:0.963:P,...,True,"{'GENE_SYMBOL': 'CASP9', 'COSMIC_GENE_ID': 'CO...",CASP9,p.A28V,missense_variant,15518,26,CASP9,p.A28V,missense_variant
16,chr10,88248713,.,G,A,20,PASS,GENE_SYMBOL=NUTM2D;COSMIC_GENE_ID=COSG61484;TR...,GT:GQ:DP:AD:VAF:C,0/1:20:38:22:0.579:P,...,True,"{'GENE_SYMBOL': 'NUTM2D', 'COSMIC_GENE_ID': 'C...",NUTM2D,p.E474K,missense_variant,90416,25,NUTM2D,p.E474K,missense_variant
17,chr11,1076512,.,T,G,24,PASS,GENE_SYMBOL=MUC6;COSMIC_GENE_ID=COSG58094;TRAN...,GT:GQ:DP:AD:VAF:C,0/1:24:35:20:0.571:P,...,True,"{'GENE_SYMBOL': 'MUC6', 'COSMIC_GENE_ID': 'COS...",MUC6,p.Q1735H,missense_variant,106493,23,MUC6,p.Q1735H,missense_variant
19,chr11,1097695,.,C,T,22,PASS,GENE_SYMBOL=MUC6;COSMIC_GENE_ID=COSG58094;TRAN...,GT:GQ:DP:AD:VAF:C,0/1:22:35:20:0.571:P,...,True,"{'GENE_SYMBOL': 'MUC6', 'COSMIC_GENE_ID': 'COS...",MUC6,p.V619M,missense_variant,107325,21,MUC6,p.V619M,missense_variant
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209,chr8,91069483,.,T,C,25,PASS,GENE_SYMBOL=NBN;COSMIC_GENE_ID=COSG81202;TRANS...,GT:GQ:DP:AD:VAF:C,1/1:25:26:26:1:P,...,True,"{'GENE_SYMBOL': 'NBN', 'COSMIC_GENE_ID': 'COSG...",NBN,p.?,"intron_variant,splice_region_variant",940092,13,NBN,p.?,"('intron_variant', 'splice_region_variant')"
210,chr8,91101432,.,C,G,26,PASS,GENE_SYMBOL=NBN;COSMIC_GENE_ID=COSG81202;TRANS...,GT:GQ:DP:AD:VAF:C,1/1:26:27:27:1:P,...,True,"{'GENE_SYMBOL': 'NBN', 'COSMIC_GENE_ID': 'COSG...",NBN,p.E185Q,missense_variant,940509,27,NBN,p.E185Q,missense_variant
216,chr8,145685942,.,T,A,17,PASS,GENE_SYMBOL=RECQL4;COSMIC_GENE_ID=COSG81638;TR...,GT:GQ:DP:AD:VAF:C,0/1:17:13:7:0.538:P,...,True,"{'GENE_SYMBOL': 'RECQL4', 'COSMIC_GENE_ID': 'C...",RECQL4,p.?,"intron_variant,splice_region_variant",986030,25,RECQL4,p.?,"('intron_variant', 'splice_region_variant')"
219,chr9,105426425,.,G,A,24,PASS,GENE_SYMBOL=WNK2;COSMIC_GENE_ID=COSG97812;TRAN...,GT:GQ:DP:AD:VAF:C,1/1:24:25:25:1:P,...,True,"{'GENE_SYMBOL': 'WNK2', 'COSMIC_GENE_ID': 'COS...",WNK2,p.V828M,missense_variant,1023119,18,WNK2,p.V828M,missense_variant


In [44]:
annotated_df.to_csv("/lustre/scratch126/casm/team274sb/lr26/pepper-tumor1B01/normalized_annotated_tumort2t_filtered_mutated_hotspots.vcf", sep="\t")