In [2]:
import pandas as pd

In [2]:
# data = pd.read_excel('E:/OneDrive_Outlook/OneDrive/Documents/GitHub/MAF File/patho_only_finalsubset.xlsx', sheet_name= 'Sheet1')
#data = pd.read_csv(r'H:\My Drive\Pathogenic_Landscape\data\indigene\Indig_patho_only_df.csv')
data = pd.read_csv(r'H:\My Drive\Pathogenic_Landscape\data\absolute\clinical_research_filtered_combined\dataframes\patho_only_df.csv')

In [3]:
data = pd.read_excel(r'H:\My Drive\Pathogenic_Landscape\assets\absolute_indie_patho\absolute_Indie_patho.xlsx')

In [6]:
#Variant_Type
def determine_variant_type(ref, alt):
    if len(ref) == 1 and len(alt) == 1:
        return "SNP"
    elif len(ref) < len(alt):
        return "INS"
    elif len(ref) > len(alt):
        return "DEL"
    elif len(ref) == 2 and len(alt) == 2:
        return "DNP"
    elif len(ref) == 3 and len(alt) == 3:
        return "TNP"
    else:
        return "ONP"

def extract_exon(entry):
    try:
        first_entry = entry.split(",")[0]

        parts = first_entry.split(":")
        if len(parts) >= 3:
            exon_part = parts[2]  # Extract the 'exon' part
            return exon_part
        else:
            return None
    except Exception as e:
        print("Nan")
        return None

GDC-Compliant List for Variant Classification: 
- Frame_Shift_Del
- Frame_Shift_Ins
- In_Frame_Del
- In_Frame_Ins
- Missense_Mutation
- Nonsense_Mutation
- Silent
- Splice_Site
- Translation_Start_Site
- Nonstop_Mutation
- RNA
- Targeted_Region

In [10]:
class VariantMapper:
    def __init__(self):
        self.variant_mapping = {
            "stopgain": "Nonsense_Mutation",
            "nonsynonymous snv": "Missense_Mutation",
            "frameshift deletion": "Frame_Shift_Del",
            "frameshift insertion": "Frame_Shift_Ins",
            "nonframeshift insertion": "In_Frame_Ins",
            "nonframeshift deletion": "In_Frame_Del",
            "stoploss": "Nonstop_Mutation",
            ".": "Splice_Site",
        }
    
    def map_variant(self, variant):
        return self.variant_mapping.get(str(variant).lower(), variant)  # Handles NaN & unknowns

mapper = VariantMapper()

maf_data = pd.DataFrame({
    "Tumor_Sample_Barcode": data["Sample_Name"],
    "Hugo_Symbol": data["Ref.Gene"],
    "Chromosome": data["CHROM_x"],
    "Start_Position": data["POS_x"],
    "End_Position": data["End_x"],
    "Variant_Classification": data["ExonicFunc.ensGene"].apply(mapper.map_variant),
    "Variant_Type": data.apply(lambda row: determine_variant_type(row["REF_x"], row["ALT_x"]), axis=1),
    "Reference_Allele": data["REF_x"],
    "Tumor_Seq_Allele1": data["REF_x"],
    "Tumor_Seq_Allele2": data["ALT_x"],
    "tx": data["AAChange.ensGene"].str.extract(r'(ENST[0-9]+)', expand=False),
    "exon": data["AAChange.ensGene"].apply(extract_exon),
    "txChange": data["AAChange.ensGene"].str.extract(r'(c\.[^:]+)', expand=False),
    "aaChange": data["AAChange.ensGene"].str.extract(r'(p\.[^,]+)', expand=False),
    "avsnp150": data["avsnp150"]
})


In [11]:
maf_data.head()

Unnamed: 0,Tumor_Sample_Barcode,Hugo_Symbol,Chromosome,Start_Position,End_Position,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,tx,exon,txChange,aaChange,avsnp150
0,IN-423-TKKC-F-Merged,CFTR,chr7,117188852,117188852,Missense_Mutation,SNP,T,T,C,ENST00000426809,exon9,c.T1277C,p.V426A,rs193922500
1,IN-423-TKKC-F-Merged,CTNS,chr17,3543516,3543519,Frame_Shift_Del,DEL,GCTGA,GCTGA,G,ENST00000576979,exon1,c.16_19del,p.L6fs,rs786204501
2,IN-423-TKXA-F2-IE2-RE,PEX12,chr17,33904992,33904992,Nonsense_Mutation,SNP,G,G,A,ENST00000225873,exon1,c.C49T,p.Q17X,rs888633730
3,IN-423-TKXA-F2-IE2-RE,BMPR2,chr2,203383717,203383717,Missense_Mutation,SNP,A,A,G,ENST00000374574,exon6,c.A794G,p.E265G,rs1085307259
4,IN-423-TKXA-F2-IE2-RE,FYCO1,chr3,45965238,45965238,Missense_Mutation,SNP,C,C,T,ENST00000438446,exon5,c.G284A,p.R95Q,rs140159323


In [9]:
#maf_data.to_csv(r"H:\My Drive\Pathogenic_Landscape\data\indigene\Indig_Patho_MAF.maf", sep="\t", index=False)
maf_data.to_csv(r"H:\My Drive\Pathogenic_Landscape\data\absolute\clinical_research_filtered_combined\Absolute_Patho_MAF.maf", sep="\t", index=False)

In [12]:
maf_data.to_csv(r"H:\My Drive\Pathogenic_Landscape\assets\absolute_indie_patho\Combined_Patho_MAF.maf", sep="\t", index=False)