## This notebook is for filtering COSMIC variants 
This has been done for the annotated somatic tumor aligned to the hg38 reference

In [None]:
# import packages
import pandas as pd

In [None]:
## load the vcf upon decompressing it as a dataframe and name columns accordingly
df = pd.read_csv("/lustre/scratch126/casm/team274sb/lr26/PacBio-deepvariant-tumor-hg38/annotated_tumor_somatic_cosmic_new38.vcf",
                 sep="\t",
                 comment="#",
                 dtype="str",
                 header=None,
                 names=["chr","pos","id","ref","alt","qual","filter","info","format","sample"])

In [None]:
# print out info of the first variant - see that it is empty and create a copy of the dataframe
print("INFO field of first variant:")
print(df.loc[0, "info"])
df_filtered = df.copy()

INFO field of first variant:
.


In [None]:
### here are the main parsing functions
def parse_sample_format(format_str, sample_str):
    """
    Parses format and sample fields of a vcf into a dict
    handles numeric parsing and multiallelic values
    """
    keys = format_str.split(":")
    values = sample_str.split(":")
    sample_dict = dict(zip(keys, values))

    # this parses genotype
    sample_dict["GT"] = sample_dict.get("GT", "")

    # this parses depth, if not present put 0
    try:
        sample_dict["DP"] = int(sample_dict.get("DP", 0))
    except ValueError:
        sample_dict["DP"] = 0

    # this parses allele depth depending on whether there are more, split into a list
    ad_val = sample_dict.get("AD", "0")
    if "," in ad_val:
        try:
            ad_list = [int(a) for a in ad_val.split(",") if a.isdigit()]
            sample_dict["AD"] = sum(ad_list[1:])  # Only alternative allele depth(s)
        except:
            sample_dict["AD"] = 0
    else:
        sample_dict["AD"] = int(ad_val) if ad_val.isdigit() else 0

    # this parses vafs by talking the maximal one else 0
    vaf_val = sample_dict.get("VAF", "0.0")
    try:
        if "," in vaf_val:
            vaf_list = [float(v) for v in vaf_val.split(",") if v]
            sample_dict["VAF"] = max(vaf_list) if vaf_list else 0.0
        else:
            sample_dict["VAF"] = float(vaf_val)
    except:
        sample_dict["VAF"] = 0.0

    return sample_dict

# here I divide genotypes into homozygous and heterozygous based on their equality
# for multiallelic we put unknown because there is likely a copy number change or is weird
def get_genotype_from_GT(gt_str):
    """
    Determines genotype type based on GT field
    """
    alleles = gt_str.replace('|', '/').split('/')
    if len(alleles) == 2:
        if alleles[0] == alleles[1]:
            return "homozygous"
        else:
            return "heterozygous"
    return "unknown"

# we format into the dictionary
def extract_gt(sample_str, format_str):
    keys = format_str.split(":")
    values = sample_str.split(":")
    format_dict = dict(zip(keys, values))
    gt = format_dict.get("GT", "")
    return get_genotype_from_GT(gt)

# variant filtering based on values used in previous projects
def filter_variant_by_sample(sample_dict, min_dp=10, min_ad=3, min_vaf=0.1):
    """
    Apply basic quality filters on depth, allele depth, and VAF
    """
    dp = sample_dict.get("DP", 0)
    ad = sample_dict.get("AD", 0)
    vaf = sample_dict.get("VAF", 0.0)
    
    return (dp >= min_dp) and (ad >= min_ad) and (vaf >= min_vaf)

# here get the cosmic annotation into the dictionary
def get_cosmic_annotation(info_str):
    """
    parses COSMIC INFO field and returns a dictionary of annotations
    assumes the INFO string is formatted using the ; separator
    """
    info_dict = {}
    for entry in info_str.split(";"):
        if "=" in entry:
            key, value = entry.split("=", 1)
            info_dict[key] = value
    return info_dict

# here get gene annotation 
def extract_gene(annotation):
    return annotation.get("GENE_SYMBOL")
# here get protein position and mutation 
def extract_protein_position(annotation):
    return annotation.get("MUTATION_AA")
# here get the mutation description 
def extract_mutation_description(annotation):
    return annotation.get("MUTATION_DESCRIPTION") or annotation.get("rsid")


In [None]:
# extract the sample info
df_filtered["sample_info"] = df_filtered.apply(lambda row: parse_sample_format(row["format"], row["sample"]), axis=1)

# add specific fields defined above as separate columns
df_filtered["DP"]  = df_filtered["sample_info"].apply(lambda x: x["DP"])
df_filtered["AD"]  = df_filtered["sample_info"].apply(lambda x: x["AD"])
df_filtered["VAF"] = df_filtered["sample_info"].apply(lambda x: x["VAF"])
df_filtered["GT"]  = df_filtered["sample_info"].apply(lambda x: x["GT"])
df_filtered["genotype"] = df_filtered["GT"].apply(get_genotype_from_GT)

# do the variant filtering
df_filtered["pass_sample_filters"] = df_filtered["sample_info"].apply(filter_variant_by_sample)

# extract the cosmic info annotation
df_filtered["cosmic_ann"] = df_filtered["info"].apply(get_cosmic_annotation)
# extract the gene, protein description and mutation description 
df_filtered["gene"] = df_filtered["cosmic_ann"].apply(extract_gene)
df_filtered["protein_position"] = df_filtered["cosmic_ann"].apply(extract_protein_position)
df_filtered["mutation_description"] = df_filtered["cosmic_ann"].apply(extract_mutation_description)

In [None]:
### here are the filtering steps where intron, synonymus, 5 prime and 3 prime UTR variants are removed 
# what is left are all fields containing and and or a mix of missense, nonsense, frameshift and splice variants
df_intronless = df_filtered[df_filtered["mutation_description"] != "intron_variant"]
df_syn = df_intronless[df_intronless["mutation_description"] != "synonymous_variant"]
df3 = df_syn[df_syn["mutation_description"] != "3_prime_UTR_variant"]
df5 = df3[df3["mutation_description"] != "5_prime_UTR_variant"]
df5["qual"] = pd.to_numeric(df5["qual"]).astype(int)
# here I also filter based on quality of the calls and allele depth of 5 since lower are likely low quality. 
# I also remove all that do not have a COSMIC annotation to get the final df
df_fin = df5[
    (df5["filter"] == "PASS") &
    (df5["qual"] > 5) &
    (df5["AD"] > 5) &
    (df5["mutation_description"].notna())
]
df_fin

In [None]:
# here I just print what is left which is GRIN2A 
print(df_fin["mutation_description"].unique())
print(df_fin["gene"].unique())

In [None]:
# and save the df
df_fin.to_csv("/lustre/scratch126/casm/team274sb/lr26/PacBio-deepvariant-tumor-hg38/annotated_tumor_somatic_cosmic_new38_filtered.vcf", sep="\t")