In [2]:
import sys
import glob
import os
from os import listdir
from os.path import isfile, join
import pysam 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.pyplot as plt # plots
import scipy.sparse as sp
from scipy.sparse import csr_matrix # matrices
print(sys.executable)

/software/cellgen/team274/lr26/miniforge3/envs/my-python/bin/python


In [3]:
base_dir = "/lustre/scratch126/casm/team274sb/lr26/"

In [5]:
df = pd.read_csv(base_dir + "pepper-tumor38/tumor_vep_annotated_cosmic_clinvar_dbsnp.vcf",
                 sep="\t",
                 comment="#",
                 dtype="str",
                 header=None,
                 names=["chr","pos","id","ref","alt","qual","filter","info","format","sample"])
df

Unnamed: 0,chr,pos,id,ref,alt,qual,filter,info,format,sample
0,chr1,10039,.,A,ACC,7,PASS,CSQ=CC|upstream_gene_variant|MODIFIER|DDX11L1|...,GT:GQ:DP:AD:VAF:C,0/1:4:5:2:0.4:DV
1,chr1,10050,.,A,AAC,6.5,PASS,CSQ=AC|upstream_gene_variant|MODIFIER|DDX11L1|...,GT:GQ:DP:AD:VAF:C,0/1:4:6:2:0.333333:DV
2,chr1,10093,.,A,ACC,6.7,PASS,CSQ=CC|upstream_gene_variant|MODIFIER|DDX11L1|...,GT:GQ:DP:AD:VAF:C,1/1:3:6:2:0.333333:DV
3,chr1,10105,.,A,G,0.1,refCall,CSQ=G|upstream_gene_variant|MODIFIER|DDX11L1|E...,GT:GQ:DP:AD:VAF:C,./.:18:6:2:0.333333:DV
4,chr1,10108,.,C,CT,11.9,PASS,CSQ=T|upstream_gene_variant|MODIFIER|DDX11L1|E...,GT:GQ:DP:AD:VAF:C,1/1:8:6:5:0.833333:DV
...,...,...,...,...,...,...,...,...,...,...
6986154,chrY,56887636,.,T,G,24,PASS,CSQ=G|intergenic_variant|MODIFIER|||||||||||||...,GT:GQ:DP:AD:VAF:C,1/1:24:26:26:1:P
6986155,chrY,56887659,.,T,G,23,PASS,CSQ=G|intergenic_variant|MODIFIER|||||||||||||...,GT:GQ:DP:AD:VAF:C,1/1:23:26:26:1:P
6986156,chrY,56887837,.,G,A,17,PASS,CSQ=A|intergenic_variant|MODIFIER|||||||||||||...,GT:GQ:DP:AD:VAF:C,1/1:17:26:26:1:P
6986157,chrY,56887844,.,T,C,21,PASS,CSQ=C|intergenic_variant|MODIFIER|||||||||||||...,GT:GQ:DP:AD:VAF:C,1/1:21:26:26:1:P


In [4]:
def parse_sample_format(format_str, sample_str):
    """
    Parses FORMAT and SAMPLE fields of a VCF into a dictionary
    Handles numeric parsing and multi-allelic values robustly
    """
    keys = format_str.split(":")
    values = sample_str.split(":")
    sample_dict = dict(zip(keys, values))

    # Parse GT
    sample_dict["GT"] = sample_dict.get("GT", "")

    # Parse DP
    try:
        sample_dict["DP"] = int(sample_dict.get("DP", 0))
    except ValueError:
        sample_dict["DP"] = 0

    # Parse AD
    ad_val = sample_dict.get("AD", "0")
    if "," in ad_val:
        try:
            ad_list = [int(a) for a in ad_val.split(",") if a.isdigit()]
            sample_dict["AD"] = sum(ad_list[1:])  # Only ALT allele depth(s)
        except:
            sample_dict["AD"] = 0
    else:
        sample_dict["AD"] = int(ad_val) if ad_val.isdigit() else 0

    # Parse VAF (handle multiple values by taking max)
    vaf_val = sample_dict.get("VAF", "0.0")
    try:
        if "," in vaf_val:
            vaf_list = [float(v) for v in vaf_val.split(",") if v]
            sample_dict["VAF"] = max(vaf_list) if vaf_list else 0.0
        else:
            sample_dict["VAF"] = float(vaf_val)
    except:
        sample_dict["VAF"] = 0.0

    return sample_dict


def get_genotype_from_GT(gt_str):
    """
    Determines genotype type based on GT field
    """
    alleles = gt_str.replace('|', '/').split('/')
    if len(alleles) == 2:
        if alleles[0] == alleles[1]:
            return "homozygous"
        else:
            return "heterozygous"
    return "unknown"

def extract_gt(sample_str, format_str):
    keys = format_str.split(":")
    values = sample_str.split(":")
    format_dict = dict(zip(keys, values))
    gt = format_dict.get("GT", "")
    return get_genotype_from_GT(gt)

def filter_variant_by_sample(sample_dict, min_dp=10, min_ad=3, min_vaf=0.1):
    """
    Apply basic quality filters on depth, allele depth, and VAF
    """
    dp = sample_dict.get("DP", 0)
    ad = sample_dict.get("AD", 0)
    vaf = sample_dict.get("VAF", 0.0)
    
    return (dp >= min_dp) and (ad >= min_ad) and (vaf >= min_vaf)

def get_vep_annotations(info_str):
    info_list = info_str.split(";")
    info_dict = {}
    csq_field_name = "CSQ"

    csq_field_order = [
        "Allele", "Consequence", "IMPACT", "SYMBOL", "Gene", "Feature_type",
        "Feature", "BIOTYPE", "EXON", "INTRON", "HGVSc", "HGVSp", "cDNA_position",
        "CDS_position", "Protein_position", "Amino_acids", "Codons",
        "Existing_variation", "DISTANCE", "STRAND", "FLAGS", "SYMBOL_SOURCE",
        "HGNC_ID", "SOURCE", "dbSNP", "dbSNP_ID", "ClinVar", "ClinVar_CLINSIG",
        "cosmic", "cosmic_COSMIC_ID", "gencode.sorted.gff3.gz"
    ]

    annotations = []

    for entry in info_list:
        if "=" in entry:
            key, value = entry.split("=", 1)
            if key == csq_field_name:
                for ann in value.split(","):
                    fields = ann.split("|")
                    ann_dict = dict(zip(csq_field_order, fields))
                    annotations.append(ann_dict)

    return annotations

def get_impact_list(ann_list):
    impact_list = []
    for annotation in ann_list:
        impact = annotation["IMPACT"]
        impact_list.append(impact)
    impact_list = list(set(impact_list))
    return impact_list

def get_impact_type(impact_list):
    relevant = ["MODERATE","HIGH"]
    impact_type = ""
    if (len(impact_list) != 0):
        for impact in impact_list:
            if (impact in(relevant)):
                impact_type = "relevant"
                break
            else:
                impact_type = "non-relevant"
    else:
        impact_type = "relevant"
    return impact_type

def get_clin_type(ann_list):
    clin_sig_list = []
    for annotation in ann_list:
        clin_ann_list = annotation["ClinVar_CLINSIG"].split("&")
        for clin_sig in clin_ann_list:
            clin_ann_sub_list = clin_sig.split("/")
            for clin_sub_sig in clin_ann_sub_list:
                clin_sig_list.append(clin_sub_sig)
    clin_sig_list = set(clin_sig_list)
    benign_list = ["Benign","Likely_benign"]
    benign_list = set(benign_list)
    if (len(clin_sig_list.intersection(benign_list)) > 0):
        clin_type = "Benign"
    else:
        clin_type = list(clin_sig_list)
    return clin_type

def extract_genes(annotations):
    return list(set(
        ann["SYMBOL"] for ann in annotations if ann.get("SYMBOL")
    ))

def extract_protein_positions(annotations):
    return [
        ann["Protein_position"] for ann in annotations if ann.get("Protein_position")
    ]

def extract_dbsnp_ids(annotations):
    return list(set(
        ann.get("dbSNP") or ann.get("rsid") for ann in annotations if ann.get("dbSNP") or ann.get("rsid")
    ))

def extract_consequences(annotations):
    return list(set(
        ann.get("Consequence") or ann.get("Conseq") for ann in annotations if ann.get("Consequence") or ann.get("Conseq")))

In [None]:
def parse_sample_format(format_str, sample_str):
    """
    Parses FORMAT and SAMPLE fields of a VCF into a dictionary
    Handles numeric parsing and multi-allelic values robustly
    """
    keys = format_str.split(":")
    values = sample_str.split(":")
    sample_dict = dict(zip(keys, values))

    # Parse GT
    sample_dict["GT"] = sample_dict.get("GT", "")

    # Parse DP
    try:
        sample_dict["DP"] = int(sample_dict.get("DP", 0))
    except ValueError:
        sample_dict["DP"] = 0

    # Parse AD
    ad_val = sample_dict.get("AD", "0")
    if "," in ad_val:
        try:
            ad_list = [int(a) for a in ad_val.split(",") if a.isdigit()]
            sample_dict["AD"] = sum(ad_list[1:])  # Only ALT allele depth(s)
        except:
            sample_dict["AD"] = 0
    else:
        sample_dict["AD"] = int(ad_val) if ad_val.isdigit() else 0

    # Parse VAF (handle multiple values by taking max)
    vaf_val = sample_dict.get("VAF", "0.0")
    try:
        if "," in vaf_val:
            vaf_list = [float(v) for v in vaf_val.split(",") if v]
            sample_dict["VAF"] = max(vaf_list) if vaf_list else 0.0
        else:
            sample_dict["VAF"] = float(vaf_val)
    except:
        sample_dict["VAF"] = 0.0

    return sample_dict


def get_genotype_from_GT(gt_str):
    """
    Determines genotype type based on GT field
    """
    alleles = gt_str.replace('|', '/').split('/')
    if len(alleles) == 2:
        if alleles[0] == alleles[1]:
            return "homozygous"
        else:
            return "heterozygous"
    return "unknown"

def extract_gt(sample_str, format_str):
    keys = format_str.split(":")
    values = sample_str.split(":")
    format_dict = dict(zip(keys, values))
    gt = format_dict.get("GT", "")
    return get_genotype_from_GT(gt)

def filter_variant_by_sample(sample_dict, min_dp=10, min_ad=3, min_vaf=0.1):
    """
    Apply basic quality filters on depth, allele depth, and VAF
    """
    dp = sample_dict.get("DP", 0)
    ad = sample_dict.get("AD", 0)
    vaf = sample_dict.get("VAF", 0.0)
    
    return (dp >= min_dp) and (ad >= min_ad) and (vaf >= min_vaf)

def get_vep_annotations(info_str):
    info_list = info_str.split(";")
    info_dict = {}
    csq_field_name = "CSQ"

    csq_field_order = [
        "Allele", "Consequence", "IMPACT", "SYMBOL", "Gene", "Feature_type",
        "Feature", "BIOTYPE", "EXON", "INTRON", "HGVSc", "HGVSp", "cDNA_position",
        "CDS_position", "Protein_position", "Amino_acids", "Codons",
        "Existing_variation", "DISTANCE", "STRAND", "FLAGS", "SYMBOL_SOURCE",
        "HGNC_ID", "SOURCE", "ClinVar", "ClinVar_ID", "dbSNP", "dbSNP_ID",
        "chm13v2.0_RefSeq_Liftoff_v5.1.sorted.gff3.gz"
    ]

    annotations = []

    for entry in info_list:
        if "=" in entry:
            key, value = entry.split("=", 1)
            if key == csq_field_name:
                for ann in value.split(","):
                    fields = ann.split("|")
                    ann_dict = dict(zip(csq_field_order, fields))
                    annotations.append(ann_dict)

    return annotations

def get_impact_list(ann_list):
    impact_list = []
    for annotation in ann_list:
        impact = annotation["IMPACT"]
        impact_list.append(impact)
    impact_list = list(set(impact_list))
    return impact_list

def get_impact_type(impact_list):
    relevant = ["MODERATE","HIGH"]
    impact_type = ""
    if (len(impact_list) != 0):
        for impact in impact_list:
            if (impact in(relevant)):
                impact_type = "relevant"
                break
            else:
                impact_type = "non-relevant"
    else:
        impact_type = "relevant"
    return impact_type

def get_clin_type(ann_list):
    clin_sig_list = []
    for annotation in ann_list:
        clin_ann_list = annotation["ClinVar_ID"].split("&")
        for clin_sig in clin_ann_list:
            clin_ann_sub_list = clin_sig.split("/")
            for clin_sub_sig in clin_ann_sub_list:
                clin_sig_list.append(clin_sub_sig)
    clin_sig_list = set(clin_sig_list)
    benign_list = ["Benign","Likely_benign"]
    benign_list = set(benign_list)
    if (len(clin_sig_list.intersection(benign_list)) > 0):
        clin_type = "Benign"
    else:
        clin_type = list(clin_sig_list)
    return clin_type

def extract_genes(annotations):
    return list(set(
        ann["Gene"] for ann in annotations if ann.get("Gene")
    ))

def extract_protein_positions(annotations):
    return [
        ann["Protein_position"] for ann in annotations if ann.get("Protein_position")
    ]

def extract_dbsnp_ids(annotations):
    return list(set(
        ann.get("dbSNP") or ann.get("rsid") for ann in annotations if ann.get("dbSNP") or ann.get("rsid")
    ))

def extract_consequences(annotations):
    return list(set(
        ann.get("Consequence") or ann.get("Conseq") for ann in annotations if ann.get("Consequence") or ann.get("Conseq")))

In [5]:
df_filt = df.copy()
df_filt["qual"] = pd.to_numeric(df_filt["qual"], errors='coerce')  # Convert in the copy

df_filtered = df_filt[
    (df_filt["filter"] == "PASS") &
    (df_filt["qual"] > 5)
]


In [6]:
df_filtered

Unnamed: 0,chr,pos,id,ref,alt,qual,filter,info,format,sample
0,chr1,10039,.,A,ACC,7.0,PASS,CSQ=CC|upstream_gene_variant|MODIFIER|DDX11L1|...,GT:GQ:DP:AD:VAF:C,0/1:4:5:2:0.4:DV
1,chr1,10050,.,A,AAC,6.5,PASS,CSQ=AC|upstream_gene_variant|MODIFIER|DDX11L1|...,GT:GQ:DP:AD:VAF:C,0/1:4:6:2:0.333333:DV
2,chr1,10093,.,A,ACC,6.7,PASS,CSQ=CC|upstream_gene_variant|MODIFIER|DDX11L1|...,GT:GQ:DP:AD:VAF:C,1/1:3:6:2:0.333333:DV
4,chr1,10108,.,C,CT,11.9,PASS,CSQ=T|upstream_gene_variant|MODIFIER|DDX11L1|E...,GT:GQ:DP:AD:VAF:C,1/1:8:6:5:0.833333:DV
14,chr1,10217,.,A,C,8.0,PASS,CSQ=C|upstream_gene_variant|MODIFIER|DDX11L1|E...,GT:GQ:DP:AD:VAF:C,1/1:4:6:2:0.333333:DV
...,...,...,...,...,...,...,...,...,...,...
6986153,chrY,56887631,.,C,A,26.0,PASS,CSQ=A|intergenic_variant|MODIFIER|||||||||||||...,GT:GQ:DP:AD:VAF:C,1/1:26:26:26:1:P
6986154,chrY,56887636,.,T,G,24.0,PASS,CSQ=G|intergenic_variant|MODIFIER|||||||||||||...,GT:GQ:DP:AD:VAF:C,1/1:24:26:26:1:P
6986155,chrY,56887659,.,T,G,23.0,PASS,CSQ=G|intergenic_variant|MODIFIER|||||||||||||...,GT:GQ:DP:AD:VAF:C,1/1:23:26:26:1:P
6986156,chrY,56887837,.,G,A,17.0,PASS,CSQ=A|intergenic_variant|MODIFIER|||||||||||||...,GT:GQ:DP:AD:VAF:C,1/1:17:26:26:1:P


In [7]:
# Extract sample info
df_filtered["sample_info"] = df_filtered.apply(lambda row: parse_sample_format(row["format"], row["sample"]), axis=1)

# Add specific fields as separate columns
df_filtered["DP"]  = df_filtered["sample_info"].apply(lambda x: x["DP"])
df_filtered["AD"]  = df_filtered["sample_info"].apply(lambda x: x["AD"])
df_filtered["VAF"] = df_filtered["sample_info"].apply(lambda x: x["VAF"])
df_filtered["GT"]  = df_filtered["sample_info"].apply(lambda x: x["GT"])
df_filtered["genotype"] = df_filtered["GT"].apply(get_genotype_from_GT)

# Apply filtering
df_filtered["pass_sample_filters"] = df_filtered["sample_info"].apply(filter_variant_by_sample)

# Extract VEP annotations
df_filtered["vep_ann"] = df_filtered["info"].apply(get_vep_annotations)

# Impact info
df_filtered["impact_list"] = df_filtered["vep_ann"].apply(get_impact_list)
df_filtered["impact_type"] = df_filtered["impact_list"].apply(get_impact_type)

# Clinical significance
df_filtered["clin_sig"] = df_filtered["vep_ann"].apply(get_clin_type)

df_filtered["genes"] = df_filtered["vep_ann"].apply(extract_genes)
df_filtered["protein_positions"] = df_filtered["vep_ann"].apply(extract_protein_positions)
df_filtered["dbsnp_ids"] = df_filtered["vep_ann"].apply(extract_dbsnp_ids)
df_filtered["consequences"] = df_filtered["vep_ann"].apply(extract_consequences)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered["sample_info"] = df_filtered.apply(lambda row: parse_sample_format(row["format"], row["sample"]), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered["DP"]  = df_filtered["sample_info"].apply(lambda x: x["DP"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered["A

In [8]:
df_filtered

Unnamed: 0,chr,pos,id,ref,alt,qual,filter,info,format,sample,...,genotype,pass_sample_filters,vep_ann,impact_list,impact_type,clin_sig,genes,protein_positions,dbsnp_ids,consequences
0,chr1,10039,.,A,ACC,7.0,PASS,CSQ=CC|upstream_gene_variant|MODIFIER|DDX11L1|...,GT:GQ:DP:AD:VAF:C,0/1:4:5:2:0.4:DV,...,heterozygous,False,"[{'Allele': 'CC', 'Consequence': 'upstream_gen...",[MODIFIER],non-relevant,[],"[DDX11L2, DDX11L1, WASH7P]",[],[],"[upstream_gene_variant, downstream_gene_variant]"
1,chr1,10050,.,A,AAC,6.5,PASS,CSQ=AC|upstream_gene_variant|MODIFIER|DDX11L1|...,GT:GQ:DP:AD:VAF:C,0/1:4:6:2:0.333333:DV,...,heterozygous,False,"[{'Allele': 'AC', 'Consequence': 'upstream_gen...",[MODIFIER],non-relevant,[],"[DDX11L2, DDX11L1, WASH7P]",[],[],"[upstream_gene_variant, downstream_gene_variant]"
2,chr1,10093,.,A,ACC,6.7,PASS,CSQ=CC|upstream_gene_variant|MODIFIER|DDX11L1|...,GT:GQ:DP:AD:VAF:C,1/1:3:6:2:0.333333:DV,...,homozygous,False,"[{'Allele': 'CC', 'Consequence': 'upstream_gen...",[MODIFIER],non-relevant,[],"[DDX11L2, DDX11L1, WASH7P]",[],[],"[upstream_gene_variant, downstream_gene_variant]"
4,chr1,10108,.,C,CT,11.9,PASS,CSQ=T|upstream_gene_variant|MODIFIER|DDX11L1|E...,GT:GQ:DP:AD:VAF:C,1/1:8:6:5:0.833333:DV,...,homozygous,False,"[{'Allele': 'T', 'Consequence': 'upstream_gene...",[MODIFIER],non-relevant,[],"[DDX11L2, DDX11L1, WASH7P]",[],[],"[upstream_gene_variant, downstream_gene_variant]"
14,chr1,10217,.,A,C,8.0,PASS,CSQ=C|upstream_gene_variant|MODIFIER|DDX11L1|E...,GT:GQ:DP:AD:VAF:C,1/1:4:6:2:0.333333:DV,...,homozygous,False,"[{'Allele': 'C', 'Consequence': 'upstream_gene...",[MODIFIER],non-relevant,[],"[DDX11L2, DDX11L1, WASH7P]",[],[],"[upstream_gene_variant, downstream_gene_variant]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6986153,chrY,56887631,.,C,A,26.0,PASS,CSQ=A|intergenic_variant|MODIFIER|||||||||||||...,GT:GQ:DP:AD:VAF:C,1/1:26:26:26:1:P,...,homozygous,True,"[{'Allele': 'A', 'Consequence': 'intergenic_va...",[MODIFIER],non-relevant,[],[],[],[],[intergenic_variant]
6986154,chrY,56887636,.,T,G,24.0,PASS,CSQ=G|intergenic_variant|MODIFIER|||||||||||||...,GT:GQ:DP:AD:VAF:C,1/1:24:26:26:1:P,...,homozygous,True,"[{'Allele': 'G', 'Consequence': 'intergenic_va...",[MODIFIER],non-relevant,[],[],[],[rs111327212],[intergenic_variant]
6986155,chrY,56887659,.,T,G,23.0,PASS,CSQ=G|intergenic_variant|MODIFIER|||||||||||||...,GT:GQ:DP:AD:VAF:C,1/1:23:26:26:1:P,...,homozygous,True,"[{'Allele': 'G', 'Consequence': 'intergenic_va...",[MODIFIER],non-relevant,[],[],[],[rs376828276],[intergenic_variant]
6986156,chrY,56887837,.,G,A,17.0,PASS,CSQ=A|intergenic_variant|MODIFIER|||||||||||||...,GT:GQ:DP:AD:VAF:C,1/1:17:26:26:1:P,...,homozygous,True,"[{'Allele': 'A', 'Consequence': 'intergenic_va...",[MODIFIER],non-relevant,[],[],[],[],[intergenic_variant]


In [None]:
df_filtered.to_csv(base_dir + "pepper-blood/blood_vep_annotated_filtered_new.tsv", sep="\t")

In [9]:
df_fin = df_filtered[
    (df_filtered["filter"] == "PASS") &
    (df_filtered["qual"] > 5) &
    (df_filtered["AD"] > 5) &
    (df_filtered["impact_type"] == "relevant") &
    (df_filtered["clin_sig"].apply(lambda x: any(s.strip() for s in x if isinstance(s, str))))
]

In [None]:
df_fin

In [10]:
df_fin.to_csv(base_dir + "pepper-tumor38/tumor_all_clinvar_dbsnp_filtered_hg38.tsv", sep="\t")

In [None]:
# Load the PanelApp gene panel file
panel_df = pd.read_csv("/nfs/users/nfs_l/lr26/shells/panelapp_childhood_solid_tumors_420.tsv", sep="\t")

# Extract unique gene names from the first column (assuming it’s called "Entity Name")
cancer_genes = panel_df["Entity Name"].dropna().unique().tolist()
cancer_genes

def get_affected_genes(ann_list):
    gene_list = []
    for annotation in ann_list:
        symbol = annotation["SYMBOL"]
        gene_list.append(symbol)
    gene_list = list(set(gene_list))
    return gene_list


In [None]:
gene_df              = df_filtered.copy()
gene_df["gene_list"] = gene_df["vep_ann"].apply(lambda x: get_affected_genes(x))

In [None]:
df_cancer = gene_df[gene_df["genes"].apply(lambda gene_list: any(gene in cancer_genes for gene in gene_list))]


In [None]:
df_cancer

In [None]:
df_cancer.to_csv(base_dir + "pepper-tumor1B01/tumor_vep_annotated_filtered_final_new_cancer_panel.tsv", sep="\t")