In [1]:
#!/usr/bin/env python3

### Imports

import pandas as pd
import numpy as np
import subprocess
import os
import pysam


In [3]:
### Base table code

def hb_maker(phased_vars_file):
    df_vcf = pd.read_csv(phased_vars_file, sep="\t", names=['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT', "sample"])
    df_vcf["GENE"] = df_vcf['INFO'].str.split("|").str[4]
    df_splitted = df_vcf['sample'].str.split(":")
    df_vcf["GT"] = df_splitted.str[0]
    df_vcf["DP"] = df_splitted.str[2]
    df_vcf["PS"] = df_splitted.str[5]
    # Min Quality of 10, this means there is no more than 1/10000 chance of an incorrect base call (>99.9% accuracy)
    # & (pd.to_numeric(df_vcf['DP']) >= 20
    
    phased = df_vcf.loc[df_vcf['PS'].notna(), ['CHROM','POS','QUAL','GENE','GT','DP','PS','REF','ALT']]
    # phased["DP"] = pd.to_numeric(phased["DP"], errors="coerce")
    # phased = phased.sort_values(by="DP")

    grouped = phased.groupby(['PS']).agg({'POS': ['min','max', 'count'],
                                                    'CHROM': 'first',
                                                    'QUAL': 'mean',
                                                    'GENE': 'unique'}).reset_index()
    return grouped


def phased_GOI(regions_file, phased_hps, sample):
    folder = "/hpc/umc_laat/gvandersluis/data/"
    all_vars = f"{folder}Ont_data_nhung/HG00{sample}/SAMPLE_renamed.vcf.gz"
    genes_of_interest = pd.read_csv(regions_file, sep="\t", names=["CHROM","START","END","GENE","INFO","STRAND"])
    goi = genes_of_interest["GENE"].unique()
    genes_of_interest["POSITION"] = genes_of_interest["CHROM"].astype(str)+":"+genes_of_interest["START"].astype(str)+"-"+genes_of_interest["END"].astype(str)
    #print(genes_of_interest)
    # Make sure each GENE entry is a Python list
    phased_hps["GENE"] = phased_hps[("GENE","unique")].apply(lambda x: x.tolist())
    
    phased_hps.columns = ['_'.join([c for c in col if c]).strip() if isinstance(col, tuple) else col for col in phased_hps.columns.values]
    phased_hps.rename(columns={"CHROM_first":"CHROM"}, inplace=True)

    merged = pd.DataFrame(columns=["GENE", "POSITION_ROI", "HB_START", "HB_END", "HB_LENGTH", "PHASED_VARIANTS", "TOTAL_VARIANTS", "REMOVED_VARIANTS", "AVG_QUAL"])
    for idx, item in phased_hps.iterrows():
        reg = item["CHROM"]+":"+str(item["POS_min"])+"-"+str(item["POS_max"])
        # Build the command as a single string
        cmd = f"apptainer exec -B /hpc/:/hpc/ /hpc/umc_laat/gvandersluis/software/bcftools_v1.9-1-deb_cv1.sif bcftools view -r {reg} {all_vars} | wc -l"
        # Run the command
        result = subprocess.run(cmd, shell=True, check=True, capture_output=True, text=True)
        # Get the count as integer
        count = int(result.stdout.strip())        
        #count = subprocess.run(cmd, check=True, capture_output=True)
        for idx, genereg in genes_of_interest.iterrows():
            if item["CHROM"] == genereg["CHROM"] and item["POS_min"] >= genereg["START"] and item["POS_max"] <= genereg["END"]:
                merged.loc[len(merged)] = [genereg["GENE"], genereg["POSITION"], item["POS_min"], item["POS_max"], item["POS_max"]-item["POS_min"]+1, item["POS_count"], count, count-item["POS_count"], item["QUAL_mean"]]


    merged = merged.sort_values(by="GENE")
    






    ### ORIGINAL CODE (only get info for haploblock of interest)
    # # Filter rows where at least one gene in 'unique' is in GOI
    # filtered = phased_hps[phased_hps[("GENE", "unique")].apply(lambda genes: any(g in goi for g in genes))].copy()
    # # Now, inside each row, keep only the genes that match the GOI
    # filtered[("GENE", "unique")] = filtered[("GENE", "unique")].apply(lambda genes: ", ".join([str(g) for g in genes if g in goi]))
    # # Remove sub columns
    # filtered.columns = ['_'.join([c for c in col if c]).strip() if isinstance(col, tuple) else col for col in filtered.columns.values]
    # # Now rename for clarity
    # filtered = filtered.rename(columns={'PS': 'PS',
    #                                     'POS_min': 'POS_START',
    #                                     'POS_max': 'POS_END',
    #                                     'POS_count': 'PHASED_VARIANTS',
    #                                     'CHROM_first': 'chromosome',
    #                                     'QUAL_mean': 'QUAL_MEAN',
    #                                     'GENE_unique': 'GENE'})

    # filtered["HB_LENGTH"] = filtered["POS_END"]-filtered["POS_START"]

    # merged = pd.merge(filtered, genes_of_interest, on="GENE", how="right")[["GENE","POSITION_ROI","HB_START","HB_END","HB_LENGTH","PHASED_VARIANTS"]]
    # int_cols = ["POS_START", "POS_END", "HB_LENGTH", "PHASED_VARIANTS"]

    # for c in int_cols:
    #     merged[c] = pd.to_numeric(merged[c], errors="coerce").astype("Int64")
    # merged = merged.astype(str).replace("<NA>", "NaN")
    return merged






In [26]:
### Code Calculate Switch Errors

def switch_errors(bs_df, ph_vars, sample, version):
    folder = "/hpc/umc_laat/gvandersluis/data/"    
    BM_vars = f"{folder}Ont_data_nhung/HG00{sample}/HG00{sample}_BM_SSANDT_rn.vcf"
    BM_ROI = f"{folder}Ont_data_nhung/HG00{sample}/{version}_ROI/ROI_eval.tsv"
    open(BM_ROI, "w").close()
    switches_ROI = f"{folder}Ont_data_nhung/HG00{sample}/{version}_ROI/switches_ROI.bed"
    open(switches_ROI, "w").close()

    for idx, region in bs_df.iterrows():
        reg = region["POSITION_ROI"].split(":")[0]+":"+str(region["HB_START"])+"-"+str(region["HB_END"])
        gen = region["GENE"]+"_"+str(region["HB_START"])
        if "NaN" not in reg:
            out_vcf = f"{folder}Ont_data_nhung/HG00{sample}/{version}_ROI/{gen}_BM.vcf"
            cmd = ["apptainer", "exec", "-B", "/hpc/:/hpc/", "/hpc/umc_laat/gvandersluis/software/bcftools_v1.9-1-deb_cv1.sif", "bcftools", "view", "-r", reg, BM_vars, "-Oz", "-o", out_vcf]
            subprocess.run(cmd, check=True)
            indexer = ["apptainer", "exec", "-B", "/hpc/:/hpc/", "/hpc/umc_laat/gvandersluis/software/bcftools_v1.9-1-deb_cv1.sif", "bcftools", "index", "-t", out_vcf]
            subprocess.run(indexer, check=True)
            indexed = out_vcf+".tbi"
            tsv_out = f"{gen}_eval.tsv"
            open(tsv_out, "a+").write("")
            if sum(1 for _ in pysam.VariantFile(out_vcf)) != 0:
                bed_out = f"{folder}Ont_data_nhung/HG00{sample}/{version}_ROI/{gen}_switch.bed"
                cmd2 = ["apptainer", "exec", "-B", "/hpc/:/hpc/", "/hpc/umc_laat/gvandersluis/software/whatshap_v1.sif", "whatshap", "compare","--switch-error-bed",bed_out,"--tsv-pairwise", tsv_out, "--names", f"BENCHMARK,{gen}", out_vcf, ph_vars]
                subprocess.run(cmd2, check=True, capture_output=True)
                if os.path.getsize(BM_ROI) == 0:
                    with open(BM_ROI, "w") as out:
                        with open(tsv_out, "r") as inp:
                            out.write(inp.read())
                else:
                    with open(BM_ROI, "a") as out:
                        with open(tsv_out, "r") as inp:
                            next(inp)
                            out.write(inp.read())
                with open(switches_ROI, "a+") as out_b:
                    with open(bed_out, "r") as inp_b:
                        out_b.write(inp_b.read())
                os.remove(bed_out)
            os.remove(out_vcf)
            os.remove(indexed)
            os.remove(tsv_out)
        print(gen, "\t Done")
    return BM_ROI

In [31]:
### ALGEMENE FUNCTIE

def main(variants, ph_variants, goi, version, samp):
    ### Make hapbloblocks in ROI
    phased_hb = hb_maker(variants)
    basis_df = phased_GOI(goi, phased_hb, samp)
    
    ### Run Switch Errors
    bm_roi = switch_errors(basis_df, ph_variants, samp, version)

    ### Format the switch error file
    sw_e = pd.read_csv(bm_roi, sep="\t")
    sw_e = sw_e[["dataset_name1","het_variants0","all_switches","all_switch_rate","all_switchflips","all_switchflip_rate","blockwise_hamming_rate"]]
    sw_e["Accuracy"] = (100-sw_e["blockwise_hamming_rate"]).astype(str)+"%"
    sw_e.drop(columns="blockwise_hamming_rate")
    sw_e.rename(columns={"dataset_name1": "GENE_l"}, inplace=True)

    ### Merge Hapoblocks with their switch errors
    basis_df["GENE_l"] = basis_df["GENE"]+"_"+basis_df["HB_START"].astype(str)    
    switch_df = pd.merge(basis_df, sw_e, on="GENE_l", how="left")
    switch_df["difference_het-vars_BM_vs_data"] = abs(switch_df["PHASED_VARIANTS"]-switch_df["het_variants0"])
    switch_df = switch_df.drop(columns="GENE_l")

    ### Write merged dataframe to file
    switch_df.to_csv(f"/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG00{samp}/{version}_ROI/final_df.csv", index=False)
    
    return basis_df, switch_df, sw_e

In [33]:

### File Inputs

# Regions of interest
goi_file = "/hpc/umc_laat/gvandersluis/data/Ref_HG/HG_annotation_ROI.bed"

## VCF Files

# vcf filtered on qual > 40, PASS, and regions of interest
filtered_v = "/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/phased_ROI_nh.vcf"
ph_filtered_v = "/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/phased_ROI.vcf"

# vcf filtered on regions of interest
ununfiltered_v = "/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/un_unphased_ROI_nh.vcf"
ph_ununfiltered_v = "/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/phased_ROI.vcf"


# Raw VCF data, only filtered on ROI
ONT = "/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/unphased_ROI/Sample_un_unfiltered_ROI_nh.vcf.gz"
ph_ONT = "/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/unphased_ROI/Sample_un_unfiltered_ROI.vcf.gz"

sample = "2"


### RUN CODE
print("VCF filtered on PASS, QUAL >= 40 & ROI")
filtered_Q_ROI = [filtered_v, ph_filtered_v, "filtered"]
basis_f_q_roi, sw_f_q_roi, switch_f_q_roi = main(filtered_Q_ROI[0], filtered_Q_ROI[1], goi_file, filtered_Q_ROI[2], sample)

print("VCF filtered on ROI")
filtered_ROI = [ununfiltered_v, ph_ununfiltered_v ,"un_unfiltered"]
basis_f_roi, sw_f_roi, switch_f_roi = main(filtered_ROI[0], filtered_ROI[1], goi_file, filtered_ROI[2], sample)

print("RAW VCF from ONT filtered on ROI")
RAW_ROI = [ONT, ph_ONT, "unphased"]
raw_f_roi, sw_raw_f_roi, switch_raw_roi = main(RAW_ROI[0], RAW_ROI[1], goi_file, RAW_ROI[2], sample)

VCF filtered on PASS, QUAL >= 40 & ROI
BRCA1_43183966 	 Done
BRCA1_42861321 	 Done
BRCA1_42808235 	 Done
BRCA1_42607430 	 Done
BRCA1_43322424 	 Done
BRCA1_43010006 	 Done
BRCA2_31865037 	 Done
BRCA2_32712242 	 Done
BRCA2_32095084 	 Done
BRCA2_32180531 	 Done
BRCA2_32508214 	 Done
BRCA2_32602810 	 Done
CFTR_117002433 	 Done
CFTR_117777011 	 Done
CRTAP_33204384 	 Done
CRTAP_32635859 	 Done
CRTAP_32934214 	 Done
CYP21A2_32488753 	 Done
CYP21A2_32000387 	 Done
CYP21A2_31542190 	 Done
CYP21A2_32515992 	 Done
CYP21A2_32185426 	 Done
HBA1_27803 	 Done
HBB_4727925 	 Done
HBB_5273948 	 Done
HBB_5679844 	 Done
MUSK_110240520 	 Done
MUSK_110997014 	 Done
PEX7_136636685 	 Done
PEX7_136574402 	 Done
POLG_88820301 	 Done
POLG_89660113 	 Done
SMN1_70485532 	 Done
SMN1_71217981 	 Done
SMN1_71016257 	 Done
SMN1_71067872 	 Done
SMN1_71317230 	 Done
TSEN54_75022811 	 Done
TSEN54_75977471 	 Done
VCF filtered on ROI
BRCA1_42607430 	 Done
BRCA1_43183966 	 Done
BRCA1_43240781 	 Done
BRCA2_32712242 	 Done
BRC

In [34]:
sw_f_q_roi

Unnamed: 0,GENE,POSITION_ROI,HB_START,HB_END,HB_LENGTH,PHASED_VARIANTS,TOTAL_VARIANTS,REMOVED_VARIANTS,AVG_QUAL,het_variants0,all_switches,all_switch_rate,all_switchflips,all_switchflip_rate,blockwise_hamming_rate,Accuracy,difference_het-vars_BM_vs_data
0,BRCA1,chr17:42607268-43607268,43183966,43231858,47893,8,315,307,54.3075,6.0,0.0,0.0,0/0,0.0,0.0,100.0%,2.0
1,BRCA1,chr17:42607268-43607268,42861321,42921612,60292,13,287,274,55.360769,13.0,0.0,0.0,0/0,0.0,0.0,100.0%,0.0
2,BRCA1,chr17:42607268-43607268,42808235,42811886,3652,4,269,265,54.315,4.0,0.0,0.0,0/0,0.0,0.0,100.0%,0.0
3,BRCA1,chr17:42607268-43607268,42607430,42767578,160149,71,482,411,58.045211,86.0,0.0,0.0,0/0,0.0,0.0,100.0%,15.0
4,BRCA1,chr17:42607268-43607268,43322424,43607251,284828,272,743,471,56.124081,330.0,0.0,0.0,0/0,0.0,0.0,100.0%,58.0
5,BRCA1,chr17:42607268-43607268,43010006,43132602,122597,7,306,299,56.21,12.0,0.0,0.0,0/0,0.0,0.0,100.0%,5.0
6,BRCA2,chr13:31857677-32857677,31865037,32011930,146894,69,558,489,58.36942,88.0,0.0,0.0,0/0,0.0,0.0,100.0%,19.0
7,BRCA2,chr13:31857677-32857677,32712242,32834472,122231,125,446,321,57.10584,147.0,0.0,0.0,0/0,0.0,0.0,100.0%,22.0
8,BRCA2,chr13:31857677-32857677,32095084,32123384,28301,4,311,307,58.92,4.0,0.0,0.0,0/0,0.0,0.0,100.0%,0.0
9,BRCA2,chr13:31857677-32857677,32180531,32415657,235127,99,594,495,56.383434,118.0,0.0,0.0,0/0,0.0,0.0,100.0%,19.0


In [11]:
## NIET GOED MEER

final_df = pd.merge(switch_df, switch_df_uf[["POSITION_ROI","PHASED_VARIANTS"]], how="left", on="POSITION_ROI")
final_df["FILTERED_VARIANTS"] = (pd.to_numeric(final_df["PHASED_VARIANTS_y"], errors="coerce") - pd.to_numeric(final_df["PHASED_VARIANTS_x"], errors="coerce"))
final_df = final_df[["GENE", "POSITION_ROI", "HB_START", "HB_END", "HB_LENGTH", "PHASED_VARIANTS_x", "FILTERED_VARIANTS", "all_switches"]]
final_df.rename(columns={"PHASED_VARIANTS_x": "PHASED_VARIANTS"}, inplace=True)

int_cols = ["HB_START", "HB_END", "HB_LENGTH", "PHASED_VARIANTS", "FILTERED_VARIANTS", "all_switches"]

for c in int_cols:
    final_df[c] = pd.to_numeric(final_df[c], errors="coerce").astype("Int64")
final_df = final_df.astype(str).replace("<NA>", "NaN")

final_df

NameError: name 'switch_df' is not defined

In [24]:
pd.concat([switch_df_uf,switch_ONT]).drop_duplicates(keep=False)


Unnamed: 0,GENE,POSITION_ROI,HB_START,HB_END,HB_LENGTH,PHASED_VARIANTS,TOTAL_VARIANTS,REMOVED_VARIANTS,AVG_QUAL,het_variants0,all_switches,all_switch_rate,all_switchflips,all_switchflip_rate,Accuracy,difference_het-vars_BM_vs_data
17,CYP21A2,chr6:31540309-32540309,32501153,32540235,39083,1472,3251,1779,19.056848,,,,,,,
28,SMN1,chr5:70439486-71439486,70463648,70538381,74734,5,1342,1337,17.664,1.0,0.0,,0/0,,nan%,4.0
29,SMN1,chr5:70439486-71439486,70440650,70553669,113020,861,1436,575,34.538792,2.0,0.0,,0/0,,nan%,859.0
16,CYP21A2,chr6:31540309-32540309,32501153,32540235,39083,1471,3251,1780,19.05877,,,,,,,
28,SMN1,chr5:70439486-71439486,70440650,70553669,113020,866,1436,570,34.441363,2.0,0.0,,0/0,,nan%,864.0


In [23]:
### Interesting Code Lines

# find how many phased variant per selected region
bcftools view -r 'chr3:32934214-33148829' phased_ROI.vcf.gz | grep -E '0\|1|1\|1|1\|0|0\|0' | wc

bcftools view -H  -i 'FORMAT/PS="."' -r 'chr17:75022811-75870128' phased_ROI.vcf.gz | wc
bcftools view -H  -i 'FORMAT/GT!~"/"'  -r 'chr17:75022811-75870128' phased_ROI.vcf.gz | wc
bcftools view -H  -i 'FORMAT/PS!="."' -r 'chr17:75022811-75870128' phased_ROI.vcf.gz | wc
bcftools view -H  -i 'FORMAT/GT~"/"'  -r 'chr17:75022811-75870128' phased_ROI.vcf.gz | wc

SyntaxError: invalid syntax (1453991483.py, line 4)