In [1]:
#!/usr/bin/env python3

### Imports

import pandas as pd
import numpy as np
import subprocess
import os

In [2]:
### File Inputs

# Regions of interest
goi_file = "/hpc/umc_laat/gvandersluis/data/Ref_HG/HG_annotation_ROI.bed"

## VCF Files

# vcf filtered on qual > 40, PASS, and regions of interest
filtered_v = "/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/phased_ROI_nh.vcf"

# vcf filtered on regions of interest
ununfiltered_v = "/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/un_unphased_ROI_nh.vcf"


In [7]:
### Base table code

def hb_maker(phased_vars_file):
    df_vcf = pd.read_csv(phased_vars_file, sep="\t", names=['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT', "sample"])
    df_vcf["GENE"] = df_vcf['INFO'].str.split("|").str[4]
    df_splitted = df_vcf['sample'].str.split(":")
    df_vcf["GT"] = df_splitted.str[0]
    df_vcf["DP"] = df_splitted.str[2]
    df_vcf["PS"] = df_splitted.str[5]
    # Min Quality of 10, this means there is no more than 1/10000 chance of an incorrect base call (>99.9% accuracy)
    # & (pd.to_numeric(df_vcf['DP']) >= 20
    phased = df_vcf.loc[df_vcf['PS'].notna(), ['CHROM','POS','QUAL','GENE','GT','DP','PS','REF','ALT']]
    # phased["DP"] = pd.to_numeric(phased["DP"], errors="coerce")
    # phased = phased.sort_values(by="DP")


    grouped = phased.groupby(['PS']).agg({'POS': ['min','max', 'count'],
                                                    'CHROM': 'first',
                                                    'QUAL': 'mean',
                                                    'GENE': 'unique'}).reset_index()

    return grouped


def phased_GOI(regions_file, phased_hps):
    genes_of_interest = pd.read_csv(regions_file, sep="\t", names=["CHROM","START","END","GENE","INFO","STRAND"])
    goi = genes_of_interest["GENE"].unique()
    genes_of_interest["POSITION"] = genes_of_interest["CHROM"].astype(str)+":"+genes_of_interest["START"].astype(str)+"-"+genes_of_interest["END"].astype(str)
    #print(genes_of_interest)
    # Make sure each GENE entry is a Python list
    phased_hps["GENE"] = phased_hps[("GENE","unique")].apply(lambda x: x.tolist())
    #print(phased_hps)

    # Filter rows where at least one gene in 'unique' is in GOI
    filtered = phased_hps[phased_hps[("GENE", "unique")].apply(lambda genes: any(g in goi for g in genes))].copy()

    # Now, inside each row, keep only the genes that match the GOI
    filtered[("GENE", "unique")] = filtered[("GENE", "unique")].apply(lambda genes: ", ".join([str(g) for g in genes if g in goi]))
    # Remove sub columns
    filtered.columns = ['_'.join([c for c in col if c]).strip() if isinstance(col, tuple) else col for col in filtered.columns.values]
    # Now rename for clarity
    filtered = filtered.rename(columns={'PS': 'PS',
                                        'POS_min': 'POS_START',
                                        'POS_max': 'POS_END',
                                        'POS_count': 'PHASED_VARIANTS',
                                        'CHROM_first': 'chromosome',
                                        'QUAL_mean': 'QUAL_MEAN',
                                        'GENE_unique': 'GENE'})

    filtered["HB_LENGTH"] = filtered["POS_END"]-filtered["POS_START"]

    merged = pd.merge(filtered, genes_of_interest, on="GENE", how="right")[["GENE","POSITION","POS_START","POS_END","HB_LENGTH","PHASED_VARIANTS"]]
    int_cols = ["POS_START", "POS_END", "HB_LENGTH", "PHASED_VARIANTS"]

    for c in int_cols:
        merged[c] = pd.to_numeric(merged[c], errors="coerce").astype("Int64")
    merged = merged.astype(str).replace("<NA>", "NaN")
    return merged

In [64]:
### Code Calculate Switch Errors

def switch_errors(bs_df, sample, version):
    folder = "/hpc/umc_laat/gvandersluis/data/"
    ph_vars = f"{folder}Ont_data_nhung/HG00{sample}/{version}_ROI/phased_ROI.vcf.gz"
    BM_vars = f"{folder}Ont_data_nhung/HG00{sample}/HG00{sample}_BM_SSANDT_rn.vcf"

    BM_ROI = f"{folder}Ont_data_nhung/HG00{sample}/{version}_ROI/ROI_eval.tsv"
    open(BM_ROI, "w").close()
    switches_ROI = f"{folder}Ont_data_nhung/HG00{sample}/{version}_ROI/switches_ROI.bed"
    open(switches_ROI, "w").close()

    for idx, region in bs_df.iterrows():
        reg = region["POSITION"].split(":")[0]+":"+region["POS_START"]+"-"+region["POS_END"]
        gen = region["GENE"]
        if "NaN" not in reg:
            out_vcf = f"{folder}Ont_data_nhung/HG00{sample}/{version}_ROI/{gen}_BM.vcf"
            cmd = ["apptainer", "exec", "-B", "/hpc/:/hpc/", "/hpc/umc_laat/gvandersluis/software/bcftools_v1.9-1-deb_cv1.sif", "bcftools", "view", "-r", reg, BM_vars, "-Oz", "-o", out_vcf]
            subprocess.run(cmd, check=True)
            tsv_out = f"{gen}_eval.tsv"
            bed_out = f"{folder}Ont_data_nhung/HG00{sample}/{version}_ROI/{gen}_switch.bed"
            cmd2 = ["apptainer", "exec", "-B", "/hpc/:/hpc/", "/hpc/umc_laat/gvandersluis/software/whatshap_v1.sif", "whatshap", "compare","--switch-error-bed",bed_out,"--tsv-pairwise", tsv_out, "--names", f"BENCHMARK,{gen}", out_vcf, ph_vars]
            subprocess.run(cmd2, check=True)
            print("HII")
            print(open(bed_out, "r").read())
            if os.path.getsize(BM_ROI) == 0:
                with open(BM_ROI, "w") as out:
                    with open(tsv_out, "r") as inp:
                        out.write(inp.read())
            else:
                with open(BM_ROI, "a") as out:
                    with open(tsv_out, "r") as inp:
                        next(inp)
                        out.write(inp.read())
            #if os.path.getsize(switches_ROI) == 0 and os.path.getsize(bed_out) != 0:
            with open(switches_ROI, "a+") as out_b:
                with open(bed_out, "r") as inp_b:
                    out_b.write(inp_b.read())
            os.remove(out_vcf)
            os.remove(tsv_out)
            os.remove(bed_out)
        print(gen, "\t Done")
    return BM_ROI

In [65]:
### Run Filtered Version

print("VCF filtered on PASS, QUAL >= 40 & ROI")
phased_hb = hb_maker(filtered_v)
basis_df = phased_GOI(goi_file, phased_hb)

basis_df

VCF filtered on PASS, QUAL >= 40 & ROI


Unnamed: 0,GENE,POSITION,POS_START,POS_END,HB_LENGTH,PHASED_VARIANTS
0,CRTAP,chr3:32630864-33630864,32934214.0,33148829.0,214615.0,209.0
1,SMN1,chr5:70439486-71439486,,,,
2,CYP21A2,chr6:31540309-32540309,32000387.0,32137224.0,136837.0,83.0
3,PEX7,chr6:136368573-137368573,136636685.0,137361584.0,724899.0,603.0
4,CFTR,chr7:117001545-118001545,117002433.0,117727876.0,725443.0,416.0
5,MUSK,chr9:110237668-111237668,110240520.0,110955599.0,715079.0,769.0
6,HBB,chr11:4727429-5727429,,,,
7,BRCA2,chr13:31857677-32857677,32180531.0,32415657.0,235126.0,99.0
8,POLG,chr15:88820029-89820029,88820301.0,89569212.0,748911.0,562.0
9,HBA1,chr16:1-677101,27803.0,646952.0,619149.0,614.0


In [66]:
### Run Switch Errors

bm_roi = switch_errors(basis_df, "2", "filtered")
sw_e = pd.read_csv(bm_roi, sep="\t")

sw_e = sw_e[["dataset_name1","all_switches"]]
sw_e.rename(columns={"dataset_name1": "GENE"}, inplace=True)
switch_df = pd.merge(basis_df, sw_e, on="GENE", how="left")
switch_df

[E::idx_find_and_load] Could not retrieve index file for '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/CRTAP_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/CRTAP_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/phased_ROI.vcf.gz'
Chromosomes present in all VCFs: chr3


Comparing phasings for sample hg002
FILENAMES
  BENCHMARK = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/CRTAP_BM.vcf
      CRTAP = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/phased_ROI.vcf.gz
---------------- Chromosome chr3 ----------------
VARIANT COUNTS (heterozygous / all): 
          BENCHMARK:       264 /       424
              CRTAP:       738 /       774
              UNION:       773 /       957
       INTERSECTION:       229 /       241
PAIRWISE COMPARISON: BENCHMARK <--> CRTAP:
         common heterozygous variants:       229
         (restricting to these below)
    non-singleton blocks in BENCHMARK:         1
                 --> covered variants:        90
        non-singleton blocks in CRTAP:         1
                 --> covered variants:       204
    non-singleton intersection blocks:         1
                 --> covered variants:        83
              ALL INTERSECTION BLOCKS: ---------
    phased pairs of variants a

[E::idx_find_and_load] Could not retrieve index file for '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/CYP21A2_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/CYP21A2_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/phased_ROI.vcf.gz'
Chromosomes present in all VCFs: chr6


Comparing phasings for sample hg002
FILENAMES
  BENCHMARK = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/CYP21A2_BM.vcf
    CYP21A2 = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/phased_ROI.vcf.gz
---------------- Chromosome chr6 ----------------
VARIANT COUNTS (heterozygous / all): 
          BENCHMARK:       115 /       178
            CYP21A2:      1907 /      2282
              UNION:      1935 /      2368
       INTERSECTION:        87 /        92
PAIRWISE COMPARISON: BENCHMARK <--> CYP21A2:
         common heterozygous variants:        87
         (restricting to these below)
    non-singleton blocks in BENCHMARK:         1
                 --> covered variants:        87
      non-singleton blocks in CYP21A2:         1
                 --> covered variants:        83
    non-singleton intersection blocks:         1
                 --> covered variants:        83
              ALL INTERSECTION BLOCKS: ---------
    phased pairs of varian

[E::idx_find_and_load] Could not retrieve index file for '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/PEX7_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/PEX7_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/phased_ROI.vcf.gz'
Chromosomes present in all VCFs: chr6


Comparing phasings for sample hg002
FILENAMES
  BENCHMARK = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/PEX7_BM.vcf
       PEX7 = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/phased_ROI.vcf.gz
---------------- Chromosome chr6 ----------------
VARIANT COUNTS (heterozygous / all): 
          BENCHMARK:       731 /      1151
               PEX7:      1907 /      2282
              UNION:      1998 /      2748
       INTERSECTION:       640 /       685
PAIRWISE COMPARISON: BENCHMARK <--> PEX7:
         common heterozygous variants:       640
         (restricting to these below)
    non-singleton blocks in BENCHMARK:         1
                 --> covered variants:       576
         non-singleton blocks in PEX7:         1
                 --> covered variants:       593
    non-singleton intersection blocks:         1
                 --> covered variants:       534
              ALL INTERSECTION BLOCKS: ---------
    phased pairs of variants ass

[E::idx_find_and_load] Could not retrieve index file for '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/CFTR_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/CFTR_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/phased_ROI.vcf.gz'
Chromosomes present in all VCFs: chr7


Comparing phasings for sample hg002
FILENAMES
  BENCHMARK = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/CFTR_BM.vcf
       CFTR = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/phased_ROI.vcf.gz
---------------- Chromosome chr7 ----------------
VARIANT COUNTS (heterozygous / all): 
          BENCHMARK:       514 /       707
               CFTR:       610 /       644
              UNION:       673 /       881
       INTERSECTION:       451 /       470
PAIRWISE COMPARISON: BENCHMARK <--> CFTR:
         common heterozygous variants:       451
         (restricting to these below)
    non-singleton blocks in BENCHMARK:         1
                 --> covered variants:       416
         non-singleton blocks in CFTR:         1
                 --> covered variants:       402
    non-singleton intersection blocks:         1
                 --> covered variants:       376
              ALL INTERSECTION BLOCKS: ---------
    phased pairs of variants ass

[E::idx_find_and_load] Could not retrieve index file for '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/MUSK_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/MUSK_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/phased_ROI.vcf.gz'
Chromosomes present in all VCFs: chr9


Comparing phasings for sample hg002
FILENAMES
  BENCHMARK = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/MUSK_BM.vcf
       MUSK = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/phased_ROI.vcf.gz
---------------- Chromosome chr9 ----------------
VARIANT COUNTS (heterozygous / all): 
          BENCHMARK:      1004 /      1345
               MUSK:      1181 /      1243
              UNION:      1348 /      1709
       INTERSECTION:       837 /       879
PAIRWISE COMPARISON: BENCHMARK <--> MUSK:
         common heterozygous variants:       837
         (restricting to these below)
    non-singleton blocks in BENCHMARK:         1
                 --> covered variants:       644
         non-singleton blocks in MUSK:         1
                 --> covered variants:       737
    non-singleton intersection blocks:         1
                 --> covered variants:       575
              ALL INTERSECTION BLOCKS: ---------
    phased pairs of variants ass

[E::idx_find_and_load] Could not retrieve index file for '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/BRCA2_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/BRCA2_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/phased_ROI.vcf.gz'
Chromosomes present in all VCFs: chr13


Comparing phasings for sample hg002
FILENAMES
  BENCHMARK = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/BRCA2_BM.vcf
      BRCA2 = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/phased_ROI.vcf.gz
---------------- Chromosome chr13 ----------------
VARIANT COUNTS (heterozygous / all): 
          BENCHMARK:       118 /       309
              BRCA2:       354 /       415
              UNION:       367 /       606
       INTERSECTION:       105 /       118
PAIRWISE COMPARISON: BENCHMARK <--> BRCA2:
         common heterozygous variants:       105
         (restricting to these below)
    non-singleton blocks in BENCHMARK:         1
                 --> covered variants:        84
        non-singleton blocks in BRCA2:         1
                 --> covered variants:        98
    non-singleton intersection blocks:         1
                 --> covered variants:        79
              ALL INTERSECTION BLOCKS: ---------
    phased pairs of variants 

[E::idx_find_and_load] Could not retrieve index file for '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/POLG_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/POLG_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/phased_ROI.vcf.gz'
Chromosomes present in all VCFs: chr15


Comparing phasings for sample hg002
FILENAMES
  BENCHMARK = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/POLG_BM.vcf
       POLG = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/phased_ROI.vcf.gz
---------------- Chromosome chr15 ----------------
VARIANT COUNTS (heterozygous / all): 
          BENCHMARK:       666 /       985
               POLG:       707 /       798
              UNION:       798 /      1173
       INTERSECTION:       575 /       610
PAIRWISE COMPARISON: BENCHMARK <--> POLG:
         common heterozygous variants:       575
         (restricting to these below)
    non-singleton blocks in BENCHMARK:         1
                 --> covered variants:       469
         non-singleton blocks in POLG:         1
                 --> covered variants:       528
    non-singleton intersection blocks:         1
                 --> covered variants:       433
              ALL INTERSECTION BLOCKS: ---------
    phased pairs of variants as

[E::idx_find_and_load] Could not retrieve index file for '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/HBA1_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/HBA1_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/phased_ROI.vcf.gz'
Chromosomes present in all VCFs: chr16


Comparing phasings for sample hg002
FILENAMES
  BENCHMARK = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/HBA1_BM.vcf
       HBA1 = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/phased_ROI.vcf.gz
---------------- Chromosome chr16 ----------------
VARIANT COUNTS (heterozygous / all): 
          BENCHMARK:       543 /       989
               HBA1:       674 /       734
              UNION:       791 /      1269
       INTERSECTION:       426 /       454
PAIRWISE COMPARISON: BENCHMARK <--> HBA1:
         common heterozygous variants:       426
         (restricting to these below)
    non-singleton blocks in BENCHMARK:         1
                 --> covered variants:       309
         non-singleton blocks in HBA1:         1
                 --> covered variants:       388
    non-singleton intersection blocks:         1
                 --> covered variants:       282
              ALL INTERSECTION BLOCKS: ---------
    phased pairs of variants as

[E::idx_find_and_load] Could not retrieve index file for '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/BRCA1_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/BRCA1_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/phased_ROI.vcf.gz'
Chromosomes present in all VCFs: chr17


Comparing phasings for sample hg002
FILENAMES
  BENCHMARK = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/BRCA1_BM.vcf
      BRCA1 = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/phased_ROI.vcf.gz
---------------- Chromosome chr17 ----------------
VARIANT COUNTS (heterozygous / all): 
          BENCHMARK:        12 /        27
              BRCA1:      1062 /      1188
              UNION:      1066 /      1205
       INTERSECTION:         8 /        10
PAIRWISE COMPARISON: BENCHMARK <--> BRCA1:
         common heterozygous variants:         8
         (restricting to these below)
    non-singleton blocks in BENCHMARK:         1
                 --> covered variants:         6
        non-singleton blocks in BRCA1:         1
                 --> covered variants:         7
    non-singleton intersection blocks:         1
                 --> covered variants:         5
              ALL INTERSECTION BLOCKS: ---------
    phased pairs of variants 

[E::idx_find_and_load] Could not retrieve index file for '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/TSEN54_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/TSEN54_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/phased_ROI.vcf.gz'
Chromosomes present in all VCFs: chr17


Comparing phasings for sample hg002
FILENAMES
  BENCHMARK = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/TSEN54_BM.vcf
     TSEN54 = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/filtered_ROI/phased_ROI.vcf.gz
---------------- Chromosome chr17 ----------------
VARIANT COUNTS (heterozygous / all): 
          BENCHMARK:       676 /      1303
             TSEN54:      1062 /      1188
              UNION:      1149 /      1864
       INTERSECTION:       589 /       627
PAIRWISE COMPARISON: BENCHMARK <--> TSEN54:
         common heterozygous variants:       589
         (restricting to these below)
    non-singleton blocks in BENCHMARK:         1
                 --> covered variants:       475
       non-singleton blocks in TSEN54:         1
                 --> covered variants:       528
    non-singleton intersection blocks:         1
                 --> covered variants:       423
              ALL INTERSECTION BLOCKS: ---------
    phased pairs of variant

Unnamed: 0,GENE,POSITION,POS_START,POS_END,HB_LENGTH,PHASED_VARIANTS,all_switches
0,CRTAP,chr3:32630864-33630864,32934214.0,33148829.0,214615.0,209.0,0.0
1,SMN1,chr5:70439486-71439486,,,,,
2,CYP21A2,chr6:31540309-32540309,32000387.0,32137224.0,136837.0,83.0,0.0
3,PEX7,chr6:136368573-137368573,136636685.0,137361584.0,724899.0,603.0,0.0
4,CFTR,chr7:117001545-118001545,117002433.0,117727876.0,725443.0,416.0,0.0
5,MUSK,chr9:110237668-111237668,110240520.0,110955599.0,715079.0,769.0,0.0
6,HBB,chr11:4727429-5727429,,,,,
7,BRCA2,chr13:31857677-32857677,32180531.0,32415657.0,235126.0,99.0,0.0
8,POLG,chr15:88820029-89820029,88820301.0,89569212.0,748911.0,562.0,0.0
9,HBA1,chr16:1-677101,27803.0,646952.0,619149.0,614.0,0.0


In [67]:
### Run Ununfiltered version

print("VCF filtered on ROI")
phased_hb_uf = hb_maker(ununfiltered_v)
basis_df_uf = phased_GOI(goi_file, phased_hb_uf)

basis_df_uf

VCF filtered on ROI


Unnamed: 0,GENE,POSITION,POS_START,POS_END,HB_LENGTH,PHASED_VARIANTS
0,CRTAP,chr3:32630864-33630864,32934214.0,33148829.0,214615.0,231.0
1,SMN1,chr5:70439486-71439486,70929044.0,71437569.0,508525.0,399.0
2,CYP21A2,chr6:31540309-32540309,31991875.0,32137224.0,145349.0,102.0
3,PEX7,chr6:136368573-137368573,136636685.0,137361584.0,724899.0,663.0
4,CFTR,chr7:117001545-118001545,117002433.0,117727876.0,725443.0,462.0
5,MUSK,chr9:110237668-111237668,110239493.0,110955599.0,716106.0,893.0
6,HBB,chr11:4727429-5727429,,,,
7,BRCA2,chr13:31857677-32857677,32180531.0,32430770.0,250239.0,106.0
8,POLG,chr15:88820029-89820029,88820301.0,89569212.0,748911.0,633.0
9,HBA1,chr16:1-677101,17889.0,648721.0,630832.0,949.0


In [68]:
bm_roi_uf = switch_errors(basis_df_uf, "2", "un_unfiltered")

sw_e_uf = pd.read_csv(bm_roi_uf, sep="\t")

sw_e_uf = sw_e_uf[["dataset_name1","all_switches"]]
sw_e_uf.rename(columns={"dataset_name1": "GENE"}, inplace=True)

switch_df_uf = pd.merge(basis_df_uf, sw_e_uf, on="GENE", how="left")
switch_df_uf

[E::idx_find_and_load] Could not retrieve index file for '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/CRTAP_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/CRTAP_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/phased_ROI.vcf.gz'
Chromosomes present in all VCFs: chr3


Comparing phasings for sample hg002
FILENAMES
  BENCHMARK = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/CRTAP_BM.vcf
      CRTAP = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/phased_ROI.vcf.gz
---------------- Chromosome chr3 ----------------
VARIANT COUNTS (heterozygous / all): 
          BENCHMARK:       264 /       424
              CRTAP:       907 /      1394
              UNION:       913 /      1402
       INTERSECTION:       258 /       416
PAIRWISE COMPARISON: BENCHMARK <--> CRTAP:
         common heterozygous variants:       258
         (restricting to these below)
    non-singleton blocks in BENCHMARK:         1
                 --> covered variants:        99
        non-singleton blocks in CRTAP:         1
                 --> covered variants:       222
    non-singleton intersection blocks:         1
                 --> covered variants:        89
              ALL INTERSECTION BLOCKS: ---------
    phased pairs of 

[E::idx_find_and_load] Could not retrieve index file for '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/SMN1_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/SMN1_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/phased_ROI.vcf.gz'
Chromosomes present in all VCFs: chr5


Comparing phasings for sample hg002
FILENAMES
  BENCHMARK = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/SMN1_BM.vcf
       SMN1 = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/phased_ROI.vcf.gz
---------------- Chromosome chr5 ----------------
VARIANT COUNTS (heterozygous / all): 
          BENCHMARK:        74 /       108
               SMN1:      1931 /      2496
              UNION:      1934 /      2499
       INTERSECTION:        71 /       105
PAIRWISE COMPARISON: BENCHMARK <--> SMN1:
         common heterozygous variants:        71
         (restricting to these below)
    non-singleton blocks in BENCHMARK:         1
                 --> covered variants:         7
         non-singleton blocks in SMN1:         1
                 --> covered variants:        64
    non-singleton intersection blocks:         1
                 --> covered variants:         6
              ALL INTERSECTION BLOCKS: ---------
    phased pairs of va

[E::idx_find_and_load] Could not retrieve index file for '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/CYP21A2_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/CYP21A2_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/phased_ROI.vcf.gz'
Chromosomes present in all VCFs: chr6


Comparing phasings for sample hg002
FILENAMES
  BENCHMARK = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/CYP21A2_BM.vcf
    CYP21A2 = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/phased_ROI.vcf.gz
---------------- Chromosome chr6 ----------------
VARIANT COUNTS (heterozygous / all): 
          BENCHMARK:       118 /       187
            CYP21A2:      4048 /      7613
              UNION:      4054 /      7620
       INTERSECTION:       112 /       180
PAIRWISE COMPARISON: BENCHMARK <--> CYP21A2:
         common heterozygous variants:       112
         (restricting to these below)
    non-singleton blocks in BENCHMARK:         1
                 --> covered variants:       112
      non-singleton blocks in CYP21A2:         1
                 --> covered variants:       102
    non-singleton intersection blocks:         1
                 --> covered variants:       102
              ALL INTERSECTION BLOCKS: ---------
    phased pairs

[E::idx_find_and_load] Could not retrieve index file for '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/PEX7_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/PEX7_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/phased_ROI.vcf.gz'
Chromosomes present in all VCFs: chr6


Comparing phasings for sample hg002
FILENAMES
  BENCHMARK = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/PEX7_BM.vcf
       PEX7 = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/phased_ROI.vcf.gz
---------------- Chromosome chr6 ----------------
VARIANT COUNTS (heterozygous / all): 
          BENCHMARK:       731 /      1151
               PEX7:      4048 /      7613
              UNION:      4063 /      7629
       INTERSECTION:       716 /      1135
PAIRWISE COMPARISON: BENCHMARK <--> PEX7:
         common heterozygous variants:       716
         (restricting to these below)
    non-singleton blocks in BENCHMARK:         1
                 --> covered variants:       639
         non-singleton blocks in PEX7:         1
                 --> covered variants:       635
    non-singleton intersection blocks:         1
                 --> covered variants:       570
              ALL INTERSECTION BLOCKS: ---------
    phased pairs of va

[E::idx_find_and_load] Could not retrieve index file for '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/CFTR_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/CFTR_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/phased_ROI.vcf.gz'
Chromosomes present in all VCFs: chr7


Comparing phasings for sample hg002
FILENAMES
  BENCHMARK = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/CFTR_BM.vcf
       CFTR = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/phased_ROI.vcf.gz
---------------- Chromosome chr7 ----------------
VARIANT COUNTS (heterozygous / all): 
          BENCHMARK:       514 /       707
               CFTR:       727 /      1062
              UNION:       738 /      1082
       INTERSECTION:       503 /       687
PAIRWISE COMPARISON: BENCHMARK <--> CFTR:
         common heterozygous variants:       503
         (restricting to these below)
    non-singleton blocks in BENCHMARK:         1
                 --> covered variants:       455
         non-singleton blocks in CFTR:         1
                 --> covered variants:       431
    non-singleton intersection blocks:         1
                 --> covered variants:       401
              ALL INTERSECTION BLOCKS: ---------
    phased pairs of va

[E::idx_find_and_load] Could not retrieve index file for '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/MUSK_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/MUSK_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/phased_ROI.vcf.gz'
Chromosomes present in all VCFs: chr9


Comparing phasings for sample hg002
FILENAMES
  BENCHMARK = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/MUSK_BM.vcf
       MUSK = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/phased_ROI.vcf.gz
---------------- Chromosome chr9 ----------------
VARIANT COUNTS (heterozygous / all): 
          BENCHMARK:      1005 /      1346
               MUSK:      1436 /      1979
              UNION:      1466 /      2011
       INTERSECTION:       975 /      1314
PAIRWISE COMPARISON: BENCHMARK <--> MUSK:
         common heterozygous variants:       975
         (restricting to these below)
    non-singleton blocks in BENCHMARK:         1
                 --> covered variants:       743
         non-singleton blocks in MUSK:         1
                 --> covered variants:       834
    non-singleton intersection blocks:         1
                 --> covered variants:       647
              ALL INTERSECTION BLOCKS: ---------
    phased pairs of va

[E::idx_find_and_load] Could not retrieve index file for '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/BRCA2_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/BRCA2_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/phased_ROI.vcf.gz'
Chromosomes present in all VCFs: chr13


Comparing phasings for sample hg002
FILENAMES
  BENCHMARK = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/BRCA2_BM.vcf
      BRCA2 = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/phased_ROI.vcf.gz
---------------- Chromosome chr13 ----------------
VARIANT COUNTS (heterozygous / all): 
          BENCHMARK:       119 /       318
              BRCA2:       428 /      1167
              UNION:       430 /      1175
       INTERSECTION:       117 /       310
PAIRWISE COMPARISON: BENCHMARK <--> BRCA2:
         common heterozygous variants:       117
         (restricting to these below)
    non-singleton blocks in BENCHMARK:         1
                 --> covered variants:        93
        non-singleton blocks in BRCA2:         1
                 --> covered variants:       105
    non-singleton intersection blocks:         1
                 --> covered variants:        84
              ALL INTERSECTION BLOCKS: ---------
    phased pairs of

[E::idx_find_and_load] Could not retrieve index file for '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/POLG_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/POLG_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/phased_ROI.vcf.gz'
Chromosomes present in all VCFs: chr15


Comparing phasings for sample hg002
FILENAMES
  BENCHMARK = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/POLG_BM.vcf
       POLG = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/phased_ROI.vcf.gz
---------------- Chromosome chr15 ----------------
VARIANT COUNTS (heterozygous / all): 
          BENCHMARK:       666 /       985
               POLG:       880 /      1774
              UNION:       895 /      1796
       INTERSECTION:       651 /       963
PAIRWISE COMPARISON: BENCHMARK <--> POLG:
         common heterozygous variants:       651
         (restricting to these below)
    non-singleton blocks in BENCHMARK:         1
                 --> covered variants:       528
         non-singleton blocks in POLG:         1
                 --> covered variants:       574
    non-singleton intersection blocks:         1
                 --> covered variants:       471
              ALL INTERSECTION BLOCKS: ---------
    phased pairs of v

[E::idx_find_and_load] Could not retrieve index file for '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/HBA1_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/HBA1_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/phased_ROI.vcf.gz'
Chromosomes present in all VCFs: chr16


Comparing phasings for sample hg002
FILENAMES
  BENCHMARK = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/HBA1_BM.vcf
       HBA1 = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/phased_ROI.vcf.gz
---------------- Chromosome chr16 ----------------
VARIANT COUNTS (heterozygous / all): 
          BENCHMARK:       544 /       992
               HBA1:      1111 /      1897
              UNION:      1136 /      1928
       INTERSECTION:       519 /       961
PAIRWISE COMPARISON: BENCHMARK <--> HBA1:
         common heterozygous variants:       519
         (restricting to these below)
    non-singleton blocks in BENCHMARK:         1
                 --> covered variants:       354
         non-singleton blocks in HBA1:         1
                 --> covered variants:       464
    non-singleton intersection blocks:         1
                 --> covered variants:       320
              ALL INTERSECTION BLOCKS: ---------
    phased pairs of v

[E::idx_find_and_load] Could not retrieve index file for '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/BRCA1_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/BRCA1_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/phased_ROI.vcf.gz'
Chromosomes present in all VCFs: chr17


Comparing phasings for sample hg002
FILENAMES
  BENCHMARK = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/BRCA1_BM.vcf
      BRCA1 = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/phased_ROI.vcf.gz
---------------- Chromosome chr17 ----------------
VARIANT COUNTS (heterozygous / all): 
          BENCHMARK:       119 /       361
              BRCA1:      1392 /      3033
              UNION:      1396 /      3043
       INTERSECTION:       115 /       351
PAIRWISE COMPARISON: BENCHMARK <--> BRCA1:
         common heterozygous variants:       115
         (restricting to these below)
    non-singleton blocks in BENCHMARK:         1
                 --> covered variants:       101
        non-singleton blocks in BRCA1:         1
                 --> covered variants:       100
    non-singleton intersection blocks:         1
                 --> covered variants:        88
              ALL INTERSECTION BLOCKS: ---------
    phased pairs of

[E::idx_find_and_load] Could not retrieve index file for '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/TSEN54_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/TSEN54_BM.vcf'
Reading phasing from '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/phased_ROI.vcf.gz'
Chromosomes present in all VCFs: chr17


Comparing phasings for sample hg002
FILENAMES
  BENCHMARK = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/TSEN54_BM.vcf
     TSEN54 = /hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/un_unfiltered_ROI/phased_ROI.vcf.gz
---------------- Chromosome chr17 ----------------
VARIANT COUNTS (heterozygous / all): 
          BENCHMARK:       679 /      1307
             TSEN54:      1392 /      3033
              UNION:      1412 /      3070
       INTERSECTION:       659 /      1270
PAIRWISE COMPARISON: BENCHMARK <--> TSEN54:
         common heterozygous variants:       659
         (restricting to these below)
    non-singleton blocks in BENCHMARK:         1
                 --> covered variants:       521
       non-singleton blocks in TSEN54:         1
                 --> covered variants:       569
    non-singleton intersection blocks:         1
                 --> covered variants:       452
              ALL INTERSECTION BLOCKS: ---------
    phased pairs 

Unnamed: 0,GENE,POSITION,POS_START,POS_END,HB_LENGTH,PHASED_VARIANTS,all_switches
0,CRTAP,chr3:32630864-33630864,32934214.0,33148829.0,214615.0,231.0,0.0
1,SMN1,chr5:70439486-71439486,70929044.0,71437569.0,508525.0,399.0,0.0
2,CYP21A2,chr6:31540309-32540309,31991875.0,32137224.0,145349.0,102.0,0.0
3,PEX7,chr6:136368573-137368573,136636685.0,137361584.0,724899.0,663.0,0.0
4,CFTR,chr7:117001545-118001545,117002433.0,117727876.0,725443.0,462.0,0.0
5,MUSK,chr9:110237668-111237668,110239493.0,110955599.0,716106.0,893.0,0.0
6,HBB,chr11:4727429-5727429,,,,,
7,BRCA2,chr13:31857677-32857677,32180531.0,32430770.0,250239.0,106.0,0.0
8,POLG,chr15:88820029-89820029,88820301.0,89569212.0,748911.0,633.0,0.0
9,HBA1,chr16:1-677101,17889.0,648721.0,630832.0,949.0,0.0


In [69]:
# int_cols = ["POS_START", "POS_END", "HB_LENGTH", "PHASED_VARIANTS"]

# for c in int_cols:
#     switch_df_uf[c] = pd.to_numeric(switch_df_uf[c], errors="coerce").astype("Int64")
# switch_df_uf = switch_df_uf.astype(str).replace("<NA>", "NaN")


switch_df_uf

Unnamed: 0,GENE,POSITION,POS_START,POS_END,HB_LENGTH,PHASED_VARIANTS,all_switches
0,CRTAP,chr3:32630864-33630864,32934214.0,33148829.0,214615.0,231.0,0.0
1,SMN1,chr5:70439486-71439486,70929044.0,71437569.0,508525.0,399.0,0.0
2,CYP21A2,chr6:31540309-32540309,31991875.0,32137224.0,145349.0,102.0,0.0
3,PEX7,chr6:136368573-137368573,136636685.0,137361584.0,724899.0,663.0,0.0
4,CFTR,chr7:117001545-118001545,117002433.0,117727876.0,725443.0,462.0,0.0
5,MUSK,chr9:110237668-111237668,110239493.0,110955599.0,716106.0,893.0,0.0
6,HBB,chr11:4727429-5727429,,,,,
7,BRCA2,chr13:31857677-32857677,32180531.0,32430770.0,250239.0,106.0,0.0
8,POLG,chr15:88820029-89820029,88820301.0,89569212.0,748911.0,633.0,0.0
9,HBA1,chr16:1-677101,17889.0,648721.0,630832.0,949.0,0.0


In [14]:
switch_df

Unnamed: 0,GENE,POSITION,POS_START,POS_END,HB_LENGTH,PHASED_VARIANTS,all_switches
0,CRTAP,chr3:32630864-33630864,32934214.0,33148829.0,214615.0,209.0,0.0
1,SMN1,chr5:70439486-71439486,,,,,
2,CYP21A2,chr6:31540309-32540309,32000387.0,32137224.0,136837.0,83.0,0.0
3,PEX7,chr6:136368573-137368573,136636685.0,137361584.0,724899.0,603.0,0.0
4,CFTR,chr7:117001545-118001545,117002433.0,117727876.0,725443.0,416.0,0.0
5,MUSK,chr9:110237668-111237668,110240520.0,110955599.0,715079.0,769.0,0.0
6,HBB,chr11:4727429-5727429,,,,,
7,BRCA2,chr13:31857677-32857677,32180531.0,32415657.0,235126.0,99.0,0.0
8,POLG,chr15:88820029-89820029,88820301.0,89569212.0,748911.0,562.0,0.0
9,HBA1,chr16:1-677101,27803.0,646952.0,619149.0,614.0,0.0


In [15]:
final_df = pd.merge(switch_df, switch_df_uf[["POSITION","PHASED_VARIANTS"]], how="left", on="POSITION")
final_df["FILTERED_VARIANTS"] = (pd.to_numeric(final_df["PHASED_VARIANTS_y"], errors="coerce") - pd.to_numeric(final_df["PHASED_VARIANTS_x"], errors="coerce"))
final_df = final_df[["GENE", "POSITION", "POS_START", "POS_END", "HB_LENGTH", "PHASED_VARIANTS_x", "FILTERED_VARIANTS", "all_switches"]]
final_df.rename(columns={"PHASED_VARIANTS_x": "PHASED_VARIANTS"}, inplace=True)

int_cols = ["POS_START", "POS_END", "HB_LENGTH", "PHASED_VARIANTS", "FILTERED_VARIANTS", "all_switches"]

for c in int_cols:
    final_df[c] = pd.to_numeric(final_df[c], errors="coerce").astype("Int64")
final_df = final_df.astype(str).replace("<NA>", "NaN")

final_df

Unnamed: 0,GENE,POSITION,POS_START,POS_END,HB_LENGTH,PHASED_VARIANTS,FILTERED_VARIANTS,all_switches
0,CRTAP,chr3:32630864-33630864,32934214.0,33148829.0,214615.0,209.0,22.0,0.0
1,SMN1,chr5:70439486-71439486,,,,,,
2,CYP21A2,chr6:31540309-32540309,32000387.0,32137224.0,136837.0,83.0,19.0,0.0
3,PEX7,chr6:136368573-137368573,136636685.0,137361584.0,724899.0,603.0,60.0,0.0
4,CFTR,chr7:117001545-118001545,117002433.0,117727876.0,725443.0,416.0,46.0,0.0
5,MUSK,chr9:110237668-111237668,110240520.0,110955599.0,715079.0,769.0,124.0,0.0
6,HBB,chr11:4727429-5727429,,,,,,
7,BRCA2,chr13:31857677-32857677,32180531.0,32415657.0,235126.0,99.0,7.0,0.0
8,POLG,chr15:88820029-89820029,88820301.0,89569212.0,748911.0,562.0,71.0,0.0
9,HBA1,chr16:1-677101,27803.0,646952.0,619149.0,614.0,335.0,0.0


In [16]:
pd.read_csv(bm_roi, sep="\t").drop(["#sample", "dataset_name0", "file_name0", "intersection_blocks","largestblock_diff_genotypes","only_snvs","largestblock_diff_genotypes_rate","file_name1"],axis=1)

Unnamed: 0,chromosome,dataset_name1,covered_variants,all_assessed_pairs,all_switches,all_switch_rate,all_switchflips,all_switchflip_rate,blockwise_hamming,blockwise_hamming_rate,blockwise_diff_genotypes,blockwise_diff_genotypes_rate,largestblock_assessed_pairs,largestblock_switches,largestblock_switch_rate,largestblock_switchflips,largestblock_switchflip_rate,largestblock_hamming,largestblock_hamming_rate,het_variants0
0,chr3,CRTAP,83,82,0,0.0,0/0,0.0,0,0.0,0,0.0,82,0,0.0,0/0,0.0,0,0.0,264
1,chr6,CYP21A2,83,82,0,0.0,0/0,0.0,0,0.0,0,0.0,82,0,0.0,0/0,0.0,0,0.0,115
2,chr6,PEX7,534,533,0,0.0,0/0,0.0,0,0.0,0,0.0,533,0,0.0,0/0,0.0,0,0.0,731
3,chr7,CFTR,376,375,0,0.0,0/0,0.0,0,0.0,0,0.0,375,0,0.0,0/0,0.0,0,0.0,514
4,chr9,MUSK,575,574,0,0.0,0/0,0.0,0,0.0,0,0.0,574,0,0.0,0/0,0.0,0,0.0,1004
5,chr13,BRCA2,79,78,0,0.0,0/0,0.0,0,0.0,0,0.0,78,0,0.0,0/0,0.0,0,0.0,118
6,chr15,POLG,433,432,0,0.0,0/0,0.0,0,0.0,0,0.0,432,0,0.0,0/0,0.0,0,0.0,666
7,chr16,HBA1,282,281,0,0.0,0/0,0.0,0,0.0,0,0.0,281,0,0.0,0/0,0.0,0,0.0,543
8,chr17,BRCA1,5,4,0,0.0,0/0,0.0,0,0.0,0,0.0,4,0,0.0,0/0,0.0,0,0.0,12
9,chr17,TSEN54,423,422,2,0.004739,0/1,0.00237,1,0.002364,0,0.0,422,2,0.004739,0/1,0.00237,1,0.002364,676


In [17]:
pd.read_csv(bm_roi_uf, sep="\t").drop(["#sample", "dataset_name0", "file_name0", "intersection_blocks","largestblock_diff_genotypes","only_snvs","largestblock_diff_genotypes_rate","file_name1"],axis=1)


Unnamed: 0,chromosome,dataset_name1,covered_variants,all_assessed_pairs,all_switches,all_switch_rate,all_switchflips,all_switchflip_rate,blockwise_hamming,blockwise_hamming_rate,blockwise_diff_genotypes,blockwise_diff_genotypes_rate,largestblock_assessed_pairs,largestblock_switches,largestblock_switch_rate,largestblock_switchflips,largestblock_switchflip_rate,largestblock_hamming,largestblock_hamming_rate,het_variants0
0,chr3,CRTAP,89,88,0,0.0,0/0,0.0,0,0.0,0,0.0,88,0,0.0,0/0,0.0,0,0.0,264
1,chr5,SMN1,6,5,0,0.0,0/0,0.0,0,0.0,0,0.0,5,0,0.0,0/0,0.0,0,0.0,74
2,chr6,CYP21A2,102,101,0,0.0,0/0,0.0,0,0.0,0,0.0,101,0,0.0,0/0,0.0,0,0.0,118
3,chr6,PEX7,570,569,0,0.0,0/0,0.0,0,0.0,0,0.0,569,0,0.0,0/0,0.0,0,0.0,731
4,chr7,CFTR,401,400,0,0.0,0/0,0.0,0,0.0,0,0.0,400,0,0.0,0/0,0.0,0,0.0,514
5,chr9,MUSK,647,646,0,0.0,0/0,0.0,0,0.0,0,0.0,646,0,0.0,0/0,0.0,0,0.0,1005
6,chr13,BRCA2,84,83,0,0.0,0/0,0.0,0,0.0,0,0.0,83,0,0.0,0/0,0.0,0,0.0,119
7,chr15,POLG,471,470,0,0.0,0/0,0.0,0,0.0,0,0.0,470,0,0.0,0/0,0.0,0,0.0,666
8,chr16,HBA1,320,319,0,0.0,0/0,0.0,0,0.0,0,0.0,319,0,0.0,0/0,0.0,0,0.0,544
9,chr17,BRCA1,88,87,0,0.0,0/0,0.0,0,0.0,0,0.0,87,0,0.0,0/0,0.0,0,0.0,119


In [1]:
### Interesting Code Lines

# find how many phased variant per selected region
bcftools view -r 'chr3:32934214-33148829' phased_ROI.vcf.gz | grep -E '0\|1|1\|1|1\|0|0\|0' | wc

bcftools view -H  -i 'FORMAT/PS="."' -r 'chr17:75022811-75870128' phased_ROI.vcf.gz | wc
bcftools view -H  -i 'FORMAT/GT!~"/"'  -r 'chr17:75022811-75870128' phased_ROI.vcf.gz | wc
bcftools view -H  -i 'FORMAT/PS!="."' -r 'chr17:75022811-75870128' phased_ROI.vcf.gz | wc
bcftools view -H  -i 'FORMAT/GT~"/"'  -r 'chr17:75022811-75870128' phased_ROI.vcf.gz | wc

SyntaxError: invalid syntax (2934525902.py, line 4)