In [11]:
#!/usr/bin/env python3

### Imports

import pandas as pd
import numpy as np
import subprocess
import pyranges as pr
import os
import string
import pysam
#import matplotlib.pyplot as plt

In [12]:
# def hb_maker(phased_vars_file):
#     ### Read the filtered vcf file (filtered on ROI of the OMIM genes & QUAL >= 40) and get the desired columns
#     df_vcf = pd.read_csv(phased_vars_file, sep="\t",comment='#', names=['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT', "sample"])
    
#     df_vcf["GENE"] = df_vcf['INFO'].str.split("|").str[4]
#     df_splitted = df_vcf['sample'].str.split(":")
#     df_vcf["GT"] = df_splitted.str[0]
#     df_vcf["DP"] = df_splitted.str[2]
#     df_vcf["PS"] = df_splitted.str[5]

#     ### Only keep the phased variants (the variants with a phase tag)
#     phased = df_vcf.loc[df_vcf['PS'].notna(), ['CHROM','POS','QUAL','GENE','GT','DP','PS','REF','ALT']]

#     ### Group by phase tag and get the min, max, and amount of variants per haploblock
#     grouped = phased.groupby(['PS']).agg({'POS': ['min','max', 'count'],
#                                                     'CHROM': 'first',
#                                                     'QUAL': 'mean',
#                                                     'GENE': 'unique'}).reset_index()

#     ### Change the column names, type, and add desired columns
#     grouped.columns = ['_'.join(col).rstrip('_') if isinstance(col, tuple) else col
#                    for col in grouped.columns]
#     grouped.rename(columns={"PS":"PS_tag", "POS_min":"START_HB", "POS_max": "END_HB", "POS_count":"informative_variants", "CHROM_first":"chromosome", "GENE_unique":"GENES_in_HB"}, inplace=True)
#     grouped["PS_tag"] = grouped["PS_tag"].astype(int)
#     grouped["HB_length"]=grouped["END_HB"]-grouped["START_HB"]
#     grouped["total_VARS"] = None
#     return grouped


def hb_maker(phased_vars_file, bed_f):
    ### Read the filtered vcf file (filtered on ROI of the OMIM genes & QUAL >= 40) and get the desired columns
    df_vcf = pd.read_csv(phased_vars_file, sep="\t", comment='#', names=['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT', "sample"])
    df_splitted = df_vcf['sample'].str.split(":")
    df_vcf["GT"] = df_splitted.str[0]
    df_vcf["GQ"] = df_splitted.str[1]
    df_vcf["DP"] = df_splitted.str[2]
    df_vcf["PS"] = df_splitted.str[5]

    ### Only keep the phased variants (the variants with a phase tag)
    phased = df_vcf.loc[df_vcf['PS'].notna(), ['CHROM','POS','QUAL','GT','GQ','DP','PS','REF','ALT']]
    ### Group by phase tag and get the min, max, and amount of variants per haploblock
    phased['POS'] = pd.to_numeric(phased['POS'], errors='coerce')
    phased['GQ'] = pd.to_numeric(phased['GQ'], errors='coerce')

    phased = create_sub_ps(phased, bed_f)
    return phased
    # grouped = phased.groupby(['PS_R']).agg({'POS': ['min','max', 'count'],
    #                                                 'Chromosome': 'first',
    #                                                 'GQ': 'mean'}).reset_index()

    # ### Change the column names, type, and add desired columns
    # grouped.columns = ['_'.join(col).rstrip('_') if isinstance(col, tuple) else col for col in grouped.columns]
    # grouped.rename(columns={"POS_min":"START_HB", "POS_max": "END_HB", "POS_count":"phased_variants", "Chromosome_first":"chromosome"}, inplace=True)
    # #grouped["PS_tag"] = grouped["PS_tag"].astype(int)
    # grouped["HB_length"]=grouped["END_HB"]-grouped["START_HB"]
    # grouped["total_VARS"] = None
    # grouped["PS_tag"] = grouped["chromosome"].astype(str)+"_"+grouped["PS_R"].astype(str)
    # return grouped


def run_switch(haploblock_table, all_vars, sample, version, ph_vars):
    ### A counter to check how many haploblocks are already compared to BM
    cnt = 0 

    ### Variables and Files for the output
    folder = "/hpc/umc_laat/gvandersluis/data/"    
    BM_vars = f"{folder}Ont_data_nhung/HG00{sample}/HG00{sample}_BM_SSANDT_rn.vcf"
    BM_ROI = f"{folder}Ont_data_nhung/HG00{sample}/{version}_ROI/ROI_eval.tsv"
    open(BM_ROI, "w").close()
    switches_ROI = f"{folder}Ont_data_nhung/HG00{sample}/{version}_ROI/switches_ROI.bed"
    open(switches_ROI, "w").close()

    ### Loop through every haploblock
    for idx, item in haploblock_table.iterrows():
        ### Get the haploblock region
        reg = item["chromosome"]+":"+str(item["START_HB"])+"-"+str(item["END_HB"])
        print(reg)
        ### ADD TOTAL VARIANTS in the region from the unfiltered vcf file
        cmd = f"apptainer exec -B /hpc/:/hpc/ /hpc/umc_laat/gvandersluis/software/bcftools_v1.9-1-deb_cv1.sif bcftools view -H -r {reg} {all_vars} | wc -l"
        ### Run the command
        result = subprocess.run(cmd, shell=True, check=True, capture_output=True, text=True)
        ### Get the count as integer and add the value to the column
        count = int(result.stdout.strip())
        grouped.loc[idx, "total_VARS"] = count
        print(count, "HII")
        
        ### WHATSHAP COMPARE
        PStag = str(item["PS_tag"])
        if "NaN" not in reg:
            ### Prepare the benchmark file for whatshap compare
            ### make an output file for the filtered Benchmark file (filtered on region of interest) and run that code + index the vcf
            out_vcf = f"{folder}Ont_data_nhung/HG00{sample}/{version}_ROI/{PStag}_BM.vcf"
            cmd = ["apptainer", "exec", "-B", "/hpc/:/hpc/", "/hpc/umc_laat/gvandersluis/software/bcftools_v1.9-1-deb_cv1.sif", "bcftools", "view", "-r", reg, BM_vars, "-Oz", "-o", out_vcf]
            subprocess.run(cmd, check=True)
            indexer = ["apptainer", "exec", "-B", "/hpc/:/hpc/", "/hpc/umc_laat/gvandersluis/software/bcftools_v1.9-1-deb_cv1.sif", "bcftools", "index", "-t", out_vcf]
            subprocess.run(indexer, check=True)

            ### Make specific region version of the phased vcf file (Prepare file for whatshap compare)
            out_ph_vars_vcf = f"{folder}Ont_data_nhung/HG00{sample}/{version}_ROI/{PStag}_phased_vars.vcf"
            cmd_1 = ["apptainer", "exec", "-B", "/hpc/:/hpc/", "/hpc/umc_laat/gvandersluis/software/bcftools_v1.9-1-deb_cv1.sif", "bcftools", "view", "-r", reg, ph_vars, "-Oz", "-o", out_ph_vars_vcf]
            subprocess.run(cmd_1, check=True)
            indexer_1 = ["apptainer", "exec", "-B", "/hpc/:/hpc/", "/hpc/umc_laat/gvandersluis/software/bcftools_v1.9-1-deb_cv1.sif", "bcftools", "index", "-t", out_ph_vars_vcf]
            subprocess.run(indexer_1, check=True)

            ### Prepare output files for WHATSHAP compare
            indexed = out_vcf+".tbi"
            indexed_1 = out_ph_vars_vcf+".tbi"
            tsv_out = f"{PStag}_eval.tsv"
            open(tsv_out, "a+").write("")

            ### If the output vcf file has variants (If there are variants found in the benchmark file of this particular region, then:)
            if sum(1 for _ in pysam.VariantFile(out_vcf)) != 0:
                ### Make file for the switch error bed file
                bed_out = f"{folder}Ont_data_nhung/HG00{sample}/{version}_ROI/{PStag}_switch.bed"
                ### RUN WHATSHAP COMPARE
                cmd2 = ["apptainer", "exec", "-B", "/hpc/:/hpc/", "/hpc/umc_laat/gvandersluis/software/whatshap_v1.sif", "whatshap", "compare","--switch-error-bed",bed_out,"--tsv-pairwise", tsv_out, "--names", f"BENCHMARK,{PStag}", out_vcf, out_ph_vars_vcf]
                subprocess.run(cmd2, check=True, capture_output=True)

                ### If the whatsap compare output file is empty (first run), then write output
                if os.path.getsize(BM_ROI) == 0:
                    with open(BM_ROI, "w") as out:
                        with open(tsv_out, "r") as inp:
                            out.write(inp.read())
                ### If whatshap compare output file is not empty anymore (every run after the first), then append output
                else:
                    with open(BM_ROI, "a") as out:
                        with open(tsv_out, "r") as inp:
                            next(inp) # Skip header
                            out.write(inp.read())
                ### Write or append switch error locations to the bed file
                with open(switches_ROI, "a+") as out_b:
                    with open(bed_out, "r") as inp_b:
                        out_b.write(inp_b.read())
            ### REMOVE temporary needed files
                os.remove(bed_out)
            os.remove(out_vcf)
            os.remove(indexed)
            os.remove(out_ph_vars_vcf)
            os.remove(indexed_1)
            os.remove(tsv_out)
        print(PStag, "\t Done")
        
        cnt += 1
        print(cnt)
    return BM_ROI, grouped

In [13]:


def create_sub_ps(variants, bed_file):
    ### Convert to BED-like intervals
    variants["start"] = variants["POS"] - 1
    variants["end"] = variants["POS"]
    ### Load Bed file 
    bed = pr.read_bed(bed_file)
    ### Convert variants to Pyranges
    var_pr = pr.PyRanges(variants.rename(columns={
            "CHROM": "Chromosome",
            "start": "Start",
            "end": "End"}))
    ### Intersect with BED
    overlap = var_pr.join(bed, how="left")
    df = overlap.df
    print("HIHIHIHIHHI\n",df)
    ### Define ROI by coordinates
    df["ROI"] = (df["Chromosome"].astype(str) + ":" + df["Start_b"].astype(str) + "-" + df["End_b"].astype(str))
    print("AGAIN\n", df)
    ### assign the suffixes
    df["PS_R"] = split_ps_tags(df)
    return df

def split_ps_tags(df):
    ### This function adds A B C etc to sub-haploblocks (haploblocks that are defined by multiple ROIs)
    new_ps = [""] * len(df)
    print(df)
    for ps, group in df.groupby("PS", dropna=False):
        print(ps)
        print(group)
        rois = group["ROI"].dropna().unique()
        print(rois)
        ### Single or no ROI → PS unchanged
        if len(rois) <= 1:
            print(group.index)
            for idx in group.index:
                print(idx)
                print(ps)
                new_ps[idx] = ps
            continue
        ### Multiple ROIs → assign suffixes A, B, C...
        roi_to_suffix = {
            roi: string.ascii_uppercase[i]
            for i, roi in enumerate(sorted(rois))}
        for idx, row in group.iterrows():
            if pd.isna(row["ROI"]):
                new_ps[idx] = ps
            else:
                new_ps[idx] = f"{ps}.{roi_to_suffix[row['ROI']]}"
    return new_ps

In [14]:
def excel_style_suffix(n):
    print(n)
    result = ""
    while n > 0:
        n, remainder = divmod(n - 1, 26)
        result = string.ascii_uppercase[remainder] + result
    return result


def split_ps_tags(df):
    df["PS_R"]= "None"
    for ps, group in df.groupby("PS", dropna=False):
        chrm = group["CHROM"].unique()[0]
        rois = group["ROI"].dropna().unique()
        if len(rois) <=1:
            df.loc[df["PS"]==ps, "PS_R"] = chrm+"_"+ps
        else:
            roi_to_suffix = {
                roi: excel_style_suffix(i)
                for i, roi in enumerate(sorted(rois))}
            for idx, row in group.iterrows():
                if row["ROI"] == "None":
                    df.loc[idx, "PS_R"] = ps
                else:
                    df.loc[idx, "PS_R"] = f"{chrm}_{ps}.{roi_to_suffix[row['ROI']]}"
    return df


def create_sub_ps(variants, bed_file):
    variants["ROI"]="None"
    bdf = open(bed_file, "r").readlines()
    for loc in bdf:
        chrom = loc.strip("\n").split("\t")[0]
        start = int(loc.strip("\n").split("\t")[1])
        stop = int(loc.strip("\n").split("\t")[2])
        variants.loc[(variants["CHROM"].astype(str) == chrom) & (variants["POS"].astype(int) >= start) & (variants["POS"].astype(int) <= stop), "ROI"] = chrom+":"+str(start)+"-"+str(stop)
    variants["PS_R"] = split_ps_tags(variants)
    return variants

In [15]:
### RUN scripts

# OMIM test set of 50000 variants
variants="/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/test_nh.vcf"
all_variants="/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/SAMPLE_renamed.vcf.gz"
phased_variants="/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/OMIM_ROI/phased_ROI_HG2.vcf.gz"
bed="/hpc/umc_laat/gvandersluis/data/Ref_HG/HG_OMIM_ROI_merged.bed"
samp="2"
vers="OMIM"
grouped = hb_maker(variants,bed) # maar van all_variants de header version van de variants
grouped
# grouped.to_csv(f"/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG00{samp}/{vers}_ROI/final_df.csv", index=False)

# grouped.sort_values(["chromosome", "PS_tag"]).head(20)
# grouped

        CHROM        POS   QUAL   GT  GQ  DP         PS REF ALT  \
3       chr12      10183   8.77  1|0   8  52      10183   T   A   
4       chr12      10185   7.52  1|0   7  52      10183   C   A   
5       chr12      10194  10.42  1|0  10  52      10183   A   T   
17      chr12      10491   5.05  1|0   5  55      10183   A   T   
18      chr12      10649  14.34  1|0  14  71      10183   G   A   
...       ...        ...    ...  ...  ..  ..        ...  ..  ..   
268495  chr12  133241504  54.35  0|1  54  44  133158779   A   G   
268496  chr12  133244280  58.20  0|1  58  47  133158779   G   A   
268498  chr12  133249425  53.87  0|1  53  49  133158779   G   T   
268499  chr12  133253066  55.29  0|1  55  49  133158779   G   T   
268505  chr12  133264934  11.47  1|0  11  44  133158779   T   C   

                              ROI  
3                chr12:1-11895386  
4                chr12:1-11895386  
5                chr12:1-11895386  
17               chr12:1-11895386  
18             

IndexError: list assignment index out of range

In [None]:
chrom = "chr12"
start = "1"
stop = 

variants.loc[(variants["CHROM"].astype(str) == chrom) & (variants["POS"].astype(str) >= start) & (variants["POS"].astype(str) <= stop), "ROI"] = str(chrom+":"+start+"-"+stop)

In [14]:
grouped[(grouped["PS"] == '10183') & (grouped["ROI"] =="None")]

Unnamed: 0,CHROM,POS,QUAL,GT,GQ,DP,PS,REF,ALT,ROI


In [18]:
def excel_style_suffix(n):
    print(n)
    result = ""
    while n > 0:
        n, remainder = divmod(n - 1, 26)
        result = string.ascii_uppercase[remainder] + result
    return result


def split_ps_tags(df):
    df["PS_R"]= "None"
    for ps, group in df.groupby("PS", dropna=False):
        chrm = group["CHROM"].unique()[0]
        rois = group["ROI"].dropna().unique()
        if len(rois) <=1:
            df.loc[df["PS"]==ps, "PS_R"] = chrm+"_"+ps
        else:
            roi_to_suffix = {
                roi: excel_style_suffix(i)
                for i, roi in enumerate(sorted(rois))}
            for idx, row in group.iterrows():
                if row["ROI"] == "None":
                    df.loc[idx, "PS_R"] = ps
                else:
                    df.loc[idx, "PS_R"] = f"{chrm}_{ps}.{roi_to_suffix[row['ROI']]}"
    return df


0
1
2
0
1
2
0
1
0
1
0
1
0
1
2
3
4
0
1
0
1
2
0
1
2
0
1
2
0
1
0
1
2
0
1
2
3
4
0
1
0
1
0
1
0
1
2
0
1
2
3
0
1
0
1
0
1
0
1
0
1
0
1
2
3
0
1
0
1
0
1
2
3
0
1
0
1
0
1
0
1
0
1
0
1
2
0
1
2
0
1
2


Unnamed: 0,CHROM,POS,QUAL,GT,GQ,DP,PS,REF,ALT,ROI,PS_R
3,chr12,10183,8.77,1|0,8,52,10183,T,A,chr12:1-11895386,chr12_10183
4,chr12,10185,7.52,1|0,7,52,10183,C,A,chr12:1-11895386,chr12_10183
5,chr12,10194,10.42,1|0,10,52,10183,A,T,chr12:1-11895386,chr12_10183
17,chr12,10491,5.05,1|0,5,55,10183,A,T,chr12:1-11895386,chr12_10183
18,chr12,10649,14.34,1|0,14,71,10183,G,A,chr12:1-11895386,chr12_10183
...,...,...,...,...,...,...,...,...,...,...,...
268495,chr12,133241504,54.35,0|1,54,44,133158779,A,G,chr12:128308614-133275309,chr12_133158779
268496,chr12,133244280,58.20,0|1,58,47,133158779,G,A,chr12:128308614-133275309,chr12_133158779
268498,chr12,133249425,53.87,0|1,53,49,133158779,G,T,chr12:128308614-133275309,chr12_133158779
268499,chr12,133253066,55.29,0|1,55,49,133158779,G,T,chr12:128308614-133275309,chr12_133158779


In [19]:
df[df["PS"] == "97397768"]

Unnamed: 0,CHROM,POS,QUAL,GT,GQ,DP,PS,REF,ALT,ROI,PS_R
205158,chr12,97397768,62.16,0|1,62,64,97397768,G,A,chr12:94108839-97430502,chr12_97397768.A
205159,chr12,97397787,66.45,0|1,66,64,97397768,A,G,chr12:94108839-97430502,chr12_97397768.A
205160,chr12,97397885,55.36,0|1,55,65,97397768,G,A,chr12:94108839-97430502,chr12_97397768.A
205161,chr12,97397979,61.68,1|0,61,65,97397768,T,C,chr12:94108839-97430502,chr12_97397768.A
205162,chr12,97398043,58.55,0|1,58,65,97397768,A,G,chr12:94108839-97430502,chr12_97397768.A
...,...,...,...,...,...,...,...,...,...,...,...
210749,chr12,101499537,60.54,1|0,60,53,97397768,A,G,chr12:98032956-115177215,chr12_97397768.B
210750,chr12,101499948,58.30,1|0,58,52,97397768,A,G,chr12:98032956-115177215,chr12_97397768.B
210753,chr12,101500504,56.29,1|0,56,54,97397768,C,T,chr12:98032956-115177215,chr12_97397768.B
210755,chr12,101504306,70.21,0|1,70,60,97397768,G,A,chr12:98032956-115177215,chr12_97397768.B


In [None]:
def split_ps_tags(df):
    ### This function adds A B C etc to sub-haploblocks (haploblocks that are defined by multiple ROIs)
    new_ps = [""] * len(df)
    print(df)
    for ps, group in df.groupby("PS", dropna=False):
        print(ps)
        print(group)
        rois = group["ROI"].dropna().unique()
        print(rois)
        ### Single or no ROI → PS unchanged
        if len(rois) <= 1:
            print(group.index)
            for idx in group.index:
                print(idx)
                print(ps)
                new_ps[idx] = ps
            continue
        ### Multiple ROIs → assign suffixes A, B, C...
        roi_to_suffix = {
            roi: string.ascii_uppercase[i]
            for i, roi in enumerate(sorted(rois))}
        for idx, row in group.iterrows():
            if pd.isna(row["ROI"]):
                new_ps[idx] = ps
            else:
                new_ps[idx] = f"{ps}.{roi_to_suffix[row['ROI']]}"
    return new_ps

In [6]:
### RUN scripts

# OMIM test set of 50000 variants
variants="/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/test_nh.vcf"
all_variants="/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/SAMPLE_renamed.vcf.gz"
phased_variants="/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/OMIM_ROI/phased_ROI_HG2.vcf.gz"
bed="/hpc/umc_laat/gvandersluis/data/Ref_HG/HG_OMIM_ROI_merged.bed"
samp="2"
vers="OMIM"
grouped = hb_maker(variants,bed) # maar van all_variants de header version van de variants
grouped
# grouped.to_csv(f"/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG00{samp}/{vers}_ROI/final_df.csv", index=False)

# grouped.sort_values(["chromosome", "PS_tag"]).head(20)
# grouped

Unnamed: 0,CHROM,POS,QUAL,GT,GQ,DP,PS,REF,ALT,ROI
3,chr12,10183,8.77,1|0,8,52,10183,T,A,chr12:1-11895386
4,chr12,10185,7.52,1|0,7,52,10183,C,A,chr12:1-11895386
5,chr12,10194,10.42,1|0,10,52,10183,A,T,chr12:1-11895386
17,chr12,10491,5.05,1|0,5,55,10183,A,T,chr12:1-11895386
18,chr12,10649,14.34,1|0,14,71,10183,G,A,chr12:1-11895386
...,...,...,...,...,...,...,...,...,...,...
268495,chr12,133241504,54.35,0|1,54,44,133158779,A,G,chr12:128308614-133275309
268496,chr12,133244280,58.20,0|1,58,47,133158779,G,A,chr12:128308614-133275309
268498,chr12,133249425,53.87,0|1,53,49,133158779,G,T,chr12:128308614-133275309
268499,chr12,133253066,55.29,0|1,55,49,133158779,G,T,chr12:128308614-133275309


In [None]:
grouped

In [16]:
df_vcf

<pysam.libcbcf.VariantFile at 0x7fb777ed2950>

In [23]:
grouped[grouped["PS_tag"].astype(int) == 16514442]

Unnamed: 0,PS_tag,START_HB,END_HB,informative_variants,chromosome,QUAL_mean,GENES_in_HB,HB_length,total_VARS
47,16514442,16514442,21189725,5755,chr12,53.974622,"[GOT2P4, GOT2P4-LMO3, LMO3, LMO3-LOC105369677,...",4675283,


In [27]:

out_vcf = f"/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/OMIM_ROI/16514442_BM.vcf"
cmd = ["apptainer", "exec", "-B", "/hpc/:/hpc/", "/hpc/umc_laat/gvandersluis/software/bcftools_v1.9-1-deb_cv1.sif", "bcftools", "view", "-r", "chr12:16514442-21189725", "/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/HG002_BM_SSANDT_rn.vcf", "-Oz", "-o", out_vcf]
subprocess.run(cmd, check=True)
indexer = ["apptainer", "exec", "-B", "/hpc/:/hpc/", "/hpc/umc_laat/gvandersluis/software/bcftools_v1.9-1-deb_cv1.sif", "bcftools", "index", "-t", out_vcf]
subprocess.run(indexer, check=True)

### Make specific region version of the phased vcf file (Prepare file for whatshap compare)
out_ph_vars_vcf = "/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/OMIM_ROI/16514442_phased_vars.vcf"
cmd_1 = ["apptainer", "exec", "-B", "/hpc/:/hpc/", "/hpc/umc_laat/gvandersluis/software/bcftools_v1.9-1-deb_cv1.sif", "bcftools", "view", "-r", "chr12:16514442-21189725", "/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/OMIM_ROI/phased_ROI_HG2.vcf.gz", "-Oz", "-o", out_ph_vars_vcf]
subprocess.run(cmd_1, check=True)
indexer_1 = ["apptainer", "exec", "-B", "/hpc/:/hpc/", "/hpc/umc_laat/gvandersluis/software/bcftools_v1.9-1-deb_cv1.sif", "bcftools", "index", "-t", out_ph_vars_vcf]
subprocess.run(indexer_1, check=True)


CompletedProcess(args=['apptainer', 'exec', '-B', '/hpc/:/hpc/', '/hpc/umc_laat/gvandersluis/software/bcftools_v1.9-1-deb_cv1.sif', 'bcftools', 'index', '-t', '/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/OMIM_ROI/16514442_phased_vars.vcf'], returncode=0)

In [28]:
pd.read_csv("/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/OMIM_ROI/16514442_phased_vars.vcf")

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte

In [26]:
bm_roi, grouped = run_switch(grouped, all_variants, samp, vers, phased_variants)
bm_roi

chr1:1000552-1079306
1000552 	 Done
1
chr1:10117019-10283590
10117019 	 Done
2
chr1:10326684-10370590
10326684 	 Done
3
chr1:10424359-10512026
10424359 	 Done
4
chr1:10590287-11036365
10590287 	 Done
5
chr1:11083631-11206194
11083631 	 Done
6
chr1:1118005-1174690
1118005 	 Done
7
chr1:11264462-11356791
11264462 	 Done
8
chr1:11407560-12218875
11407560 	 Done
9
chr1:1216550-1295621
1216550 	 Done
10
chr1:12269238-12902628
12269238 	 Done
11
chr1:13021831-13050669
13021831 	 Done
12
chr1:13104329-13136798
13104329 	 Done
13
chr1:13183818-13265675
13183818 	 Done
14
chr1:13330460-13884604
13330460 	 Done
15
chr1:13942278-16569139
13942278 	 Done
16
chr1:1406214-2432709
1406214 	 Done
17
chr1:16011572-16683109
16011572 	 Done
18
chr1:16755040-16773626
16755040 	 Done
19
chr1:16859997-18039368
16859997 	 Done
20
chr1:18096829-18283908
18096829 	 Done
21
chr1:18349782-19648300
18349782 	 Done
22
chr1:19700469-19999168
19700469 	 Done
23
chr1:20049791-20176268
20049791 	 Done
24
chr1:20279975

'/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/OMIM_ROI/ROI_eval.tsv'

In [9]:
sw_e = pd.read_csv("/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/OMIM_ROI/ROI_eval.tsv", sep="\t")
sw_e.sort_values("all_switches")

Unnamed: 0,#sample,chromosome,dataset_name0,dataset_name1,file_name0,file_name1,intersection_blocks,covered_variants,all_assessed_pairs,all_switches,...,largestblock_switches,largestblock_switch_rate,largestblock_switchflips,largestblock_switchflip_rate,largestblock_hamming,largestblock_hamming_rate,largestblock_diff_genotypes,largestblock_diff_genotypes_rate,het_variants0,only_snvs
0,hg002,chr1,BENCHMARK,1000552,/hpc/umc_laat/gvandersluis/data/Ont_data_nhung...,/hpc/umc_laat/gvandersluis/data/Ont_data_nhung...,1,3,2,0,...,0,0.000000,0/0,0.000000,0,0.000000,0,0.0,5,0
1,hg002,chr1,BENCHMARK,10117019,/hpc/umc_laat/gvandersluis/data/Ont_data_nhung...,/hpc/umc_laat/gvandersluis/data/Ont_data_nhung...,1,13,12,0,...,0,0.000000,0/0,0.000000,0,0.000000,0,0.0,19,0
2,hg002,chr1,BENCHMARK,10326684,/hpc/umc_laat/gvandersluis/data/Ont_data_nhung...,/hpc/umc_laat/gvandersluis/data/Ont_data_nhung...,1,2,1,0,...,0,0.000000,0/0,0.000000,0,0.000000,0,0.0,4,0
3,hg002,chr1,BENCHMARK,10424359,/hpc/umc_laat/gvandersluis/data/Ont_data_nhung...,/hpc/umc_laat/gvandersluis/data/Ont_data_nhung...,1,7,6,0,...,0,0.000000,0/0,0.000000,0,0.000000,0,0.0,12,0
4,hg002,chr1,BENCHMARK,10590287,/hpc/umc_laat/gvandersluis/data/Ont_data_nhung...,/hpc/umc_laat/gvandersluis/data/Ont_data_nhung...,1,178,177,0,...,0,0.000000,0/0,0.000000,0,0.000000,0,0.0,276,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,hg002,chr1,BENCHMARK,4655077,/hpc/umc_laat/gvandersluis/data/Ont_data_nhung...,/hpc/umc_laat/gvandersluis/data/Ont_data_nhung...,1,101,100,2,...,2,0.020000,0/1,0.010000,1,0.009901,0,0.0,151,0
37,hg002,chr1,BENCHMARK,30225136,/hpc/umc_laat/gvandersluis/data/Ont_data_nhung...,/hpc/umc_laat/gvandersluis/data/Ont_data_nhung...,1,864,863,2,...,2,0.002317,0/1,0.001159,1,0.001157,0,0.0,1313,0
97,hg002,chr1,BENCHMARK,56185733,/hpc/umc_laat/gvandersluis/data/Ont_data_nhung...,/hpc/umc_laat/gvandersluis/data/Ont_data_nhung...,1,2363,2362,2,...,2,0.000847,2/0,0.000847,2,0.000846,0,0.0,4014,0
98,hg002,chr1,BENCHMARK,5641831,/hpc/umc_laat/gvandersluis/data/Ont_data_nhung...,/hpc/umc_laat/gvandersluis/data/Ont_data_nhung...,1,1774,1773,2,...,2,0.001128,0/1,0.000564,1,0.000564,0,0.0,2877,0


In [19]:
sw_e = pd.read_csv("/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/OMIM_ROI/ROI_eval.tsv", sep="\t")
sw_e.sort_values("all_switches")

### Format the switch error file
sw_e = sw_e[["dataset_name1","het_variants0","all_switches","all_switch_rate","all_switchflips","all_switchflip_rate","blockwise_hamming_rate"]]
sw_e["Accuracy"] = (100-sw_e["blockwise_hamming_rate"]).astype(str)+"%"
sw_e.drop(columns="blockwise_hamming_rate")
sw_e.rename(columns={"dataset_name1": "PS_tag"}, inplace=True)
sw_e = pd.merge(grouped, sw_e, on="PS_tag", how="left")
sw_e.sort_values("all_switches").tail(6)
# ### Merge Hapoblocks with their switch errors
# basis_df["GENE_l"] = basis_df["GENE"]+"_"+basis_df["HB_START"].astype(str)    
# switch_df = pd.merge(basis_df, sw_e, on="GENE_l", how="left")
# switch_df["difference_het-vars_BM_vs_data"] = abs(switch_df["PHASED_VARIANTS"]-switch_df["het_variants0"])
# switch_df = switch_df.drop(columns="GENE_l")

#     ### Write merged dataframe to file
#     switch_df.to_csv(f"/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG00{samp}/{version}_ROI/final_df.csv", index=False)
    


Unnamed: 0,PS_tag,START_HB,END_HB,informative_variants,chromosome,QUAL_mean,GENES_in_HB,HB_length,total_VARS,het_variants0,all_switches,all_switch_rate,all_switchflips,all_switchflip_rate,blockwise_hamming_rate,Accuracy
38,30225136,30225136,31231795,1134,chr1,57.033616,"[LOC105378617-LOC105378618, LOC105378618, LOC1...",1006659,,1313.0,2.0,0.002317,0/1,0.001159,0.001157,99.9988425925926%
98,56185733,56185733,59816040,3314,chr1,56.832764,"[LOC105378741-RPSAP20, RPSAP20-LINC01767, LINC...",3630307,,4014.0,2.0,0.000847,2/0,0.000847,0.000846,99.99915361828185%
99,5641831,5641831,8828783,2394,chr1,56.887686,"[LOC105376686-MIR4689, MIR4689, NPHP4, KCNAB2,...",3186952,,2877.0,2.0,0.001128,0/1,0.000564,0.000564,99.99943630214206%
135,78409421,78409421,78990830,479,chr1,57.100125,"[RNA5SP23-PTGFR, PTGFR, PTGFR-IFI44L, IFI44L, ...",581409,,556.0,2.0,0.005988,0/1,0.002994,0.002985,99.99701492537314%
18,16755040,16755040,16773626,22,chr1,47.836818,[MST1L],18586,,,,,,,,
110,655624,655624,693461,3,chr1,48.973333,"[MTCO3P12-WBP1LP6, OR4F16]",37837,,,,,,,,
