In [24]:
from cyvcf2 import VCF, Writer
import pandas as pd
import numpy as np

In [48]:
### Get the Benchmark file
truth = pd.read_csv("/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/WF_H_VAR_ROI/HG002_BM_SSANDT_rn.vcf", compression="gzip", comment="#", sep = "\t", names=["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "sample"]).drop("INFO", axis=1)

In [26]:
### Get and generate the necessary columns
truth["GT_BM"] = truth["sample"].str.split(":").str[0]
truth["tag"] = truth["CHROM"].astype(str)+"_"+truth["POS"].astype(str)
short_truth = truth[["tag", "GT_BM"]]
short_truth

Unnamed: 0,tag,GT_BM
0,chr1_602113,1/1
1,chr1_604358,1/1
2,chr1_604360,1/1
3,chr1_611317,1/1
4,chr1_631859,1/1
...,...,...
4048337,chr22_50791171,1|0
4048338,chr22_50791276,0/1
4048339,chr22_50791289,0/1
4048340,chr22_50792075,1|0


In [27]:
### Remove the variants that contain more variation possibilities '2' and / (unphased variants)
short_truth = short_truth[(short_truth["GT_BM"].str.contains("2") == False) & (short_truth["GT_BM"].str.contains("/") == False)]
short_truth

Unnamed: 0,tag,GT_BM
138,chr1_885657,1|0
161,chr1_908025,1|0
170,chr1_916010,0|1
204,chr1_928128,0|1
208,chr1_928253,0|1
...,...,...
4048329,chr22_50787086,1|0
4048331,chr22_50787916,1|0
4048333,chr22_50789463,1|0
4048337,chr22_50791171,1|0


In [28]:
### Load the variant files, collect the necessary columns and remove unphased variants
vcf = pd.read_csv('/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/SUP_v5.2_ROI/roi_SUP_v5.2.wf_snp.vcf.gz', comment="#", sep = "\t", names=["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "sample"])
vcf["GT"] = vcf["sample"].str.split(":").str[0]
vcf = vcf[vcf["GT"].str.contains("/") == False]
vcf["Cor_phased"]="unknown"
vcf["PS_tag"] = vcf["CHROM"].astype(str)+"_"+vcf["sample"].str.split(":").str[5]
vcf["tag"] = vcf["CHROM"].astype(str)+"_"+vcf["POS"].astype(str)

vcf


Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,sample,GT,Cor_phased,PS_tag,tag
0,chr1,434272,.,A,C,10.63,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:10:31:19,11:0.3548:413640",0|1,unknown,chr1_413640,chr1_434272
2,chr1,434287,.,G,A,13.04,PASS,F,GT:GQ:DP:AD:AF:PS,"1|0:13:31:17,12:0.3871:413640",1|0,unknown,chr1_413640,chr1_434287
3,chr1,434292,.,T,C,15.32,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:15:31:17,13:0.4194:413640",0|1,unknown,chr1_413640,chr1_434292
4,chr1,434451,.,G,A,17.59,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:17:31:18,13:0.4194:413640",0|1,unknown,chr1_413640,chr1_434451
5,chr1,434504,.,G,A,15.38,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:15:31:19,12:0.3871:413640",0|1,unknown,chr1_413640,chr1_434504
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3909342,chrY,25064188,.,G,A,9.68,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:9:23:16,6:0.2609:25064188",0|1,unknown,chrY_25064188,chrY_25064188
3909343,chrY,25065902,.,G,T,5.01,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:5:29:16,13:0.4483:25064188",0|1,unknown,chrY_25064188,chrY_25065902
3909524,chrY,26088251,.,TTATA,T,19.39,PASS,F,GT:GQ:DP:AD:AF:PS,"1|0:19:21:0,18:0.8571:26116800",1|0,unknown,chrY_26116800,chrY_26088251
3909532,chrY,26116800,.,C,G,9.34,PASS,F,GT:GQ:DP:AD:AF:PS,"1|0:9:7:2,4:0.5714:26116800",1|0,unknown,chrY_26116800,chrY_26116800


In [29]:
### Merge the Benchmark file with the VCF file on the generated location tag 'tag'
### Turn all variants of the vcf that are in the benchmark to True. 
### For haploblocks without switches, this will mean that all variants are correctly phased that are found in the benchmark file
complete = pd.merge(vcf, short_truth, on="tag", how="left")
complete.loc[complete["GT_BM"].notna(), "Cor_phased"] = "True"
complete

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,sample,GT,Cor_phased,PS_tag,tag,GT_BM
0,chr1,434272,.,A,C,10.63,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:10:31:19,11:0.3548:413640",0|1,unknown,chr1_413640,chr1_434272,
1,chr1,434287,.,G,A,13.04,PASS,F,GT:GQ:DP:AD:AF:PS,"1|0:13:31:17,12:0.3871:413640",1|0,unknown,chr1_413640,chr1_434287,
2,chr1,434292,.,T,C,15.32,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:15:31:17,13:0.4194:413640",0|1,unknown,chr1_413640,chr1_434292,
3,chr1,434451,.,G,A,17.59,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:17:31:18,13:0.4194:413640",0|1,unknown,chr1_413640,chr1_434451,
4,chr1,434504,.,G,A,15.38,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:15:31:19,12:0.3871:413640",0|1,unknown,chr1_413640,chr1_434504,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2292089,chrY,25064188,.,G,A,9.68,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:9:23:16,6:0.2609:25064188",0|1,unknown,chrY_25064188,chrY_25064188,
2292090,chrY,25065902,.,G,T,5.01,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:5:29:16,13:0.4483:25064188",0|1,unknown,chrY_25064188,chrY_25065902,
2292091,chrY,26088251,.,TTATA,T,19.39,PASS,F,GT:GQ:DP:AD:AF:PS,"1|0:19:21:0,18:0.8571:26116800",1|0,unknown,chrY_26116800,chrY_26088251,
2292092,chrY,26116800,.,C,G,9.34,PASS,F,GT:GQ:DP:AD:AF:PS,"1|0:9:7:2,4:0.5714:26116800",1|0,unknown,chrY_26116800,chrY_26116800,


In [45]:
### From the haploblock file, identify the PS_tags that contain switches. 
### For these haploblocks the variants need to be determined if they are correctly phased
v5_2_new = pd.read_csv("/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/SUP_v5.2_ROI/Haploblock_switches.csv")
ps_tags = v5_2_new[(v5_2_new["all_switches"] > 0) & (v5_2_new["all_switches"].notna())].sort_values("phased_variants")["PS_tag"].unique()

ps_tags[5:10]
v5_2_new[v5_2_new["PS_tag"].isin(ps_tags[5:10])]


Unnamed: 0,PS,START_HB,END_HB,phased_variants,chromosome,GQ_mean,GENES_in_HB,HB_length,total_VARS,PS_tag,het_variants0,all_switches,all_switch_rate,all_switchflips,all_switchflip_rate,blockwise_hamming_rate,Accuracy
296,113958227,113958227,114102696,39,chr7,30.615385,[nan],144469,269,chr7_113958227,5.0,1.0,0.333333,1/0,0.333333,0.25,0.75%
643,13708195,13708195,14025356,27,chr20,45.333333,[nan],317161,401,chr20_13708195,22.0,1.0,0.058824,1/0,0.058824,0.166667,0.8333333333333334%
1359,219155409,219152789,219329415,27,chr2,33.851852,[nan],176626,147,chr2_219155409,15.0,1.0,0.083333,1/0,0.083333,0.230769,0.7692307692307693%
2935,76634488,76615492,76851237,41,chr15,43.268293,[nan],235745,247,chr15_76634488,28.0,1.0,0.041667,1/0,0.041667,0.08,0.92%
3143,86305043,86305043,86482067,21,chr12,41.52381,[nan],177024,187,chr12_86305043,18.0,2.0,0.133333,0/1,0.066667,0.0625,0.9375%


In [31]:

count = 0
### Run through all phases tags (haploblocks) in which switches are present
for i in ps_tags[5:10]:
    count += 1
    print(count)
    ### Filter rows of the dataframe that contain the PS tag and are not NaN
    subset = complete[(complete["PS_tag"] == i) & (complete["GT_BM"].notna())]
    ### For variant in this cluster if it is not empty:
    if not subset.empty:
        for index, item in subset.iterrows():
            ### To check if the variants are phased correctly according to the benchmark an on which allele they are,
            ### an X or O is connected to the variant. And - as a check to see if something else is up
            if item["GT"] == item["GT_BM"]:
                complete.loc[index, "Cor_phased"] = "X"
            elif item["GT"] == item["GT_BM"][::-1]:
                complete.loc[index, "Cor_phased"] = "O"
            else:
                complete.loc[index, "Cor_phased"] = "-"
    ### Per haploblock we can now calculate the Hemming rate.
    tot = complete[complete["PS_tag"] == i].groupby("Cor_phased").agg("count")
    tot_x = tot.loc[["X"],"GT"]
    tot_o = tot.loc[["O"],"GT"]
    total_vars = v5_2_new[v5_2_new["PS_tag"] == i]["het_variants0"]
    off_hemm_rate = v5_2_new[v5_2_new["PS_tag"] == i]["blockwise_hamming_rate"]
    hemm_rate_x = tot_x.loc["X"]/(tot_x.loc["X"] + tot_o.loc["O"])
    hemm_rate_o = tot_o.loc["O"]/(tot_x.loc["X"] + tot_o.loc["O"])

    ### If the hemming rate is higher with the X, then O variants are the switches and the other way around. 
    ### If the rate is 0.50, it is not possible to know which are phased correctly
    if round(hemm_rate_x,2) == 0.50:
        complete.loc[(complete["PS_tag"] == i) & (complete["GT_BM"].notna()), "Cor_phased"] = "UNKNOWN"
    elif round(hemm_rate_x,4) > round(hemm_rate_o,4):
        complete.loc[(complete["PS_tag"] == i) & (complete["Cor_phased"] == "O" ), "Cor_phased"] = "False"
        complete.loc[(complete["PS_tag"] == i) & (complete["Cor_phased"] == "X" ), "Cor_phased"] = "True"
    else:
        complete.loc[(complete["PS_tag"] == i) & (complete["Cor_phased"] == "X" ), "Cor_phased"] = "False"
        complete.loc[(complete["PS_tag"] == i) & (complete["Cor_phased"] == "O" ), "Cor_phased"] = "True"



1
2
3
4
5


In [32]:
complete.to_csv("/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/SUP_v5.2_ROI/cor_phased.csv", sep='\t', index=False)

In [12]:
comp_tab[comp_tab["Cor_phased"] == "-"]

Unnamed: 0.1,Unnamed: 0,chrom,pos,id,ref_x,alt_x,qual,filter,info,format,sample,Cor_phased,GT,PS_tag,tag,GT_BM,ref_y,alt_y
93463,93463,chr1,114647173,.,G,A,13.79,PASS,F,GT:GQ:DP:AD:AF:PS,"1|0:13:57:11,26:0.4561:114486666",-,1|0,chr1_114486666,chr1_114647173,1/1,G,A
172380,172380,chr1,234278517,.,G,A,5.66,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:5:63:0,38:0.6032:233144051",-,0|1,chr1_233144051,chr1_234278517,1/1,G,A
1693914,1693914,chr13,71553444,.,C,A,3.33,PASS,F,GT:GQ:DP:AD:AF:PS,"1|0:3:41:0,26:0.6341:67041383",-,1|0,chr13_67041383,chr13_71553444,1/1,C,A
1948137,1948137,chr17,2493718,.,C,CA,12.61,PASS,F,GT:GQ:DP:AD:AF:PS,"1|0:12:40:5,13:0.325:506523",-,1|0,chr17_506523,chr17_2493718,1/1,C,CA
2071674,2071674,chr18,70688336,.,C,T,4.52,PASS,F,GT:GQ:DP:AD:AF:PS,"1|0:4:36:8,20:0.5556:69194727",-,1|0,chr18_69194727,chr18_70688336,1/1,C,T


In [36]:
i = "chr16_80524674"
v5_2_new[v5_2_new["PS_tag"] == i]

Unnamed: 0,PS,START_HB,END_HB,phased_variants,chromosome,GQ_mean,GENES_in_HB,HB_length,total_VARS,PS_tag,het_variants0,all_switches,all_switch_rate,all_switchflips,all_switchflip_rate,blockwise_hamming_rate,Accuracy
3036,80524674,80524674,87365010,11576,chr16,52.91612,[nan],6840336,19260,chr16_80524674,10381.0,2.0,0.000271,0/1,0.000136,0.000136,0.9998644619137979%


In [37]:
complete[complete["PS_tag"] == i].groupby("Cor_phased").agg("count")

Unnamed: 0_level_0,chrom,pos,id,ref_x,alt_x,qual,filter,info,format,sample,GT,PS_tag,tag,GT_BM,ref_y,alt_y
Cor_phased,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
False,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
True,7377,7377,7377,7377,7377,7377,7377,7377,7377,7377,7377,7377,7377,7377,7377,7377
unknown,4198,4198,4198,4198,4198,4198,4198,4198,4198,4198,4198,4198,4198,0,0,0


In [38]:
complete[(complete["PS_tag"] == i)].drop("tag", axis=1)#["GT_BM"].unique()

Unnamed: 0,chrom,pos,id,ref_x,alt_x,qual,filter,info,format,sample,Cor_phased,GT,PS_tag,GT_BM,ref_y,alt_y
1929589,chr16,80524674,.,C,A,33.77,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:33:50:33,15:0.3:80524674",True,0|1,chr16_80524674,0|1,C,A
1929590,chr16,80525096,.,A,G,13.37,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:13:52:35,16:0.3077:80524674",True,0|1,chr16_80524674,0|1,A,G
1929591,chr16,80525762,.,T,A,46.81,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:46:49:33,16:0.3265:80524674",True,0|1,chr16_80524674,0|1,T,A
1929592,chr16,80525816,.,C,T,57.22,PASS,F,GT:GQ:DP:AD:AF:PS,"1|0:57:49:15,32:0.6531:80524674",True,1|0,chr16_80524674,1|0,C,T
1929593,chr16,80526026,.,C,G,47.61,PASS,F,GT:GQ:DP:AD:AF:PS,"1|0:47:49:15,33:0.6735:80524674",True,1|0,chr16_80524674,1|0,C,G
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1941160,chr16,87347460,.,T,C,62.08,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:62:59:28,30:0.5085:80524674",True,0|1,chr16_80524674,0|1,T,C
1941161,chr16,87348238,.,G,A,60.79,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:60:58:28,30:0.5172:80524674",True,0|1,chr16_80524674,0|1,G,A
1941162,chr16,87349381,.,C,T,61.83,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:61:63:31,32:0.5079:80524674",True,0|1,chr16_80524674,0|1,C,T
1941163,chr16,87350052,.,T,C,69.69,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:69:64:32,32:0.5:80524674",unknown,0|1,chr16_80524674,,,


In [39]:
complete[(complete["PS_tag"] == i) & (complete["Cor_phased"] == "False")].drop("tag", axis=1).sort_values("pos")

Unnamed: 0,chrom,pos,id,ref_x,alt_x,qual,filter,info,format,sample,Cor_phased,GT,PS_tag,GT_BM,ref_y,alt_y
1934003,chr16,83518437,.,G,A,66.93,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:66:42:16,26:0.619:80524674",False,0|1,chr16_80524674,1|0,G,A


In [40]:
complete[(complete["PS_tag"] == i) & (complete["GT_BM"].notna())]


Unnamed: 0,chrom,pos,id,ref_x,alt_x,qual,filter,info,format,sample,Cor_phased,GT,PS_tag,tag,GT_BM,ref_y,alt_y
1929589,chr16,80524674,.,C,A,33.77,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:33:50:33,15:0.3:80524674",True,0|1,chr16_80524674,chr16_80524674,0|1,C,A
1929590,chr16,80525096,.,A,G,13.37,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:13:52:35,16:0.3077:80524674",True,0|1,chr16_80524674,chr16_80525096,0|1,A,G
1929591,chr16,80525762,.,T,A,46.81,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:46:49:33,16:0.3265:80524674",True,0|1,chr16_80524674,chr16_80525762,0|1,T,A
1929592,chr16,80525816,.,C,T,57.22,PASS,F,GT:GQ:DP:AD:AF:PS,"1|0:57:49:15,32:0.6531:80524674",True,1|0,chr16_80524674,chr16_80525816,1|0,C,T
1929593,chr16,80526026,.,C,G,47.61,PASS,F,GT:GQ:DP:AD:AF:PS,"1|0:47:49:15,33:0.6735:80524674",True,1|0,chr16_80524674,chr16_80526026,1|0,C,G
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1941157,chr16,87344608,.,T,G,63.80,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:63:59:28,28:0.4746:80524674",True,0|1,chr16_80524674,chr16_87344608,0|1,T,G
1941158,chr16,87344782,.,G,C,62.61,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:62:59:31,28:0.4746:80524674",True,0|1,chr16_80524674,chr16_87344782,0|1,G,C
1941160,chr16,87347460,.,T,C,62.08,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:62:59:28,30:0.5085:80524674",True,0|1,chr16_80524674,chr16_87347460,0|1,T,C
1941161,chr16,87348238,.,G,A,60.79,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:60:58:28,30:0.5172:80524674",True,0|1,chr16_80524674,chr16_87348238,0|1,G,A


In [41]:
complete[complete["PS_tag"] == i]

Unnamed: 0,chrom,pos,id,ref_x,alt_x,qual,filter,info,format,sample,Cor_phased,GT,PS_tag,tag,GT_BM,ref_y,alt_y
1929589,chr16,80524674,.,C,A,33.77,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:33:50:33,15:0.3:80524674",True,0|1,chr16_80524674,chr16_80524674,0|1,C,A
1929590,chr16,80525096,.,A,G,13.37,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:13:52:35,16:0.3077:80524674",True,0|1,chr16_80524674,chr16_80525096,0|1,A,G
1929591,chr16,80525762,.,T,A,46.81,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:46:49:33,16:0.3265:80524674",True,0|1,chr16_80524674,chr16_80525762,0|1,T,A
1929592,chr16,80525816,.,C,T,57.22,PASS,F,GT:GQ:DP:AD:AF:PS,"1|0:57:49:15,32:0.6531:80524674",True,1|0,chr16_80524674,chr16_80525816,1|0,C,T
1929593,chr16,80526026,.,C,G,47.61,PASS,F,GT:GQ:DP:AD:AF:PS,"1|0:47:49:15,33:0.6735:80524674",True,1|0,chr16_80524674,chr16_80526026,1|0,C,G
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1941160,chr16,87347460,.,T,C,62.08,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:62:59:28,30:0.5085:80524674",True,0|1,chr16_80524674,chr16_87347460,0|1,T,C
1941161,chr16,87348238,.,G,A,60.79,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:60:58:28,30:0.5172:80524674",True,0|1,chr16_80524674,chr16_87348238,0|1,G,A
1941162,chr16,87349381,.,C,T,61.83,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:61:63:31,32:0.5079:80524674",True,0|1,chr16_80524674,chr16_87349381,0|1,C,T
1941163,chr16,87350052,.,T,C,69.69,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:69:64:32,32:0.5:80524674",unknown,0|1,chr16_80524674,chr16_87350052,,,


In [112]:
tel=0
for index, i in complete.iterrows():
    if i["GT_BM"].notna():
        if i["GT"] == i["GT_BM"]:
            vcf.loc[index, "Cor_phased"] = "X"
        elif i["GT"] == truth.loc[truth["ID"] == i["tag"],"GT"].iloc[0][::-1]:
            vcf.loc[index, "Cor_phased"] = "O"
        else:
            vcf.loc[index, "Cor_phased"] = "-"

        tel += 1
        print(tel)
vcf

AttributeError: 'float' object has no attribute 'notna'

In [42]:
vcf[vcf["Cor_phased"] == "O"]


Unnamed: 0,chrom,pos,id,ref,alt,qual,filter,info,format,sample,Cor_phased,GT,PS,tag
127,chr1,885657,.,C,T,68.1,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:68:38:17,21:0.5526:789481",O,0|1,789481,chr1_885657
129,chr1,908025,.,A,G,50.8,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:50:40:20,18:0.45:789481",O,0|1,789481,chr1_908025
130,chr1,916010,.,G,T,61.18,PASS,F,GT:GQ:DP:AD:AF:PS,"1|0:61:38:18,18:0.4737:789481",O,1|0,789481,chr1_916010
131,chr1,928128,.,G,T,47.14,PASS,F,GT:GQ:DP:AD:AF:PS,"1|0:47:43:29,14:0.3256:789481",O,1|0,789481,chr1_928128
132,chr1,928253,.,G,A,50.02,PASS,F,GT:GQ:DP:AD:AF:PS,"1|0:50:43:29,14:0.3256:789481",O,1|0,789481,chr1_928253
133,chr1,933038,.,C,T,63.44,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:63:49:19,30:0.6122:789481",O,0|1,789481,chr1_933038
138,chr1,940387,.,T,G,49.05,PASS,F,GT:GQ:DP:AD:AF:PS,"1|0:49:56:33,23:0.4107:789481",O,1|0,789481,chr1_940387
139,chr1,940820,.,G,C,67.2,PASS,F,GT:GQ:DP:AD:AF:PS,"1|0:67:55:32,23:0.4182:789481",O,1|0,789481,chr1_940820
140,chr1,942934,.,G,C,65.09,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:65:50:21,26:0.52:789481",O,0|1,789481,chr1_942934
141,chr1,946870,.,C,A,51.09,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:51:44:22,22:0.5:789481",O,0|1,789481,chr1_946870


In [4]:
comp_tab = pd.read_csv("/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/SUP_v5.2_ROI/cor_phased.csv", sep='\t')

In [7]:
ps = 0

comp_tab = pd.read_csv("/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/SUP_v5.2_ROI/cor_phased.csv", sep='\t')

cor_ph = "/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/SUP_v5.2_ROI/cor_phased.vcf.gz"
vcf_f = VCF("/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/SUP_v5.2_ROI/roi_SUP_v5.2.wf_snp.vcf.gz")
vcf_f.add_format_to_header({
    'ID': 'Cor_phasing',
    'Description': 'indicates if the variant is switch error causing',
    'Type': 'String',
    'Number': '1'
})

w = Writer(comp_tab, vcf_f)
for var in comp_tab:
    w.write_record(var)
w.close()
vcf_f.close()

# num_samples = len(vcf_f.samples)
# cor_ph = []
# for record in vcf_f:
#     print(record.samples)
#     record.set_format("Cor_phasing", np.array(["UNKNWN"] * num_samples, dtype="S"))
#     if truth[truth['ID'] == f"{record.CHROM}_{record.POS}"].empty == False:
#         print(record.FORMAT['GT'].astype(str))
#         if record.FORMAT['GT'].astype(str) == truth.loc[truth["ID"] == f"{record.CHROM}_{record.POS}","GT"].iloc[0]:
#             cor_ph.append("X")
#         elif record.FORMAT['GT'].astype(str) == truth.loc[truth["ID"] == f"{record.CHROM}_{record.POS}","GT"].iloc[0][::-1]:
#             cor_ph.append("O")
#         else:
#             cor_ph.append("-")
#         print(record)
#     #print(f"{record.CHROM}:{record.POS} {record.REF}>{record.ALT}")


TypeError: Argument 'var' has incorrect type (expected cyvcf2.cyvcf2.Variant, got str)

In [None]:
np.array(["UNKNWN"] * num_samples, dtype="S")

In [13]:
comp_tab.drop(("ref_y", "alt_y"), axis=1)

Unnamed: 0.1,Unnamed: 0,chrom,pos,id,ref_x,alt_x,qual,filter,info,format,sample,Cor_phased,GT,PS_tag,tag,GT_BM,ref_y,alt_y
0,0,chr1,434272,.,A,C,10.63,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:10:31:19,11:0.3548:413640",unknown,0|1,chr1_413640,chr1_434272,,,
1,1,chr1,434287,.,G,A,13.04,PASS,F,GT:GQ:DP:AD:AF:PS,"1|0:13:31:17,12:0.3871:413640",unknown,1|0,chr1_413640,chr1_434287,,,
2,2,chr1,434292,.,T,C,15.32,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:15:31:17,13:0.4194:413640",unknown,0|1,chr1_413640,chr1_434292,,,
3,3,chr1,434451,.,G,A,17.59,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:17:31:18,13:0.4194:413640",unknown,0|1,chr1_413640,chr1_434451,,,
4,4,chr1,434504,.,G,A,15.38,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:15:31:19,12:0.3871:413640",unknown,0|1,chr1_413640,chr1_434504,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2292089,2292089,chrY,25064188,.,G,A,9.68,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:9:23:16,6:0.2609:25064188",unknown,0|1,chrY_25064188,chrY_25064188,,,
2292090,2292090,chrY,25065902,.,G,T,5.01,PASS,F,GT:GQ:DP:AD:AF:PS,"0|1:5:29:16,13:0.4483:25064188",unknown,0|1,chrY_25064188,chrY_25065902,,,
2292091,2292091,chrY,26088251,.,TTATA,T,19.39,PASS,F,GT:GQ:DP:AD:AF:PS,"1|0:19:21:0,18:0.8571:26116800",unknown,1|0,chrY_26116800,chrY_26088251,,,
2292092,2292092,chrY,26116800,.,C,G,9.34,PASS,F,GT:GQ:DP:AD:AF:PS,"1|0:9:7:2,4:0.5714:26116800",unknown,1|0,chrY_26116800,chrY_26116800,,,


In [33]:
comp_tab = pd.read_csv(
    "/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/SUP_v5.2_ROI/cor_phased.csv",
    sep="\t"
)

In [36]:
comp_tab = comp_tab[comp_tab["GT_BM"].notna()]

In [41]:

# # Load modified table
# comp_tab = pd.read_csv(
#     "/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/SUP_v5.2_ROI/cor_phased.csv",
#     sep="\t"
# )
comp_tab = comp_tab.copy()

# Index for fast lookup
comp_tab["key"] = list(
    zip(comp_tab.CHROM, comp_tab.POS, comp_tab.REF, comp_tab.ALT)
)
lookup = dict(zip(comp_tab.key, comp_tab.Cor_phased))

# Open original VCF
vcf_f = VCF(
    "/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/SUP_v5.2_ROI/roi_SUP_v5.2.wf_snp.vcf.gz"
)

vcf_f.add_format_to_header({
    "ID": "Cor_phased",
    "Description": "indicates if the variant is switch error causing",
    "Type": "String",
    "Number": "1"
})

writer = Writer(
    "/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/SUP_v5.2_ROI/cor_phased.vcf.gz",
    vcf_f
)

# Transfer DataFrame data → VCF
for var in vcf_f:
    key = (var.CHROM, var.POS, var.REF, var.ALT[0])

    value = lookup.get(key, "NA")

    var.set_format(
        "Cor_phased",
        np.array([value.encode("ascii")], dtype="S")
    )

    writer.write_record(var)

writer.close()
vcf_f.close()


In [None]:

def save_vcf(vcff, outf, comp_tab):
    comp_tab = comp_tab.copy()

    ### Index for fast lookup
    comp_tab["key"] = list(
        zip(comp_tab.CHROM, comp_tab.POS, comp_tab.REF, comp_tab.ALT)
    )
    lookup = dict(zip(comp_tab.key, comp_tab.Cor_phased))

    ### Open original VCF
    vcf_f = VCF(vcff)

    ### Edit the header to include Cor_phased
    vcf_f.add_format_to_header({
        "ID": "Cor_phased",
        "Description": "indicates if the variant is switch error causing",
        "Type": "String",
        "Number": "1"
    })

    ### Write vcf to output file
    writer = Writer(outf, vcf_f)

    ### For every variant, check dataframe and add Cor_phased Column
    for var in vcf_f:
        key = (var.CHROM, var.POS, var.REF, var.ALT[0])
        value = lookup.get(key, "NA")
        var.set_format("Cor_phased", np.array([value.encode("ascii")], dtype="S"))
        writer.write_record(var)
    writer.close()
    vcf_f.close()

def main():
    bm_file = "/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/SUP_v5.2_ROI/HG002_BM_SSANDT_rn.vcf.gz"
    vcf_file = "/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/SUP_v5.2_ROI/roi_SUP_v5.2.wf_snp.vcf.gz"
    haploblock_file = "/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/SUP_v5.2_ROI/Haploblock_switches.csv"
    output_file = "/hpc/umc_laat/gvandersluis/data/Ont_data_nhung/HG002/SUP_v5.2_ROI/cor_phased.vcf.gz"
    bench_t = mod_bm(bm_file)
    comp_vcf = mod_vcf(vcf_file, bench_t)
    inf_tags = informative_tags(haploblock_file)
    corr_phased = corr_phased_vars(comp_vcf, inf_tags)
    save_vcf(vcf_file, output_file, corr_phased)


()