In [1]:
import pandas as pd
from pandas.api.types import CategoricalDtype
import polars as pl
import seaborn as sns
import glob
import os
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import importlib


pedigree_file_loc = 'resources/pedigrees/gatk_1kgp.ped'
summary_statistics_folder='intermediate_data'

if os.getcwd()!='phasing_T2T_project':
    os.chdir('..') # Change to the parent directory

In [2]:
import glob; glob.glob(f"{summary_statistics_folder}/*")

['intermediate_data/rolling_stats_500k_window.parquet',
 'intermediate_data/flips_and_switches.parquet',
 'intermediate_data/samples.parquet',
 'intermediate_data/compressed_ideogram_100000_window.parquet',
 'intermediate_data/variants.parquet',
 'intermediate_data/compressed_ideogram_1000000_window.parquet',
 'intermediate_data/per_genome_cnv_regions_with_slop.parquet',
 'intermediate_data/chroms.parquet',
 'intermediate_data/variant_frequency_stats',
 'intermediate_data/switch_errors.parquet',
 'intermediate_data/MAF_performance_variants.parquet',
 'intermediate_data/sample_genotype_concordance.parquet',
 'intermediate_data/per_sample_imputation_performance.parquet',
 'intermediate_data/rolling_stats_250k_window.parquet',
 'intermediate_data/bcftools_query_variant_data.parquet',
 'intermediate_data/per_genome_cnv_regions.parquet',
 'intermediate_data/per_variant_category_imputation_performance.parquet',
 'intermediate_data/per_MAF_bin.parquet',
 'intermediate_data/methods.parquet',
 

In [3]:
q = pd.read_parquet('intermediate_data/per_MAF_bin.parquet')
q = q[(q.rounded_MAF=='singleton')&(q.method_of_phasing=='phased_with_parents_and_pedigree')&(q.syntenic=='All')&(q.type=='SNPs + Indels')]
# q.type=='SNPs + Indels'
q

Unnamed: 0,type,rounded_MAF,genome,method_of_phasing,ground_truth_data_source,n_switch_errors,n_checked,n_gt_errors,n_gt_checked,MAC,AN,switch_error_rate,gt_error_rate,MAF,syntenic
862,SNPs + Indels,singleton,CHM13v2.0,phased_with_parents_and_pedigree,HGSVC_samples,389.0,1505.0,3071,142515,2312,14806048,25.847176,2.154861,0.015615,All
1032,SNPs + Indels,singleton,CHM13v2.0,phased_with_parents_and_pedigree,HPRC_HGSVC_probands,4.0,109.0,3581,52836,777,4975908,3.669725,6.777576,0.015615,All
978,SNPs + Indels,singleton,CHM13v2.0,phased_with_parents_and_pedigree,HGSVC_samples_nontrios_only,369.0,1123.0,2261,31863,1677,10739508,32.858415,7.096005,0.015615,All
901,SNPs + Indels,singleton,CHM13v2.0,phased_with_parents_and_pedigree,HGSVC_probands,4.0,109.0,1607,20646,666,4265064,3.669725,7.78359,0.015615,All
1101,SNPs + Indels,singleton,CHM13v2.0,phased_with_parents_and_pedigree,HGSVC_parents,20.0,274.0,1176,10192,784,5020736,7.29927,11.538462,0.015615,All
893,SNPs + Indels,singleton,CHM13v2.0,phased_with_parents_and_pedigree,HPRC_samples,,,1046,32658,851,5449804,,3.202891,0.015615,All
748,SNPs + Indels,singleton,CHM13v2.0,phased_with_parents_and_pedigree,HPRC_HGSVC_all_samples,393.0,1506.0,9450,267100,2671,17105084,26.095618,3.538001,0.015615,All


In [4]:
MAF_performance_variant_df = pl.read_parquet(f'{summary_statistics_folder}/MAF_performance_variants.parquet')
all_chroms = pd.read_parquet(f'{summary_statistics_folder}/chroms.parquet')
all_samples = pd.read_parquet(f'{summary_statistics_folder}/samples.parquet')
all_ancestries = pd.read_parquet(f'{summary_statistics_folder}/ancestries.parquet')
all_methods = pd.read_parquet(f'{summary_statistics_folder}/methods.parquet')
all_variants = pl.read_parquet(f'{summary_statistics_folder}/variants.parquet')

per_variant_category_imputation_performance = pd.read_parquet(f'{summary_statistics_folder}/per_variant_category_imputation_performance.parquet')
per_sample_imputation_performance = pd.read_parquet(f'{summary_statistics_folder}/per_sample_imputation_performance.parquet')

### Variant Filtering Stats 
Stats in paragraph after "Full length, telomere to telomere maps of human genetic recombination rates are largely consistent with deCODE recombination rates"

In [5]:
variant_filter_stats = pd.read_parquet(f"{summary_statistics_folder}/filter_summary_stats.parquet")
variant_filter_stats['gatk_filter'] = variant_filter_stats.VQSLOD_filter #| variant_filter_stats.neg_train_site_filter
chm13_variant_stats = variant_filter_stats.loc[(variant_filter_stats.genome=='CHM13v2.0')]
singletons = chm13_variant_stats.loc[chm13_variant_stats.singleton]
not_singletons = chm13_variant_stats.loc[~chm13_variant_stats.singleton]

num_affected_gatk = chm13_variant_stats.loc[~(chm13_variant_stats.GRCh38_criteria_fail) &
                                            # chm13_variant_stats.CHM13_criteria_fail &
                                            chm13_variant_stats.gatk_filter].len.sum()


singletons_gained = singletons.loc[chm13_variant_stats.GRCh38_criteria_fail & 
                                   ~(chm13_variant_stats.CHM13_criteria_fail | 
                                   variant_filter_stats.CHM13_filtered)].len.sum()
print(f"Our VQSLOD filter resulted in the loss of an additional \n\
{num_affected_gatk} \
variants compared to the NYGC’s GRCh38 1KGP variant filtering methods, but the inclusion of singleton alleles added \n\
{singletons_gained} \
unique variants to our dataset (Supplemental Figure 1).")
singletons

Our VQSLOD filter resulted in the loss of an additional 
14494016 variants compared to the NYGC’s GRCh38 1KGP variant filtering methods, but the inclusion of singleton alleles added 
111805 unique variants to our dataset (Supplemental Figure 1).


Unnamed: 0,genome,Syntenic,singleton,VQSLOD_filter,MERR_filter,HWE_pop_filter,MAC_filter,AC_filter,f_missing_filter,var_len_filter,alt_star_filter,pass_filter,CHM13_filtered,GRCh38_filtered,GRCh38_criteria_fail,CHM13_criteria_fail,len,gatk_filter
7,CHM13v2.0,False,True,False,False,False,False,True,False,False,False,False,True,True,True,False,106488,False
12,CHM13v2.0,True,True,False,False,False,False,True,True,True,False,False,True,True,True,True,1360,False
40,CHM13v2.0,False,True,True,False,False,False,True,False,False,False,False,True,True,True,True,589443,True
55,CHM13v2.0,True,True,False,False,False,False,False,True,False,False,False,True,True,True,True,3,False
90,CHM13v2.0,True,True,False,False,False,False,False,False,False,True,False,True,True,True,True,1,False
95,CHM13v2.0,True,True,True,False,False,False,False,False,False,True,False,True,True,True,True,12,True
104,CHM13v2.0,True,True,True,True,False,False,True,True,False,True,False,True,True,True,True,2,True
109,CHM13v2.0,True,True,False,False,False,False,True,False,False,True,False,True,True,True,True,441729,False
113,CHM13v2.0,True,True,False,False,False,False,True,True,False,False,False,True,True,True,True,31253,False
122,CHM13v2.0,True,True,True,False,False,False,True,True,False,True,False,True,True,True,True,10378,True


In [6]:
not_singletons.loc[(not_singletons.Syntenic)].groupby(['CHM13_filtered', 'CHM13_criteria_fail','gatk_filter']).len.sum()

CHM13_filtered  CHM13_criteria_fail  gatk_filter
False           False                False            213333
True            False                False          60511946
                True                 False           2032062
                                     True           17508360
Name: len, dtype: uint64

In [7]:
variant_filter_stats.groupby(['genome','gatk_filter','GRCh38_criteria_fail'])['len'].sum().reset_index()

  variant_filter_stats.groupby(['genome','gatk_filter','GRCh38_criteria_fail'])['len'].sum().reset_index()


Unnamed: 0,genome,gatk_filter,GRCh38_criteria_fail,len
0,GRCh38,False,False,61243112
1,GRCh38,False,True,42784742
2,GRCh38,True,False,13365695
3,GRCh38,True,True,25729879
4,CHM13v2.0,False,False,60934254
5,CHM13v2.0,False,True,42867140
6,CHM13v2.0,True,False,14494016
7,CHM13v2.0,True,True,16177227


In [8]:
variant_filter_stats.loc[(~variant_filter_stats.CHM13_criteria_fail)&(variant_filter_stats.GRCh38_criteria_fail)].groupby(['GRCh38_criteria_fail','gatk_filter','genome','singleton'])['len'].sum().reset_index()    

  variant_filter_stats.loc[(~variant_filter_stats.CHM13_criteria_fail)&(variant_filter_stats.GRCh38_criteria_fail)].groupby(['GRCh38_criteria_fail','gatk_filter','genome','singleton'])['len'].sum().reset_index()


Unnamed: 0,GRCh38_criteria_fail,gatk_filter,genome,singleton,len
0,True,False,GRCh38,True,39599697
1,True,False,CHM13v2.0,True,40226623


Table 1 and supplemental table XX

In [9]:
best_phased_variants = MAF_performance_variant_df.filter((pl.col('ground_truth_data_source')!='trios')&(pl.col('method_of_phasing')=='phased_with_parents_and_pedigree'))
catVarType = pl.Enum(['SNP','Indel','SNPs + Indels'])
best_phased_variants = best_phased_variants.with_columns(type=pl.col('type').cast(catVarType))
best_phased_variants = pl.concat([best_phased_variants, best_phased_variants.with_columns(type=pl.lit('SNPs + Indels').cast(catVarType))])
best_phased_variants_by_syntenic = best_phased_variants.group_by(['type','Syntenic','method_of_phasing','ground_truth_data_source','genome']
                                                                 ).sum(
                                                                 ).with_columns(gt_error_rate = pl.col('n_gt_errors')/pl.col('n_gt_checked')*100,
                                                                                switch_error_rate = pl.col('n_switch_errors')/pl.col('n_checked')*100
                                                                 ).to_pandas()
best_phased_variants_overall = best_phased_variants.group_by(['type','method_of_phasing','ground_truth_data_source','genome']
                                                                 ).sum(
                                                                 ).with_columns(gt_error_rate = pl.col('n_gt_errors')/pl.col('n_gt_checked')*100,
                                                                                switch_error_rate = pl.col('n_switch_errors')/pl.col('n_checked')*100
                                                                 ).to_pandas()


In [10]:
best_phased_variants_overall.sort_values(['ground_truth_data_source','genome','type'])

Unnamed: 0,type,method_of_phasing,ground_truth_data_source,genome,variant_id,n_switch_errors,n_checked,n_gt_errors,n_gt_checked,MAC,AN,Syntenic,multiallelic,in_platinum_STRs,in_segdups,in_STRs,MAF,rounded_MAF,gt_error_rate,switch_error_rate
19,SNP,phased_with_parents_and_pedigree,HPRC_samples,GRCh38,,3664,368771,55269,2690863,41946142,444066168,69221,5601,7803,22995,3083,654836.5,,2.053951,0.993571
16,Indel,phased_with_parents_and_pedigree,HPRC_samples,GRCh38,,1466,70966,33755,531857,9134280,87709184,13683,9941,9679,3451,2011,142638.359375,,6.346631,2.065778
8,SNPs + Indels,phased_with_parents_and_pedigree,HPRC_samples,GRCh38,,5130,439737,89024,3222720,51080422,531775352,82904,15542,17482,26446,5094,797406.0625,,2.762387,1.166606
38,SNP,phased_with_parents_and_pedigree,HPRC_samples,CHM13v2.0,,2170,354018,12172,2499336,39847161,412936324,64480,4688,6145,7788,2267,622073.75,,0.487009,0.612963
33,Indel,phased_with_parents_and_pedigree,HPRC_samples,CHM13v2.0,,1193,79594,23808,563401,9924572,93120564,14534,10155,9844,2301,2584,154979.96875,,4.225765,1.498857
9,SNPs + Indels,phased_with_parents_and_pedigree,HPRC_samples,CHM13v2.0,,3363,433612,35980,3062737,49771733,506056888,79014,14843,15989,10089,4851,776970.9375,,1.174766,0.775578
27,SNP,phased_with_parents_and_pedigree,HGSVC_samples,GRCh38,,6682,532570,75542,4308072,38507201,445878500,69489,5524,8023,22003,3179,601137.375,,1.753499,1.254671
13,Indel,phased_with_parents_and_pedigree,HGSVC_samples,GRCh38,,2465,104443,51501,855247,8512412,88432836,13792,10062,9783,3216,2018,132927.28125,,6.021769,2.360139
31,SNPs + Indels,phased_with_parents_and_pedigree,HGSVC_samples,GRCh38,,9147,637013,127043,5163319,47019613,534311336,83281,15586,17806,25219,5197,733991.125,,2.460491,1.43592
0,SNP,phased_with_parents_and_pedigree,HGSVC_samples,CHM13v2.0,,4688,529875,46334,4278024,37923130,442337088,69069,5047,6718,8337,2512,592002.5625,,1.08307,0.884737


In [11]:
best_phased_variants_by_syntenic.loc[best_phased_variants_by_syntenic.ground_truth_data_source=='HPRC_samples'].sort_values(['ground_truth_data_source','Syntenic','genome','type'])

Unnamed: 0,type,Syntenic,method_of_phasing,ground_truth_data_source,genome,variant_id,n_switch_errors,n_checked,n_gt_errors,n_gt_checked,MAC,AN,multiallelic,in_platinum_STRs,in_segdups,in_STRs,MAF,rounded_MAF,gt_error_rate,switch_error_rate
4,SNP,False,phased_with_parents_and_pedigree,HPRC_samples,GRCh38,,0,882,145,4716,97434,774884,9,7,2,3,1521.455322,,3.07464,0.0
30,Indel,False,phased_with_parents_and_pedigree,HPRC_samples,GRCh38,,0,47,22,507,7212,83252,8,7,2,1,112.617111,,4.33925,0.0
28,SNPs + Indels,False,phased_with_parents_and_pedigree,HPRC_samples,GRCh38,,0,929,167,5223,104646,858136,17,14,4,4,1634.072754,,3.197396,0.0
53,SNP,False,phased_with_parents_and_pedigree,HPRC_samples,CHM13v2.0,,0,0,3,39,109,6404,1,0,1,0,1.702061,,7.692308,
73,Indel,False,phased_with_parents_and_pedigree,HPRC_samples,CHM13v2.0,,3,22,60,273,6347,44828,5,5,7,3,99.109924,,21.978022,13.636364
82,SNPs + Indels,False,phased_with_parents_and_pedigree,HPRC_samples,CHM13v2.0,,3,22,63,312,6456,51232,6,5,8,3,100.811989,,20.192308,13.636364
79,SNP,True,phased_with_parents_and_pedigree,HPRC_samples,GRCh38,,3664,367889,55124,2686147,41848708,443291284,5592,7796,22993,3080,653316.625,,2.052159,0.995953
26,Indel,True,phased_with_parents_and_pedigree,HPRC_samples,GRCh38,,1466,70919,33733,531350,9127068,87625932,9933,9672,3449,2010,142525.71875,,6.348546,2.067147
75,SNPs + Indels,True,phased_with_parents_and_pedigree,HPRC_samples,GRCh38,,5130,438808,88857,3217497,50975776,530917216,15525,17468,26442,5090,795773.625,,2.761681,1.169076
59,SNP,True,phased_with_parents_and_pedigree,HPRC_samples,CHM13v2.0,,2170,354018,12169,2499297,39847052,412929920,4687,6145,7787,2267,622072.0625,,0.486897,0.612963


### Percent concordance in different panels

In [12]:
overall_subsetted = MAF_performance_variant_df.filter((pl.col('ground_truth_data_source')!='trios')&(pl.col('method_of_phasing')=='phased_with_parents_and_pedigree')).group_by(['type','Syntenic','method_of_phasing','ground_truth_data_source','genome']).sum().with_columns(gt_error_rate = pl.col('n_gt_errors')/pl.col('n_gt_checked'))
overall           = MAF_performance_variant_df.filter((pl.col('ground_truth_data_source')!='trios')&(pl.col('method_of_phasing')=='phased_with_parents_and_pedigree')).group_by(['method_of_phasing','ground_truth_data_source','genome']).sum().with_columns(gt_error_rate = pl.col('n_gt_errors')/pl.col('n_gt_checked'))
overall_syntenic  = MAF_performance_variant_df.filter((pl.col('ground_truth_data_source')!='trios')&(pl.col('Syntenic'))&(pl.col('method_of_phasing')=='phased_with_parents_and_pedigree')).group_by(['method_of_phasing','ground_truth_data_source','genome']).sum().with_columns(gt_error_rate = pl.col('n_gt_errors')/pl.col('n_gt_checked'))

In [13]:
overall_syntenic = overall_syntenic.to_pandas()[['Syntenic','genome','ground_truth_data_source','gt_error_rate']].sort_values(['genome','Syntenic','gt_error_rate'])
overall_syntenic['Non-reference true positive rate'] = (1-overall_syntenic.gt_error_rate) * 100
overall_syntenic['gt_error_rate'] *= 100
overall_syntenic.loc[overall_syntenic.ground_truth_data_source.isin(('HPRC_samples', 'HGSVC_samples','HPRC_HGSVC_all_samples'))]

Unnamed: 0,Syntenic,genome,ground_truth_data_source,gt_error_rate,Non-reference true positive rate
4,82904,GRCh38,HPRC_samples,2.761681,97.238319
5,83281,GRCh38,HGSVC_samples,2.463194,97.536806
1,98672,GRCh38,HPRC_HGSVC_all_samples,3.522661,96.477339
2,79014,CHM13v2.0,HPRC_samples,1.172829,98.827171
13,84344,CHM13v2.0,HGSVC_samples,1.806058,98.193942
3,99265,CHM13v2.0,HPRC_HGSVC_all_samples,2.740583,97.259417


In [14]:
overall = overall.to_pandas()[['Syntenic','genome','ground_truth_data_source','gt_error_rate']].sort_values(['genome','Syntenic','gt_error_rate'])
overall['Non-reference true positive rate'] = (1-overall.gt_error_rate) * 100
overall['gt_error_rate'] *= 100
overall.loc[overall.ground_truth_data_source.isin(('HPRC_samples', 'HGSVC_samples','HPRC_HGSVC_all_samples'))]

Unnamed: 0,Syntenic,genome,ground_truth_data_source,gt_error_rate,Non-reference true positive rate
2,82904,GRCh38,HPRC_samples,2.762387,97.237613
6,83281,GRCh38,HGSVC_samples,2.460491,97.539509
1,98672,GRCh38,HPRC_HGSVC_all_samples,3.525144,96.474856
4,79014,CHM13v2.0,HPRC_samples,1.174766,98.825234
7,84344,CHM13v2.0,HGSVC_samples,1.811786,98.188214
8,99265,CHM13v2.0,HPRC_HGSVC_all_samples,2.746979,97.253021


In [15]:
overall_subsetted = overall_subsetted.to_pandas()[['type','Syntenic','genome','ground_truth_data_source','gt_error_rate']].sort_values(['genome','type','Syntenic','gt_error_rate'])
overall_subsetted['Non-reference true positive rate'] = (1-overall_subsetted.gt_error_rate) * 100
overall_subsetted['gt_error_rate'] *= 100
overall_subsetted.loc[overall_subsetted.ground_truth_data_source.isin(('HPRC_samples', 'HGSVC_samples','HPRC_HGSVC_all_samples'))]

Unnamed: 0,type,Syntenic,genome,ground_truth_data_source,gt_error_rate,Non-reference true positive rate
17,SNP,False,GRCh38,HGSVC_samples,0.771056,99.228944
14,SNP,False,GRCh38,HPRC_samples,3.07464,96.92536
54,SNP,False,GRCh38,HPRC_HGSVC_all_samples,5.0,95.0
55,SNP,True,GRCh38,HGSVC_samples,1.755425,98.244575
24,SNP,True,GRCh38,HPRC_samples,2.052159,97.947841
33,SNP,True,GRCh38,HPRC_HGSVC_all_samples,2.875292,97.124708
7,Indel,False,GRCh38,HGSVC_samples,2.754036,97.245964
1,Indel,False,GRCh38,HPRC_HGSVC_all_samples,4.05,95.95
39,Indel,False,GRCh38,HPRC_samples,4.33925,95.66075
46,Indel,True,GRCh38,HGSVC_samples,6.025797,93.974203


In [16]:
# wide_genome_ideogram['SER_diff'] = wide_genome_ideogram['SER_t2t'] - wide_genome_ideogram['SER_grch38']
# region_table = (wide_genome_ideogram.loc[(wide_genome_ideogram.ground_truth_data_source=='HPRC_samples') & 
#                          (wide_genome_ideogram.method_of_phasing=='HPRC_variation_phased_with_reference_panel') #& 
#                         & (wide_genome_ideogram.contextual_MAC_grch38 > 1e6)
#                         & (wide_genome_ideogram.contextual_MAC_t2t > 1e6)
#                         ])
# decipher_regions = ['1p36.33','1p36.32','1p36.31','1p36.23','1p36.22','1p36.21','1q21.1','1q21.2','2p21','2p16.1','2p15','2q32.3','2q33.1','2q33.2','2q33.3','2q37.3','3q29','4p16.3','5p15.33','5p15.32','5p15.31','5p15.2','5q22.3','5q23.2','5q35.2','5q35.3','7q11.22','7q21.3','8p23.1','8q21.13','9q34.3','11p13','11p11.2','12q14.2','12q14.3','12q15','15q11.2','15q12','15q13.1','15q13.2','15q13.3','15q24.1','15q24.2','15q26.3','16p13.3','16p13.11','16p12.2','16p12.1','16p11.2','17p13.3','17p12','17p11.2','17q11.2','17q12','17q21.31','21q21.3','22p13','22p12','22p11.2','22p11.1','22q11.1','22q11.21','22q11.22','22q11.23','22q13.33','Xp22.33','Xp22.31','Xp11.23','Xp11.22','Xq22.2','Xq28']
# region_table.interval_names = region_table.chrom.str.replace('chr','') + region_table.interval_names
# region_table['decipher_region'] = region_table.interval_names.isin(decipher_regions)

# region_table = (region_table.sort_values('SER_diff').iloc[:].reset_index(drop=True)
#                         [['chrom','interval_names','decipher_region','n_switch_errors_grch38','n_checked_grch38','n_switch_errors_t2t','n_checked_t2t','SER_grch38','SER_t2t','SER_diff']])

# region_table = region_table.drop(columns='chrom').rename(columns={'interval_names':'Genetic Region',
#                                                    'n_switch_errors_grch38': 'Switch Errors (GRCh38 panel)',
#                                                    'n_switch_errors_t2t': 'Switch Errors (CHM13 panel)',
#                                                    'n_checked_grch38': 'Heterozygous sites (GRCh38 panel)',
#                                                    'n_checked_t2t': 'Heterozygous sites (CHM13 panel)',
#                                                    'SER_grch38': 'SER % (GRCh38 panel)',
#                                                    'SER_t2t': 'SER % (CHM13 panel)',
#                                                    'SER_diff': 'Absolute reduction in SER (% points)'})

# pd.set_option('display.float_format', lambda x: f'{x:,.2f}%')
# region_table.set_index('Genetic Region')

In [17]:
rephase=all_variants.filter((pl.col('method_of_phasing') == '1kgp_variation_phased_with_reference_panel') & 
                         (pl.col('ground_truth_data_source') == 'HPRC_samples'))

singletons_rephase_sum = rephase.filter(pl.col('contextual_MAC') == 1).group_by('genome').sum().to_pandas().reset_index()
singletons_rephase_sum['SER'] = singletons_rephase_sum.n_switch_errors/singletons_rephase_sum.n_checked*100
for idx, (genome, ser) in singletons_rephase_sum[['genome','SER']].iterrows():
    print(f'Singletons in {genome}: {ser}')

rare_rephase = rephase.filter(pl.col('contextual_MAC')/pl.col('contextual_AN') < 0.01)
common_rephase = rephase.filter(pl.col('contextual_MAC')/pl.col('contextual_AN') >= 0.01)

rare_rephase_sum = rare_rephase.group_by('genome').sum().to_pandas().reset_index()
common_rephase_sum = common_rephase.group_by('genome').sum().to_pandas().reset_index()
for idx, (genome, n_switch_errors, n_checked) in rare_rephase_sum[['genome','n_switch_errors', 'n_checked']].iterrows():
    print (f"\nRare in {genome}: {n_switch_errors/n_checked*100}")
    print (f"Common in {genome}: {common_rephase_sum.iloc[idx].n_switch_errors/common_rephase_sum.iloc[idx].n_checked*100}")
syntenic = common_rephase.filter(pl.col('Syntenic') == True)
nonsyntenic = common_rephase.filter(pl.col('Syntenic') == False)

syntenic_sum = syntenic.group_by('genome').sum().to_pandas().set_index('genome')
nonsyntenic_sum = nonsyntenic.group_by('genome').sum().to_pandas().set_index('genome')

for genome, row in syntenic_sum.iterrows():
    print (f"\nCommon Syntenic in {genome}: {row.n_switch_errors/row.n_checked*100}")
    print (f"Common Nonsyntenic {genome}: {nonsyntenic_sum.loc[genome].n_switch_errors/nonsyntenic_sum.loc[genome].n_checked*100}")

Singletons in GRCh38: 14.39873417721519
Singletons in CHM13v2.0: 9.825145711906744

Rare in CHM13v2.0: 6.018179918503814
Common in CHM13v2.0: 1.1376264478260545

Rare in GRCh38: 7.419862621637092
Common in GRCh38: 1.6314417692446646

Common Syntenic in GRCh38: 1.635230560250521
Common Nonsyntenic GRCh38: 0.798175598631699

Common Syntenic in CHM13v2.0: 1.1374146006742445
Common Nonsyntenic CHM13v2.0: 7.6923076923076925


In [18]:
synEnum = pl.Enum(categories=['All_regions', 'Syntenic', 'Nonsyntenic'])
concordance = all_variants.filter((pl.col('method_of_phasing') == 'phased_with_parents_and_pedigree') &
                                  (pl.col('ground_truth_data_source') == 'HPRC_samples')).with_columns(Syntenic=pl.col('Syntenic').cast(str).replace('true','Syntenic').replace('false','Nonsyntenic').cast(synEnum))

In [19]:
concordance_summary = concordance.clone().with_columns(Syntenic = pl.lit('All_regions').cast(synEnum))
concordance_summary = pl.concat((concordance, concordance_summary)).group_by(['Syntenic','genome','type']).sum().to_pandas().reset_index()
concordance_summary = pd.concat((concordance_summary, concordance_summary.copy().assign(type = 'SNPs + Indels').groupby(['Syntenic','genome','type'], observed=True).sum(numeric_only=True).reset_index()))
concordance_summary['gt_error_rate'] = concordance_summary.n_gt_errors/concordance_summary.n_gt_checked*100
concordance_summary['gt_accuracy_rate'] = 100-concordance_summary['gt_error_rate'] 
concordance_summary['switch_error_rate'] = concordance_summary.n_switch_errors/concordance_summary.n_checked*100
concordance_summary['Syntenic'] = concordance_summary.Syntenic.replace('false', 'Nonsyntenic').replace('true','Syntenic')

In [20]:
pd.set_option("display.precision", 3)
concordance_summary.sort_values(['Syntenic','genome','type']).set_index(['Syntenic','genome','type'])[['switch_error_rate','gt_error_rate','n_gt_errors','n_gt_checked', 'n_switch_errors', 'n_checked']].astype(float)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,switch_error_rate,gt_error_rate,n_gt_errors,n_gt_checked,n_switch_errors,n_checked
Syntenic,genome,type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
All_regions,GRCh38,Indel,2.066,6.347,33755.0,531900.0,1466.0,70966.0
All_regions,GRCh38,SNP,0.994,2.054,55269.0,2691000.0,3664.0,368771.0
All_regions,GRCh38,SNPs + Indels,1.167,2.762,89024.0,3223000.0,5130.0,439737.0
All_regions,CHM13v2.0,Indel,1.499,4.226,23808.0,563400.0,1193.0,79594.0
All_regions,CHM13v2.0,SNP,0.613,0.487,12172.0,2499000.0,2170.0,354018.0
All_regions,CHM13v2.0,SNPs + Indels,0.776,1.175,35980.0,3063000.0,3363.0,433612.0
Syntenic,GRCh38,Indel,2.067,6.349,33733.0,531400.0,1466.0,70919.0
Syntenic,GRCh38,SNP,0.996,2.052,55124.0,2686000.0,3664.0,367889.0
Syntenic,GRCh38,SNPs + Indels,1.169,2.762,88857.0,3217000.0,5130.0,438808.0
Syntenic,CHM13v2.0,Indel,1.496,4.217,23748.0,563100.0,1190.0,79572.0
