In [1]:
import pandas as pd
from pandas.api.types import CategoricalDtype
import polars as pl
import seaborn as sns
import glob
import os
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import importlib


pedigree_file_loc = 'resources/pedigrees/gatk_1kgp.ped'
summary_statistics_folder='intermediate_data_fullgenome'

if os.getcwd()!='phasing_T2T_project':
    os.chdir('..') # Change to the parent directory

In [2]:
import glob; glob.glob(f"{summary_statistics_folder}/*")

['intermediate_data_fullgenome/rolling_stats_500k_window.parquet',
 'intermediate_data_fullgenome/flips_and_switches.parquet',
 'intermediate_data_fullgenome/samples.parquet',
 'intermediate_data_fullgenome/compressed_ideogram_100000_window.parquet',
 'intermediate_data_fullgenome/variants.parquet',
 'intermediate_data_fullgenome/compressed_ideogram_1000000_window.parquet',
 'intermediate_data_fullgenome/per_genome_cnv_regions_with_slop.parquet',
 'intermediate_data_fullgenome/chroms.parquet',
 'intermediate_data_fullgenome/variant_frequency_stats',
 'intermediate_data_fullgenome/switch_errors.parquet',
 'intermediate_data_fullgenome/MAF_performance_variants.parquet',
 'intermediate_data_fullgenome/sample_genotype_concordance.parquet',
 'intermediate_data_fullgenome/per_sample_imputation_performance.parquet',
 'intermediate_data_fullgenome/rolling_stats_250k_window.parquet',
 'intermediate_data_fullgenome/bcftools_query_variant_data.parquet',
 'intermediate_data_fullgenome/per_genome_c

In [3]:
q = pd.read_parquet('intermediate_data/per_MAF_bin.parquet')
q = q[(q.rounded_MAF=='singleton')&(q.method_of_phasing=='phased_with_parents_and_pedigree')&(q.syntenic=='All')&(q.type=='SNPs + Indels')]
# q.type=='SNPs + Indels'
q

Unnamed: 0,type,rounded_MAF,genome,method_of_phasing,ground_truth_data_source,n_switch_errors,n_checked,n_gt_errors,n_gt_checked,MAC,AN,switch_error_rate,gt_error_rate,MAF,syntenic
1105,SNPs + Indels,singleton,CHM13v2.0,phased_with_parents_and_pedigree,HPRC_HGSVC_probands,4.0,109.0,3581,52836,777,4975908,3.669725,6.777576,0.015615,All
964,SNPs + Indels,singleton,CHM13v2.0,phased_with_parents_and_pedigree,HGSVC_samples,389.0,1505.0,3071,142515,2312,14806048,25.847176,2.154861,0.015615,All
1058,SNPs + Indels,singleton,CHM13v2.0,phased_with_parents_and_pedigree,HPRC_samples,,,1046,32658,851,5449804,,3.202891,0.015615,All
914,SNPs + Indels,singleton,CHM13v2.0,phased_with_parents_and_pedigree,HGSVC_probands,4.0,109.0,1607,20646,666,4265064,3.669725,7.78359,0.015615,All
836,SNPs + Indels,singleton,CHM13v2.0,phased_with_parents_and_pedigree,HGSVC_parents,20.0,274.0,1176,10192,784,5020736,7.29927,11.538462,0.015615,All
983,SNPs + Indels,singleton,CHM13v2.0,phased_with_parents_and_pedigree,HPRC_HGSVC_all_samples,393.0,1506.0,9450,267100,2671,17105084,26.095618,3.538001,0.015615,All
931,SNPs + Indels,singleton,CHM13v2.0,phased_with_parents_and_pedigree,HGSVC_samples_nontrios_only,369.0,1123.0,2261,31863,1677,10739508,32.858415,7.096005,0.015615,All


In [4]:
MAF_performance_variant_df = pl.read_parquet(f'{summary_statistics_folder}/MAF_performance_variants.parquet')
all_chroms = pd.read_parquet(f'{summary_statistics_folder}/chroms.parquet')
all_samples = pd.read_parquet(f'{summary_statistics_folder}/samples.parquet')
all_ancestries = pd.read_parquet(f'{summary_statistics_folder}/ancestries.parquet')
all_methods = pd.read_parquet(f'{summary_statistics_folder}/methods.parquet')
all_variants = pl.read_parquet(f'{summary_statistics_folder}/variants.parquet')

per_variant_category_imputation_performance = pd.read_parquet(f'{summary_statistics_folder}/per_variant_category_imputation_performance.parquet')
per_sample_imputation_performance = pd.read_parquet(f'{summary_statistics_folder}/per_sample_imputation_performance.parquet')

### Variant Filtering Stats 
Stats in paragraph after "Full length, telomere to telomere maps of human genetic recombination rates are largely consistent with deCODE recombination rates"

In [5]:
variant_filter_stats = pd.read_parquet(f"{summary_statistics_folder}/filter_summary_stats.parquet")
variant_filter_stats['gatk_filter'] = variant_filter_stats.VQSLOD_filter #| variant_filter_stats.neg_train_site_filter
chm13_variant_stats = variant_filter_stats.loc[(variant_filter_stats.genome=='CHM13v2.0')]
singletons = chm13_variant_stats.loc[chm13_variant_stats.singleton]
not_singletons = chm13_variant_stats.loc[~chm13_variant_stats.singleton]

num_affected_gatk = chm13_variant_stats.loc[~(chm13_variant_stats.GRCh38_criteria_fail) &
                                            # chm13_variant_stats.CHM13_criteria_fail &
                                            chm13_variant_stats.gatk_filter].len.sum()


singletons_gained = singletons.loc[chm13_variant_stats.GRCh38_criteria_fail & 
                                   ~(chm13_variant_stats.CHM13_criteria_fail | 
                                   variant_filter_stats.CHM13_filtered)].len.sum()
print(f"Our VQSLOD filter resulted in the loss of an additional \n\
{num_affected_gatk} \
variants compared to the NYGC’s GRCh38 1KGP variant filtering methods, but the inclusion of singleton alleles added \n\
{singletons_gained} \
unique variants to our dataset (Supplemental Figure 1).")
singletons

Our VQSLOD filter resulted in the loss of an additional 
14494016 variants compared to the NYGC’s GRCh38 1KGP variant filtering methods, but the inclusion of singleton alleles added 
26014731 unique variants to our dataset (Supplemental Figure 1).


Unnamed: 0,genome,Syntenic,singleton,VQSLOD_filter,MERR_filter,HWE_pop_filter,MAC_filter,AC_filter,f_missing_filter,var_len_filter,alt_star_filter,pass_filter,CHM13_filtered,GRCh38_filtered,GRCh38_criteria_fail,CHM13_criteria_fail,len,gatk_filter
4,CHM13v2.0,True,True,False,False,False,False,False,True,False,False,False,True,True,True,True,3,False
24,CHM13v2.0,False,True,False,False,False,False,True,False,False,False,False,False,False,True,False,25,False
33,CHM13v2.0,False,True,False,False,False,False,True,False,True,False,False,True,True,True,True,1097,False
38,CHM13v2.0,False,True,True,False,False,False,True,True,True,True,False,True,True,True,True,2418,True
40,CHM13v2.0,False,True,True,False,False,False,True,False,True,True,False,True,True,True,True,2747,True
44,CHM13v2.0,True,True,False,False,False,False,True,True,False,False,False,True,False,True,True,25,False
46,CHM13v2.0,False,True,False,False,False,False,True,True,False,True,False,True,True,True,True,1245,False
68,CHM13v2.0,False,True,False,False,False,False,True,True,True,False,False,True,True,True,True,354,False
69,CHM13v2.0,False,True,True,False,False,False,False,False,False,False,False,True,True,False,True,11,True
82,CHM13v2.0,True,True,False,False,False,False,True,False,False,False,False,True,True,True,False,14164284,False


In [6]:
not_singletons.loc[(not_singletons.Syntenic)].groupby(['CHM13_filtered', 'CHM13_criteria_fail','gatk_filter']).len.sum()

CHM13_filtered  CHM13_criteria_fail  gatk_filter
False           False                False          48948192
True            False                False          11777087
                True                 False           2032062
                                     True           17508360
Name: len, dtype: uint64

In [7]:
variant_filter_stats.groupby(['genome','gatk_filter','GRCh38_criteria_fail'])['len'].sum().reset_index()

  variant_filter_stats.groupby(['genome','gatk_filter','GRCh38_criteria_fail'])['len'].sum().reset_index()


Unnamed: 0,genome,gatk_filter,GRCh38_criteria_fail,len
0,GRCh38,False,False,61243112
1,GRCh38,False,True,42784742
2,GRCh38,True,False,13365695
3,GRCh38,True,True,25729879
4,CHM13v2.0,False,False,60934254
5,CHM13v2.0,False,True,42867140
6,CHM13v2.0,True,False,14494016
7,CHM13v2.0,True,True,16177227


In [8]:
variant_filter_stats.loc[(~variant_filter_stats.CHM13_criteria_fail)&(variant_filter_stats.GRCh38_criteria_fail)].groupby(['GRCh38_criteria_fail','gatk_filter','genome','singleton'])['len'].sum().reset_index()    

  variant_filter_stats.loc[(~variant_filter_stats.CHM13_criteria_fail)&(variant_filter_stats.GRCh38_criteria_fail)].groupby(['GRCh38_criteria_fail','gatk_filter','genome','singleton'])['len'].sum().reset_index()


Unnamed: 0,GRCh38_criteria_fail,gatk_filter,genome,singleton,len
0,True,False,GRCh38,True,39599697
1,True,False,CHM13v2.0,True,40226623


Table 1 and supplemental table XX

In [9]:
best_phased_variants = MAF_performance_variant_df.filter((pl.col('ground_truth_data_source')!='trios')&(pl.col('method_of_phasing')=='phased_with_parents_and_pedigree'))
catVarType = pl.Enum(['SNP','Indel','SNPs + Indels'])
best_phased_variants = best_phased_variants.with_columns(type=pl.col('type').cast(catVarType))
best_phased_variants = pl.concat([best_phased_variants, best_phased_variants.with_columns(type=pl.lit('SNPs + Indels').cast(catVarType))])
best_phased_variants_by_syntenic = best_phased_variants.group_by(['type','Syntenic','method_of_phasing','ground_truth_data_source','genome']
                                                                 ).sum(
                                                                 ).with_columns(gt_error_rate = pl.col('n_gt_errors')/pl.col('n_gt_checked')*100,
                                                                                switch_error_rate = pl.col('n_switch_errors')/pl.col('n_checked')*100
                                                                 ).to_pandas()
best_phased_variants_overall = best_phased_variants.group_by(['type','method_of_phasing','ground_truth_data_source','genome']
                                                                 ).sum(
                                                                 ).with_columns(gt_error_rate = pl.col('n_gt_errors')/pl.col('n_gt_checked')*100,
                                                                                switch_error_rate = pl.col('n_switch_errors')/pl.col('n_checked')*100
                                                                 ).to_pandas()


In [10]:
best_phased_variants_overall.sort_values(['ground_truth_data_source','genome','type'])

Unnamed: 0,type,method_of_phasing,ground_truth_data_source,genome,variant_id,n_switch_errors,n_checked,n_gt_errors,n_gt_checked,MAC,AN,Syntenic,multiallelic,in_STRs,in_segdups,in_platinum_STRs,MAF,rounded_MAF,gt_error_rate,switch_error_rate
19,SNP,phased_with_parents_and_pedigree,HPRC_samples,GRCh38,,203833,96830151,3025975,711358610,10944096358,116867178764,18221880,1265576,766061,729897,1743056,163978912.0,,0.42538,0.210506
3,Indel,phased_with_parents_and_pedigree,HPRC_samples,GRCh38,,270831,19555688,6481975,145991067,2456453580,23980917912,3740768,2691149,755934,125781,2640091,38157448.0,,4.439981,1.384922
22,SNPs + Indels,phased_with_parents_and_pedigree,HPRC_samples,GRCh38,,474664,116385839,9507950,857349677,13400549938,140848096676,21962648,3956725,1521995,855678,4383147,199150912.0,,1.108993,0.407837
27,SNP,phased_with_parents_and_pedigree,HPRC_samples,CHM13v2.0,,125429,83970616,1581451,615986668,9469281970,101191249828,15792611,987434,517335,278683,1313540,142845632.0,,0.256735,0.149372
36,Indel,phased_with_parents_and_pedigree,HPRC_samples,CHM13v2.0,,236515,18027890,5285347,131789145,2235467907,21649362400,3376142,2364140,779837,113103,2303815,34772896.0,,4.010457,1.311939
28,SNPs + Indels,phased_with_parents_and_pedigree,HPRC_samples,CHM13v2.0,,361944,101998506,6866798,747775813,11704749877,122840612228,19168753,3351574,1297172,391786,3617355,174973280.0,,0.918296,0.354852
41,SNP,phased_with_parents_and_pedigree,HGSVC_samples,GRCh38,,754134,151010447,8002821,1262828279,10955497233,128888389728,20094909,1367442,838425,803320,1895313,163417504.0,,0.633722,0.499392
18,Indel,phased_with_parents_and_pedigree,HGSVC_samples,GRCh38,,645574,29972914,13099186,257525124,2469918872,26283412072,4099769,2940460,813114,138324,2887930,38311516.0,,5.086566,2.153858
9,SNPs + Indels,phased_with_parents_and_pedigree,HGSVC_samples,GRCh38,,1399708,180983361,21102007,1520353403,13425416105,155171801800,24194678,4307902,1651539,941644,4783243,198561632.0,,1.387967,0.77339
4,SNP,phased_with_parents_and_pedigree,HGSVC_samples,CHM13v2.0,,485678,131554603,5512226,1118869739,9491939443,114194366900,17820975,1097102,587425,302935,1471419,142577184.0,,0.49266,0.369184


In [11]:
best_phased_variants_by_syntenic.loc[best_phased_variants_by_syntenic.ground_truth_data_source=='HPRC_samples'].sort_values(['ground_truth_data_source','Syntenic','genome','type'])

Unnamed: 0,type,Syntenic,method_of_phasing,ground_truth_data_source,genome,variant_id,n_switch_errors,n_checked,n_gt_errors,n_gt_checked,MAC,AN,multiallelic,in_STRs,in_segdups,in_platinum_STRs,MAF,rounded_MAF,gt_error_rate,switch_error_rate
3,SNP,False,phased_with_parents_and_pedigree,HPRC_samples,GRCh38,,450,122930,25015,1054999,14013151,174259244,3736,16442,983,2220,218830.1,,2.371092,0.366062
58,Indel,False,phased_with_parents_and_pedigree,HPRC_samples,GRCh38,,174,15490,10527,152017,2255971,25039640,2623,1673,191,1852,35227.83,,6.924883,1.123305
42,SNPs + Indels,False,phased_with_parents_and_pedigree,HPRC_samples,GRCh38,,624,138420,35542,1207016,16269122,199298884,6359,18115,1174,4072,254063.6,,2.944617,0.450802
41,SNP,False,phased_with_parents_and_pedigree,HPRC_samples,CHM13v2.0,,133,26737,15469,336505,3625205,55368984,1475,3542,2255,849,56609.54,,4.59696,0.497438
80,Indel,False,phased_with_parents_and_pedigree,HPRC_samples,CHM13v2.0,,410,15019,16586,173023,2662187,28549032,3520,3259,1868,1150,41571.11,,9.586009,2.729875
76,SNPs + Indels,False,phased_with_parents_and_pedigree,HPRC_samples,CHM13v2.0,,543,41756,32055,509528,6287392,83918016,4995,6801,4123,1999,98182.16,,6.291116,1.300412
43,SNP,True,phased_with_parents_and_pedigree,HPRC_samples,GRCh38,,203383,96707221,3000960,710303611,10930083207,116692919520,1261840,749619,728914,1740836,163788600.0,,0.42249,0.210308
68,Indel,True,phased_with_parents_and_pedigree,HPRC_samples,GRCh38,,270657,19540198,6471448,145839050,2454197609,23955878272,2688526,754261,125590,2638239,38122740.0,,4.43739,1.385129
33,SNPs + Indels,True,phased_with_parents_and_pedigree,HPRC_samples,GRCh38,,474040,116247419,9472408,856142661,13384280816,140648797792,3950366,1503880,854504,4379075,198922800.0,,1.106405,0.407785
63,SNP,True,phased_with_parents_and_pedigree,HPRC_samples,CHM13v2.0,,125296,83943879,1565982,615650163,9465656765,101135880844,985959,513793,276428,1312691,142793300.0,,0.254362,0.149262


### Percent concordance in different panels

In [12]:
overall_subsetted = MAF_performance_variant_df.filter((pl.col('ground_truth_data_source')!='trios')&(pl.col('method_of_phasing')=='phased_with_parents_and_pedigree')).group_by(['type','Syntenic','method_of_phasing','ground_truth_data_source','genome']).sum().with_columns(gt_error_rate = pl.col('n_gt_errors')/pl.col('n_gt_checked'))
overall           = MAF_performance_variant_df.filter((pl.col('ground_truth_data_source')!='trios')&(pl.col('method_of_phasing')=='phased_with_parents_and_pedigree')).group_by(['method_of_phasing','ground_truth_data_source','genome']).sum().with_columns(gt_error_rate = pl.col('n_gt_errors')/pl.col('n_gt_checked'))
overall_syntenic  = MAF_performance_variant_df.filter((pl.col('ground_truth_data_source')!='trios')&(pl.col('Syntenic'))&(pl.col('method_of_phasing')=='phased_with_parents_and_pedigree')).group_by(['method_of_phasing','ground_truth_data_source','genome']).sum().with_columns(gt_error_rate = pl.col('n_gt_errors')/pl.col('n_gt_checked'))

In [13]:
overall_syntenic = overall_syntenic.to_pandas()[['Syntenic','genome','ground_truth_data_source','gt_error_rate']].sort_values(['genome','Syntenic','gt_error_rate'])
overall_syntenic['Non-reference true positive rate'] = (1-overall_syntenic.gt_error_rate) * 100
overall_syntenic['gt_error_rate'] *= 100
overall_syntenic.loc[overall_syntenic.ground_truth_data_source.isin(('HPRC_samples', 'HGSVC_samples','HPRC_HGSVC_all_samples'))]

Unnamed: 0,Syntenic,genome,ground_truth_data_source,gt_error_rate,Non-reference true positive rate
2,21962648,GRCh38,HPRC_samples,1.106405,98.893595
4,24194678,GRCh38,HGSVC_samples,1.386977,98.613023
0,28325609,GRCh38,HPRC_HGSVC_all_samples,1.289689,98.710311
12,19168753,CHM13v2.0,HPRC_samples,0.914633,99.085367
11,21523790,CHM13v2.0,HGSVC_samples,1.203934,98.796066
13,25190775,CHM13v2.0,HPRC_HGSVC_all_samples,1.138373,98.861627


In [14]:
overall = overall.to_pandas()[['Syntenic','genome','ground_truth_data_source','gt_error_rate']].sort_values(['genome','Syntenic','gt_error_rate'])
overall['Non-reference true positive rate'] = (1-overall.gt_error_rate) * 100
overall['gt_error_rate'] *= 100
overall.loc[overall.ground_truth_data_source.isin(('HPRC_samples', 'HGSVC_samples','HPRC_HGSVC_all_samples'))]

Unnamed: 0,Syntenic,genome,ground_truth_data_source,gt_error_rate,Non-reference true positive rate
11,21962648,GRCh38,HPRC_samples,1.108993,98.891007
2,24194678,GRCh38,HGSVC_samples,1.387967,98.612033
5,28325609,GRCh38,HPRC_HGSVC_all_samples,1.291473,98.708527
6,19168753,CHM13v2.0,HPRC_samples,0.918296,99.081704
10,21523790,CHM13v2.0,HGSVC_samples,1.208646,98.791354
4,25190775,CHM13v2.0,HPRC_HGSVC_all_samples,1.141592,98.858408


In [15]:
overall_subsetted = overall_subsetted.to_pandas()[['type','Syntenic','genome','ground_truth_data_source','gt_error_rate']].sort_values(['genome','type','Syntenic','gt_error_rate'])
overall_subsetted['Non-reference true positive rate'] = (1-overall_subsetted.gt_error_rate) * 100
overall_subsetted['gt_error_rate'] *= 100
overall_subsetted.loc[overall_subsetted.ground_truth_data_source.isin(('HPRC_samples', 'HGSVC_samples','HPRC_HGSVC_all_samples'))]

Unnamed: 0,type,Syntenic,genome,ground_truth_data_source,gt_error_rate,Non-reference true positive rate
38,SNP,False,GRCh38,HGSVC_samples,1.442616,98.557384
44,SNP,False,GRCh38,HPRC_HGSVC_all_samples,2.01019,97.98981
16,SNP,False,GRCh38,HPRC_samples,2.371092,97.628908
18,SNP,True,GRCh38,HPRC_samples,0.42249,99.57751
6,SNP,True,GRCh38,HPRC_HGSVC_all_samples,0.6242,99.3758
36,SNP,True,GRCh38,HGSVC_samples,0.632463,99.367537
51,Indel,False,GRCh38,HGSVC_samples,6.394128,93.605872
3,Indel,False,GRCh38,HPRC_HGSVC_all_samples,6.490134,93.509866
9,Indel,False,GRCh38,HPRC_samples,6.924883,93.075117
53,Indel,True,GRCh38,HPRC_samples,4.43739,95.56261


In [16]:
rephase=all_variants.filter((pl.col('method_of_phasing') == '1kgp_variation_phased_with_reference_panel') & 
                         (pl.col('ground_truth_data_source') == 'HPRC_samples'))

singletons_rephase_sum = rephase.filter(pl.col('contextual_MAC') == 1).group_by('genome').sum().to_pandas().reset_index()
singletons_rephase_sum['SER'] = singletons_rephase_sum.n_switch_errors/singletons_rephase_sum.n_checked*100
for idx, (genome, ser) in singletons_rephase_sum[['genome','SER']].iterrows():
    print(f'Singletons in {genome}: {ser}')

rare_rephase = rephase.filter(pl.col('contextual_MAC')/pl.col('contextual_AN') < 0.01)
common_rephase = rephase.filter(pl.col('contextual_MAC')/pl.col('contextual_AN') >= 0.01)

rare_rephase_sum = rare_rephase.group_by('genome').sum().to_pandas().reset_index()
common_rephase_sum = common_rephase.group_by('genome').sum().to_pandas().reset_index()
for idx, (genome, n_switch_errors, n_checked) in rare_rephase_sum[['genome','n_switch_errors', 'n_checked']].iterrows():
    print (f"\nRare in {genome}: {n_switch_errors/n_checked*100}")
    print (f"Common in {genome}: {common_rephase_sum.iloc[idx].n_switch_errors/common_rephase_sum.iloc[idx].n_checked*100}")
syntenic = common_rephase.filter(pl.col('Syntenic') == True)
nonsyntenic = common_rephase.filter(pl.col('Syntenic') == False)

syntenic_sum = syntenic.group_by('genome').sum().to_pandas().set_index('genome')
nonsyntenic_sum = nonsyntenic.group_by('genome').sum().to_pandas().set_index('genome')

for genome, row in syntenic_sum.iterrows():
    print (f"\nCommon Syntenic in {genome}: {row.n_switch_errors/row.n_checked*100}")
    print (f"Common Nonsyntenic {genome}: {nonsyntenic_sum.loc[genome].n_switch_errors/nonsyntenic_sum.loc[genome].n_checked*100}")

Singletons in CHM13v2.0: 8.27393547908082
Singletons in GRCh38: 11.690793831015235

Rare in GRCh38: 6.616868852558108
Common in GRCh38: 0.9045780019233195

Rare in CHM13v2.0: 6.225743033345612
Common in CHM13v2.0: 0.8103833501207229

Common Syntenic in CHM13v2.0: 0.8099508170456401
Common Nonsyntenic CHM13v2.0: 1.8914473684210527

Common Syntenic in GRCh38: 0.9045591759081486
Common Nonsyntenic GRCh38: 0.9198633549213578


In [17]:
synEnum = pl.Enum(categories=['All_regions', 'Syntenic', 'Nonsyntenic'])
concordance = all_variants.filter((pl.col('method_of_phasing') == 'phased_with_parents_and_pedigree') &
                                  (pl.col('ground_truth_data_source') == 'HPRC_samples')).with_columns(Syntenic=pl.col('Syntenic').cast(str).replace('true','Syntenic').replace('false','Nonsyntenic').cast(synEnum))

In [18]:
concordance_summary = concordance.clone().with_columns(Syntenic = pl.lit('All_regions').cast(synEnum))
concordance_summary = pl.concat((concordance, concordance_summary)).group_by(['Syntenic','genome','type']).sum().to_pandas().reset_index()
concordance_summary = pd.concat((concordance_summary, concordance_summary.copy().assign(type = 'SNPs + Indels').groupby(['Syntenic','genome','type'], observed=True).sum(numeric_only=True).reset_index()))
concordance_summary['gt_error_rate'] = concordance_summary.n_gt_errors/concordance_summary.n_gt_checked*100
concordance_summary['gt_accuracy_rate'] = 100-concordance_summary['gt_error_rate'] 
concordance_summary['switch_error_rate'] = concordance_summary.n_switch_errors/concordance_summary.n_checked*100
concordance_summary['Syntenic'] = concordance_summary.Syntenic.replace('false', 'Nonsyntenic').replace('true','Syntenic')

In [19]:
pd.set_option("display.precision", 3)
concordance_summary.sort_values(['Syntenic','genome','type']).set_index(['Syntenic','genome','type'])[['switch_error_rate','gt_error_rate','n_gt_errors','n_gt_checked', 'n_switch_errors', 'n_checked']].astype(float)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,switch_error_rate,gt_error_rate,n_gt_errors,n_gt_checked,n_switch_errors,n_checked
Syntenic,genome,type,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
All_regions,GRCh38,Indel,1.385,4.44,6482000.0,146000000.0,270831.0,19560000.0
All_regions,GRCh38,SNP,0.211,0.425,3026000.0,711400000.0,203833.0,96830000.0
All_regions,GRCh38,SNPs + Indels,0.408,1.109,9508000.0,857300000.0,474664.0,116400000.0
All_regions,CHM13v2.0,Indel,1.312,4.01,5285000.0,131800000.0,236515.0,18030000.0
All_regions,CHM13v2.0,SNP,0.149,0.257,1581000.0,616000000.0,125429.0,83970000.0
All_regions,CHM13v2.0,SNPs + Indels,0.355,0.918,6867000.0,747800000.0,361944.0,102000000.0
Syntenic,GRCh38,Indel,1.385,4.437,6471000.0,145800000.0,270657.0,19540000.0
Syntenic,GRCh38,SNP,0.21,0.422,3001000.0,710300000.0,203383.0,96710000.0
Syntenic,GRCh38,SNPs + Indels,0.408,1.106,9472000.0,856100000.0,474040.0,116200000.0
Syntenic,CHM13v2.0,Indel,1.311,4.003,5269000.0,131600000.0,236105.0,18010000.0
