In [1]:
import hail as hl

# hl.init()

In [2]:
# hl.utils.get_1kg('data/1kg/')
# hl.utils.get_hgdp('data/hgdp/')

In [3]:
data_path = 'data/hgdp'

In [4]:
mt = hl.read_matrix_table(f'{data_path}/HGDP.mt')

Initializing Hail with default parameters...
Running on Apache Spark version 3.5.1
SparkUI available at http://c2843211b84f:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.131-11d9b2ff89da
LOGGING: writing to /home/jovyan/hail-20240625-1647-0.2.131-11d9b2ff89da.log


In [5]:
mt.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
----------------------------------------
Row fields:
    'locus': locus<GRCh38>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        QUALapprox: int32, 
        SB: array<int32>, 
        MQ: float64, 
        MQRankSum: float64, 
        VarDP: int32, 
        AS_ReadPosRankSum: float64, 
        AS_pab_max: float64, 
        AS_QD: float64, 
        AS_MQ: float64, 
        QD: float64, 
        AS_MQRankSum: float64, 
        FS: float64, 
        AS_FS: float64, 
        ReadPosRankSum: float64, 
        AS_QUALapprox: int32, 
        AS_SB_TABLE: array<int32>, 
        AS_VarDP: int32, 
        AS_SOR: float64, 
        SOR: float64, 
        transmitted_singleton: bool, 
        omni: bool, 
        mills: bool, 
        monoallelic: bool, 
        AS_VQSLOD: float64, 
        Inbreeding

In [6]:
populations_table = hl.import_table(f'{data_path}/HGDP_annotations.txt', key='s')
populations_table.show()

2024-06-25 16:48:02.361 Hail: INFO: Reading table without type imputation
  Loading field 's' as type str (not specified)
  Loading field 'pop' as type str (not specified)
  Loading field 'continental_pop' as type str (not specified)
  Loading field 'sex_karyotype' as type str (not specified)
2024-06-25 16:48:21.857 Hail: WARN: aggregate_cols(): Aggregates over cols ordered by 'col_key'.
    To preserve matrix table column order, first unkey columns with 'key_cols_by()'
2024-06-25 16:48:23.074 Hail: WARN: entries(): Resulting entries table is sorted by '(row_key, col_key)'.
    To preserve row-major matrix table order, first unkey columns with 'key_cols_by()'
2024-06-25 16:48:26.542 Hail: INFO: Ordering unsorted dataset with network shuffle
2024-06-25 16:48:26.916 Hail: INFO: Coerced sorted dataset
2024-06-25 16:48:27.959 Hail: INFO: Ordering unsorted dataset with network shuffle
2024-06-25 16:48:36.940 Hail: INFO: Ordering unsorted dataset with network shuffle
2024-06-25 16:48:37.185 

s,pop,continental_pop,sex_karyotype
str,str,str,str
"""HG00096""","""gbr""","""nfe""","""XY"""
"""HG00097""","""gbr""","""nfe""","""XX"""
"""HG00099""","""gbr""","""nfe""","""XX"""
"""HG00100""","""gbr""","""nfe""","""XX"""
"""HG00101""","""gbr""","""nfe""","""XY"""
"""HG00102""","""gbr""","""nfe""","""XX"""
"""HG00103""","""gbr""","""nfe""","""XY"""
"""HG00105""","""gbr""","""nfe""","""XY"""
"""HG00106""","""gbr""","""nfe""","""XX"""
"""HG00107""","""gbr""","""nfe""","""XY"""


In [7]:
mt = mt.annotate_cols(**populations_table[mt.s])

In [8]:
hl.summarize_variants(mt)

Number of alleles,Count
2,10441

Allele type,Count
SNP,10441

Metric,Value
Transitions,6602.0
Transversions,3839.0
Ratio,1.72

Contig,Count
chr1,881
chr2,799
chr3,728
chr4,659
chr5,618
chr6,572
chr7,576
chr8,525
chr9,476
chr10,516


In [9]:
import random
import pandas as pd

rsid_list = mt.aggregate_rows(hl.agg.collect_as_set(mt.rsid))
selected_snps = random.choices(list(rsid_list), k=100)
mt_filtered = mt.filter_rows(hl.literal(selected_snps).contains(mt.rsid))

In [10]:
mt_filtered.count_rows()

99

In [11]:
def gt_freq_estimates(mt, group):
    p_ML = ((2 * hl.agg.sum(mt.GT.is_hom_ref())) + hl.agg.sum(mt.GT.is_het())) / (2 * hl.agg.count_where(hl.is_defined(mt.GT)))
    return mt.aggregate_entries(
        hl.agg.group_by(
            mt.rsid,
            hl.agg.group_by(
                group,
                hl.struct(
                    p_ML = p_ML,
                    p_AA = p_ML ** 2,
                    p_AB = 2 * p_ML * (1 - p_ML),
                    p_BB = (1 - p_ML) ** 2,
                )
            )
        )
    )

def create_freqs_dataframe(snp_freqs):
    rows = [
        {
            'rsid': rsid,
            'pop_freqs': hl.struct(**pop_freqs)
        }
        for rsid, pop_freqs in snp_freqs.items()
    ]
    df = pd.DataFrame(rows)
    return df

snp_freqs = gt_freq_estimates(mt_filtered, mt_filtered.pop)
freqs_ht = hl.Table.from_pandas(create_freqs_dataframe(snp_freqs), key='rsid')
freqs_ht.show()

Unnamed: 0_level_0,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs,pop_freqs
Unnamed: 0_level_1,acb,acb,acb,acb,adygei,adygei,adygei,adygei,asw,asw,asw,asw,balochi,balochi,balochi,balochi,basque,basque,basque,basque,beb,beb,beb,beb,bedouin,bedouin,bedouin,bedouin,brahui,brahui,brahui,brahui,burusho,burusho,burusho,burusho,cambodian,cambodian,cambodian,cambodian,cdx,cdx,cdx,cdx,ceu,ceu,ceu,ceu,chb,chb,chb,chb,chs,chs,chs,chs,clm,clm,clm,clm,colombian,colombian,colombian,colombian,dai,dai,dai,dai,daur,daur,daur,daur,druze,druze,druze,druze,esn,esn,esn,esn,fin,fin,fin,fin,french,french,french,french,gbr,gbr,gbr,gbr,gih,gih,gih,gih,gwd,gwd,gwd,gwd,han,han,han,han,hazara,hazara,hazara,hazara,hezhen,hezhen,hezhen,hezhen,ibs,ibs,ibs,ibs,itu,itu,itu,itu,japanese,japanese,japanese,japanese,jpt,jpt,jpt,jpt,kalash,kalash,kalash,kalash,karitiana,karitiana,karitiana,karitiana,khv,khv,khv,khv,lwk,lwk,lwk,lwk,makrani,makrani,makrani,makrani,mandenka,mandenka,mandenka,mandenka,maya,maya,maya,maya,mongola,mongola,mongola,mongola,mozabite,mozabite,mozabite,mozabite,msl,msl,msl,msl,mxl,mxl,mxl,mxl,naxi,naxi,naxi,naxi,orcadian,orcadian,orcadian,orcadian,oroqen,oroqen,oroqen,oroqen,palestinian,palestinian,palestinian,palestinian,pathan,pathan,pathan,pathan,pel,pel,pel,pel,pjl,pjl,pjl,pjl,pur,pur,pur,pur,russian,russian,russian,russian,she,she,she,she,sindhi,sindhi,sindhi,sindhi,stu,stu,stu,stu,surui,surui,surui,surui,tsi,tsi,tsi,tsi,tu,tu,tu,tu,tujia,tujia,tujia,tujia,tuscan,tuscan,tuscan,tuscan,uygur,uygur,uygur,uygur,yakut,yakut,yakut,yakut,yizu,yizu,yizu,yizu,yoruba,yoruba,yoruba,yoruba,yri,yri,yri,yri
rsid,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB
str,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
"""rs1036894983""",1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,,,,,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
"""rs10422355""",0.55,0.303,0.495,0.202,0.25,0.0625,0.375,0.563,0.563,0.316,0.492,0.191,0.5,0.25,0.5,0.25,0.25,0.0625,0.375,0.563,0.389,0.151,0.475,0.373,0.286,0.0816,0.408,0.51,0.5,0.25,0.5,0.25,0.625,0.391,0.469,0.141,0.667,0.444,0.444,0.111,0.909,0.826,0.165,0.00826,0.438,0.191,0.492,0.316,1.0,1.0,0.0,0.0,0.833,0.694,0.278,0.0278,0.444,0.198,0.494,0.309,1.0,1.0,0.0,0.0,0.75,0.563,0.375,0.0625,0.5,0.25,0.5,0.25,0.375,0.141,0.469,0.391,0.471,0.221,0.498,0.28,0.556,0.309,0.494,0.198,0.0,0.0,0.0,1.0,0.444,0.198,0.494,0.309,0.545,0.298,0.496,0.207,0.594,0.353,0.482,0.165,0.8,0.64,0.32,0.04,0.667,0.444,0.444,0.111,1.0,1.0,0.0,0.0,0.364,0.132,0.463,0.405,0.429,0.184,0.49,0.327,1.0,1.0,0.0,0.0,0.875,0.766,0.219,0.0156,0.25,0.0625,0.375,0.563,0.5,0.25,0.5,0.25,0.818,0.669,0.298,0.0331,0.8,0.64,0.32,0.04,0.0,0.0,0.0,1.0,0.5,0.25,0.5,0.25,1.0,1.0,0.0,0.0,0.5,0.25,0.5,0.25,0.25,0.0625,0.375,0.563,0.625,0.391,0.469,0.141,0.714,0.51,0.408,0.0816,0.5,0.25,0.5,0.25,0.75,0.563,0.375,0.0625,0.5,0.25,0.5,0.25,0.313,0.0977,0.43,0.473,0.333,0.111,0.444,0.444,0.417,0.174,0.486,0.34,0.481,0.231,0.499,0.27,0.4,0.16,0.48,0.36,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.75,0.563,0.375,0.0625,0.455,0.207,0.496,0.298,0.5,0.25,0.5,0.25,0.154,0.0237,0.26,0.716,0.5,0.25,0.5,0.25,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.5,0.25,0.5,0.25,0.667,0.444,0.444,0.111,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.6,0.36,0.48,0.16
"""rs1109855""",0.4,0.16,0.48,0.36,0.75,0.563,0.375,0.0625,0.688,0.473,0.43,0.0977,1.0,1.0,0.0,0.0,0.5,0.25,0.5,0.25,0.778,0.605,0.346,0.0494,0.714,0.51,0.408,0.0816,0.5,0.25,0.5,0.25,0.625,0.391,0.469,0.141,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.563,0.316,0.492,0.191,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.889,0.79,0.198,0.0123,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.25,0.0625,0.375,0.563,0.559,0.312,0.493,0.195,0.611,0.373,0.475,0.151,0.25,0.0625,0.375,0.563,0.722,0.522,0.401,0.0772,0.773,0.597,0.351,0.0517,0.688,0.473,0.43,0.0977,1.0,1.0,0.0,0.0,0.833,0.694,0.278,0.0278,1.0,1.0,0.0,0.0,0.636,0.405,0.463,0.132,0.857,0.735,0.245,0.0204,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.875,0.766,0.219,0.0156,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.7,0.49,0.42,0.09,0.5,0.25,0.5,0.25,0.5,0.25,0.5,0.25,0.75,0.563,0.375,0.0625,1.0,1.0,0.0,0.0,0.25,0.0625,0.375,0.563,0.708,0.502,0.413,0.0851,0.929,0.862,0.133,0.0051,1.0,1.0,0.0,0.0,0.75,0.563,0.375,0.0625,1.0,1.0,0.0,0.0,0.813,0.66,0.305,0.0352,0.5,0.25,0.5,0.25,1.0,1.0,0.0,0.0,0.846,0.716,0.26,0.0237,0.625,0.391,0.469,0.141,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.75,0.563,0.375,0.0625,0.773,0.597,0.351,0.0517,1.0,1.0,0.0,0.0,0.731,0.534,0.393,0.0725,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.5,0.25,0.5,0.25,1.0,1.0,0.0,0.0,0.833,0.694,0.278,0.0278,1.0,1.0,0.0,0.0,0.5,0.25,0.5,0.25,0.5,0.25,0.5,0.25
"""rs111930918""",1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.75,0.563,0.375,0.0625,0.889,0.79,0.198,0.0123,0.857,0.735,0.245,0.0204,0.667,0.444,0.444,0.111,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.969,0.938,0.0605,0.000977,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.778,0.605,0.346,0.0494,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.75,0.563,0.375,0.0625,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.833,0.694,0.278,0.0278,0.9,0.81,0.18,0.01,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.864,0.746,0.236,0.0186,0.857,0.735,0.245,0.0204,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.667,0.444,0.444,0.111,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.688,0.473,0.43,0.0977,0.667,0.444,0.444,0.111,1.0,1.0,0.0,0.0,0.885,0.783,0.204,0.0133,0.9,0.81,0.18,0.01,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.818,0.669,0.298,0.0331,1.0,1.0,0.0,0.0,0.917,0.84,0.153,0.00694,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
"""rs112395787""",0.95,0.903,0.095,0.0025,1.0,1.0,0.0,0.0,0.875,0.766,0.219,0.0156,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.912,0.831,0.161,0.00779,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.875,0.766,0.219,0.0156,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.958,0.918,0.0799,0.00174,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.5,0.25,0.5,0.25,0.925,0.856,0.139,0.00562
"""rs112488762""",0.95,0.903,0.095,0.0025,1.0,1.0,0.0,0.0,0.813,0.66,0.305,0.0352,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.706,0.498,0.415,0.0865,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.906,0.821,0.17,0.00879,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.875,0.766,0.219,0.0156,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.938,0.879,0.117,0.00391,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.9,0.81,0.18,0.01
"""rs112533403""",0.2,0.04,0.32,0.64,1.0,1.0,0.0,0.0,0.25,0.0625,0.375,0.563,1.0,1.0,0.0,0.0,0.75,0.563,0.375,0.0625,0.889,0.79,0.198,0.0123,0.75,0.563,0.375,0.0625,0.667,0.444,0.444,0.111,0.875,0.766,0.219,0.0156,1.0,1.0,0.0,0.0,0.95,0.903,0.095,0.0025,0.719,0.517,0.404,0.0791,0.9,0.81,0.18,0.01,1.0,1.0,0.0,0.0,0.611,0.373,0.475,0.151,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.667,0.444,0.444,0.111,0.167,0.0278,0.278,0.694,0.667,0.444,0.444,0.111,0.75,0.563,0.375,0.0625,0.556,0.309,0.494,0.198,0.7,0.49,0.42,0.09,0.321,0.103,0.436,0.46,0.9,0.81,0.18,0.01,0.667,0.444,0.444,0.111,1.0,1.0,0.0,0.0,0.667,0.444,0.444,0.111,0.929,0.862,0.133,0.0051,1.0,1.0,0.0,0.0,0.813,0.66,0.305,0.0352,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.864,0.746,0.236,0.0186,0.3,0.09,0.42,0.49,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.75,0.563,0.375,0.0625,1.0,1.0,0.0,0.0,0.625,0.391,0.469,0.141,0.409,0.167,0.483,0.349,0.857,0.735,0.245,0.0204,1.0,1.0,0.0,0.0,0.75,0.563,0.375,0.0625,1.0,1.0,0.0,0.0,0.714,0.51,0.408,0.0816,0.667,0.444,0.444,0.111,0.917,0.84,0.153,0.00694,0.717,0.515,0.405,0.0799,0.611,0.373,0.475,0.151,1.0,1.0,0.0,0.0,0.75,0.563,0.375,0.0625,0.5,0.25,0.5,0.25,0.727,0.529,0.397,0.0744,1.0,1.0,0.0,0.0,0.846,0.716,0.26,0.0237,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.25,0.0625,0.375,0.563,0.211,0.0443,0.332,0.623
"""rs112730181""",0.85,0.722,0.255,0.0225,1.0,1.0,0.0,0.0,0.938,0.879,0.117,0.00391,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.929,0.862,0.133,0.0051,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.912,0.831,0.161,0.00779,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.969,0.938,0.0605,0.000977,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.9,0.81,0.18,0.01,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.917,0.84,0.153,0.00694,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.938,0.879,0.117,0.00391,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.75,0.563,0.375,0.0625,0.9,0.81,0.18,0.01
"""rs112832077""",0.95,0.903,0.095,0.0025,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.833,0.694,0.278,0.0278,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.938,0.879,0.117,0.00391,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.944,0.892,0.105,0.00309,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,,,,,0.875,0.766,0.219,0.0156,0.941,0.886,0.111,0.00346,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.833,0.694,0.278,0.0278,0.955,0.911,0.0868,0.00207,0.938,0.879,0.117,0.00391,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.864,0.746,0.236,0.0186,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.9,0.81,0.18,0.01,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.75,0.563,0.375,0.0625,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.875,0.766,0.219,0.0156,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.938,0.879,0.117,0.00391,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.95,0.903,0.095,0.0025,0.5,0.25,0.5,0.25,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,,,,,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.75,0.563,0.375,0.0625,0.9,0.81,0.18,0.01
"""rs113145131""",1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.958,0.918,0.0799,0.00174,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0


In [12]:
total_subjects = mt.count_cols()
ethnicity_counts = mt.aggregate_cols(hl.agg.counter(mt.pop))
ethnicity_proportions = {k: v / total_subjects for k, v in ethnicity_counts.items()}

ethnicity_proportions, sum(ethnicity_proportions.values())

({'acb': 0.025510204081632654,
  'adygei': 0.00510204081632653,
  'asw': 0.02040816326530612,
  'balochi': 0.002551020408163265,
  'basque': 0.00510204081632653,
  'beb': 0.02295918367346939,
  'bedouin': 0.017857142857142856,
  'brahui': 0.007653061224489796,
  'burusho': 0.01020408163265306,
  'cambodian': 0.007653061224489796,
  'cdx': 0.02806122448979592,
  'ceu': 0.04081632653061224,
  'chb': 0.012755102040816327,
  'chs': 0.02295918367346939,
  'clm': 0.02295918367346939,
  'colombian': 0.002551020408163265,
  'dai': 0.00510204081632653,
  'daur': 0.002551020408163265,
  'druze': 0.01020408163265306,
  'esn': 0.04336734693877551,
  'fin': 0.02295918367346939,
  'french': 0.00510204081632653,
  'gbr': 0.02295918367346939,
  'gih': 0.02806122448979592,
  'gwd': 0.04081632653061224,
  'han': 0.012755102040816327,
  'hazara': 0.007653061224489796,
  'hezhen': 0.002551020408163265,
  'ibs': 0.02806122448979592,
  'itu': 0.017857142857142856,
  'japanese': 0.01020408163265306,
  'jpt':

In [13]:
def annotate_rows_with_freqs(mt, freqs_ht):
    mt = mt.annotate_rows(
        freqs=freqs_ht[mt_filtered.rsid].pop_freqs
    )
    return mt

mt_annotated = annotate_rows_with_freqs(mt_filtered, freqs_ht)
mt_annotated.entries().show(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0,Unnamed: 17_level_0,Unnamed: 18_level_0,Unnamed: 19_level_0,Unnamed: 20_level_0,Unnamed: 21_level_0,Unnamed: 22_level_0,Unnamed: 23_level_0,Unnamed: 24_level_0,Unnamed: 25_level_0,Unnamed: 26_level_0,Unnamed: 27_level_0,Unnamed: 28_level_0,Unnamed: 29_level_0,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,freqs,Unnamed: 290_level_0,Unnamed: 291_level_0,Unnamed: 292_level_0,Unnamed: 293_level_0,Unnamed: 294_level_0,Unnamed: 295_level_0,Unnamed: 296_level_0,Unnamed: 297_level_0,Unnamed: 298_level_0
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,info,acb,acb,acb,acb,adygei,adygei,adygei,adygei,asw,asw,asw,asw,balochi,balochi,balochi,balochi,basque,basque,basque,basque,beb,beb,beb,beb,bedouin,bedouin,bedouin,bedouin,brahui,brahui,brahui,brahui,burusho,burusho,burusho,burusho,cambodian,cambodian,cambodian,cambodian,cdx,cdx,cdx,cdx,ceu,ceu,ceu,ceu,chb,chb,chb,chb,chs,chs,chs,chs,clm,clm,clm,clm,colombian,colombian,colombian,colombian,dai,dai,dai,dai,daur,daur,daur,daur,druze,druze,druze,druze,esn,esn,esn,esn,fin,fin,fin,fin,french,french,french,french,gbr,gbr,gbr,gbr,gih,gih,gih,gih,gwd,gwd,gwd,gwd,han,han,han,han,hazara,hazara,hazara,hazara,hezhen,hezhen,hezhen,hezhen,ibs,ibs,ibs,ibs,itu,itu,itu,itu,japanese,japanese,japanese,japanese,jpt,jpt,jpt,jpt,kalash,kalash,kalash,kalash,karitiana,karitiana,karitiana,karitiana,khv,khv,khv,khv,lwk,lwk,lwk,lwk,makrani,makrani,makrani,makrani,mandenka,mandenka,mandenka,mandenka,maya,maya,maya,maya,mongola,mongola,mongola,mongola,mozabite,mozabite,mozabite,mozabite,msl,msl,msl,msl,mxl,mxl,mxl,mxl,naxi,naxi,naxi,naxi,orcadian,orcadian,orcadian,orcadian,oroqen,oroqen,oroqen,oroqen,palestinian,palestinian,palestinian,palestinian,pathan,pathan,pathan,pathan,pel,pel,pel,pel,pjl,pjl,pjl,pjl,pur,pur,pur,pur,russian,russian,russian,russian,she,she,she,she,sindhi,sindhi,sindhi,sindhi,stu,stu,stu,stu,surui,surui,surui,surui,tsi,tsi,tsi,tsi,tu,tu,tu,tu,tujia,tujia,tujia,tujia,tuscan,tuscan,tuscan,tuscan,uygur,uygur,uygur,uygur,yakut,yakut,yakut,yakut,yizu,yizu,yizu,yizu,yoruba,yoruba,yoruba,yoruba,yri,yri,yri,yri,Unnamed: 290_level_1,Unnamed: 291_level_1,Unnamed: 292_level_1,Unnamed: 293_level_1,Unnamed: 294_level_1,Unnamed: 295_level_1,Unnamed: 296_level_1,Unnamed: 297_level_1,Unnamed: 298_level_1
locus,alleles,rsid,qual,filters,QUALapprox,SB,MQ,MQRankSum,VarDP,AS_ReadPosRankSum,AS_pab_max,AS_QD,AS_MQ,QD,AS_MQRankSum,FS,AS_FS,ReadPosRankSum,AS_QUALapprox,AS_SB_TABLE,AS_VarDP,AS_SOR,SOR,transmitted_singleton,omni,mills,monoallelic,AS_VQSLOD,InbreedingCoeff,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,p_ML,p_AA,p_AB,p_BB,s,pop,continental_pop,sex_karyotype,GT,DP,GQ,AD,PL
locus<GRCh38>,array<str>,str,float64,set<str>,int32,array<int32>,float64,float64,int32,float64,float64,float64,float64,float64,float64,float64,float64,float64,int32,array<int32>,int32,float64,float64,bool,bool,bool,bool,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,str,str,str,str,call,int32,int32,array<int32>,array<int32>
chr1:13436332,"[""C"",""G""]","""rs974190053""",-10.0,{},2424012,"[42774,42087,45925,46682]",60.0,-0.031,177468,0.306,1.0,13.7,60.0,13.7,-0.026,0.0,0.0,0.284,2424012,"[42774,42087,45925,46682]",177468,0.694,0.694,False,False,False,False,20.9,0.0629,0.95,0.903,0.095,0.0025,1.0,1.0,0.0,0.0,0.938,0.879,0.117,0.00391,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.971,0.942,0.0571,0.000865,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.906,0.821,0.17,0.00879,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.958,0.918,0.0799,0.00174,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.95,0.903,0.095,0.0025,"""HG00107""","""gbr""","""nfe""","""XY""",0/0,32,32,,
chr1:13436332,"[""C"",""G""]","""rs974190053""",-10.0,{},2424012,"[42774,42087,45925,46682]",60.0,-0.031,177468,0.306,1.0,13.7,60.0,13.7,-0.026,0.0,0.0,0.284,2424012,"[42774,42087,45925,46682]",177468,0.694,0.694,False,False,False,False,20.9,0.0629,0.95,0.903,0.095,0.0025,1.0,1.0,0.0,0.0,0.938,0.879,0.117,0.00391,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.971,0.942,0.0571,0.000865,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.906,0.821,0.17,0.00879,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.958,0.918,0.0799,0.00174,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.95,0.903,0.095,0.0025,"""HG00114""","""gbr""","""nfe""","""XY""",0/0,31,30,,
chr1:13436332,"[""C"",""G""]","""rs974190053""",-10.0,{},2424012,"[42774,42087,45925,46682]",60.0,-0.031,177468,0.306,1.0,13.7,60.0,13.7,-0.026,0.0,0.0,0.284,2424012,"[42774,42087,45925,46682]",177468,0.694,0.694,False,False,False,False,20.9,0.0629,0.95,0.903,0.095,0.0025,1.0,1.0,0.0,0.0,0.938,0.879,0.117,0.00391,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.971,0.942,0.0571,0.000865,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.906,0.821,0.17,0.00879,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.958,0.918,0.0799,0.00174,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.95,0.903,0.095,0.0025,"""HG00121""","""gbr""","""nfe""","""XX""",0/0,34,55,,


In [14]:
def non_nan(x, defailt=0.0):
    return hl.if_else(hl.is_finite(x), x, defailt)

In [15]:
def log_likelihood(genotype, freqs, pop):
    return (
        hl.case()
        .when(genotype.is_hom_ref(), non_nan(hl.log(freqs[pop].p_AA)))
        .when(genotype.is_het(), non_nan(hl.log(freqs[pop].p_AB)))
        .when(genotype.is_hom_var(), non_nan(hl.log(freqs[pop].p_BB)))
        .default(0.0)
    )

def get_log_likelihoods(mt, proportions):
    mt = mt.annotate_cols(
        log_likelihoods=hl.struct(**{
            pop: hl.agg.sum(log_likelihood(mt.GT, mt.freqs, pop)) + hl.log(proportions[pop])
            for pop in proportions
        })
    )
    return mt


mt_likelihood = get_log_likelihoods(mt_annotated, ethnicity_proportions)

In [16]:
def log_sum_exp(log_values):
    max_log = hl.max(log_values)
    return max_log + hl.log(hl.sum([hl.exp(log_value - max_log) for log_value in log_values]))


def calculate_posteriors(mt, proportions):
    log_likelihoods = [mt.log_likelihoods[pop] for pop in proportions.keys()]

    mt = mt.annotate_cols(log_likelihood_sum=log_sum_exp(log_likelihoods))
    return mt.annotate_cols(
        posteriors=hl.dict({
            pop: mt.log_likelihoods[pop] - mt.log_likelihood_sum
               for pop in proportions.keys()
        })
    )

mt_probs = calculate_posteriors(mt_likelihood, ethnicity_proportions)
mt_probs.cols().show(4)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,log_likelihoods,Unnamed: 69_level_0,Unnamed: 70_level_0
s,pop,continental_pop,sex_karyotype,acb,adygei,asw,balochi,basque,beb,bedouin,brahui,burusho,cambodian,cdx,ceu,chb,chs,clm,colombian,dai,daur,druze,esn,fin,french,gbr,gih,gwd,han,hazara,hezhen,ibs,itu,japanese,jpt,kalash,karitiana,khv,lwk,makrani,mandenka,maya,mongola,mozabite,msl,mxl,naxi,orcadian,oroqen,palestinian,pathan,pel,pjl,pur,russian,she,sindhi,stu,surui,tsi,tu,tujia,tuscan,uygur,yakut,yizu,yoruba,yri,log_likelihood_sum,posteriors
str,str,str,str,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,"dict<str, float64>"
"""HG00107""","""gbr""","""nfe""","""XY""",-41.0,-41.5,-37.9,-15.7,-34.4,-43.1,-43.6,-36.7,-40.8,-31.5,-47.3,-39.9,-36.5,-42.5,-44.1,-12.2,-27.3,-20.5,-46.3,-43.1,-41.8,-36.3,-40.6,-38.0,-47.4,-39.0,-33.3,-14.3,-42.5,-43.5,-34.3,-41.4,-29.4,-22.9,-49.6,-38.2,-28.5,-19.1,-26.7,-24.7,-46.0,-41.2,-48.3,-18.4,-39.7,-17.8,-44.6,-31.2,-43.8,-43.2,-39.9,-15.7,-31.5,-37.3,-40.4,-23.0,-40.6,-16.4,-15.7,-21.2,-30.8,-31.7,-24.0,-30.2,-43.7,-12.0,"{""cdx"":-3.53e+01,""gih"":-2.60e+01,""adygei"":-2.95e+01,""msl"":-2.92e+01,""burusho"":-2.88e+01,""palestinian"":-3.26e+01,""han"":-2.70e+01,""ibs"":-3.05e+01,""beb"":-3.11e+01,""french"":-2.43e+01,""basque"":-2.24e+01,""tu"":-4.38e+00,""itu"":-3.15e+01,""dai"":-1.53e+01,""ceu"":-2.79e+01,""makrani"":-1.65e+01,""tsi"":-2.86e+01,""fin"":-2.98e+01,""tujia"":-3.68e+00,""mozabite"":-3.40e+01,""pjl"":-3.12e+01,""gwd"":-3.54e+01,""mandenka"":-7.15e+00,""surui"":-1.10e+01,""pel"":-3.18e+01,""pur"":-2.80e+01,""orcadian"":-2.77e+01,""acb"":-2.90e+01,""bedouin"":-3.16e+01,""she"":-1.95e+01,""chs"":-3.05e+01,""brahui"":-2.47e+01,""naxi"":-6.45e+00,""mxl"":-3.63e+01,""daur"":-8.53e+00,""russian"":-3.68e+00,""balochi"":-3.68e+00,""asw"":-2.59e+01,""clm"":-3.21e+01,""maya"":-1.47e+01,""yoruba"":-1.82e+01,""colombian"":-2.16e-01,""karitiana"":-1.09e+01,""yizu"":-1.20e+01,""pathan"":-1.92e+01,""hazara"":-2.13e+01,""cambodian"":-1.95e+01,""kalash"":-1.75e+01,""yakut"":-1.97e+01,""yri"":-3.17e+01,""hezhen"":-2.30e+00,""mongola"":-1.27e+01,""tuscan"":-9.23e+00,""lwk"":-2.62e+01,""gbr"":-2.86e+01,""chb"":-2.45e+01,""uygur"":-1.88e+01,""esn"":-3.11e+01,""sindhi"":-2.53e+01,""druze"":-3.43e+01,""japanese"":-2.23e+01,""jpt"":-2.94e+01,""khv"":-3.76e+01,""stu"":-2.84e+01,""oroqen"":-5.76e+00}"
"""HG00114""","""gbr""","""nfe""","""XY""",-57.2,-39.4,-51.6,-19.8,-28.9,-42.5,-35.7,-34.0,-39.4,-36.3,-49.7,-38.5,-30.9,-47.2,-45.4,-12.9,-26.2,-24.7,-43.6,-51.9,-41.5,-37.9,-36.1,-43.3,-59.7,-35.4,-34.7,-16.4,-38.4,-45.5,-36.4,-44.5,-33.1,-24.3,-46.6,-51.9,-26.7,-20.5,-25.9,-28.4,-42.7,-48.8,-41.9,-20.5,-35.9,-20.5,-42.4,-37.5,-35.2,-43.2,-39.0,-16.4,-34.3,-41.9,-39.6,-20.3,-36.5,-17.8,-16.4,-21.2,-29.9,-33.8,-21.6,-32.9,-51.6,-12.8,"{""cdx"":-3.69e+01,""gih"":-3.05e+01,""adygei"":-2.66e+01,""msl"":-3.60e+01,""burusho"":-2.66e+01,""palestinian"":-2.96e+01,""han"":-2.26e+01,""ibs"":-2.56e+01,""beb"":-2.97e+01,""french"":-2.51e+01,""basque"":-1.61e+01,""tu"":-4.95e+00,""itu"":-3.27e+01,""dai"":-1.34e+01,""ceu"":-2.57e+01,""makrani"":-1.39e+01,""tsi"":-2.37e+01,""fin"":-2.87e+01,""tujia"":-3.57e+00,""mozabite"":-2.99e+01,""pjl"":-3.04e+01,""gwd"":-4.69e+01,""mandenka"":-7.72e+00,""surui"":-7.48e+00,""pel"":-2.24e+01,""pur"":-2.62e+01,""orcadian"":-2.31e+01,""acb"":-4.44e+01,""bedouin"":-2.29e+01,""she"":-2.15e+01,""chs"":-3.44e+01,""brahui"":-2.12e+01,""naxi"":-7.72e+00,""mxl"":-2.91e+01,""daur"":-1.19e+01,""russian"":-3.57e+00,""balochi"":-7.03e+00,""asw"":-3.88e+01,""clm"":-3.26e+01,""maya"":-1.31e+01,""yoruba"":-2.01e+01,""colombian"":-9.99e-02,""karitiana"":-1.15e+01,""yizu"":-8.80e+00,""pathan"":-2.47e+01,""hazara"":-2.18e+01,""cambodian"":-2.35e+01,""kalash"":-2.03e+01,""yakut"":-2.10e+01,""yri"":-3.88e+01,""hezhen"":-3.57e+00,""mongola"":-1.56e+01,""tuscan"":-8.42e+00,""lwk"":-3.91e+01,""gbr"":-2.33e+01,""chb"":-1.81e+01,""uygur"":-1.71e+01,""esn"":-3.91e+01,""sindhi"":-2.91e+01,""druze"":-3.08e+01,""japanese"":-2.35e+01,""jpt"":-3.17e+01,""khv"":-3.38e+01,""stu"":-2.68e+01,""oroqen"":-7.72e+00}"
"""HG00121""","""gbr""","""nfe""","""XX""",-62.2,-44.0,-53.0,-20.5,-32.8,-47.7,-42.0,-37.2,-41.7,-32.7,-49.9,-42.9,-35.8,-52.4,-50.3,-12.2,-35.5,-24.0,-52.0,-57.6,-43.0,-39.6,-41.3,-46.3,-62.3,-37.2,-31.0,-17.8,-48.2,-43.7,-38.3,-41.5,-40.8,-20.7,-48.7,-56.3,-28.4,-21.9,-27.3,-29.8,-48.2,-56.9,-36.1,-17.8,-37.1,-19.8,-50.1,-38.9,-43.6,-44.7,-43.6,-16.4,-34.5,-40.7,-41.7,-19.3,-42.7,-19.1,-17.8,-23.3,-34.5,-31.9,-27.4,-35.5,-59.7,-12.2,"{""cdx"":-3.78e+01,""gih"":-3.41e+01,""adygei"":-3.18e+01,""msl"":-4.47e+01,""burusho"":-2.95e+01,""palestinian"":-3.79e+01,""han"":-2.50e+01,""ibs"":-3.61e+01,""beb"":-3.55e+01,""french"":-2.74e+01,""basque"":-2.06e+01,""tu"":-6.96e+00,""itu"":-3.16e+01,""dai"":-2.33e+01,""ceu"":-3.07e+01,""makrani"":-1.62e+01,""tsi"":-3.05e+01,""fin"":-3.09e+01,""tujia"":-5.57e+00,""mozabite"":-3.60e+01,""pjl"":-3.25e+01,""gwd"":-5.01e+01,""mandenka"":-9.73e+00,""surui"":-7.12e+00,""pel"":-3.14e+01,""pur"":-3.15e+01,""orcadian"":-2.49e+01,""acb"":-5.00e+01,""bedouin"":-2.98e+01,""she"":-2.23e+01,""chs"":-4.03e+01,""brahui"":-2.51e+01,""naxi"":-5.57e+00,""mxl"":-2.39e+01,""daur"":-1.18e+01,""russian"":-4.19e+00,""balochi"":-8.35e+00,""asw"":-4.08e+01,""clm"":-3.81e+01,""maya"":-1.51e+01,""yoruba"":-2.33e+01,""colombian"":-2.97e-02,""karitiana"":-8.50e+00,""yizu"":-1.52e+01,""pathan"":-2.68e+01,""hazara"":-1.88e+01,""cambodian"":-2.05e+01,""kalash"":-2.86e+01,""yakut"":-1.97e+01,""yri"":-4.75e+01,""hezhen"":-5.57e+00,""mongola"":-1.76e+01,""tuscan"":-1.11e+01,""lwk"":-4.41e+01,""gbr"":-2.91e+01,""chb"":-2.36e+01,""uygur"":-2.24e+01,""esn"":-4.54e+01,""sindhi"":-2.85e+01,""druze"":-3.98e+01,""japanese"":-2.62e+01,""jpt"":-2.93e+01,""khv"":-3.65e+01,""stu"":-2.95e+01,""oroqen"":-7.65e+00}"
"""HG00127""","""gbr""","""nfe""","""XX""",-51.4,-43.5,-46.3,-21.2,-34.8,-49.7,-47.8,-41.2,-40.3,-34.3,-50.1,-44.7,-39.0,-48.5,-49.8,-12.9,-31.5,-24.0,-52.1,-54.9,-39.0,-34.6,-41.2,-49.2,-59.6,-41.0,-31.9,-16.4,-46.4,-49.9,-34.0,-39.1,-39.4,-28.0,-46.6,-52.6,-31.0,-19.8,-29.9,-26.9,-51.1,-54.4,-42.2,-21.9,-35.2,-19.1,-46.0,-37.5,-40.9,-44.4,-44.9,-18.4,-33.2,-39.2,-50.6,-19.4,-49.0,-20.5,-15.7,-22.6,-26.9,-33.9,-25.2,-28.6,-56.2,-12.8,"{""cdx"":-3.73e+01,""gih"":-3.64e+01,""adygei"":-3.07e+01,""msl"":-4.16e+01,""burusho"":-2.75e+01,""palestinian"":-3.31e+01,""han"":-2.82e+01,""ibs"":-3.36e+01,""beb"":-3.69e+01,""french"":-2.18e+01,""basque"":-2.20e+01,""tu"":-7.72e+00,""itu"":-3.71e+01,""dai"":-1.87e+01,""ceu"":-3.19e+01,""makrani"":-1.82e+01,""tsi"":-3.62e+01,""fin"":-2.62e+01,""tujia"":-2.87e+00,""mozabite"":-3.82e+01,""pjl"":-3.16e+01,""gwd"":-4.68e+01,""mandenka"":-7.03e+00,""surui"":-6.61e+00,""pel"":-2.81e+01,""pur"":-3.21e+01,""orcadian"":-2.24e+01,""acb"":-3.86e+01,""bedouin"":-3.49e+01,""she"":-2.04e+01,""chs"":-3.57e+01,""brahui"":-2.84e+01,""naxi"":-9.11e+00,""mxl"":-2.94e+01,""daur"":-1.12e+01,""russian"":-5.64e+00,""balochi"":-8.42e+00,""asw"":-3.35e+01,""clm"":-3.70e+01,""maya"":-1.71e+01,""yoruba"":-1.58e+01,""colombian"":-9.80e-02,""karitiana"":-1.52e+01,""yizu"":-1.24e+01,""pathan"":-2.47e+01,""hazara"":-1.91e+01,""cambodian"":-2.15e+01,""kalash"":-2.66e+01,""yakut"":-2.11e+01,""yri"":-4.33e+01,""hezhen"":-3.56e+00,""mongola"":-1.41e+01,""tuscan"":-9.80e+00,""lwk"":-3.98e+01,""gbr"":-2.84e+01,""chb"":-2.62e+01,""uygur"":-1.41e+01,""esn"":-4.21e+01,""sindhi"":-2.64e+01,""druze"":-3.93e+01,""japanese"":-2.12e+01,""jpt"":-2.63e+01,""khv"":-3.38e+01,""stu"":-3.78e+01,""oroqen"":-6.34e+00}"


In [17]:
def predict_ancestry(posteriors):
    return hl.bind(
        lambda x: hl.sorted(x.items(), key=lambda item: item[1], reverse=True)[0][0],
        posteriors
    )

mt_probs = mt_probs.annotate_cols(predicted_ancestry=predict_ancestry(mt_probs.posteriors))
mt_probs.cols().select('predicted_ancestry', 'pop').show(5)

s,predicted_ancestry,pop
str,str,str
"""HG00107""","""colombian""","""gbr"""
"""HG00114""","""colombian""","""gbr"""
"""HG00121""","""colombian""","""gbr"""
"""HG00127""","""colombian""","""gbr"""
"""HG00132""","""colombian""","""gbr"""


In [None]:
mt_probs.aggregate_cols(predicted_ancestry=predict_ancestry(mt_probs.posteriors))

In [18]:
tp, total = mt_probs.filter_cols(mt_probs.predicted_ancestry == mt_probs.pop).count_cols(), mt_probs.count_cols()

tp / total, tp, total

(0.00510204081632653, 2, 392)

In [19]:
def calculate_error_rate(mt):
    correct_predictions = mt.filter_cols(mt.predicted_ancestry == mt.pop)
    total_samples = mt.count_cols()
    error_rate = correct_predictions.aggregate_cols(
        hl.agg.sum(1 - hl.exp(correct_predictions.posteriors[correct_predictions.predicted_ancestry]))
    ) / total_samples
    return error_rate


calculate_error_rate(mt_probs)

0.0013290751226235143