In [1]:
import random
import hail as hl
import pandas as pd

### Sample annotations

In [2]:
! gsutil cp gs://gcp-public-data--gnomad/release/3.1/secondary_analyses/hgdp_1kg/data_intersection/hgdp_1kg_sample_info.unrelateds.pca_outliers_removed.with_project.tsv .


Copying gs://gcp-public-data--gnomad/release/3.1/secondary_analyses/hgdp_1kg/data_intersection/hgdp_1kg_sample_info.unrelateds.pca_outliers_removed.with_project.tsv...
- [1 files][ 56.2 KiB/ 56.2 KiB]                                                
Operation completed over 1 objects/56.2 KiB.                                     


In [3]:
sample_annotations = pd.read_table('hgdp_1kg_sample_info.unrelateds.pca_outliers_removed.with_project.tsv',
                                   header=0)

In [4]:
# get sample IDs and randomly (90/10) split data into truth and unknown
random.seed(1234)
samples = list(sample_annotations['Sample'].values)
N = len(samples)
truth_sample_ridx = random.sample(list(range(0, N)), int(N * 0.9))
truth_samples = [samples[i] for i in truth_sample_ridx]
unknown_samples = list(set(samples) - set(truth_samples))

assert (len(truth_samples + unknown_samples)) == len(samples)
assert len(list(set(truth_samples) & set(unknown_samples))) == 0

print(f'Total samples: {N}\nTruth samples: {len(truth_samples)}\nUnknown samples:  {len(unknown_samples)}')

Total samples: 3380
Truth samples: 3042
Unknown samples:  338


In [5]:
# truth
truth_annotations = sample_annotations.loc[sample_annotations['Sample'].isin(truth_samples)]
truth_annotations = truth_annotations[['Sample', 'SuperPop']]
truth_annotations.to_csv('hgdp_1kg_truth_labels.txt', sep ='\t', index=False)

# unknown
unknown_annotations = sample_annotations.loc[sample_annotations['Sample'].isin(unknown_samples)]
unknown_annotations = unknown_annotations[['Sample', 'SuperPop']]
unknown_annotations.to_csv('hgdp_1kg_unknown_labels.txt', sep ='\t', index=False)

### Randomly split data into truth (gold standard) and unknown

In [6]:
mt = hl.read_matrix_table('unrelateds_without_outliers.mt')

Initializing Hail with default parameters...


23/03/20 20:23:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/20 20:23:07 WARN Hail: This Hail JAR was compiled for Spark 3.3.0, running with Spark 3.3.2.
  Compatibility is not guaranteed.


Running on Apache Spark version 3.3.2
SparkUI available at http://10.0.0.207:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.112-31ceff2fb5fd
LOGGING: writing to /Volumes/ExternalDrive/SPCAncestry/data/hgdp_1kg/hail-20230320-2023-0.2.112-31ceff2fb5fd.log


In [7]:
mt.count()

(199974, 3378)

In [8]:
# data has very few variants but lots of partitions
mt = mt.checkpoint('hgdp_1kg_checkpoint.mt')
# mt_rep = mt.repartition(2)



In [8]:
mt = hl.read_matrix_table('hgdp_1kg_checkpoint.mt')

### Do a 90/10 split, and the 90% will be used to train and infer POP labels on the 10%

In [9]:
# truth
truth_mt = mt.filter_cols(hl.literal(truth_samples).contains(mt['s']), keep=True)
# the annotations file has more samples
# assert truth_mt.count_cols() == len(truth_samples)

# unknown
unknown_mt = mt.filter_cols(hl.literal(unknown_samples).contains(mt['s']), keep=True)
# assert unknown_mt.count_cols() == len(unknown_samples)

hl.export_plink(truth_mt, 'hgdp_1kg_truth', ind_id = truth_mt.s)
hl.export_plink(unknown_mt, 'hgdp_1kg_unknown', ind_id = unknown_mt.s)

