In [1]:
from pathlib import Path
import subprocess
from datetime import datetime

import dxdata
import dxpy
import hail as hl
import pyspark
import tomli

from utils import fields_for_id

In [2]:
LOG_FILE = (
    Path("../hail_logs", f"GIPR_{datetime.now().strftime('%H%M')}.log")
    .resolve()
    .__str__()
)

sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)
hl.init(sc=sc, default_reference="GRCh38", log=LOG_FILE)

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 2.4.4
SparkUI available at http://ip-10-60-131-24.eu-west-2.compute.internal:8081
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.78-b17627756568
LOGGING: writing to /opt/notebooks/gogoGPCR/notebooks/hail_logs/GIPR_1154.log


In [3]:
dispensed_database_name = dxpy.find_one_data_object(
    classname="database", name="app*", folder="/", name_mode="glob", describe=True
)["describe"]["name"]
dispensed_dataset_id = dxpy.find_one_data_object(
    typename="Dataset", name="app*.dataset", folder="/", name_mode="glob"
)["id"]

dataset = dxdata.load_dataset(id=dispensed_dataset_id)
participant = dataset["participant"]

In [4]:
fields = ["22027", "22019", "22021", "21000"]
field_names = [
    fields_for_id(i, participant) for i in fields
]  # fields_for_id("22027") + fields_for_id("22019") + fields_for_id("22006") + fields_for_id("22021")
field_names = ["eid"] + [field.name for fields in field_names for field in fields]

In [5]:
df = participant.retrieve_fields(
    names=field_names, engine=dxdata.connect(), coding_values="replace"
)

# Don't show eid
# df.show(5, truncate=False)

# Use hard filters

df = df.filter(
    (~df.p22027.isNull())
    | (~df.p22019.isNull())
    | (df.p22021 == "Participant excluded from kinship inference process")
    | (df.p22021 == "Ten or more third-degree relatives identified")
    | (df.p21000_i0 == "White and Black Caribbean")
    | (df.p21000_i0 == "White and Black African")
    | (df.p21000_i0 == "White and Asian")
    | (df.p21000_i0 == "Any other mixed background")
)
filtered_samples_to_remove = hl.Table.from_spark(df.select("eid")).key_by("eid")
print(f"Samples to be filtered: {filtered_samples_to_remove.count()}")

Samples to be filtered: 4714


In [6]:
filtered_samples_to_remove.export("/tmp/samples_to_remove.tsv")

2023-01-31 11:55:28 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-01-31 11:55:31 Hail: INFO: merging 9 files totalling 36.8K...
2023-01-31 11:55:31 Hail: INFO: while writing:
    /tmp/samples_to_remove.tsv
  merge time: 95.368ms


In [9]:
UKB_REL_DAT_FILE = "Bulk/Genotype Results/Genotype calls/ukb_rel.dat"

rel = hl.import_table(
    "file:" + "/mnt/project/" + UKB_REL_DAT_FILE,
    delimiter=" ",
    impute=True,
    types={"ID1": "str", "ID2": "str"},
)

2023-01-30 10:14:01 Hail: INFO: Reading table to impute column types
2023-01-30 10:14:10 Hail: INFO: Finished type imputation
  Loading field 'ID1' as type str (user-supplied type)
  Loading field 'ID2' as type str (user-supplied type)
  Loading field 'HetHet' as type float64 (imputed)
  Loading field 'IBS0' as type float64 (imputed)
  Loading field 'Kinship' as type float64 (imputed)


In [46]:
anc = hl.import_table('/tmp/ancestry.csv', delimiter = ',', quote = '"')
anc = anc.aggregate(hl.agg.filter(anc.group == "United Kingdom", hl.agg.collect_as_set(anc['PC_UKBB.eid'])))
n_uk = len(anc)
print(n_uk)

2023-01-30 10:29:42 Hail: INFO: Reading table without type imputation
  Loading field '' as type str (not specified)
  Loading field 'PC_UKBB.eid' as type str (not specified)
  Loading field 'group' as type str (not specified)


446606


In [56]:
n_rel = rel.filter(hl.set(anc).contains(rel.ID1) & hl.set(anc).contains(rel.ID2) & (rel.Kinship > 0.044)).count()
n_2nd = rel.filter(hl.set(anc).contains(rel.ID1) & hl.set(anc).contains(rel.ID2) & (rel.Kinship > 0.0884)).count()
print(f"Proportion of related: {100 * n_rel / n_uk}")
print(f"Number of samples with 2nd degree relatives {n_2nd}")

Proportion of related: 22.922441704768858
Number of samples with 2nd degree relatives 37936


In [6]:
#####
# We opt for not filtering relatedness based on Regnie stated performance. See regenie supplementary tables 10-14

# rel = (
#     rel.key_by("ID2")
#     .anti_join(filtered_samples_to_remove)
#     .key_by("ID1")
#     .anti_join(filtered_samples_to_remove)
# )

# rel = rel.filter(rel.Kinship > 0.125, keep=True)

# print(
#     f"Related samples not already in filter and low kinship coefficient: {rel.count()}"
# )

2022-08-20 22:50:05 Hail: INFO: Reading table to impute column types
2022-08-20 22:50:09 Hail: INFO: Finished type imputation
  Loading field 'ID1' as type str (user-supplied type)
  Loading field 'ID2' as type str (user-supplied type)
  Loading field 'HetHet' as type float64 (imputed)
  Loading field 'IBS0' as type float64 (imputed)
  Loading field 'Kinship' as type float64 (imputed)
2022-08-20 22:50:12 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-08-20 22:50:13 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-08-20 22:50:15 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-08-20 22:50:16 Hail: INFO: Ordering unsorted dataset with network shuffle


Related samples not already in filter and low kinship coefficient: 33869


In [7]:
# Carriers of variants of interest

# samples = hl.import_table("/opt/notebooks/gogoGPCR_private/data/GIPR/interesting_samples.tsv", impute = True, types={"s": "str"}).key_by("s")

2022-08-20 22:50:18 Hail: INFO: Reading table to impute column types
2022-08-20 22:50:19 Hail: INFO: Finished type imputation
  Loading field 's' as type str (user-supplied type)
  Loading field 'is_interesting_sample' as type bool (imputed)


In [8]:
# Define structures for interesting variants

# rel_interesting = rel.key_by(
#     i=hl.struct(id=rel.ID1,
#                 is_int=samples[rel.ID1].is_interesting_sample,
#                 is_def=hl.is_defined(samples[rel.ID1].is_interesting_sample)),
#     j=hl.struct(id=rel.ID2,
#                 is_int=samples[rel.ID2].is_interesting_sample,
#                 is_def=hl.is_defined(samples[rel.ID2].is_interesting_sample)))

In [9]:
# Define a tie breaker preferring carriers of interesting variants

# def tie_breaker(l, r):
#     is_def_and_int = (hl.case()
#               .when(l.is_def & ~r.is_def, -1)
#               .when(~l.is_def & r.is_def, 1)
#               .when(~l.is_def & ~r.is_def, 0)
#               .when(l.is_def & r.is_def,
#                    (hl.case()
#                     .when(l.is_int & ~r.is_int, -1)
#                     .when(~l.is_int & r.is_int, 1)
#                     .when(~l.is_int & ~r.is_int, 0)
#                     .when(l.is_int & r.is_int, 0).default(-9)
#                )).default(-9)
#              )
#     return is_def_and_int

                 

In [10]:
# Create an independent set by removing related nodes in the graph

# related_samples_to_remove = (
#     hl.maximal_independent_set(
#         i=rel_interesting.i,
#         j=rel_interesting.j,
#         keep=False,
#         tie_breaker=tie_breaker
#     )

# print(
#     f"Samples to remove to create independent set: {related_samples_to_remove.count()}"
# )

2022-08-20 22:50:20 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-08-20 22:50:21 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-08-20 22:50:22 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-08-20 22:50:23 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-08-20 22:50:23 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-08-20 22:50:24 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-08-20 22:50:28 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-08-20 22:50:28 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-08-20 22:50:32 Hail: INFO: Coerced sorted dataset
2022-08-20 22:50:32 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-08-20 22:50:36 Hail: INFO: wrote table with 33869 rows in 1 partition to /tmp/c8GwwJq7o8Rb5Spb7PfwsW
2022-08-20 22:50:38 Hail: INFO: Ordering unsorted dataset with network shuffle


Samples to remove to create independent set: 29877


In [11]:
# related_samples_to_remove = related_samples_to_remove.key_by(eid = related_samples_to_remove.node.id).drop("node")

In [12]:
# Before removal

# samples.aggregate(hl.agg.sum(samples.is_interesting_sample))

166259

In [13]:
# result = samples.filter(hl.is_defined(related_samples_to_remove[samples.s]), keep=False)

In [14]:
# With tie break

# result.aggregate(hl.agg.sum(result.is_interesting_sample))

2022-08-20 22:50:40 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-08-20 22:50:40 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-08-20 22:50:40 Hail: INFO: Coerced sorted dataset


159590

In [15]:
# We opt for not removing related individuals

# final = related_samples_to_remove.join(filtered_samples_to_remove, how="outer")
# print(f"Final number of samples to remove: {final.count()}")

2022-08-20 22:50:43 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:
    'eid' -> 'eid_1'
2022-08-20 22:50:43 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-08-20 22:50:43 Hail: INFO: Coerced sorted dataset
2022-08-20 22:50:44 Hail: INFO: Ordering unsorted dataset with network shuffle


Final number of samples to remove: 31692


In [16]:
# result = samples.filter(hl.is_defined(final[samples.s]), keep=False)

In [17]:
# result.aggregate(hl.agg.sum(result.is_interesting_sample))

2022-08-20 22:50:46 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-08-20 22:50:46 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-08-20 22:50:46 Hail: INFO: Coerced sorted dataset
2022-08-20 22:50:47 Hail: INFO: Ordering unsorted dataset with network shuffle


158989

In [18]:
# final.export("/tmp/samples_to_remove2.tsv")

2022-08-20 22:50:54 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-08-20 22:50:54 Hail: INFO: Coerced sorted dataset
2022-08-20 22:50:55 Hail: INFO: Ordering unsorted dataset with network shuffle
2022-08-20 22:50:57 Hail: INFO: merging 49 files totalling 247.6K...
2022-08-20 22:50:57 Hail: INFO: while writing:
    /tmp/samples_to_remove2.tsv
  merge time: 301.608ms
