In [1]:
!pip install toml



In [2]:
import pyspark
import dxpy
import dxdata
import toml

from distutils.version import LooseVersion
from subprocess import run
from pathlib import Path

import hail as hl

In [3]:
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

In [181]:
with open("../config.toml") as f:
    conf = toml.load(f)

RAW_REL_FILE = conf["SAMPLE_QC"]["UKB_REL_DAT_FILE"]
FINAL_FILTER_FILE = conf["SAMPLE_QC"]["SAMPLE_FILTER_FILE"]

MAX_KINSHIP = conf["SAMPLE_QC"]["MAX_KINSHIP"]

LOG_FILE = Path(conf["IMPORT"]["LOG_DIR"], f"sample_filters.log").resolve().__str__()
TMP_DIR = Path(conf["EXPORT"]["TMP_DIR"])
DATA_DIR = Path(conf["SAMPLE_QC"]["DATA_DIR"])


In [5]:
hl.init(sc=sc, default_reference='GRCh38', log=LOG_FILE)

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 2.4.4
SparkUI available at http://ip-10-60-0-47.eu-west-2.compute.internal:8081
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /opt/notebooks/gogoGPCR/hail_logs/sample_filters.log


In [6]:
dispensed_database_name = dxpy.find_one_data_object(classname="database", name="app*", folder="/", name_mode="glob", describe=True)["describe"]["name"]
dispensed_dataset_id = dxpy.find_one_data_object(typename="Dataset", name="app*.dataset", folder="/", name_mode="glob")["id"]

In [7]:
dataset = dxdata.load_dataset(id=dispensed_dataset_id)

In [8]:
participant = dataset["participant"]

In [9]:
def fields_for_id(field_id):

    field_id = str(field_id)
    fields = participant.find_fields(name_regex=r'^p{}(_i\d+)?(_a\d+)?$'.format(field_id))
    
    return sorted(fields, key=lambda f: LooseVersion(f.name))

In [73]:
fields = ["22027", "22019", "22006", "22021"]
field_names = [fields_for_id(id) for id in fields] #fields_for_id("22027") + fields_for_id("22019") + fields_for_id("22006") + fields_for_id("22021")
field_names = ["eid"] + [field.name for fields in field_names for field in fields]

In [162]:
df = participant.retrieve_fields(names=field_names, engine=dxdata.connect(), coding_values="replace")

In [163]:
df.show(5, truncate=False)

+-------+------+------+---------+--------------------------------+
|eid    |p22027|p22019|p22006   |p22021                          |
+-------+------+------+---------+--------------------------------+
|3888244|null  |null  |Caucasian|No kinship found                |
|1795659|null  |null  |Caucasian|No kinship found                |
|2084720|null  |null  |Caucasian|At least one relative identified|
|3742232|null  |null  |Caucasian|At least one relative identified|
|1094442|null  |null  |Caucasian|At least one relative identified|
+-------+------+------+---------+--------------------------------+
only showing top 5 rows



In [164]:
df = df.filter(df.p22006.isNull() |
          (~df.p22027.isNull()) | 
          (~df.p22019.isNull()) |
          (df.p22021 == "Participant excluded from kinship inference process") | 
          (df.p22021 == "Ten or more third-degree relatives identified")
              )
filtered_samples_to_remove = hl.Table.from_spark(df.select("eid")).key_by("eid")
print(f"Samples to be filtered: {filtered_samples_to_remove.count()}")

Samples to be filtered: 94317


In [167]:
rel = hl.import_table(
        "file:" + "/mnt/project/" + RAW_REL_FILE,
        delimiter=" ",
        impute=True,
        types={"ID1": "str", "ID2": "str"},
    )

rel = rel.key_by("ID2").anti_join(filtered_samples_to_remove).key_by("ID1").anti_join(filtered_samples_to_remove)

rel = rel.filter(rel.Kinship > MAX_KINSHIP, keep=True)

print(f"Related samples not already in filter and low kinship coefficient: {rel.count()}")

2021-11-08 10:45:23 Hail: INFO: Reading table to impute column types
2021-11-08 10:45:24 Hail: INFO: Finished type imputation
  Loading field 'ID1' as type str (user-supplied type)
  Loading field 'ID2' as type str (user-supplied type)
  Loading field 'HetHet' as type float64 (imputed)
  Loading field 'IBS0' as type float64 (imputed)
  Loading field 'Kinship' as type float64 (imputed)
2021-11-08 10:45:24 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-11-08 10:45:25 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-11-08 10:45:26 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-11-08 10:45:26 Hail: INFO: Ordering unsorted dataset with network shuffle


Related samples not already in filter and low kinship coefficient: 34111


In [168]:
#Find maximal independent set

related_samples_to_remove = hl.maximal_independent_set(
    i=rel.ID1,
    j=rel.ID2,
    keep=False,
).rename({"node": "eid"}).key_by("eid")

print(f"Samples to remove to create independent set: {related_samples_to_remove.count()}")

2021-11-08 10:46:10 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-11-08 10:46:10 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-11-08 10:46:12 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-11-08 10:46:12 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-11-08 10:46:14 Hail: INFO: wrote table with 34111 rows in 1 partition to /tmp/BR3e9VXPApCLzmLnkXrL7m
    Total size: 497.10 KiB
    * Rows: 497.09 KiB
    * Globals: 11.00 B
    * Smallest partition: 34111 rows (497.09 KiB)
    * Largest partition:  34111 rows (497.09 KiB)
2021-11-08 10:46:15 Hail: INFO: Ordering unsorted dataset with network shuffle


Samples to remove to create independent set: 29667


In [169]:
final = related_samples_to_remove.join(filtered_samples_to_remove, how="outer" )
print(f"Final number of samples to remove: {final.count()}")

2021-11-08 10:58:58 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:
    'eid' -> 'eid_1'
2021-11-08 10:58:58 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-11-08 10:58:58 Hail: INFO: Ordering unsorted dataset with network shuffle


Final number of samples to remove: 123984


In [216]:
FILTER_PATH = (TMP_DIR / FINAL_FILTER_FILE).resolve().__str__()
PROCESSED_DIR = (DATA_DIR.parents[0].stem / Path(DATA_DIR.stem)).__str__()  + "/"

related_samples_to_remove.export("file:" + FILTER_PATH)

2021-11-08 11:14:41 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-11-08 11:14:41 Hail: INFO: merging 1 files totalling 231.8K...
2021-11-08 11:14:41 Hail: INFO: while writing:
    file:/opt/notebooks/gogoGPCR/tmp/samples_to_remove.tsv
  merge time: 10.514ms


In [223]:
run(["dx", "upload", FILTER_PATH, "--path", PROCESSED_DIR], check = True, shell = False)

CompletedProcess(args=['dx', 'upload', '/opt/notebooks/gogoGPCR/tmp/samples_to_remove.tsv', '--path', 'data/processed/'], returncode=0)