In [1]:
import pyspark
import dxpy
import dxdata
import toml

from distutils.version import LooseVersion
from subprocess import run
from pathlib import Path

import hail as hl

In [2]:
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

In [3]:
with open("../config.toml") as f:
    conf = toml.load(f)

LOG_FILE = Path(conf["IMPORT"]["LOG_DIR"], f"sample_filters.log").resolve().__str__()

In [4]:
hl.init(sc=sc, default_reference='GRCh38', log=log_file)

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 2.4.4
SparkUI available at http://ip-10-60-12-7.eu-west-2.compute.internal:8081
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /opt/notebooks/gogoGPCR/hail_logs/sample_filters.log


In [5]:
dispensed_database_name = dxpy.find_one_data_object(classname="database", name="app*", folder="/", name_mode="glob", describe=True)["describe"]["name"]
dispensed_dataset_id = dxpy.find_one_data_object(typename="Dataset", name="app*.dataset", folder="/", name_mode="glob")["id"]

In [6]:
dataset = dxdata.load_dataset(id=dispensed_dataset_id)

In [7]:
participant = dataset["participant"]

In [8]:
def fields_for_id(field_id):

    field_id = str(field_id)
    fields = participant.find_fields(name_regex=r'^p{}(_i\d+)?(_a\d+)?$'.format(field_id))
    
    return sorted(fields, key=lambda f: LooseVersion(f.name))

In [9]:
fields = ["22027", "22019", "22006", "22021"]
field_names = [fields_for_id(id) for id in fields] #fields_for_id("22027") + fields_for_id("22019") + fields_for_id("22006") + fields_for_id("22021")
field_names = ["eid"] + [field.name for fields in field_names for field in fields]

In [10]:
df = participant.retrieve_fields(names=field_names, engine=dxdata.connect(), coding_values="replace")

In [11]:
df.show(5, truncate=False)

+-------+------+------+---------+--------------------------------+
|eid    |p22027|p22019|p22006   |p22021                          |
+-------+------+------+---------+--------------------------------+
|3888244|null  |null  |Caucasian|No kinship found                |
|1795659|null  |null  |Caucasian|No kinship found                |
|2084720|null  |null  |Caucasian|At least one relative identified|
|3742232|null  |null  |Caucasian|At least one relative identified|
|1094442|null  |null  |Caucasian|At least one relative identified|
+-------+------+------+---------+--------------------------------+
only showing top 5 rows



In [12]:
df = df.filter((df.p22006 == "Caucasian") &
          (df.p22027.isNull()) &
          (df.p22019.isNull()) &
          (df.p22021 != "Participant excluded from kinship inference process") & 
          (df.p22021 != "Ten or more third-degree relatives identified"))


In [13]:
df.count()

408135

In [15]:
samp_path = "/opt/notebooks/gogoGPCR/data/samples_to_keep.tsv"
df = df.select("eid")
df.toPandas().to_csv(samp_path, index=False, sep="\t")

In [16]:
run(["dx", "upload", samp_path, "--path", "/data/processed/"], check = True, shell = False)

CompletedProcess(args=['dx', 'upload', '/opt/notebooks/gogoGPCR/data/samples_to_keep.tsv', '--path', '/data/processed/'], returncode=0)

In [18]:
rel_file = "file://" + conf["SAMPLE_QC"]["REL_FILE"]
max_kinship = conf["SAMPLE_QC"]["MAX_KINSHIP"]

rel = hl.import_table(
        rel_file,
        delimiter=" ",
        impute=True,
        types={"ID1": "str", "ID2": "str"},
    )

rel = rel.filter(rel.Kinship > max_kinship, keep=True)

related_samples_to_remove = hl.maximal_independent_set(
    i=rel.ID1,
    j=rel.ID2,
    keep=False,
).rename({"node": "eid"}).key_by("eid")

2021-09-14 12:26:20 Hail: INFO: Reading table to impute column types
2021-09-14 12:26:22 Hail: INFO: Finished type imputation
  Loading field 'ID1' as type str (user-supplied type)
  Loading field 'ID2' as type str (user-supplied type)
  Loading field 'HetHet' as type float64 (imputed)
  Loading field 'IBS0' as type float64 (imputed)
  Loading field 'Kinship' as type float64 (imputed)
2021-09-14 12:26:23 Hail: INFO: wrote table with 12597 rows in 1 partition to /tmp/emVS1y8XvzSZJjUBhDfiB2
    Total size: 195.46 KiB
    * Rows: 195.45 KiB
    * Globals: 11.00 B
    * Smallest partition: 12597 rows (195.45 KiB)
    * Largest partition:  12597 rows (195.45 KiB)


In [19]:
related_samples_to_remove.count()

2021-09-14 12:26:26 Hail: INFO: Ordering unsorted dataset with network shuffle


11827

In [21]:
kin_path = "/opt/notebooks/gogoGPCR/data/samples_to_remove.tsv"
related_samples_to_remove.export("file://" + kin_path)

2021-09-14 12:26:47 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-09-14 12:26:48 Hail: INFO: merging 1 files totalling 92.4K...
2021-09-14 12:26:48 Hail: INFO: while writing:
    file:///opt/notebooks/gogoGPCR/data/samples_to_remove.tsv
  merge time: 10.528ms


In [22]:
run(["dx", "upload", kin_path, "--path", "/data/processed/"], check = True, shell = False)

CompletedProcess(args=['dx', 'upload', '/opt/notebooks/gogoGPCR/data/samples_to_remove.tsv', '--path', '/data/processed/'], returncode=0)