In [1]:
from pathlib import Path
from datetime import datetime
import subprocess

import dxdata
import dxpy
import hail as hl
import pyspark
from pyspark.sql.functions import when, col


from packaging import version

In [2]:
# Initialise Spark

sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)


SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/cluster/dnax/jars/dnanexus-api-0.1.0-SNAPSHOT-jar-with-dependencies.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/cluster/spark/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


2023-09-13 08:36:00.385 WARN  NativeCodeLoader:60 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2023-09-13 08:36:01.462 WARN  Utils:69 - Service 'org.apache.spark.network.netty.NettyBlockTransferService' could not bind on port 43000. Attempting port 43001.
2023-09-13 08:36:01.720 WARN  MetricsReporter:84 - No metrics configured for reporting
2023-09-13 08:36:01.722 WARN  LineProtoUsageReporter:48 - Telegraf configurations: url [metrics.push.telegraf.hostport], user [metrics.push.telegraf.user] or password [metrics.push.telegraf.password] missing.
2023-09-13 08:36:01.722 WARN  MetricsReporter:117 - metrics.scraping.httpserver.port


In [3]:
# Constants
DATABASE = "matrix_tables"
REFERENCE_GENOME = "GRCh38"
PROJ_NAME = "DRD2"

LOG_FILE = (
    Path("../hail_logs", f"{PROJ_NAME}_{datetime.now().strftime('%H%M')}.log")
    .resolve()
    .__str__()
)


In [4]:
# Hail init
hl.init(sc=sc, default_reference=REFERENCE_GENOME, log=LOG_FILE)

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/backend/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/backend/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jar

log4j: Parsing for [root] with value=[INFO, logfile].
log4j: Level token is [INFO].
log4j: Category root set to INFO
log4j: Parsing appender named "logfile".
log4j: Parsing layout options for "logfile".
log4j: Setting property [conversionPattern] to [%d{yyyy-MM-dd HH:mm:ss.SSS} %c{1}: %p: %m%n].
log4j: End of parsing for "logfile".
log4j: Setting property [append] to [false].
log4j: Setting property [threshold] to [INFO].
log4j: Setting property [file] to [/opt/notebooks/gogoGPCR2/hail_logs/DRD2_0836.log].
log4j: setFile called: /opt/notebooks/gogoGPCR2/hail_logs/DRD2_0836.log, false
log4j: setFile ended
log4j: Parsed "logfile" options.
log4j: Parsing for [Hail] with value=[INFO, HailSocketAppender].
log4j: Level token is [INFO].
log4j: Category Hail set to INFO
log4j: Parsing appender named "HailSocketAppender".
log4j: Parsed "HailSocketAppender" options.
log4j: Handling log4j.additivity.Hail=[null]
log4j: Finished configuring.


Running on Apache Spark version 3.2.3
SparkUI available at http://ip-10-60-22-64.eu-west-2.compute.internal:8081
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.116-cd64e0876c94
LOGGING: writing to /opt/notebooks/gogoGPCR2/hail_logs/DRD2_0836.log


In [5]:
dispensed_database_name = dxpy.find_one_data_object(
    classname="database", name="app*", folder="/", name_mode="glob", describe=True
)["describe"]["name"]
dispensed_dataset_id = dxpy.find_one_data_object(
    typename="Dataset", name="app*.dataset", folder="/", name_mode="glob"
)["id"]

dataset = dxdata.load_dataset(id=dispensed_dataset_id)
participant = dataset["participant"]

In [6]:
def fields_for_id(field_id, participant):
    """[summary]

    Parameters
    ----------
    field_id : [type]
        [description]
    participant : [type]
        [description]

    Returns
    -------
    [type]
        [description]
    """

    field_id = str(field_id)
    fields = participant.find_fields(
        name_regex=r"^p{}(_i\d+)?(_a\d+)?$".format(field_id)
    )
    
    return fields

    return sorted(fields, key=lambda f: version.parse(f.name.replace("p", "")))

fields = ["22027", "22019", "22021", "21000"]

field_names = [
    fields_for_id(i, participant) for i in fields
]

field_names = ["eid"] + [field.name for fields in field_names for field in fields]

In [11]:
# Use hard filters

df = participant.retrieve_fields(
    names=field_names, engine=dxdata.connect(), coding_values="replace"
)

df = df.filter(
    (~df.p22027.isNull())
    | (~df.p22019.isNull())
    | (df.p22021 == "Participant excluded from kinship inference process")
    | (df.p22021 == "Ten or more third-degree relatives identified")
    | (df.p21000_i0 == "White and Black Caribbean")
    | (df.p21000_i0 == "White and Black African")
    | (df.p21000_i0 == "White and Asian")
    | (df.p21000_i0 == "Any other mixed background")
)
filtered_samples_to_remove = hl.Table.from_spark(df.select("eid")).key_by("eid")

print(f"Samples to be filtered: {filtered_samples_to_remove.count()}")



Samples to be filtered: 4713


                                                                                

In [8]:
# Use ancestry filter from Privet et al.
ANCESTRY_FILE = 'file:///mnt/project/gogoGPCR2/ancestry.csv'

anc = hl.import_table(ANCESTRY_FILE, delimiter = ',', quote = '"')
anc = anc.key_by(eid = anc['PC_UKBB.eid'])
ancestry_to_remove = anc.filter(anc.group != "United Kingdom")
print(f"Ancestry to remove: {ancestry_to_remove.count()}")

2023-09-13 08:36:39.239 Hail: INFO: Reading table without type imputation       
  Loading field '' as type str (not specified)
  Loading field 'PC_UKBB.eid' as type str (not specified)
  Loading field 'group' as type str (not specified)
[Stage 2:>                                                          (0 + 1) / 1]

Ancestry to remove: 30171


                                                                                

In [9]:
# Withdrawn samples to remove
WITHDRAWN_FILE = "file:///mnt/project/gogoGPCR2/withdrawn_2023-04-25.csv"

withdrawn_to_remove = hl.import_table(WITHDRAWN_FILE, delimiter = ",", no_header = True).key_by("f0")
print(f"Withdrawn samples to remove: {withdrawn_to_remove.count()}")

2023-09-13 08:42:50.839 Hail: INFO: Reading table without type imputation       
  Loading field 'f0' as type str (not specified)


Withdrawn samples to remove: 148


In [12]:
# Remove related individuals
RAW_REL_FILE = Path("/mnt/project/Bulk/Genotype Results/Genotype calls/ukb_rel.dat")
MAX_KINSHIP = 0.176 # 2nd degree relatives


rel = hl.import_table(
    f"file://{RAW_REL_FILE}",
    delimiter=" ",
    impute=True,
    types={"ID1": "str", "ID2": "str"},
)

rel = rel.filter(
    hl.is_defined(filtered_samples_to_remove[rel.ID1]) |
    hl.is_defined(filtered_samples_to_remove[rel.ID2]) |
    hl.is_defined(ancestry_to_remove[rel.ID1]) |
    hl.is_defined(ancestry_to_remove[rel.ID2]) |
    hl.is_defined(withdrawn_to_remove[rel.ID1]) |
    hl.is_defined(withdrawn_to_remove[rel.ID2]),
    keep = False
    )

rel = rel.filter(rel.Kinship > MAX_KINSHIP, keep=True)

print(
    f"Related samples not already in filter and high kinship coefficient: {rel.count()}"
)

# Define interesting cases for affective disorders tie breaker # TODO: Remove this shit 
fields = ["20126"]

field_names = [
    fields_for_id(i, participant) for i in fields
]

field_names = ["eid"] + [field.name for fields in field_names for field in fields]

df2 = participant.retrieve_fields(
    names=field_names, engine=dxdata.connect(), coding_values="replace"
)

# Add new_col based on old_col being null
df2 = df2.withColumn("is_affective", col("p20126_i0").isNotNull() & (col("p20126_i0") != "No Bipolar or Depression"))

int_samples = hl.Table.from_spark(df2).key_by("eid")

print(f"Affective cases: {int_samples.filter(int_samples.is_affective).count()}")

rel_interesting = rel.key_by(
    i=hl.struct(id=rel.ID1,
                is_affective=hl.or_else(int_samples[rel.ID1].is_affective, False)
               ),
    j=hl.struct(id=rel.ID2,
                is_affective=hl.or_else(int_samples[rel.ID2].is_affective, False)
               )
)

def tie_breaker(l, r):
    return hl.if_else(l.is_affective & ~r.is_affective, -1, hl.if_else(~l.is_affective & r.is_affective, 1, 0))
    
related_samples_to_remove = hl.maximal_independent_set(
        i=rel_interesting.i,
        j=rel_interesting.j,
        keep=False,
        tie_breaker=tie_breaker
    )

related_samples_to_remove = related_samples_to_remove.key_by(eid = related_samples_to_remove.node.id).drop("node")

print(
    f"Samples to remove to create independent set: {related_samples_to_remove.count()}"
)


2023-09-13 08:50:08.110 Hail: INFO: Reading table to impute column types        
2023-09-13 08:50:09.220 Hail: INFO: Finished type imputation
  Loading field 'ID1' as type str (user-supplied type)
  Loading field 'ID2' as type str (user-supplied type)
  Loading field 'HetHet' as type float64 (imputed)
  Loading field 'IBS0' as type float64 (imputed)
  Loading field 'Kinship' as type float64 (imputed)
2023-09-13 08:50:14.794 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-09-13 08:50:17.096 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-09-13 08:50:20.314 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-09-13 08:50:22.316 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-09-13 08:50:24.999 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-09-13 08:50:26.448 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-09-13 08:50:33.030 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-09-13 08:5

Related samples not already in filter and high kinship coefficient: 27649


                                                                                

Affective cases: 33439


2023-09-13 08:50:49.931 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-09-13 08:50:51.825 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-09-13 08:50:54.413 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-09-13 08:50:56.368 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-09-13 08:50:58.835 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-09-13 08:51:00.477 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-09-13 08:51:02.559 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-09-13 08:51:03.707 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-09-13 08:51:05.405 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-09-13 08:51:05.953 Hail: INFO: Coerced sorted dataset
2023-09-13 08:51:07.469 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-09-13 08:51:08.107 Hail: INFO: Coerced sorted dataset
2023-09-13 08:51:09.856 Hail: INFO: Ordering unsorte

Samples to remove to create independent set: 24848


                                                                                

In [13]:
final_to_remove = (
    filtered_samples_to_remove
    .join(related_samples_to_remove, how = "outer")
    .join(ancestry_to_remove, how = "outer")
    .join(withdrawn_to_remove, how = "outer")
).distinct()

print(f"Final number of samples to remove: {final_to_remove.count()}")

2023-09-13 08:55:41.864 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-09-13 08:55:48.920 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-09-13 08:55:49.544 Hail: INFO: Coerced sorted dataset
2023-09-13 08:55:50.925 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-09-13 08:55:51.504 Hail: INFO: Coerced sorted dataset
                                                                                

Final number of samples to remove: 59423


In [14]:
SAMPLES_TO_REMOVE_FILE = "/tmp/samples_to_remove.tsv"

final_to_remove.eid.export(SAMPLES_TO_REMOVE_FILE)

2023-09-13 09:22:30.867 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-09-13 09:22:37.555 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-09-13 09:22:38.145 Hail: INFO: Coerced sorted dataset
2023-09-13 09:22:39.246 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-09-13 09:22:39.824 Hail: INFO: Coerced sorted dataset
2023-09-13 09:22:44.635 Hail: INFO: merging 13 files totalling 464.2K...        
2023-09-13 09:22:44.699 Hail: INFO: while writing:
    /tmp/samples_to_remove.tsv
  merge time: 63.099ms
