In [None]:
import subprocess
import re
import dxdata  # type: ignore
import dxpy

import pyspark

import hail as hl

from pyspark.sql import SparkSession
from pathlib import Path
from datetime import datetime


# --- HELPER FUNCTIONS ---
# Should factor these out but installing on RAP is a pain
def fields_for_id(field_id, participant):
    """Retrieve and sort fields, robustly handling _i and _a indices."""
    fid_str = str(field_id)
    fields = participant.find_fields(name_regex=rf"^p{fid_str}(_i\d+)?(_a\d+)?$")

    def get_sort_key(field):
        i_match = re.search(r"_i(\d+)", field.name)
        a_match = re.search(r"_a(\d+)", field.name)
        return (
            int(i_match.group(1)) if i_match else 0,
            int(a_match.group(1)) if a_match else 0,
        )

    return sorted(fields, key=get_sort_key)


def get_primary_column(df, field_id):
    """Finds the Instance 0 column (pXXXX_i0) for a field."""
    candidates = [c for c in df.columns if c.startswith(f"p{field_id}")]
    candidates.sort(key=lambda x: (len(x), x))
    if not candidates:
        raise ValueError(f"Field {field_id} not found in dataframe")
    return candidates[0]

In [3]:
# --- CONSTANTS ---
DATABASE = "matrix_tables"
REFERENCE_GENOME = "GRCh38"
LOG_FILE = str(
    Path(f"../hail_logs/hail_{datetime.now().strftime('%H%M')}.log").resolve()
)

# 1. INIT SPARK
sc = pyspark.SparkContext()
spark = SparkSession(sc)

# 2. INIT HAIL
hl.init(sc=sc, default_reference=REFERENCE_GENOME, log=LOG_FILE)

# 3. CONNECT TO UKB DATASET (DNANexus)
dispensed_dataset_id = dxpy.find_one_data_object(
    typename="Dataset", name="app*.dataset", folder="/", name_mode="glob"
)["id"]

dataset = dxdata.load_dataset(id=dispensed_dataset_id)
participant = dataset["participant"]


Using hl.init with a default_reference argument is deprecated. To set a default reference genome after initializing hail, call `hl.default_reference` with an argument to set the default reference genome.

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/backend/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/backend/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 3.5.2
SparkUI available at http://ip-10-60-122-101.eu-west-2.compute.internal:8081
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.132-678e1f52b999
LOGGING: writing to /opt/hail_logs/hail_1617.log
2025-12-19 16:19:34.920 Hail: INFO: Reading table to impute column types
2025-12-19 16:19:36.223 Hail: INFO: Finished type imputation
  Loading field '

In [8]:
# --- CONFIGURATION ---
use_ancestry = True

fields = [
    "22027",  # Outliers for heterozygosity or missing rate
    "22019",  # Sex chromosome aneuploidy
    "22021",  # Genetic kinship info (exclusions)
]

if use_ancestry:
    fields = fields + ["30079"]  # Genomic ancestry (Pan-UKB)

# Retrieve Field Names
field_names = [fields_for_id(i, participant) for i in fields]
field_names = ["eid"] + [field.name for fields in field_names for field in fields]

# Load Data into Spark DataFrame
df = participant.retrieve_fields(
    names=field_names,
    engine=dxdata.connect(),
    coding_values="replace",
)

# Apply Hard Filters
df_filtered = df.filter(
    (~df.p22027.isNull())  # Outliers
    | (~df.p22019.isNull())  # Sex Aneuploidy
    | (df.p22021 == "Participant excluded from kinship inference process")
    | (df.p22021 == "Ten or more third-degree relatives identified")
    | (df.p30079 != "European ancestry (EUR)")  # Ancestry check
)

# Convert to Hail Table (keyed by eid)
filtered_samples_to_remove = hl.Table.from_spark(df_filtered.select("eid")).key_by(
    "eid"
)

print(f"Samples to be filtered (QC/Ancestry): {filtered_samples_to_remove.count()}")

Samples to be filtered (QC/Ancestry): 22896


In [5]:
# --- CONFIGURATION ---
RAW_REL_FILE = Path("/mnt/project/Bulk/Genotype Results/Genotype calls/ukb_rel.dat")
MAX_KINSHIP = 0.176  # Removes 1st & 2nd degree relatives

# 1. IMPORT RELATEDNESS TABLE
rel = hl.import_table(
    f"file://{RAW_REL_FILE}",
    delimiter=" ",
    impute=True,
    types={"ID1": "str", "ID2": "str"},
)

# 2. FILTER PAIRS
# Only consider pairs that are closely related
pairs = rel.filter(rel.Kinship > MAX_KINSHIP)
print(f"Total related pairs (Kinship > {MAX_KINSHIP}): {pairs.count()}")

# 3. APPLY "QC FIRST" LOGIC
# Remove pairs where one of the individuals is ALREADY marked for removal in Chunk 3.
# (If Person A is 'Bad QC', we don't need to drop Person B just because they are related.)
pairs_clean = pairs.filter(
    ~hl.is_defined(filtered_samples_to_remove[pairs.ID1])
    & ~hl.is_defined(filtered_samples_to_remove[pairs.ID2])
)
print(f"Pairs remaining after excluding QC failures: {pairs_clean.count()}")

# 4. SOLVE MAXIMAL INDEPENDENT SET
# Determines who to drop to break the remaining relatedness chains
related_samples_to_remove = hl.maximal_independent_set(
    pairs_clean.ID1, pairs_clean.ID2, keep=False
)

print(
    f"Additional samples to remove for relatedness: {related_samples_to_remove.count()}"
)

Total related pairs (Kinship > 0.176): 29044
Pairs remaining after excluding QC failures: 28197
Additional samples to remove for relatedness: 25328


In [None]:
SAMPLES_TO_REMOVE_FILE = "/tmp/samples_to_remove.tsv"
UPLOAD_PATH = "TASR/Phenotypes/QC/"

# 1. STANDARDIZE SCHEMAS
# The 'related' table has a key 'node'. We simply rename it to 'eid' to match the QC table.
# This one line replaces the previous complex block.
related_compatible = related_samples_to_remove.rename({"node": "eid"})

# 2. COMBINE
# Union the QC failures with the relatedness removals.
# Both tables now have the key 'eid' and identical schemas.
final_to_remove = filtered_samples_to_remove.union(related_compatible).distinct()

count = final_to_remove.count()
print(f"Final total samples to remove: {count}")

# 3. EXPORT LOCALLY
final_to_remove.export(SAMPLES_TO_REMOVE_FILE)


# getmerge ensures we get a single file from the spark output directory
subprocess.run(
    f"hadoop fs -getmerge {SAMPLES_TO_REMOVE_FILE} ../tmp/samples_to_remove.tsv",
    shell=True,
    check=True,
)
subprocess.run(
    f"dx upload ../tmp/samples_to_remove.tsv --path {UPLOAD_PATH}",
    shell=True,
    check=True,
)

print(f"Done! Uploaded to {UPLOAD_PATH}")

Final total samples to remove: 48224
Done! Uploaded to TASR/Phenotypes/QC/
