# SCRIPT TO PERFORM QUALITY CONTROL ON UK BIOBANK SAMPLES
Ancestry and kinship and such
Could use an upgrade

## This script should only be run once

#### Initialization
##### Load packages

In [None]:
from pathlib import Path
from datetime import datetime
import re

import dxdata  # type: ignore
import dxpy
import networkx as nx
import hail as hl

import pyspark
from pyspark.sql import SparkSession


# --- HELPER FUNCTIONS ---


def fields_for_id(field_id, participant):
    """Retrieve and sort fields, robustly handling _i and _a indices."""
    fid_str = str(field_id)
    fields = participant.find_fields(name_regex=rf"^p{fid_str}(_i\d+)?(_a\d+)?$")

    def get_sort_key(field):
        i_match = re.search(r"_i(\d+)", field.name)
        a_match = re.search(r"_a(\d+)", field.name)
        return (
            int(i_match.group(1)) if i_match else 0,
            int(a_match.group(1)) if a_match else 0,
        )

    return sorted(fields, key=get_sort_key)


def get_primary_column(df, field_id):
    """Finds the Instance 0 column (pXXXX_i0) for a field."""
    candidates = [c for c in df.columns if c.startswith(f"p{field_id}")]
    # Sort by length and name to prioritize 'p54' or 'p54_i0' over 'p54_i1'
    candidates.sort(key=lambda x: (len(x), x))
    if not candidates:
        raise ValueError(f"Field {field_id} not found in dataframe")
    return candidates[0]

##### Spark, Hail and dataset configuration 

In [3]:
# Constants
# Run once only
DATABASE = "matrix_tables"
REFERENCE_GENOME = "GRCh38"

LOG_FILE = (
    Path("../hail_logs", f"hail_{datetime.now().strftime('%H%M')}.log")
    .resolve()
    .__str__()
)

sc = pyspark.SparkContext()
spark = SparkSession(sc)

hl.init(sc=sc, default_reference=REFERENCE_GENOME, log=LOG_FILE)

dispensed_dataset_id = dxpy.find_one_data_object(
    typename="Dataset", name="app*.dataset", folder="/", name_mode="glob"
)["id"]

dataset = dxdata.load_dataset(id=dispensed_dataset_id)  # type: ignore
participant = dataset["participant"]


Using hl.init with a default_reference argument is deprecated. To set a default reference genome after initializing hail, call `hl.default_reference` with an argument to set the default reference genome.

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/backend/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/backend/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 3.5.2
SparkUI available at http://ip-10-60-122-101.eu-west-2.compute.internal:8081
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.132-678e1f52b999
LOGGING: writing to /opt/hail_logs/hail_1543.log
2025-12-19 15:45:49.257 Hail: INFO: Reading table to impute column types
2025-12-19 15:45:50.827 Hail: INFO: Finished type imputation
  Loading field '

### Filtering
#### Hard filtering

In [4]:
use_ancestry = True  # You should argue this

fields = [
    "22027",  # Outliers for heterozygosity or missing rate
    "22019",  # Sex chromosome aneuploidy
    "22021",  # Genetic kinship to other participants
]

if use_ancestry:
    fields = fields + ["30079"]  #  Genomic ancestry (Pan-UKB)

field_names = [fields_for_id(i, participant) for i in fields]
field_names = ["eid"] + [field.name for fields in field_names for field in fields]

In [9]:
df = participant.retrieve_fields(
    names=field_names,
    engine=dxdata.connect(),
    coding_values="replace",  # type: ignore
)

df_filtered = df.filter(
    (~df.p22027.isNull())
    | (~df.p22019.isNull())
    | (df.p22021 == "Participant excluded from kinship inference process")
    | (df.p22021 == "Ten or more third-degree relatives identified")
    | (
        df.p30079 != "European ancestry (EUR)"  # proooblematic
    )
)

filtered_samples_to_remove = hl.Table.from_spark(df_filtered.select("eid")).key_by(
    "eid"
)

print(f"Samples to be filtered: {filtered_samples_to_remove.count()}")

Samples to be filtered: 22896


#### Ancestry filtering
Now as field 30079 from Pan-UKB

#### Withdrawn
Now handled automatically by UKB

#### Related individuals

In [19]:
# Remove related individuals
RAW_REL_FILE = Path("/mnt/project/Bulk/Genotype Results/Genotype calls/ukb_rel.dat")
MAX_KINSHIP = 0.176  # 2nd degree relatives, based on: https://kenhanscombe.github.io/ukbkings/reference/bio_gen_related_remove.html

rel = hl.import_table(
    f"file://{RAW_REL_FILE}",
    delimiter=" ",
    impute=True,
    types={"ID1": "str", "ID2": "str"},
)

rel = rel.filter(
    hl.is_defined(filtered_samples_to_remove[rel.ID1])
    | hl.is_defined(filtered_samples_to_remove[rel.ID2]),
    keep=False,
)

rel = rel.filter(rel.Kinship > MAX_KINSHIP, keep=True)

In [20]:
# Hail maximal independent set is not working so we use networkx
# related_samples_to_remove = (hl.maximal_independent_set(i=rel.ID1, j=rel.ID2,keep=False).rename({"node": "eid"}).key_by("eid"))

# Collect the ID pairs into a list for processing
rel_data = rel.select("ID1", "ID2").collect()

# Create a graph using networkx from the relationships
G = nx.Graph()
for row in rel_data:
    G.add_edge(row.ID1, row.ID2)

# Compute the maximal independent set using networkx
independent_set = set(nx.maximal_independent_set(G))

# Extract all unique IDs from the original relationship data
all_ids = set([row.ID1 for row in rel_data] + [row.ID2 for row in rel_data])

# Calculate the related samples to remove (those not in the independent set)
related_samples_to_remove_ids = all_ids - independent_set

# Convert the related samples to remove into a Hail Table and key the table by "eid"
related_samples_to_remove = hl.Table.parallelize(
    [hl.struct(eid=sample) for sample in related_samples_to_remove_ids]
).key_by("eid")

print(
    f"Related samples not already in filter and high kinship coefficient: {related_samples_to_remove.count()}"
)

Related samples not already in filter and high kinship coefficient: 25746


#### Combine all samples to remove

In [21]:
final_to_remove = (
    filtered_samples_to_remove.join(related_samples_to_remove, how="outer")
    # .join(withdrawn_to_remove, how="outer")
).distinct()

print(f"Final number of samples to remove: {final_to_remove.count()}")

Final number of samples to remove: 48642


### Save and export

In [22]:
SAMPLES_TO_REMOVE_FILE = "/tmp/samples_to_remove.tsv"

final_to_remove.eid.export(SAMPLES_TO_REMOVE_FILE)

In [23]:
!hadoop fs -getmerge /tmp/samples_to_remove.tsv ../tmp/samples_to_remove.tsv
!dx upload ../tmp/samples_to_remove.tsv --path TASR/Phenotypes/QC/

2025-12-19 15:54:06,884 WARN metrics.MetricsReporter: Unable to initialize metrics scraping configurations from hive-site.xml. Message:InputStream cannot be null
2025-12-19 15:54:06,986 WARN service.DNAxApiSvc: Using default configurations. Unable to find dnanexus.conf.location=null
2025-12-19 15:54:06,986 INFO service.DNAxApiSvc: apiserver connection-pool config. MaxPoolSize=10, MaxPoolPerRoute=10,MaxWaitTimeout=60000
2025-12-19 15:54:06,986 INFO service.DNAxApiSvc: initializing http connection manager pools
2025-12-19 15:54:07,534 INFO service.DNAxApiSvc: Worker process - IdleConnectionMonitorThread disabled
2025-12-19 15:54:07,534 INFO service.DNAxApiSvc: Worker process - IdleConnectionMonitorThread disabled
2025-12-19 15:54:07,534 INFO service.DNAxApiSvc: initializing DNAxApiSvc
2025-12-19 15:54:08,077 WARN service.DNAxApiSvc: Shutting down Runtime service for Connection Pools
2025-12-19 15:54:08,078 INFO service.DNAxApiSvc: shutting down httpClientConnManager
2025-12-19 15:54:08,0