In [1]:
# Python
import subprocess
from distutils.version import LooseVersion

import dxdata
import dxpy
import pyspark
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, FloatType

In [2]:
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

dispensed_database_name = dxpy.find_one_data_object(
    classname="database", name="app*", folder="/", name_mode="glob", describe=True
)["describe"]["name"]
dispensed_dataset_id = dxpy.find_one_data_object(
    typename="Dataset", name="app*.dataset", folder="/", name_mode="glob"
)["id"]

dataset = dxdata.load_dataset(id=dispensed_dataset_id)
participant = dataset["participant"]


def fields_for_id(field_id):

    field_id = str(field_id)
    fields = participant.find_fields(
        name_regex=r"^p{}(_i\d+)?(_a\d+)?$".format(field_id)
    )

    return sorted(fields, key=lambda f: LooseVersion(f.name))

In [17]:
pcs_fields = ["eid"] + [field.name for field in fields_for_id("22009")]

df = participant.retrieve_fields(
    names=pcs_fields, engine=dxdata.connect(), coding_values="raw"
)

df = df.na.drop(how="any")

In [19]:
df.coalesce(1).write.csv(
    "/tmp/pcs.csv",
    sep=",",
    header=True,
)

In [21]:
subprocess.run(
    ["hadoop", "fs", "-getmerge", "/tmp/pcs.csv", "pcs.csv"],
    check=True,
    shell=False,
)

CompletedProcess(args=['hadoop', 'fs', '-getmerge', '/tmp/pcs.tsv', 'pcs.tsv'], returncode=0)

In [1]:
# R
# https://github.com/privefl/UKBB-PGS
install.packages("bigreadr")

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

also installing the dependencies ‘bigassertr’, ‘parallelly’




In [23]:
# Also R
PC_UKBB <- bigreadr::fread2("pcs.csv",   select = c("eid", paste0("p22009_a", 1:16)))
all_centers <- read.csv(
  "https://raw.githubusercontent.com/privefl/UKBB-PGS/main/pop_centers.csv",
  stringsAsFactors = FALSE)
all_sq_dist <- apply(all_centers[-1], 1, function(one_center) {
  rowSums(sweep(PC_UKBB[-1], 2, one_center, '-')^2)
})
thr_sq_dist <- max(dist(all_centers[-1])^2) * 0.002 / 0.16
group <- apply(all_sq_dist, 1, function(x) {
  grp <- NA
  ind <- which.min(x)
  if (isTRUE(x[ind] < thr_sq_dist)) {
    grp <- all_centers$Ancestry[ind]
    # We used a more stringent cutoff for the Ashkenazi group
    if (grp == "Ashkenazi" && x[ind] > 12.5^2) grp <- NA
  }
  grp
})
table(group, exclude = NULL)

group
     Ashkenazi      Caribbean          China          India           Iran 
          2500           2653           1852           6718           1234 
         Italy        Nigeria         Poland United Kingdom           <NA> 
          6818           4085           4311         446606          11394 

In [25]:
# Still R
df <- data.frame(PC_UKBB$eid, group)
write.csv(df, "ancestry.csv")