In [1]:
import re
import subprocess

import dxdata
import dxpy
import pyspark


from pathlib import Path
from src.phenotypes import get_age_sex, get_pheno_fields, concatenate, new_names

Path("../tmp").resolve().mkdir(parents=True, exist_ok=True)

trait = "metabolic"

In [2]:
# conf = SparkConf()
# conf.set("autoBroadcastJoinThreshold", -1)
# conf.set("broadcastTimeout", 1200)


spark = pyspark.sql.SparkSession.builder.config(
    "spark.sql.autoBroadcastJoinThreshold", -1
).getOrCreate()

dispensed_database_name = dxpy.find_one_data_object(
    classname="database", name="app*", folder="/", name_mode="glob", describe=True
)["describe"]["name"]
dispensed_dataset_id = dxpy.find_one_data_object(
    typename="Dataset", name="app*.dataset", folder="/", name_mode="glob"
)["id"]

spark.sql("USE " + dispensed_database_name)

dataset = dxdata.load_dataset(id=dispensed_dataset_id)
participant = dataset["participant"]

In [3]:
first_occurence_fields = list(
    participant.find_fields(
        lambda f: bool(
            re.match("^Date (E10|E11|E66|K90|K91|K80|K81) first reported", f.title)
        )
    )
)

age_sex_fields = get_age_sex(participant, fields=["31", "21022"])

metabolic_fields = get_pheno_fields(
    participant,
    fields=[
        "131688",
        "131690",
        "130706",
        "130708",
        "130792",
        "2443",
        "131674",
        "131676",
        "2463",
        "20511",
        "1687",
        "1697",
    ],
)

field_names = concatenate(
    ["eid"], age_sex_fields, metabolic_fields, [f.name for f in first_occurence_fields]
)

df = participant.retrieve_fields(names=field_names, engine=dxdata.connect())

In [4]:
# Drop arrays
# to_drop = [x for x in df.columns if "a1" in x]
# df = df.drop(*to_drop)
colnames = [re.sub("_a\d", "", x) for x in df.columns]
colnames = ["xeid"] + [new_names(s) for s in colnames[1:]]
print(colnames[:10])

['xeid', 'x31_0_0', 'x21022_0_0', 'x1687_0_0', 'x1687_1_0', 'x1687_2_0', 'x1697_0_0', 'x1697_1_0', 'x1697_2_0', 'x2443_0_0']


In [5]:
df.printSchema()

root
 |-- eid: string (nullable = true)
 |-- p31: long (nullable = true)
 |-- p21022: long (nullable = true)
 |-- p1687_i0: long (nullable = true)
 |-- p1687_i1: long (nullable = true)
 |-- p1687_i2: long (nullable = true)
 |-- p1697_i0: long (nullable = true)
 |-- p1697_i1: long (nullable = true)
 |-- p1697_i2: long (nullable = true)
 |-- p2443_i0: long (nullable = true)
 |-- p2443_i1: long (nullable = true)
 |-- p2443_i2: long (nullable = true)
 |-- p2443_i3: long (nullable = true)
 |-- p2463_i0: long (nullable = true)
 |-- p2463_i1: long (nullable = true)
 |-- p2463_i2: long (nullable = true)
 |-- p2463_i3: long (nullable = true)
 |-- p130706: date (nullable = true)
 |-- p130708: date (nullable = true)
 |-- p130792: date (nullable = true)
 |-- p131674: date (nullable = true)
 |-- p131676: date (nullable = true)
 |-- p131688: date (nullable = true)
 |-- p131690: date (nullable = true)



In [6]:
df = df.toDF(*colnames)

df.write.csv("/tmp/phenos.tsv", sep="\t", header=True, emptyValue="NA")

In [7]:
subprocess.run(
    ["hadoop", "fs", "-rm", "/tmp/phenos.tsv/_SUCCESS"], check=True, shell=False
)
subprocess.run(
    ["hadoop", "fs", "-get", "/tmp/phenos.tsv", "../tmp/phenos.tsv"],
    check=True,
    shell=False,
)

CompletedProcess(args=['hadoop', 'fs', '-get', '/tmp/phenos.tsv', '../tmp/phenos.tsv'], returncode=0)

In [8]:
!sed -e '3,${/^xeid/d' -e '}' ../tmp/phenos.tsv/part* > ../tmp/metabolic.BT.raw.tsv

In [9]:
# %%
# Upload to project

subprocess.run(
    ["dx", "upload", "../tmp/metabolic.BT.raw.tsv", "--path", "Data/phenotypes/"],
    check=True,
    shell=False,
)

CompletedProcess(args=['dx', 'upload', '../tmp/metabolic.BT.raw.tsv', '--path', 'Data/phenotypes/'], returncode=0)