In [1]:
# %%
import re
import subprocess

import dxdata
import dxpy
import pandas as pd
import pyspark

from pyspark.sql import functions as F
from pyspark.conf import SparkConf
from pyspark.sql.types import StringType

from pathlib import Path
from phenotypes import get_age_sex, get_pheno_fields, concatenate, new_names

Path("../tmp").resolve().mkdir(parents=True, exist_ok=True)

trait = "metabolic"

In [None]:
# Set up all kinds of Spark stuff based on DNAnexus documentation somewhere
conf = SparkConf()
conf.set("autoBroadcastJoinThreshold", -1)

sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)

dispensed_database_name = dxpy.find_one_data_object(
    classname="database", name="app*", folder="/", name_mode="glob", describe=True
)["describe"]["name"]
dispensed_dataset_id = dxpy.find_one_data_object(
    typename="Dataset", name="app*.dataset", folder="/", name_mode="glob"
)["id"]

spark.sql("USE " + dispensed_database_name)

dataset = dxdata.load_dataset(id=dispensed_dataset_id)
participant = dataset["participant"]

In [None]:
# Get fields
age_sex_fields = get_age_sex(participant, fields=["31", "21022"])

metabolic_fields = get_pheno_fields(
    participant,
    fields=[
        "48",
        "49",
        "21001",
        "21002",
        "23099",
        "23127",
        "78",
        "30750",
        "30880",
        "26019",
        "4079",
        "4080",
        
    ],
)

field_names = concatenate(["eid"], age_sex_fields, metabolic_fields)

df = participant.retrieve_fields(names=field_names, engine=dxdata.connect())

In [None]:
# Drop arrays
to_drop = [x for x in df.columns if "a1" in x]
df = df.drop(*to_drop)
colnames = [re.sub("_a\d", "", x) for x in df.columns]
colnames = ["xeid"] + [new_names(s) for s in colnames[1:]]
print(colnames[:10])

In [None]:
# %%
df = df.toDF(*colnames)

In [None]:
# %%
df.write.csv("/tmp/phenos.tsv", sep="\t", header=True, emptyValue="NA")

In [14]:
# %%
subprocess.run(
    ["hadoop", "fs", "-rm", "/tmp/phenos.tsv/_SUCCESS"], check=True, shell=False
)
subprocess.run(
    ["hadoop", "fs", "-get", "/tmp/phenos.tsv", "../tmp/phenos.tsv"],
    check=True,
    shell=False,
)

CompletedProcess(args=['hadoop', 'fs', '-get', '/tmp/phenos.tsv', '../tmp/phenos.tsv'], returncode=0)

In [15]:
!sed -e '3,${/^xeid/d' -e '}' ../tmp/phenos.tsv/part* > ../tmp/metabolic.QT.raw.tsv

In [16]:
# %%
# Upload to project

subprocess.run(
    ["dx", "upload", "../tmp/metabolic.QT.raw.tsv", "--path", "Data/phenotypes/"],
    check=True,
    shell=False,
)

CompletedProcess(args=['dx', 'upload', '../tmp/metabolic.QT.raw.tsv', '--path', 'Data/phenotypes/'], returncode=0)