In [4]:
import re
import subprocess

import dxdata
import dxpy
import pandas as pd
import pyspark

from pyspark.sql import functions as F
from pyspark.conf import SparkConf
from pyspark.sql.types import StringType

from pathlib import Path
from phenotypes import get_age_sex, get_pheno_fields, concatenate, new_names

Path("../tmp").resolve().mkdir(parents=True, exist_ok=True)

In [5]:
spark = pyspark.sql.SparkSession.builder.config("spark.sql.autoBroadcastJoinThreshold", -1).getOrCreate()

dispensed_database_name = dxpy.find_one_data_object(
    classname="database", name="app*", folder="/", name_mode="glob", describe=True
)["describe"]["name"]
dispensed_dataset_id = dxpy.find_one_data_object(
    typename="Dataset", name="app*.dataset", folder="/", name_mode="glob"
)["id"]

spark.sql("USE " + dispensed_database_name)

dataset = dxdata.load_dataset(id=dispensed_dataset_id)
participant = dataset["participant"]

In [79]:
liking_fields = list(
    participant.find_fields(
        lambda f: bool(re.match("^Liking for ", f.title))
    )
)

not_food = ("p20614", "p20656", "p20657", "p20668", "p20669", "p20670", "p20733", "p20741", ) # bicyling, exercising, going out, stairs, television

liking_fields = [f.name for f in liking_fields if not f.name.startswith(not_food)]

liking_fields[0:5]

['p20600', 'p20601', 'p20602', 'p20603', 'p20604']

In [80]:
age_sex_fields = get_age_sex(participant, fields=["31", "21022"])

field_names = concatenate(
    ["eid"], age_sex_fields, liking_fields
)

In [81]:
df = participant.retrieve_fields(names=field_names, engine=dxdata.connect())

In [83]:
colnames = [re.sub("_a\d", "", x) for x in df.columns]
colnames = ["xeid"] + [new_names(s) for s in colnames[1:]]
print(colnames[:10])

# df.printSchema()

['xeid', 'x31_0_0', 'x21022_0_0', 'x20600_0_0', 'x20601_0_0', 'x20602_0_0', 'x20603_0_0', 'x20604_0_0', 'x20605_0_0', 'x20606_0_0']


In [85]:
# subprocess.run(
#     ["hadoop", "fs", "-rm", "-r", "/tmp/phenos.tsv"], check=True, shell=False
# )

df = df.toDF(*colnames)

df.write.csv("/tmp/phenos.tsv", sep="\t", header=True, emptyValue="NA")

In [86]:
subprocess.run(
    ["hadoop", "fs", "-get", "/tmp/phenos.tsv", "../tmp/phenos.tsv"],
    check=True,
    shell=False,
)

CompletedProcess(args=['hadoop', 'fs', '-get', '/tmp/phenos.tsv', '../tmp/phenos.tsv'], returncode=0)

In [87]:
!sed -e '3,${/^xeid/d' -e '}' ../tmp/phenos.tsv/part* > ../tmp/likings.raw.tsv

In [88]:
# Upload to project

subprocess.run(
    ["dx", "upload", "../tmp/likings.raw.tsv", "--path", "Data/phenotypes/"],
    check=True,
    shell=False,
)

CompletedProcess(args=['dx', 'upload', '../tmp/likings.raw.tsv', '--path', 'Data/phenotypes/'], returncode=0)

In [89]:
nutrients = tuple([f"p{f}" for f in list(range(26000, 26062)) + [26155]])

nutrient_fields = list(participant.find_fields(
        lambda f: f.name.startswith(nutrients)
    ))

nutrient_fields = [f.name for f in nutrient_fields]

nutrient_fields[0:6]

['p26000_i0', 'p26000_i1', 'p26000_i2', 'p26000_i3', 'p26000_i4', 'p26001_i0']

In [90]:
food_weights = tuple([f"p{f}" for f in range(26062, 26155)])

weight_fields = list(participant.find_fields(
        lambda f: f.name.startswith(food_weights)
    ))

weight_fields = [f.name for f in weight_fields]

weight_fields[0:6]

['p26062_i0', 'p26062_i1', 'p26062_i2', 'p26062_i3', 'p26062_i4', 'p26063_i0']

In [91]:
age_sex_fields = get_age_sex(participant, fields=["31", "21022"])

field_names = concatenate(
    ["eid"], nutrient_fields, weight_fields
)

In [92]:
df = participant.retrieve_fields(names=field_names, engine=dxdata.connect())

In [94]:
colnames = [re.sub("_a\d", "", x) for x in df.columns]
colnames = ["xeid"] + [new_names(s) for s in colnames[1:]]
print(colnames[:10])

# df.printSchema()

['xeid', 'x26000_0_0', 'x26000_1_0', 'x26000_2_0', 'x26000_3_0', 'x26000_4_0', 'x26001_0_0', 'x26001_1_0', 'x26001_2_0', 'x26001_3_0']


In [95]:
subprocess.run(
    ["hadoop", "fs", "-rm", "-r", "/tmp/phenos.tsv"], check=True, shell=False
)

df = df.toDF(*colnames)

df.write.csv("/tmp/phenos.tsv", sep="\t", header=True, emptyValue="NA")

In [96]:
subprocess.run(
    ["hadoop", "fs", "-get", "/tmp/phenos.tsv", "../tmp/phenos.tsv"],
    check=True,
    shell=False,
)

CompletedProcess(args=['hadoop', 'fs', '-get', '/tmp/phenos.tsv', '../tmp/phenos.tsv'], returncode=0)

In [97]:
!sed -e '3,${/^xeid/d' -e '}' ../tmp/phenos.tsv/part* > ../tmp/nutrients.raw.tsv

In [98]:
# Upload to project

subprocess.run(
    ["dx", "upload", "../tmp/nutrients.raw.tsv", "--path", "Data/phenotypes/"],
    check=True,
    shell=False,
)

CompletedProcess(args=['dx', 'upload', '../tmp/nutrients.raw.tsv', '--path', 'Data/phenotypes/'], returncode=0)