# SCRIPT TO OBTAIN PHENOTIPIC QUANTITATIVE TREATS

## This script should only be run once

#### Initialization
##### Load packages

In [40]:
import re
import subprocess

import dxdata
import dxpy
import pyspark

from pyspark.sql import functions as F
from pyspark.conf import SparkConf
from pyspark.sql.types import StringType

from pathlib import Path
from phenotypes2 import get_pheno_fields, concatenate, new_names, get_age_sex

Path("../tmp").resolve().mkdir(parents=True, exist_ok=True)

##### Spark and dataset configuration 

In [2]:
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

dispensed_database_name = dxpy.find_one_data_object(
    classname="database", 
    name="app*", folder="/", name_mode="glob", 
    describe=True
)["describe"]["name"]
spark.sql("USE " + dispensed_database_name)

dispensed_dataset_id = dxpy.find_one_data_object(
    typename="Dataset", name="app*.dataset", folder="/", name_mode="glob"
)["id"]
dataset = dxdata.load_dataset(id=dispensed_dataset_id)

participant = dataset["participant"]

#### Data
##### Retrieve quantitative values of given fields

In [51]:
age_sex_fields = get_age_sex(participant, fields=["31", "21022"])

quantitative_fields = get_pheno_fields(
    participant,
    fields=[
        "48",
        "49",
        "21001",
        "21002",
        "23099",
        "23127",
        "78",
        "30750",
        "30880",
        "26019",
        "4079",
        "4080",
    ],
)

field_names = concatenate(["eid"], age_sex_fields, quantitative_fields)

df = participant.retrieve_fields(names=field_names, engine=dxdata.connect())

##### DataFrame formatting

In [59]:
def new_names(s: str) -> str:
    """
    Return updated column name for PEACOK use

    Input(s):
    - Field names

    Output(s):
    - Updated field names for PHESANT use
    """
    s = s.replace("p", "").replace("i", "")

    match = re.search(r"_(\d)$", s)
    
    if match:
        digit = match.group(1)
        s = re.sub(r"_(\d)$", "", s)  
        s += f"-0.{digit}"            
    else:
        s += "-0.0"
    
    return s

In [60]:
# Drop arrays
to_drop = [x for x in df.columns if "a1" in x]
df = df.drop(*to_drop)

# Rename columns
colnames = [re.sub("_a\d", "", x) for x in df.columns]
colnames = ["userId"] + [new_names(s) for s in colnames[1:]]

df = df.toDF(*colnames)

In [None]:
# Print schema
df.printSchema()

##### Export and upload DataFrame

In [68]:
df.write.csv("/tmp/phenos_QT.tsv", sep="\t", header=True, emptyValue="NA")

In [None]:
!hadoop fs -getmerge /tmp/phenos_QT.tsv ../tmp/phenos_PEACOK.QT.raw.tsv
!dx upload ../tmp/phenos_PEACOK.QT.raw.csv --path  /WGS_Javier/Data/phenotypes/ --brief

In [72]:
!sed 's/\t/,/g' /mnt/project/WGS_Javier/Data/phenotypes/phenos_PEACOK.QT.raw.tsv > phenos_PEACOK.QT.raw.csv
!dx upload ../tmp/phenos_PEACOK.QT.raw.csv --path  /WGS_Javier/Data/phenotypes/ --brief