# SCRIPT TO OBTAIN PHENOTIPIC BINARY TREATS

## This script should only be run once

#### Initialization
##### Load packages

In [None]:
import re
import subprocess

import dxdata
import dxpy
import pyspark

from pyspark.sql import functions as F
from pyspark.conf import SparkConf
from pyspark.sql.types import StringType

from pathlib import Path
from src.phenotypes import get_pheno_fields, concatenate, new_names, get_age_sex

Path("../tmp").resolve().mkdir(parents=True, exist_ok=True)

##### Spark and dataset configuration 

In [None]:
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

dispensed_database_name = dxpy.find_one_data_object(
    classname="database", 
    name="app*", folder="/", name_mode="glob", 
    describe=True
)["describe"]["name"]
spark.sql("USE " + dispensed_database_name)

dispensed_dataset_id = dxpy.find_one_data_object(
    typename="Dataset", name="app*.dataset", folder="/", name_mode="glob"
)["id"]
dataset = dxdata.load_dataset(id=dispensed_dataset_id)

participant = dataset["participant"]

#### Data
##### Retrieve binary values of given fields

In [None]:
age_sex_fields = get_age_sex(participant, fields=["31", "21022"])

binary_fields = get_pheno_fields(participant,
    fields=[
        "131688",
        "131690",
        "130706",
        "130708",
        "130792",
        "2443",
        "131674",
        "131676",
        "2463",
        "20511",
        "1687",
        "1697",
    ]
)

field_names = concatenate(["eid"], age_sex_fields, binary_fields)

df = participant.retrieve_fields(names=field_names, engine=dxdata.connect())

##### DataFrame formatting

In [None]:
def new_names(s: str) -> str:
    """
    Return updated column name for PEACOK use

    Input(s):
    - Field names

    Output(s):
    - Updated field names for PHESANT use
    """
    s = s.replace("p", "").replace("i", "")

    match = re.search(r"_(\d)$", s)
    
    if match:
        digit = match.group(1)
        s = re.sub(r"_(\d)$", "", s)  
        s += f"-0.{digit}"            
    else:
        s += "-0.0"
    
    return s

In [None]:
# Rename columns
colnames = [re.sub("_a\d", "", x) for x in df.columns]
colnames = ["xeid"] + [new_names(s) for s in colnames[1:]]

df = df.toDF(*colnames)

In [None]:
# Print schema
df.printSchema()

##### Export and upload DataFrame

In [None]:
df.write.csv("/tmp/phenos_BT.tsv", sep="\t", header=True, emptyValue="NA")

In [None]:
!hadoop fs -getmerge /tmp/phenos_BT.tsv ../tmp/phenos_PEACOK.BT.raw.tsv
!dx upload ../tmp/phenos_PEACOK.BT.raw.tsv --path  /WGS_Javier/Data/phenotypes/ --brief

In [None]:
!sed 's/\t/,/g' /mnt/project/WGS_Javier/Data/phenotypes/phenos_PEACOK.BT.raw.tsv > phenos_PEACOK.BT.raw.csv
!dx upload phenos_PEACOK.BT.raw.csv --path  /WGS_Javier/Data/phenotypes/ --brief