# SCRIPT TO GENERATE COVARIATES

## This script should be only run once

#### Initialization 
##### Load packages

In [None]:
import subprocess
from packaging import version

import dxdata
import dxpy
import pyspark
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType

from src.fields import fields_for_id

##### Spark and dataset configuration

In [None]:
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession(sc)
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

dispensed_dataset_id = dxpy.find_one_data_object(
    typename="Dataset", 
    name="app*.dataset", folder="/", name_mode="glob"
)["id"]
dataset = dxdata.load_dataset(id=dispensed_dataset_id)

participant = dataset["participant"]

#### Data
##### Retrieve covariates and 20 first PCs

In [None]:
fields = [
    "21022", # Age at recruitment
    "22001", # Genetic sex
    "22009", # Genetic principal components
]

# Get names of given fields
field_names = [fields_for_id(id, participant) for id in fields]
field_names = ["eid"] + [field.name for fields in field_names for field in fields]

pcs = {f"p22009_a{i}": f"PC{i}" for i in range(1, 21)}
covs = ["FID", "IID", "SEX", "AGE", "AGE2", "AGESEX", "AGE2SEX"] + list(pcs.values())

In [None]:
# Get data from the given fields
df = participant.retrieve_fields(
    names=field_names, engine=dxdata.connect(), coding_values="raw"
)

# Drop rows containing missing values
df = df.na.drop(how="any")

# Rename columns and get final DataFrame
df = (
    df.select([F.col(c).alias(pcs.get(c, c)) for c in df.columns])
    .withColumn("FID", F.col("eid"))
    .withColumn("IID", F.col("eid"))
    .withColumn("SEX", F.col("p22001").cast(IntegerType()))
    .withColumn("AGE", F.col("p21022").cast(IntegerType()))
    .withColumn("AGE2", (F.col("p21022") ** 2).cast(IntegerType()))
    .withColumn("AGESEX", (F.col("p21022") * F.col("p22001")).cast(IntegerType()))
    .withColumn(
        "AGE2SEX", ((F.col("p21022") ** 2) * F.col("p22001")).cast(IntegerType())
    )
    .select(*covs)
)

##### Save and Export

In [None]:
# Save DataFrame
df.coalesce(1).write.csv(
    "/tmp/covariates.tsv",
    sep="\t", header=True,
)

In [None]:
# Upload DataFrame
!hadoop fs -getmerge /tmp/covariates.tsv ../tmp/covariates.tsv
!dx upload ../tmp/covariates.tsv /Data/Input_regenie/