### 1st part: include modification of code to have drugId and studyId

In [1]:
#### 08.01.2025
#### ALL PHASES 
from array import ArrayType
from functions import (
    relative_success,
    spreadSheetFormatter,
    discrepancifier,
    temporary_directionOfEffect,
    
)
from stoppedTrials import terminated_td
from DoEAssessment import directionOfEffect
from membraneTargets import target_membrane
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
from datetime import datetime
from datetime import date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.types import (
    StructType,
    StructField,
    DoubleType,
    DecimalType,
    StringType,
    FloatType,
)
import pandas as pd

spark = SparkSession.builder.getOrCreate()

path = "gs://open-targets-pre-data-releases/24.12-uo_test-3/output/etl/parquet/"

target = spark.read.parquet(f"{path}targets/")

diseases = spark.read.parquet(f"{path}diseases/")

evidences = spark.read.parquet(f"{path}evidence")

credible = spark.read.parquet(f"{path}credibleSet")

### index with new fix" "gs://ot-team/irene/gentropy/study_index_2412_fixed"
index = spark.read.parquet(f"gs://ot-team/irene/gentropy/study_index_2412_fixed")

new = spark.read.parquet(f"{path}colocalisation/coloc")

variantIndex = spark.read.parquet(f"{path}variantIndex")

biosample = spark.read.parquet(f"{path}biosample")


#### Fixing scXQTL as XQTLs:
## code provided by @ireneisdoomed
pd.DataFrame.iteritems = pd.DataFrame.items

raw_studies_metadata_schema: StructType = StructType(
        [
            StructField("study_id", StringType(), True),
            StructField("dataset_id", StringType(), True),
            StructField("study_label", StringType(), True),
            StructField("sample_group", StringType(), True),
            StructField("tissue_id", StringType(), True),
            StructField("tissue_label", StringType(), True),
            StructField("condition_label", StringType(), True),
            StructField("sample_size", IntegerType(), True),
            StructField("quant_method", StringType(), True),
            StructField("pmid", StringType(), True),
            StructField("study_type", StringType(), True),
        ]
    )
raw_studies_metadata_path = "https://raw.githubusercontent.com/eQTL-Catalogue/eQTL-Catalogue-resources/fe3c4b4ed911b3a184271a6aadcd8c8769a66aba/data_tables/dataset_metadata.tsv"

study_table = spark.createDataFrame(
            pd.read_csv(raw_studies_metadata_path, sep="\t"),
            schema=raw_studies_metadata_schema,
        )

#index = spark.read.parquet("gs://open-targets-pre-data-releases/24.12-uo_test-3/output/genetics/parquet/study_index")

study_index_w_correct_type = (
    study_table.select(
        F.concat_ws(
            "_",
            F.col("study_label"),
            F.col("quant_method"),
            F.col("sample_group"),
        ).alias("extracted_column"),
        "study_type",
    )
    .join(
        index
        # Get eQTL Catalogue studies
        .filter(F.col("studyType") != "gwas")
        .filter(~F.col("studyId").startswith("UKB_PPP"))
        # Remove measured trait
        .withColumn(
            "extracted_column",
            F.regexp_replace(F.col("studyId"), r"(_ENS.*|_ILMN.*|_X.*|_[0-9]+:.*)", ""),
        )
        .withColumn(
            "extracted_column",
            # After the previous cleanup, there are multiple traits from the same publication starting with the gene symbol that need to be removed (e.g. `Sun_2018_aptamer_plasma_ANXA2.4961.17.1..1`)
            F.when(
                F.col("extracted_column").startswith("Sun_2018_aptamer_plasma"),
                F.lit("Sun_2018_aptamer_plasma"),
            ).otherwise(F.col("extracted_column")),
        ),
        on="extracted_column",
        how="right",
    )
    .persist()
)

fixed = (
    study_index_w_correct_type.withColumn(
        "toFix",
        F.when(
            (F.col("study_type") != "single-cell")
            & (F.col("studyType").startswith("sc")),
            F.lit(True),
        ).otherwise(F.lit(False)),
    )
    # Remove the substring "sc" from the studyType column
    .withColumn(
        "newStudyType",
        F.when(
            F.col("toFix"), F.regexp_replace(F.col("studyType"), r"sc", "")
        ).otherwise(F.col("studyType")),
    )
    .drop("toFix", "extracted_column", "study_type")
).persist()
all_studies = index.join(
    fixed.selectExpr("studyId", "newStudyType"), on="studyId", how="left"
).persist()
fixedIndex = all_studies.withColumn(
    "studyType",
    F.when(F.col("newStudyType").isNotNull(), F.col("newStudyType")).otherwise(
        F.col("studyType")
    ),
).drop("newStudyType")
#### fixed  

newColoc = (
    new.join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on left side
            "studyLocusId as leftStudyLocusId",
            "StudyId as leftStudyId",
            "variantId as leftVariantId",
            "studyType as credibleLeftStudyType",
        ),
        on="leftStudyLocusId",
        how="left",
    )
    .join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on right side
            "studyLocusId as rightStudyLocusId",
            "studyId as rightStudyId",
            "variantId as rightVariantId",
            "studyType as credibleRightStudyType",
        ),
        on="rightStudyLocusId",
        how="left",
    )
    .join(
        fixedIndex.selectExpr(  ### bring modulated target on right side (QTL study)
            "studyId as rightStudyId", "geneId", "projectId", "studyType as indexStudyType", "condition", "biosampleId"
        ),
        on="rightStudyId",
        how="left",
    )
    .persist()
)
# remove columns without content (only null values on them)
df = evidences.filter((F.col("datasourceId") == "gwas_credible_sets"))

# Use an aggregation to determine non-null columns
non_null_counts = df.select(
    *[F.sum(F.col(col).isNotNull().cast("int")).alias(col) for col in df.columns]
)

# Collect the counts for each column
non_null_columns = [
    row[0] for row in non_null_counts.collect()[0].asDict().items() if row[1] > 0
]

# Select only the non-null columns
filtered_df = df.select(*non_null_columns).persist()

## bring studyId, variantId, beta from Gwas and pValue
gwasComplete = filtered_df.join(
    credible.selectExpr(
        "studyLocusId", "studyId", "variantId", "beta as betaGwas", "pValueExponent"
    ),
    on="studyLocusId",
    how="left",
).persist()

resolvedColoc = (
    (
        newColoc.withColumnRenamed("geneId", "targetId")
        .join(
            gwasComplete.withColumnRenamed("studyLocusId", "leftStudyLocusId"),
            on=["leftStudyLocusId", "targetId"],
            how="inner",
        )
        .join(  ### propagated using parent terms
            diseases.selectExpr(
                "id as diseaseId", "name", "parents", "therapeuticAreas"
            ),
            on="diseaseId",
            how="left",
        )
        .withColumn(
            "diseaseId",
            F.explode_outer(F.concat(F.array(F.col("diseaseId")), F.col("parents"))),
        )
        .drop("parents", "oldDiseaseId")
    )
    .withColumn(
        "colocDoE",
        F.when(
            F.col("rightStudyType").isin(
                ["eqtl", "pqtl", "tuqtl", "sceqtl", "sctuqtl"]
            ),
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_protect"),
            ),
        ).when(
            F.col("rightStudyType").isin(
                ["sqtl", "scsqtl"]
            ),  ### opposite directionality than sqtl
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_protect"),
            ),
        ),
    )
    .persist()
)

path = "gs://open-targets-pre-data-releases/24.12-uo_test-3/output/etl/parquet/"

datasource_filter = [
    "ot_genetics_portal",
    "gwas_credible_sets",
    "gene_burden",
    "eva",
    "eva_somatic",
    "gene2phenotype",
    "orphanet",
    "cancer_gene_census",
    "intogen",
    "impc",
    "chembl",
]

assessment, evidences, actionType, oncolabel = temporary_directionOfEffect(
    path, datasource_filter
)

drugApproved=spark.read.parquet("gs://ot-team/irene/l2g/validation/chembl_w_flags").drop("clinicalTrialId","isComplex"
).withColumn("isApproved", F.when(F.col("isApproved")=="true", F.lit(1)).otherwise(F.lit(0))).distinct()


spark session created at 2025-01-17 11:20:40.774677
Analysis started on 2025-01-17 at  2025-01-17 11:20:40.774677


25/01/17 11:20:46 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

#### 2nd part

In [2]:
### include drugId and studyId becuase there are some studies where there are multiple drugs. 
# Everywthing will become coherent in chembl assoc since this
### 1nd version of 

analysis_chembl_indication = (
    discrepancifier(
        assessment.filter(
            (F.col("datasourceId") == "chembl")
            & (F.col("homogenized") != "noEvaluable")
        )
        .join(drugApproved.filter(F.col("isApproved")==1), on=["targetId","diseaseId","drugId"], how="left")
        .withColumn(
            "maxClinPhase", ### no longer maxclinphase for T-D
            F.max("clinicalPhase").over(Window.partitionBy("targetId", "diseaseId","drugId", "studyId","clinicalPhase")), ### this is key to understand what we do
        )
        .withColumn(
        "approvedDrug",
        F.max(F.col("isApproved")).over(Window.partitionBy("targetId", "diseaseId")),
        )
        ## do not consider drugId so when there are two drugs with diferent MoA at the same clinical phase we will not include it (coherency==coherent)
        .groupBy("targetId", "diseaseId", "studyId","drugId","clinicalPhase","maxClinPhase","approvedDrug")  ## we can remove drugID
        .pivot("homogenized")
        .count()
    )
    .withColumnRenamed("studyId","clinicalStudyId")
    .filter(F.col("coherencyDiagonal") == "coherent")
    .drop(
        "coherencyDiagonal", 
        "coherencyOneCell", 
        "noEvaluable", 
        "GoF_risk", 
        "LoF_risk"
    )
    .withColumnRenamed("GoF_protect", "drugGoF_protect")
    .withColumnRenamed("LoF_protect", "drugLoF_protect")
)

benchmark = (
    (
        resolvedColoc.filter(F.col("betaGwas") < 0)
        .join(  ### select just GWAS giving protection
            analysis_chembl_indication, on=["targetId", "diseaseId"], how="inner"
        )
        .withColumn(
            "AgreeDrug",
            F.when(
                (F.col("drugGoF_protect").isNotNull())
                & (F.col("colocDoE") == "GoF_protect"),
                F.lit("yes"),
            )
            .when(
                (F.col("drugLoF_protect").isNotNull())
                & (F.col("colocDoE") == "LoF_protect"),
                F.lit("yes"),
            )
            .otherwise(F.lit("no")),
        )
    )
    .filter(F.col("name") != "COVID-19")  #### remove COVID-19 associations
).join(biosample.select("biosampleId", "biosampleName"), on="biosampleId", how="left")

                                                                                

### Bring examples for the benchmark: how the numbers changed the statistics between pqtl, for instance. 

In [3]:
def comparisons_df_iterative(disdic,projectId):
    toAnalysis=[(key, value) for key, value in disdic.items() if value == projectId]
    schema = StructType(
        [
            StructField("comparison", StringType(), True),
            StructField("comparisonType", StringType(), True),
        ]
    )

    comparisons = spark.createDataFrame(toAnalysis, schema=schema)
    ### include all the columns as predictor

    predictions = spark.createDataFrame(
        data=[
            ("Phase4", "clinical"),
            ("Phase>=3", "clinical"),
            ("Phase>=2", "clinical"),
            ("Phase>=1", "clinical"),
            #("nPhase4", "clinical"),
            #("nPhase>=3", "clinical"),
            #("nPhase>=2", "clinical"),
            #("nPhase>=1", "clinical"),
            ("approved", "clinical"),

            # ("PhaseT", "clinical"),
        ]
    )
    return comparisons.join(predictions, how="full").collect()

full_data = spark.createDataFrame(
    data=[
        ("yes", "yes"),
        ("yes", "no"),
        ("no", "yes"),
        ("no", "no"),
    ],
    schema=StructType(
        [
            StructField("prediction", StringType(), True),
            StructField("comparison", StringType(), True),
        ]
    ),
)

In [39]:
list(set(group1) - set("maxClinPhase_TD"))

['maxClinPhase_TD', 'targetId', 'diseaseId', 'approvedDrug']

In [None]:
### check if the tiem where we take the maxclinphase is right or not. 

In [23]:
from pyspark.sql.functions import monotonically_increasing_id


In [4]:
chembl_indication=discrepancifier((assessment.filter(
            (F.col("datasourceId") == "chembl")
            & (F.col("homogenized") != "noEvaluable")
        )
        .join(drugApproved.filter(F.col("isApproved")==1), on=["targetId","diseaseId","drugId"], how="left")
        ).withColumn(
        "approvedDrug",
        F.max(F.col("isApproved")).over(Window.partitionBy("targetId", "diseaseId","drugId")),
        ).groupBy("targetId", "diseaseId", "studyId","drugId","clinicalPhase","approvedDrug"
        ).pivot("homogenized").count()
    ).withColumnRenamed("studyId","clinicalStudyId"
        ).filter(F.col("coherencyDiagonal") == "coherent"
        ).drop(
        "coherencyDiagonal", 
        "coherencyOneCell", 
        "noEvaluable", 
        "GoF_risk", 
        "LoF_risk"
    ).withColumnRenamed("GoF_protect", "drugGoF_protect"
    ).withColumnRenamed("LoF_protect", "drugLoF_protect")

new_benchmark = (
    (
        resolvedColoc.filter(F.col("betaGwas") < 0)
        .join(  ### select just GWAS giving protection
            chembl_indication, on=["targetId", "diseaseId"], how="inner"
        )
        .withColumn(
            "AgreeDrug",
            F.when(
                (F.col("drugGoF_protect").isNotNull())
                & (F.col("colocDoE") == "GoF_protect"),
                F.lit("yes"),
            )
            .when(
                (F.col("drugLoF_protect").isNotNull())
                & (F.col("colocDoE") == "LoF_protect"),
                F.lit("yes"),
            )
            .otherwise(F.lit("no")),
        )
    )
    .filter(F.col("name") != "COVID-19")  #### remove COVID-19 associations
).join(biosample.select("biosampleId", "biosampleName"), on="biosampleId", how="left")

In [9]:
# List of columns to aggregate
numeric_columns = ["0.5", "1.0", "2.0", "3.0", "4.0"]

# Escape column names
escaped_columns = [f"`{c}`" for c in numeric_columns]

# Create aggregation expressions
agg_expressions = [F.sum(F.col(c)).alias(c.strip("`")) for c in escaped_columns]



new_benchmark.groupBy("targetId","diseaseId"
).pivot("clinicalPhase").count().withColumn("id", F.monotonically_increasing_id()
).agg(*agg_expressions).show()



+---+-----+-----+-----+-----+
|0.5|  1.0|  2.0|  3.0|  4.0|
+---+-----+-----+-----+-----+
|771|27407|29184|76940|63229|
+---+-----+-----+-----+-----+



                                                                                

In [None]:
new_benchmark.select("clinicalPhase","clinicalStudyId").distinct().groupBy("clinicalStudyId").pivot("clinicalPhase").count().agg(*agg_expressions).show()



+---+---+---+----+---+
|0.5|1.0|2.0| 3.0|4.0|
+---+---+---+----+---+
| 13|623|590|1308|863|
+---+---+---+----+---+



                                                                                

In [None]:
new_benchmark.select("clinicalPhase","clinicalStudyId","rightStudyType"
).distinct().groupBy("rightStudyType"
).pivot("clinicalPhase"
).count().show()



+--------------+----+---+---+----+---+
|rightStudyType| 0.5|1.0|2.0| 3.0|4.0|
+--------------+----+---+---+----+---+
|          sqtl|   2|169|198| 516|309|
|          pqtl|null| 66| 89| 293| 94|
|         tuqtl|   6|199|264| 561|341|
|          eqtl|  13|592|528|1198|825|
|       sctuqtl|   4|101|161| 332|222|
|        sceqtl|   3|129|205| 442|336|
|        scsqtl|null| 42| 54| 141| 72|
+--------------+----+---+---+----+---+



                                                                                

In [None]:
new_benchmark.filter(F.col("clinicalStudyId").isNotNull()).withColumn("agree_lowestPval", F.first("AgreeDrug").over(window_spec)
).select(
    "targetId",
    "diseaseId",
    "clinicalPhase",
    "clinicalStudyId",
    "rightStudyType",
    "agree_lowestPval").distinct(

    ).groupBy("rightStudyType","agree_lowestPVal"
    ).pivot("clinicalPhase"
    ).count().sort(F.col("rightStudyType").desc()).show()



+--------------+----------------+----+---+---+---+---+
|rightStudyType|agree_lowestPVal| 0.5|1.0|2.0|3.0|4.0|
+--------------+----------------+----+---+---+---+---+
|         tuqtl|             yes|   2|197|207|637|448|
|         tuqtl|              no|   4| 27| 74| 88| 41|
|          sqtl|              no|   2|114|163|442|353|
|          sqtl|             yes|null| 79| 51|230|104|
|       sctuqtl|              no|   2| 48| 53|147|121|
|       sctuqtl|             yes|   2| 53|110|194|101|
|        scsqtl|              no|null|  9|  4|  2|  8|
|        scsqtl|             yes|null| 33| 50|141| 63|
|        sceqtl|             yes|   2| 79|123|321|275|
|        sceqtl|              no|   1| 51| 84|162| 84|
|          pqtl|             yes|null| 39| 64|161| 48|
|          pqtl|              no|null| 27| 27|133| 46|
|          eqtl|             yes|   8|320|324|748|392|
|          eqtl|              no|   5|350|256|853|683|
+--------------+----------------+----+---+---+---+---+



                                                                                

In [15]:
window_spec = Window.partitionBy("targetId", "diseaseId","rightStudyType").orderBy(F.col("pValueExponent").asc())

new_benchmark.filter(F.col("clinicalStudyId").isNotNull()).withColumn("agree_lowestPval", F.first("AgreeDrug").over(window_spec)
).select(
    "targetId",
    "diseaseId",
    "clinicalPhase",
    "clinicalStudyId",
    #"drugId",
    "rightStudyType",
    "agree_lowestPval").distinct(
    ).withColumn(
        "clinicalPhase", ### no longer maxclinphase for T-D
        F.max("clinicalPhase").over(Window.partitionBy("targetId","diseaseId","rightStudyType"))).withColumn(
        "Phase4",
        F.when(F.col("clinicalPhase") == 4, F.lit("yes")).otherwise(F.lit("no"))
        ).withColumn(
        "Phase>=3",
        F.when(F.col("clinicalPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
        ).withColumn(
        "Phase>=2",
        F.when(F.col("clinicalPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
        ).withColumn(
        "Phase>=1",
        F.when(F.col("clinicalPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
        ).groupBy("rightStudyType","agree_lowestPVal"
        ).pivot("Phase4").count().sort(F.col("rightStudyType").desc()
        ).select("rightStudyType","agree_lowestPVal","yes","no").show()



+--------------+----------------+----+---+
|rightStudyType|agree_lowestPVal| yes| no|
+--------------+----------------+----+---+
|         tuqtl|             yes|1114| 99|
|         tuqtl|              no| 456| 56|
|          sqtl|              no| 718| 76|
|          sqtl|             yes| 718| 26|
|       sctuqtl|              no| 353| 19|
|       sctuqtl|             yes| 420| 39|
|        scsqtl|              no|  19|  4|
|        scsqtl|             yes| 286|  1|
|        sceqtl|             yes| 725| 75|
|        sceqtl|              no| 355| 27|
|          pqtl|             yes| 290| 22|
|          pqtl|              no| 212| 21|
|          eqtl|             yes|1609|126|
|          eqtl|              no|2064|140|
+--------------+----------------+----+---+



                                                                                

In [None]:
window_spec = Window.partitionBy("targetId", "diseaseId","rightStudyType").orderBy(F.col("pValueExponent").asc())

new_benchmark.filter(F.col("clinicalStudyId").isNotNull()).withColumn("agree_lowestPval", F.first("AgreeDrug").over(window_spec)
).select(
    "targetId",
    "diseaseId",
    "clinicalPhase",
    "clinicalStudyId",
    "drugId",
    "rightStudyType",
    "agree_lowestPval").distinct(
    ).withColumn(
        "clinicalPhase", ### no longer maxclinphase for T-D
        F.max("clinicalPhase").over(Window.partitionBy("targetId","diseaseId","rightStudyType"))).withColumn(
        "Phase4",
        F.when(F.col("clinicalPhase") == 4, F.lit("yes")).otherwise(F.lit("no"))
        ).withColumn(
        "Phase>=3",
        F.when(F.col("clinicalPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
        ).withColumn(
        "Phase>=2",
        F.when(F.col("clinicalPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
        ).withColumn(
        "Phase>=1",
        F.when(F.col("clinicalPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
        ).groupBy("rightStudyType","agree_lowestPVal"
        ).pivot("Phase4").count().sort(F.col("rightStudyType").desc()
        ).select("rightStudyType","agree_lowestPVal","yes","no").show()

In [51]:
new_benchmark.select("targetId","diseaseId","clinicalPhase","clinicalStudyId","rightStudyType"
).distinct().groupBy("rightStudyType"
).pivot("clinicalPhase"
).count().show()



+--------------+----+---+---+----+----+
|rightStudyType| 0.5|1.0|2.0| 3.0| 4.0|
+--------------+----+---+---+----+----+
|          sqtl|   2|193|214| 675| 478|
|          pqtl|null| 66| 91| 296| 115|
|         tuqtl|   6|224|282| 730| 517|
|          eqtl|  13|670|582|1610|1126|
|       sctuqtl|   4|101|163| 343| 235|
|        sceqtl|   3|130|207| 486| 378|
|        scsqtl|null| 42| 54| 143|  76|
+--------------+----+---+---+----+----+



                                                                                

In [47]:
new_benchmark.select("clinicalStudyId").distinct().count()

                                                                                

3395

In [24]:
new_benchmark.withColumn("approved_l", F.when(F.col("approvedDrug")==1, F.lit("yes")).otherwise(F.lit("no"))).groupBy("approvedDrug","approved_l").count().show()



+------------+----------+------+
|approvedDrug|approved_l| count|
+------------+----------+------+
|        null|        no| 40787|
|           1|       yes|156744|
+------------+----------+------+



                                                                                

In [39]:
new_benchmark.withColumn(
"approved", ### no longer maxclinphase for T-D
F.max("approvedDrug").over(Window.partitionBy("targetId","diseaseId")
)).withColumn("approved2",F.when(F.col("approved")==1, F.lit("yes")).otherwise(F.lit("no"))
).groupBy("targetId","diseaseId","approved","approved2").agg(F.size(F.collect_set("approved")).alias("size")).sort(F.col("approved").desc()).show()



+---------------+-------------+--------+---------+----+
|       targetId|    diseaseId|approved|approved2|size|
+---------------+-------------+--------+---------+----+
|ENSG00000105397|  EFO_0000540|       1|      yes|   1|
|ENSG00000132170|  EFO_0000400|       1|      yes|   1|
|ENSG00000105397|  EFO_0000676|       1|      yes|   1|
|ENSG00000110944|  EFO_0000676|       1|      yes|   1|
|ENSG00000110944|  EFO_1001494|       1|      yes|   1|
|ENSG00000010310|  EFO_0000400|       1|      yes|   1|
|ENSG00000112164|  EFO_0000400|       1|      yes|   1|
|ENSG00000043591|  EFO_0000319|       1|      yes|   1|
|ENSG00000112164|MONDO_0005148|       1|      yes|   1|
|ENSG00000065989|  EFO_0000676|       1|      yes|   1|
|ENSG00000113161|  EFO_0000400|       1|      yes|   1|
|ENSG00000006071|MONDO_0005148|       1|      yes|   1|
|ENSG00000113161|MONDO_0005148|       1|      yes|   1|
|ENSG00000105397|  EFO_0003778|       1|      yes|   1|
|ENSG00000113302|  EFO_0000540|       1|      ye

                                                                                

In [31]:
new_benchmark.withColumn("approved_l", F.when(F.col("approvedDrug")==1, F.lit("yes")).otherwise(F.lit("no"))).groupBy("targetId","diseaseId").agg(F.size(F.collect_set("approved_l")).alias("size")).sort(F.col("size").desc()).show()



+---------------+-------------+----+
|       targetId|    diseaseId|size|
+---------------+-------------+----+
|ENSG00000105397|  EFO_0000540|   2|
|ENSG00000157388|  EFO_0000319|   2|
|ENSG00000105397|  EFO_0000676|   2|
|ENSG00000113161|MONDO_0005148|   2|
|ENSG00000145777|MONDO_0004979|   2|
|ENSG00000113302|  EFO_0000540|   2|
|ENSG00000065989|  EFO_0000676|   2|
|ENSG00000113302|  EFO_0000676|   2|
|ENSG00000006071|MONDO_0005148|   2|
|ENSG00000113302|  EFO_1001494|   2|
|ENSG00000105397|  EFO_0003778|   2|
|ENSG00000115232|  EFO_0000384|   2|
|ENSG00000110944|  EFO_1001494|   2|
|ENSG00000126218|  EFO_0003827|   2|
|ENSG00000112164|MONDO_0005148|   2|
|ENSG00000126218|  EFO_0004286|   2|
|ENSG00000043591|  EFO_0000319|   2|
|ENSG00000110944|  EFO_0000676|   2|
|ENSG00000112164|  EFO_0000400|   2|
|ENSG00000113161|  EFO_0000400|   2|
+---------------+-------------+----+
only showing top 20 rows



                                                                                

In [43]:
df.to_csv("gs://ot-team/jroldan/analysis/globalqtlResults.csv")

In [47]:
import numpy as np
def convertTuple(tup):
    st = ",".join(map(str, tup))
    return st
from scipy.stats import fisher_exact
from scipy.stats.contingency import odds_ratio
from pyspark.sql.types import *

#### columsn to groupBy - introduce a dictionary for trying different list 
partitionByPValue=["targetId", "diseaseId", "rightStudyType"]

#### description of groups to make groupBy
## partitionForPhase
groupPhase1 = ["targetId","diseaseId"]
groupPhase2 = ["targetId","diseaseId","clinicalPhase"]
groupPhase3 = ["targetId","diseaseId","clinicalStudyId"] 
groupPhase4 = ["targetId","diseaseId","clinicalPhase","clinicalStudyId"] ## will take each clinical phase and study
groupPhase5 = ["targetId","diseaseId","clinicalPhase","clinicalStudyId","drugId"] ## will take max clin phase for DrugId, clinical phase and clnical study
groupPhase6 = ["targetId","diseaseId","clinicalPhase","drugId"] ## will take each max clinical phase per drugId
groupPhase7 = ["targetId","diseaseId","drugId"]

group_phases = {
        "targetIdDiseaseID": groupPhase1,
    "clinicalPhase": groupPhase2,
    "clinicalStudy": groupPhase3,
    "clinicalPhase&ClinicalStudy": groupPhase4,
    "clinicalPhaseClinicalStudyIdDrugId": groupPhase5,
    "clinicalPhaseDrugId": groupPhase6,
    "drugId": groupPhase7,

}

## partitionForRows
groupPhase1 = ["targetId","diseaseId","clinicalPhase"]
groupPhase2 = ["targetId","diseaseId","clinicalPhase","clinicalStudyId"] ## will take each clinical phase and study
groupPhase3 = ["targetId","diseaseId","clinicalPhase","clinicalStudyId","drugId"] ## will take max clin phase for DrugId, clinical phase and clnical study
groupPhase4 = ["targetId","diseaseId","clinicalPhase","drugId"] ## will take each max clinical phase per drugId
# same groups but containing approved 
groupPhase5 = ["targetId","diseaseId","clinicalPhase","approved_l"]
groupPhase6 = ["targetId","diseaseId","clinicalPhase","clinicalStudyId","approved_l"] ## will take each clinical phase and study
groupPhase7 = ["targetId","diseaseId","clinicalPhase","clinicalStudyId","drugId","approved_l"] ## will take max clin phase for DrugId, clinical phase and clnical study
groupPhase8 = ["targetId","diseaseId","clinicalPhase","drugId","approved_l"] ## will take each max clinical phase per drugId

group_rows = {
    "clinicalPhase": groupPhase1,
    "clinicalPhaseClinicalStudyId": groupPhase2,
    "clinicalPhaseClinicalStudyIdDrugId": groupPhase3,
    "clinicalPhaseDrugId": groupPhase4,
    "clinicalPhaseApproved_l": groupPhase5,
    "clinicalPhaseClinicalStudyIdApproved_l": groupPhase6,
    "clinicalPhaseClinicalStudyIdDrugIdApproved_l": groupPhase7,
    "clinicalPhaseDrugIdApproved_l": groupPhase8,

}

value_analysis=["pqtl","eqtl","sqtl"]
### define the window to order for taking pValue 
window_spec = Window.partitionBy(*partitionByPValue).orderBy(F.col("pValueExponent").asc())
results=[]

for value in value_analysis:
    # Iterate over group mapping
    for group_phase, group_phase_columns in group_phases.items():
        print("making group phases:", group_phase_columns)
        for group_row, group_rows_columns in group_rows.items():
                print("making group rows:", group_rows_columns)
                x = value
                print(value,x)

                if "approved_l" in group_rows_columns: 
                        
                        pre=new_benchmark.withColumn("approved_l", F.when(F.col("approvedDrug")==1, F.lit("yes")).otherwise(F.lit("no"))
                                ).filter(F.col("clinicalStudyId").isNotNull()
                                ).withColumn("agree_lowestPval", F.first("AgreeDrug").over(window_spec)
                                ).withColumn(
                                "clinicalPhase", ### no longer maxclinphase for T-D
                                F.max("clinicalPhase").over(Window.partitionBy(*group_phase_columns))
                                ).withColumn(
                                "approved", ### no longer maxclinphase for T-D
                                F.max("approvedDrug").over(Window.partitionBy(*group_phase_columns))
                                ).groupBy(*group_rows_columns).pivot("rightStudyType").agg(F.collect_set("agree_lowestPVal")
                                ).withColumn(
                                        "Phase4",
                                        F.when(F.col("clinicalPhase") == 4, F.lit("yes")).otherwise(F.lit("no"))
                                ).withColumn(
                                        "Phase>=3",
                                        F.when(F.col("clinicalPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
                                ).withColumn(
                                        "Phase>=2",
                                        F.when(F.col("clinicalPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
                                ).withColumn(
                                        "Phase>=1",
                                        F.when(F.col("clinicalPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
                                ).withColumn(
                                        "approved",
                                                F.when(F.col("approved")==1, F.lit("yes")).otherwise(F.lit("no")),
                                ).withColumn(
                                        x, 
                                        F.when(F.array_contains(F.col(x),"yes"), F.lit("yes")).otherwise(F.lit("no"))
                                ).persist()
                        for phase in ["Phase4","approved"]:
                                
                                print("groupby for", phase)
                                #pre.groupBy(x).pivot(phase).count().select(x,"yes","no").sort(F.col(x).desc()).show()

                                array1 = np.delete(pre.groupBy(x).pivot(phase).count().select(x,"yes","no").sort(F.col(x).desc())
                                .fillna(0)
                                .toPandas()
                                .to_numpy(),
                                [0],
                                1,
                                )
                                total = np.sum(array1)
                                res_npPhaseX = np.array(array1, dtype=int)
                                resX = convertTuple(fisher_exact(res_npPhaseX, alternative="two-sided"))
                                resx_CI = convertTuple(
                                odds_ratio(res_npPhaseX).confidence_interval(confidence_level=0.95)
                                )
                                print(round(float(resX.split(",")[0]), 2),float(resX.split(",")[1]),
                                round(float(resx_CI.split(",")[0]), 2),
                                round(float(resx_CI.split(",")[1]), 2))
                                print("\n")
                                results.append([
                                        partitionByPValue,
                                        group_phase,
                                        group_row,
                                        phase,
                                        x,
                                        round(float(resX.split(",")[0]), 2), ## OR
                                        float(resX.split(",")[1]), ## pValue
                                        round(float(resx_CI.split(",")[0]), 2), ## Low CI
                                        round(float(resx_CI.split(",")[1]), 2), ## High CI
                                        total,
                                        array1])
                        pre.unpersist()
                else:
                        pre=new_benchmark.withColumn("approved_l", F.when(F.col("approvedDrug")==1, F.lit("yes")).otherwise(F.lit("no"))
                        ).filter(F.col("clinicalStudyId").isNotNull()
                        ).withColumn("agree_lowestPval", F.first("AgreeDrug").over(window_spec)
                        ).withColumn(
                        "clinicalPhase", ### no longer maxclinphase for T-D
                        F.max("clinicalPhase").over(Window.partitionBy(*group_phase_columns))
                        ).groupBy(*group_rows_columns).pivot("rightStudyType").agg(F.collect_set("agree_lowestPVal")
                        ).withColumn(
                                "Phase4",
                                F.when(F.col("clinicalPhase") == 4, F.lit("yes")).otherwise(F.lit("no"))
                        ).withColumn(
                                "Phase>=3",
                                F.when(F.col("clinicalPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
                        ).withColumn(
                                "Phase>=2",
                                F.when(F.col("clinicalPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
                        ).withColumn(
                                "Phase>=1",
                                F.when(F.col("clinicalPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
                        ).withColumn(
                                x, 
                                F.when(F.array_contains(F.col(x),"yes"), F.lit("yes")).otherwise(F.lit("no"))
                        ).persist()
                        for phase in ["Phase4"]:
                                
                                print("groupby for", phase)
                                #pre.groupBy(x).pivot(phase).count().select(x,"yes","no").sort(F.col(x).desc()).show()

                                array1 = np.delete(pre.groupBy(x).pivot(phase).count().select(x,"yes","no").sort(F.col(x).desc())
                                .fillna(0)
                                .toPandas()
                                .to_numpy(),
                                [0],
                                1,
                                )
                                total = np.sum(array1)
                                res_npPhaseX = np.array(array1, dtype=int)
                                resX = convertTuple(fisher_exact(res_npPhaseX, alternative="two-sided"))
                                resx_CI = convertTuple(
                                odds_ratio(res_npPhaseX).confidence_interval(confidence_level=0.95)
                                )
                                print(round(float(resX.split(",")[0]), 2),float(resX.split(",")[1]),
                                round(float(resx_CI.split(",")[0]), 2),
                                round(float(resx_CI.split(",")[1]), 2))
                                print("\n")
                                results.append([
                                        partitionByPValue,
                                        group_phase,
                                        group_row,
                                        phase,
                                        x,
                                        round(float(resX.split(",")[0]), 2), ## OR
                                        float(resX.split(",")[1]), ## pValue
                                        round(float(resx_CI.split(",")[0]), 2), ## Low CI
                                        round(float(resx_CI.split(",")[1]), 2), ## High CI
                                        total,
                                        array1])
                        pre.unpersist()
df = pd.DataFrame(
results,
columns=["partitionByPValue",
        "partitionForPhase",
        "groupByForRows",
        "phase",
        "x",
        "OR",
        "pValue",
        "LowCI",
        "HighCI",
        "total",
        "array"

],
)

making group phases: ['targetId', 'diseaseId']
making group rows: ['targetId', 'diseaseId', 'clinicalPhase']
pqtl pqtl


                                                                                

groupby for Phase4


                                                                                

2.57 0.04795664956148016 0.99 7.05


making group rows: ['targetId', 'diseaseId', 'clinicalPhase', 'clinicalStudyId']
pqtl pqtl


                                                                                

groupby for Phase4


                                                                                

1.1 0.7410886427563347 0.7 1.81


making group rows: ['targetId', 'diseaseId', 'clinicalPhase', 'clinicalStudyId', 'drugId']
pqtl pqtl


                                                                                

groupby for Phase4


                                                                                

0.97 0.9099031384690608 0.63 1.58


making group rows: ['targetId', 'diseaseId', 'clinicalPhase', 'drugId']
pqtl pqtl


25/01/17 12:59:43 WARN CacheManager: Asked to cache already cached data.        


groupby for Phase4
2.33 0.008770489571684605 1.21 4.78


making group rows: ['targetId', 'diseaseId', 'clinicalPhase', 'approved_l']
pqtl pqtl


                                                                                

AnalysisException: Column 'approved' does not exist. Did you mean one of the following? [approved_l, Phase4, diseaseId, pqtl, targetId, Phase>=1, Phase>=2, Phase>=3, eqtl, sceqtl, scsqtl, sctuqtl, sqtl, tuqtl, clinicalPhase];
'Project [targetId#4743, diseaseId#4892, clinicalPhase#1520297, approved_l#1520196, eqtl#1521713, pqtl#1521715, sceqtl#1521717, scsqtl#1521719, sctuqtl#1521721, sqtl#1521723, tuqtl#1521725, Phase4#1521737, Phase>=3#1521750, Phase>=2#1521764, Phase>=1#1521779, CASE WHEN ('approved = 1) THEN yes ELSE no END AS approved#1521795]
+- Project [targetId#4743, diseaseId#4892, clinicalPhase#1520297, approved_l#1520196, eqtl#1521713, pqtl#1521715, sceqtl#1521717, scsqtl#1521719, sctuqtl#1521721, sqtl#1521723, tuqtl#1521725, Phase4#1521737, Phase>=3#1521750, Phase>=2#1521764, CASE WHEN (clinicalPhase#1520297 >= cast(1 as double)) THEN yes ELSE no END AS Phase>=1#1521779]
   +- Project [targetId#4743, diseaseId#4892, clinicalPhase#1520297, approved_l#1520196, eqtl#1521713, pqtl#1521715, sceqtl#1521717, scsqtl#1521719, sctuqtl#1521721, sqtl#1521723, tuqtl#1521725, Phase4#1521737, Phase>=3#1521750, CASE WHEN (clinicalPhase#1520297 >= cast(2 as double)) THEN yes ELSE no END AS Phase>=2#1521764]
      +- Project [targetId#4743, diseaseId#4892, clinicalPhase#1520297, approved_l#1520196, eqtl#1521713, pqtl#1521715, sceqtl#1521717, scsqtl#1521719, sctuqtl#1521721, sqtl#1521723, tuqtl#1521725, Phase4#1521737, CASE WHEN (clinicalPhase#1520297 >= cast(3 as double)) THEN yes ELSE no END AS Phase>=3#1521750]
         +- Project [targetId#4743, diseaseId#4892, clinicalPhase#1520297, approved_l#1520196, eqtl#1521713, pqtl#1521715, sceqtl#1521717, scsqtl#1521719, sctuqtl#1521721, sqtl#1521723, tuqtl#1521725, CASE WHEN (clinicalPhase#1520297 = cast(4 as double)) THEN yes ELSE no END AS Phase4#1521737]
            +- Aggregate [targetId#4743, diseaseId#4892, clinicalPhase#1520297, approved_l#1520196], [targetId#4743, diseaseId#4892, clinicalPhase#1520297, approved_l#1520196, collect_set(if ((rightStudyType#1046 <=> cast(eqtl as string))) agree_lowestPVal#1520246 else cast(null as string), 0, 0) AS eqtl#1521713, collect_set(if ((rightStudyType#1046 <=> cast(pqtl as string))) agree_lowestPVal#1520246 else cast(null as string), 0, 0) AS pqtl#1521715, collect_set(if ((rightStudyType#1046 <=> cast(sceqtl as string))) agree_lowestPVal#1520246 else cast(null as string), 0, 0) AS sceqtl#1521717, collect_set(if ((rightStudyType#1046 <=> cast(scsqtl as string))) agree_lowestPVal#1520246 else cast(null as string), 0, 0) AS scsqtl#1521719, collect_set(if ((rightStudyType#1046 <=> cast(sctuqtl as string))) agree_lowestPVal#1520246 else cast(null as string), 0, 0) AS sctuqtl#1521721, collect_set(if ((rightStudyType#1046 <=> cast(sqtl as string))) agree_lowestPVal#1520246 else cast(null as string), 0, 0) AS sqtl#1521723, collect_set(if ((rightStudyType#1046 <=> cast(tuqtl as string))) agree_lowestPVal#1520246 else cast(null as string), 0, 0) AS tuqtl#1521725]
               +- Project [biosampleId#1012, targetId#4743, diseaseId#4892, leftStudyLocusId#1043, rightStudyId#3055, rightStudyLocusId#1044, chromosome#1045, rightStudyType#1046, numberColocalisingVariants#1047L, h0#1048, h1#1049, h2#1050, h3#1051, h4#1052, colocalisationMethod#1053, betaRatioSignAverage#1054, leftStudyId#3030, leftVariantId#3031, credibleLeftStudyType#3032, rightVariantId#3056, credibleRightStudyType#3057, projectId#985, indexStudyType#3108, condition#1008, ... 26 more fields]
                  +- Project [biosampleId#1012, targetId#4743, diseaseId#4892, leftStudyLocusId#1043, rightStudyId#3055, rightStudyLocusId#1044, chromosome#1045, rightStudyType#1046, numberColocalisingVariants#1047L, h0#1048, h1#1049, h2#1050, h3#1051, h4#1052, colocalisationMethod#1053, betaRatioSignAverage#1054, leftStudyId#3030, leftVariantId#3031, credibleLeftStudyType#3032, rightVariantId#3056, credibleRightStudyType#3057, projectId#985, indexStudyType#3108, condition#1008, ... 27 more fields]
                     +- Window [max(approvedDrug#18011) windowspecdefinition(targetId#4743, diseaseId#4892, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS approved#1520348], [targetId#4743, diseaseId#4892]
                        +- Project [biosampleId#1012, targetId#4743, diseaseId#4892, leftStudyLocusId#1043, rightStudyId#3055, rightStudyLocusId#1044, chromosome#1045, rightStudyType#1046, numberColocalisingVariants#1047L, h0#1048, h1#1049, h2#1050, h3#1051, h4#1052, colocalisationMethod#1053, betaRatioSignAverage#1054, leftStudyId#3030, leftVariantId#3031, credibleLeftStudyType#3032, rightVariantId#3056, credibleRightStudyType#3057, projectId#985, indexStudyType#3108, condition#1008, ... 25 more fields]
                           +- Project [biosampleId#1012, targetId#4743, diseaseId#4892, leftStudyLocusId#1043, rightStudyId#3055, rightStudyLocusId#1044, chromosome#1045, rightStudyType#1046, numberColocalisingVariants#1047L, h0#1048, h1#1049, h2#1050, h3#1051, h4#1052, colocalisationMethod#1053, betaRatioSignAverage#1054, leftStudyId#3030, leftVariantId#3031, credibleLeftStudyType#3032, rightVariantId#3056, credibleRightStudyType#3057, projectId#985, indexStudyType#3108, condition#1008, ... 25 more fields]
                              +- Project [biosampleId#1012, targetId#4743, diseaseId#4892, leftStudyLocusId#1043, rightStudyId#3055, rightStudyLocusId#1044, chromosome#1045, rightStudyType#1046, numberColocalisingVariants#1047L, h0#1048, h1#1049, h2#1050, h3#1051, h4#1052, colocalisationMethod#1053, betaRatioSignAverage#1054, leftStudyId#3030, leftVariantId#3031, credibleLeftStudyType#3032, rightVariantId#3056, credibleRightStudyType#3057, projectId#985, indexStudyType#3108, condition#1008, ... 27 more fields]
                                 +- Window [max(clinicalPhase#5985) windowspecdefinition(targetId#4743, diseaseId#4892, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS clinicalPhase#1520297], [targetId#4743, diseaseId#4892]
                                    +- Project [biosampleId#1012, targetId#4743, diseaseId#4892, leftStudyLocusId#1043, rightStudyId#3055, rightStudyLocusId#1044, chromosome#1045, rightStudyType#1046, numberColocalisingVariants#1047L, h0#1048, h1#1049, h2#1050, h3#1051, h4#1052, colocalisationMethod#1053, betaRatioSignAverage#1054, leftStudyId#3030, leftVariantId#3031, credibleLeftStudyType#3032, rightVariantId#3056, credibleRightStudyType#3057, projectId#985, indexStudyType#3108, condition#1008, ... 25 more fields]
                                       +- Project [biosampleId#1012, targetId#4743, diseaseId#4892, leftStudyLocusId#1043, rightStudyId#3055, rightStudyLocusId#1044, chromosome#1045, rightStudyType#1046, numberColocalisingVariants#1047L, h0#1048, h1#1049, h2#1050, h3#1051, h4#1052, colocalisationMethod#1053, betaRatioSignAverage#1054, leftStudyId#3030, leftVariantId#3031, credibleLeftStudyType#3032, rightVariantId#3056, credibleRightStudyType#3057, projectId#985, indexStudyType#3108, condition#1008, ... 25 more fields]
                                          +- Project [biosampleId#1012, targetId#4743, diseaseId#4892, leftStudyLocusId#1043, rightStudyId#3055, rightStudyLocusId#1044, chromosome#1045, rightStudyType#1046, numberColocalisingVariants#1047L, h0#1048, h1#1049, h2#1050, h3#1051, h4#1052, colocalisationMethod#1053, betaRatioSignAverage#1054, leftStudyId#3030, leftVariantId#3031, credibleLeftStudyType#3032, rightVariantId#3056, credibleRightStudyType#3057, projectId#985, indexStudyType#3108, condition#1008, ... 26 more fields]
                                             +- Window [first(AgreeDrug#23696, false) windowspecdefinition(targetId#4743, diseaseId#4892, rightStudyType#1046, pValueExponent#4792 ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS agree_lowestPval#1520246], [targetId#4743, diseaseId#4892, rightStudyType#1046], [pValueExponent#4792 ASC NULLS FIRST]
                                                +- Project [biosampleId#1012, targetId#4743, diseaseId#4892, leftStudyLocusId#1043, rightStudyId#3055, rightStudyLocusId#1044, chromosome#1045, rightStudyType#1046, numberColocalisingVariants#1047L, h0#1048, h1#1049, h2#1050, h3#1051, h4#1052, colocalisationMethod#1053, betaRatioSignAverage#1054, leftStudyId#3030, leftVariantId#3031, credibleLeftStudyType#3032, rightVariantId#3056, credibleRightStudyType#3057, projectId#985, indexStudyType#3108, condition#1008, ... 24 more fields]
                                                   +- Filter isnotnull(clinicalStudyId#23611)
                                                      +- Project [biosampleId#1012, targetId#4743, diseaseId#4892, leftStudyLocusId#1043, rightStudyId#3055, rightStudyLocusId#1044, chromosome#1045, rightStudyType#1046, numberColocalisingVariants#1047L, h0#1048, h1#1049, h2#1050, h3#1051, h4#1052, colocalisationMethod#1053, betaRatioSignAverage#1054, leftStudyId#3030, leftVariantId#3031, credibleLeftStudyType#3032, rightVariantId#3056, credibleRightStudyType#3057, projectId#985, indexStudyType#3108, condition#1008, ... 24 more fields]
                                                         +- Project [biosampleId#1012, targetId#4743, diseaseId#4892, leftStudyLocusId#1043, rightStudyId#3055, rightStudyLocusId#1044, chromosome#1045, rightStudyType#1046, numberColocalisingVariants#1047L, h0#1048, h1#1049, h2#1050, h3#1051, h4#1052, colocalisationMethod#1053, betaRatioSignAverage#1054, leftStudyId#3030, leftVariantId#3031, credibleLeftStudyType#3032, rightVariantId#3056, credibleRightStudyType#3057, projectId#985, indexStudyType#3108, condition#1008, ... 23 more fields]
                                                            +- Join LeftOuter, (biosampleId#1012 = biosampleId#1093)
                                                               :- Filter NOT (name#691 = COVID-19)
                                                               :  +- Project [targetId#4743, diseaseId#4892, leftStudyLocusId#1043, rightStudyId#3055, rightStudyLocusId#1044, chromosome#1045, rightStudyType#1046, numberColocalisingVariants#1047L, h0#1048, h1#1049, h2#1050, h3#1051, h4#1052, colocalisationMethod#1053, betaRatioSignAverage#1054, leftStudyId#3030, leftVariantId#3031, credibleLeftStudyType#3032, rightVariantId#3056, credibleRightStudyType#3057, projectId#985, indexStudyType#3108, condition#1008, biosampleId#1012, ... 22 more fields]
                                                               :     +- Project [targetId#4743, diseaseId#4892, leftStudyLocusId#1043, rightStudyId#3055, rightStudyLocusId#1044, chromosome#1045, rightStudyType#1046, numberColocalisingVariants#1047L, h0#1048, h1#1049, h2#1050, h3#1051, h4#1052, colocalisationMethod#1053, betaRatioSignAverage#1054, leftStudyId#3030, leftVariantId#3031, credibleLeftStudyType#3032, rightVariantId#3056, credibleRightStudyType#3057, projectId#985, indexStudyType#3108, condition#1008, biosampleId#1012, ... 21 more fields]
                                                               :        +- Join Inner, ((targetId#4743 = targetId#5966) AND (diseaseId#4892 = diseaseId#6066))
                                                               :           :- Filter (betaGwas#4427 < cast(0 as double))
                                                               :           :  +- Project [diseaseId#4892, leftStudyLocusId#1043, targetId#4743, rightStudyId#3055, rightStudyLocusId#1044, chromosome#1045, rightStudyType#1046, numberColocalisingVariants#1047L, h0#1048, h1#1049, h2#1050, h3#1051, h4#1052, colocalisationMethod#1053, betaRatioSignAverage#1054, leftStudyId#3030, leftVariantId#3031, credibleLeftStudyType#3032, rightVariantId#3056, credibleRightStudyType#3057, projectId#985, indexStudyType#3108, condition#1008, biosampleId#1012, ... 15 more fields]
                                                               :           :     +- Project [diseaseId#4892, leftStudyLocusId#1043, targetId#4743, rightStudyId#3055, rightStudyLocusId#1044, chromosome#1045, rightStudyType#1046, numberColocalisingVariants#1047L, h0#1048, h1#1049, h2#1050, h3#1051, h4#1052, colocalisationMethod#1053, betaRatioSignAverage#1054, leftStudyId#3030, leftVariantId#3031, credibleLeftStudyType#3032, rightVariantId#3056, credibleRightStudyType#3057, projectId#985, indexStudyType#3108, condition#1008, biosampleId#1012, ... 14 more fields]
                                                               :           :        +- Project [diseaseId#4892, leftStudyLocusId#1043, targetId#4743, rightStudyId#3055, rightStudyLocusId#1044, chromosome#1045, rightStudyType#1046, numberColocalisingVariants#1047L, h0#1048, h1#1049, h2#1050, h3#1051, h4#1052, colocalisationMethod#1053, betaRatioSignAverage#1054, leftStudyId#3030, leftVariantId#3031, credibleLeftStudyType#3032, rightVariantId#3056, credibleRightStudyType#3057, projectId#985, indexStudyType#3108, condition#1008, biosampleId#1012, ... 15 more fields]
                                                               :           :           +- Generate explode(concat(array(diseaseId#818), parents#694)), true, [diseaseId#4892]
                                                               :           :              +- Project [diseaseId#818, leftStudyLocusId#1043, targetId#4743, rightStudyId#3055, rightStudyLocusId#1044, chromosome#1045, rightStudyType#1046, numberColocalisingVariants#1047L, h0#1048, h1#1049, h2#1050, h3#1051, h4#1052, colocalisationMethod#1053, betaRatioSignAverage#1054, leftStudyId#3030, leftVariantId#3031, credibleLeftStudyType#3032, rightVariantId#3056, credibleRightStudyType#3057, projectId#985, indexStudyType#3108, condition#1008, biosampleId#1012, ... 15 more fields]
                                                               :           :                 +- Join LeftOuter, (diseaseId#818 = diseaseId#4846)
                                                               :           :                    :- Project [leftStudyLocusId#1043, targetId#4743, rightStudyId#3055, rightStudyLocusId#1044, chromosome#1045, rightStudyType#1046, numberColocalisingVariants#1047L, h0#1048, h1#1049, h2#1050, h3#1051, h4#1052, colocalisationMethod#1053, betaRatioSignAverage#1054, leftStudyId#3030, leftVariantId#3031, credibleLeftStudyType#3032, rightVariantId#3056, credibleRightStudyType#3057, projectId#985, indexStudyType#3108, condition#1008, biosampleId#1012, datasourceId#717, ... 12 more fields]
                                                               :           :                    :  +- Join Inner, ((leftStudyLocusId#1043 = leftStudyLocusId#4767) AND (targetId#4743 = targetId#718))
                                                               :           :                    :     :- Project [rightStudyId#3055, rightStudyLocusId#1044, leftStudyLocusId#1043, chromosome#1045, rightStudyType#1046, numberColocalisingVariants#1047L, h0#1048, h1#1049, h2#1050, h3#1051, h4#1052, colocalisationMethod#1053, betaRatioSignAverage#1054, leftStudyId#3030, leftVariantId#3031, credibleLeftStudyType#3032, rightVariantId#3056, credibleRightStudyType#3057, geneId#984 AS targetId#4743, projectId#985, indexStudyType#3108, condition#1008, biosampleId#1012]
                                                               :           :                    :     :  +- Project [rightStudyId#3055, rightStudyLocusId#1044, leftStudyLocusId#1043, chromosome#1045, rightStudyType#1046, numberColocalisingVariants#1047L, h0#1048, h1#1049, h2#1050, h3#1051, h4#1052, colocalisationMethod#1053, betaRatioSignAverage#1054, leftStudyId#3030, leftVariantId#3031, credibleLeftStudyType#3032, rightVariantId#3056, credibleRightStudyType#3057, geneId#984, projectId#985, indexStudyType#3108, condition#1008, biosampleId#1012]
                                                               :           :                    :     :     +- Join LeftOuter, (rightStudyId#3055 = rightStudyId#3107)
                                                               :           :                    :     :        :- Project [rightStudyLocusId#1044, leftStudyLocusId#1043, chromosome#1045, rightStudyType#1046, numberColocalisingVariants#1047L, h0#1048, h1#1049, h2#1050, h3#1051, h4#1052, colocalisationMethod#1053, betaRatioSignAverage#1054, leftStudyId#3030, leftVariantId#3031, credibleLeftStudyType#3032, rightStudyId#3055, rightVariantId#3056, credibleRightStudyType#3057]
                                                               :           :                    :     :        :  +- Join LeftOuter, (rightStudyLocusId#1044 = rightStudyLocusId#3054)
                                                               :           :                    :     :        :     :- Project [leftStudyLocusId#1043, rightStudyLocusId#1044, chromosome#1045, rightStudyType#1046, numberColocalisingVariants#1047L, h0#1048, h1#1049, h2#1050, h3#1051, h4#1052, colocalisationMethod#1053, betaRatioSignAverage#1054, leftStudyId#3030, leftVariantId#3031, credibleLeftStudyType#3032]
                                                               :           :                    :     :        :     :  +- Join LeftOuter, (leftStudyLocusId#1043 = leftStudyLocusId#3029)
                                                               :           :                    :     :        :     :     :- Relation [leftStudyLocusId#1043,rightStudyLocusId#1044,chromosome#1045,rightStudyType#1046,numberColocalisingVariants#1047L,h0#1048,h1#1049,h2#1050,h3#1051,h4#1052,colocalisationMethod#1053,betaRatioSignAverage#1054] parquet
                                                               :           :                    :     :        :     :     +- Project [studyLocusId#931 AS leftStudyLocusId#3029, StudyId#932 AS leftStudyId#3030, variantId#933 AS leftVariantId#3031, studyType#956 AS credibleLeftStudyType#3032]
                                                               :           :                    :     :        :     :        +- Relation [studyLocusId#931,studyId#932,variantId#933,chromosome#934,position#935,region#936,beta#937,zScore#938,pValueMantissa#939,pValueExponent#940,effectAlleleFrequencyFromSource#941,standardError#942,subStudyDescription#943,qualityControls#944,finemappingMethod#945,credibleSetIndex#946,credibleSetlog10BF#947,purityMeanR2#948,purityMinR2#949,locusStart#950,locusEnd#951,sampleSize#952,ldSet#953,locus#954,... 2 more fields] parquet
                                                               :           :                    :     :        :     +- Project [studyLocusId#3062 AS rightStudyLocusId#3054, studyId#3063 AS rightStudyId#3055, variantId#3064 AS rightVariantId#3056, studyType#3087 AS credibleRightStudyType#3057]
                                                               :           :                    :     :        :        +- Relation [studyLocusId#3062,studyId#3063,variantId#3064,chromosome#3065,position#3066,region#3067,beta#3068,zScore#3069,pValueMantissa#3070,pValueExponent#3071,effectAlleleFrequencyFromSource#3072,standardError#3073,subStudyDescription#3074,qualityControls#3075,finemappingMethod#3076,credibleSetIndex#3077,credibleSetlog10BF#3078,purityMeanR2#3079,purityMinR2#3080,locusStart#3081,locusEnd#3082,sampleSize#3083,ldSet#3084,locus#3085,... 2 more fields] parquet
                                                               :           :                    :     :        +- Project [studyId#983 AS rightStudyId#3107, geneId#984, projectId#985, studyType#2967 AS indexStudyType#3108, condition#1008, biosampleId#1012]
                                                               :           :                    :     :           +- Project [studyId#983, geneId#984, projectId#985, studyType#2967, traitFromSource#987, traitFromSourceMappedIds#988, biosampleFromSourceId#989, pubmedId#990, publicationTitle#991, publicationFirstAuthor#992, publicationDate#993, publicationJournal#994, backgroundTraitFromSourceMappedIds#995, initialSampleSize#996, nCases#997, nControls#998, nSamples#999, cohorts#1000, ldPopulationStructure#1001, discoverySamples#1002, replicationSamples#1003, qualityControls#1004, analysisFlags#1005, summarystatsLocation#1006, ... 6 more fields]
                                                               :           :                    :     :              +- Project [studyId#983, geneId#984, projectId#985, CASE WHEN isnotnull(newStudyType#1427) THEN newStudyType#1427 ELSE studyType#986 END AS studyType#2967, traitFromSource#987, traitFromSourceMappedIds#988, biosampleFromSourceId#989, pubmedId#990, publicationTitle#991, publicationFirstAuthor#992, publicationDate#993, publicationJournal#994, backgroundTraitFromSourceMappedIds#995, initialSampleSize#996, nCases#997, nControls#998, nSamples#999, cohorts#1000, ldPopulationStructure#1001, discoverySamples#1002, replicationSamples#1003, qualityControls#1004, analysisFlags#1005, summarystatsLocation#1006, ... 7 more fields]
                                                               :           :                    :     :                 +- Project [studyId#983, geneId#984, projectId#985, studyType#986, traitFromSource#987, traitFromSourceMappedIds#988, biosampleFromSourceId#989, pubmedId#990, publicationTitle#991, publicationFirstAuthor#992, publicationDate#993, publicationJournal#994, backgroundTraitFromSourceMappedIds#995, initialSampleSize#996, nCases#997, nControls#998, nSamples#999, cohorts#1000, ldPopulationStructure#1001, discoverySamples#1002, replicationSamples#1003, qualityControls#1004, analysisFlags#1005, summarystatsLocation#1006, ... 7 more fields]
                                                               :           :                    :     :                    +- Join LeftOuter, (studyId#983 = studyId#2130)
                                                               :           :                    :     :                       :- Relation [studyId#983,geneId#984,projectId#985,studyType#986,traitFromSource#987,traitFromSourceMappedIds#988,biosampleFromSourceId#989,pubmedId#990,publicationTitle#991,publicationFirstAuthor#992,publicationDate#993,publicationJournal#994,backgroundTraitFromSourceMappedIds#995,initialSampleSize#996,nCases#997,nControls#998,nSamples#999,cohorts#1000,ldPopulationStructure#1001,discoverySamples#1002,replicationSamples#1003,qualityControls#1004,analysisFlags#1005,summarystatsLocation#1006,... 6 more fields] parquet
                                                               :           :                    :     :                       +- Project [studyId#2130, newStudyType#1427]
                                                               :           :                    :     :                          +- Project [studyId#2130, geneId#2131, projectId#2132, studyType#2133, traitFromSource#2134, traitFromSourceMappedIds#2135, biosampleFromSourceId#2136, pubmedId#2137, publicationTitle#2138, publicationFirstAuthor#2139, publicationDate#2140, publicationJournal#2141, backgroundTraitFromSourceMappedIds#2142, initialSampleSize#2143, nCases#2144, nControls#2145, nSamples#2146, cohorts#2147, ldPopulationStructure#2148, discoverySamples#2149, replicationSamples#2150, qualityControls#2151, analysisFlags#2152, summarystatsLocation#2153, ... 7 more fields]
                                                               :           :                    :     :                             +- Project [extracted_column#1169, study_type#1121, studyId#2130, geneId#2131, projectId#2132, studyType#2133, traitFromSource#2134, traitFromSourceMappedIds#2135, biosampleFromSourceId#2136, pubmedId#2137, publicationTitle#2138, publicationFirstAuthor#2139, publicationDate#2140, publicationJournal#2141, backgroundTraitFromSourceMappedIds#2142, initialSampleSize#2143, nCases#2144, nControls#2145, nSamples#2146, cohorts#2147, ldPopulationStructure#2148, discoverySamples#2149, replicationSamples#2150, qualityControls#2151, ... 10 more fields]
                                                               :           :                    :     :                                +- Project [extracted_column#1169, study_type#1121, studyId#2130, geneId#2131, projectId#2132, studyType#2133, traitFromSource#2134, traitFromSourceMappedIds#2135, biosampleFromSourceId#2136, pubmedId#2137, publicationTitle#2138, publicationFirstAuthor#2139, publicationDate#2140, publicationJournal#2141, backgroundTraitFromSourceMappedIds#2142, initialSampleSize#2143, nCases#2144, nControls#2145, nSamples#2146, cohorts#2147, ldPopulationStructure#2148, discoverySamples#2149, replicationSamples#2150, qualityControls#2151, ... 9 more fields]
                                                               :           :                    :     :                                   +- Project [extracted_column#1169, study_type#1121, studyId#2130, geneId#2131, projectId#2132, studyType#2133, traitFromSource#2134, traitFromSourceMappedIds#2135, biosampleFromSourceId#2136, pubmedId#2137, publicationTitle#2138, publicationFirstAuthor#2139, publicationDate#2140, publicationJournal#2141, backgroundTraitFromSourceMappedIds#2142, initialSampleSize#2143, nCases#2144, nControls#2145, nSamples#2146, cohorts#2147, ldPopulationStructure#2148, discoverySamples#2149, replicationSamples#2150, qualityControls#2151, ... 8 more fields]
                                                               :           :                    :     :                                      +- Join RightOuter, (extracted_column#1133 = extracted_column#1169)
                                                               :           :                    :     :                                         :- Project [concat_ws(_, study_label#1113, quant_method#1119, sample_group#1114) AS extracted_column#1133, study_type#1121]
                                                               :           :                    :     :                                         :  +- LogicalRDD [study_id#1111, dataset_id#1112, study_label#1113, sample_group#1114, tissue_id#1115, tissue_label#1116, condition_label#1117, sample_size#1118, quant_method#1119, pmid#1120, study_type#1121], false
                                                               :           :                    :     :                                         +- Project [studyId#2130, geneId#2131, projectId#2132, studyType#2133, traitFromSource#2134, traitFromSourceMappedIds#2135, biosampleFromSourceId#2136, pubmedId#2137, publicationTitle#2138, publicationFirstAuthor#2139, publicationDate#2140, publicationJournal#2141, backgroundTraitFromSourceMappedIds#2142, initialSampleSize#2143, nCases#2144, nControls#2145, nSamples#2146, cohorts#2147, ldPopulationStructure#2148, discoverySamples#2149, replicationSamples#2150, qualityControls#2151, analysisFlags#2152, summarystatsLocation#2153, ... 7 more fields]
                                                               :           :                    :     :                                            +- Project [studyId#2130, geneId#2131, projectId#2132, studyType#2133, traitFromSource#2134, traitFromSourceMappedIds#2135, biosampleFromSourceId#2136, pubmedId#2137, publicationTitle#2138, publicationFirstAuthor#2139, publicationDate#2140, publicationJournal#2141, backgroundTraitFromSourceMappedIds#2142, initialSampleSize#2143, nCases#2144, nControls#2145, nSamples#2146, cohorts#2147, ldPopulationStructure#2148, discoverySamples#2149, replicationSamples#2150, qualityControls#2151, analysisFlags#2152, summarystatsLocation#2153, ... 7 more fields]
                                                               :           :                    :     :                                               +- Filter NOT StartsWith(studyId#2130, UKB_PPP)
                                                               :           :                    :     :                                                  +- Filter NOT (studyType#2133 = gwas)
                                                               :           :                    :     :                                                     +- Relation [studyId#2130,geneId#2131,projectId#2132,studyType#2133,traitFromSource#2134,traitFromSourceMappedIds#2135,biosampleFromSourceId#2136,pubmedId#2137,publicationTitle#2138,publicationFirstAuthor#2139,publicationDate#2140,publicationJournal#2141,backgroundTraitFromSourceMappedIds#2142,initialSampleSize#2143,nCases#2144,nControls#2145,nSamples#2146,cohorts#2147,ldPopulationStructure#2148,discoverySamples#2149,replicationSamples#2150,qualityControls#2151,analysisFlags#2152,summarystatsLocation#2153,... 6 more fields] parquet
                                                               :           :                    :     +- Project [studyLocusId#796 AS leftStudyLocusId#4767, datasourceId#717, targetId#718, datatypeId#747, diseaseFromSourceMappedId#751, resourceScore#787, targetFromSourceId#803, diseaseId#818, id#819, score#820, sourceId#823, studyId#4784, variantId#4785, betaGwas#4427, pValueExponent#4792]
                                                               :           :                    :        +- Project [studyLocusId#796, datasourceId#717, targetId#718, datatypeId#747, diseaseFromSourceMappedId#751, resourceScore#787, targetFromSourceId#803, diseaseId#818, id#819, score#820, sourceId#823, studyId#4784, variantId#4785, betaGwas#4427, pValueExponent#4792]
                                                               :           :                    :           +- Join LeftOuter, (studyLocusId#796 = studyLocusId#4783)
                                                               :           :                    :              :- Project [datasourceId#717, targetId#718, datatypeId#747, diseaseFromSourceMappedId#751, resourceScore#787, studyLocusId#796, targetFromSourceId#803, diseaseId#818, id#819, score#820, sourceId#823]
                                                               :           :                    :              :  +- Filter (datasourceId#717 = gwas_credible_sets)
                                                               :           :                    :              :     +- Relation [datasourceId#717,targetId#718,alleleOrigins#719,allelicRequirements#720,ancestry#721,ancestryId#722,assays#723,assessments#724,beta#725,betaConfidenceIntervalLower#726,betaConfidenceIntervalUpper#727,biologicalModelAllelicComposition#728,biologicalModelGeneticBackground#729,biologicalModelId#730,biomarkerList#731,biomarkerName#732,biomarkers#733,biosamplesFromSource#734,cellLineBackground#735,cellType#736,clinicalPhase#737,clinicalSignificances#738,clinicalStatus#739,cohortDescription#740,... 83 more fields] parquet
                                                               :           :                    :              +- Project [studyLocusId#4783, studyId#4784, variantId#4785, beta#4789 AS betaGwas#4427, pValueExponent#4792]
                                                               :           :                    :                 +- Relation [studyLocusId#4783,studyId#4784,variantId#4785,chromosome#4786,position#4787,region#4788,beta#4789,zScore#4790,pValueMantissa#4791,pValueExponent#4792,effectAlleleFrequencyFromSource#4793,standardError#4794,subStudyDescription#4795,qualityControls#4796,finemappingMethod#4797,credibleSetIndex#4798,credibleSetlog10BF#4799,purityMeanR2#4800,purityMinR2#4801,locusStart#4802,locusEnd#4803,sampleSize#4804,ldSet#4805,locus#4806,... 2 more fields] parquet
                                                               :           :                    +- Project [id#687 AS diseaseId#4846, name#691, parents#694, therapeuticAreas#699]
                                                               :           :                       +- Relation [id#687,code#688,dbXRefs#689,description#690,name#691,directLocationIds#692,obsoleteTerms#693,parents#694,synonyms#695,ancestors#696,descendants#697,children#698,therapeuticAreas#699,indirectLocationIds#700,ontology#701] parquet
                                                               :           +- Project [targetId#5966, diseaseId#6066, clinicalStudyId#23611, drugId#6003, clinicalPhase#5985, approvedDrug#18011, drugGoF_protect#23633L, LoF_protect#21269L AS drugLoF_protect#23642L]
                                                               :              +- Project [targetId#5966, diseaseId#6066, clinicalStudyId#23611, drugId#6003, clinicalPhase#5985, approvedDrug#18011, GoF_protect#21268L AS drugGoF_protect#23633L, LoF_protect#21269L]
                                                               :                 +- Project [targetId#5966, diseaseId#6066, clinicalStudyId#23611, drugId#6003, clinicalPhase#5985, approvedDrug#18011, GoF_protect#21268L, LoF_protect#21269L]
                                                               :                    +- Filter (coherencyDiagonal#23584 = coherent)
                                                               :                       +- Project [targetId#5966, diseaseId#6066, studyId#6043 AS clinicalStudyId#23611, drugId#6003, clinicalPhase#5985, approvedDrug#18011, GoF_protect#21268L, LoF_protect#21269L, GoF_risk#21282, LoF_risk#23155, noEvaluable#23359, coherencyDiagonal#23584, coherencyOneCell#23597]
                                                               :                          +- Project [targetId#5966, diseaseId#6066, studyId#6043, drugId#6003, clinicalPhase#5985, approvedDrug#18011, GoF_protect#21268L, LoF_protect#21269L, GoF_risk#21282, LoF_risk#23155, noEvaluable#23359, coherencyDiagonal#23584, CASE WHEN ((((isnull(LoF_risk#23155) AND isnull(LoF_protect#21269L)) AND isnull(GoF_risk#21282)) AND isnull(GoF_protect#21268L)) AND isnull(noEvaluable#23359)) THEN noEvid WHEN ((((isnull(LoF_risk#23155) AND isnull(LoF_protect#21269L)) AND isnull(GoF_risk#21282)) AND isnull(GoF_protect#21268L)) AND isnotnull(noEvaluable#23359)) THEN EvidNotDoE WHEN (((isnotnull(LoF_risk#23155) OR isnotnull(LoF_protect#21269L)) OR isnotnull(GoF_risk#21282)) OR isnotnull(GoF_protect#21268L)) THEN CASE WHEN (isnotnull(LoF_risk#23155) AND ((isnull(LoF_protect#21269L) AND isnull(GoF_risk#21282)) AND isnull(GoF_protect#21268L))) THEN coherent WHEN (isnotnull(GoF_risk#21282) AND ((isnull(LoF_protect#21269L) AND isnull(LoF_risk#23155)) AND isnull(GoF_protect#21268L))) THEN coherent WHEN (isnotnull(LoF_protect#21269L) AND ((isnull(LoF_risk#23155) AND isnull(GoF_risk#21282)) AND isnull(GoF_protect#21268L))) THEN coherent WHEN (isnotnull(GoF_protect#21268L) AND ((isnull(LoF_protect#21269L) AND isnull(GoF_risk#21282)) AND isnull(LoF_risk#23155))) THEN coherent ELSE dispar END END AS coherencyOneCell#23597]
                                                               :                             +- Project [targetId#5966, diseaseId#6066, studyId#6043, drugId#6003, clinicalPhase#5985, approvedDrug#18011, GoF_protect#21268L, LoF_protect#21269L, GoF_risk#21282, LoF_risk#23155, noEvaluable#23359, CASE WHEN ((((isnull(LoF_risk#23155) AND isnull(LoF_protect#21269L)) AND isnull(GoF_risk#21282)) AND isnull(GoF_protect#21268L)) AND isnull(noEvaluable#23359)) THEN noEvid WHEN ((((isnull(LoF_risk#23155) AND isnull(LoF_protect#21269L)) AND isnull(GoF_risk#21282)) AND isnull(GoF_protect#21268L)) AND isnotnull(noEvaluable#23359)) THEN EvidNotDoE WHEN (((isnotnull(LoF_risk#23155) OR isnotnull(LoF_protect#21269L)) OR isnotnull(GoF_risk#21282)) OR isnotnull(GoF_protect#21268L)) THEN CASE WHEN (isnotnull(GoF_risk#21282) AND isnotnull(LoF_risk#23155)) THEN dispar WHEN (isnotnull(LoF_protect#21269L) AND isnotnull(LoF_risk#23155)) THEN dispar WHEN (isnotnull(GoF_protect#21268L) AND isnotnull(GoF_risk#21282)) THEN dispar WHEN (isnotnull(GoF_protect#21268L) AND isnotnull(LoF_protect#21269L)) THEN dispar ELSE coherent END END AS coherencyDiagonal#23584]
                                                               :                                +- Project [targetId#5966, diseaseId#6066, studyId#6043, drugId#6003, clinicalPhase#5985, approvedDrug#18011, GoF_protect#21268L, LoF_protect#21269L, GoF_risk#21282, LoF_risk#23155, null AS noEvaluable#23359]
                                                               :                                   +- Project [targetId#5966, diseaseId#6066, studyId#6043, drugId#6003, clinicalPhase#5985, approvedDrug#18011, GoF_protect#21268L, LoF_protect#21269L, GoF_risk#21282, null AS LoF_risk#23155]
                                                               :                                      +- Project [targetId#5966, diseaseId#6066, studyId#6043, drugId#6003, clinicalPhase#5985, approvedDrug#18011, GoF_protect#21268L, LoF_protect#21269L, null AS GoF_risk#21282]
                                                               :                                         +- Project [targetId#5966, diseaseId#6066, studyId#6043, drugId#6003, clinicalPhase#5985, approvedDrug#18011, __pivot_count(1) AS count AS `count(1) AS count`#21267[0] AS GoF_protect#21268L, __pivot_count(1) AS count AS `count(1) AS count`#21267[1] AS LoF_protect#21269L]
                                                               :                                            +- Aggregate [targetId#5966, diseaseId#6066, studyId#6043, drugId#6003, clinicalPhase#5985, approvedDrug#18011], [targetId#5966, diseaseId#6066, studyId#6043, drugId#6003, clinicalPhase#5985, approvedDrug#18011, pivotfirst(homogenized#8340, count(1) AS count#21261L, GoF_protect, LoF_protect, 0, 0) AS __pivot_count(1) AS count AS `count(1) AS count`#21267]
                                                               :                                               +- Aggregate [targetId#5966, diseaseId#6066, studyId#6043, drugId#6003, clinicalPhase#5985, approvedDrug#18011, homogenized#8340], [targetId#5966, diseaseId#6066, studyId#6043, drugId#6003, clinicalPhase#5985, approvedDrug#18011, homogenized#8340, count(1) AS count(1) AS count#21261L]
                                                               :                                                  +- Project [targetId#5966, diseaseId#6066, drugId#6003, datasourceId#5965, alleleOrigins#5967, allelicRequirements#5968, ancestry#5969, ancestryId#5970, assays#5971, assessments#5972, beta#6842, betaConfidenceIntervalLower#5974, betaConfidenceIntervalUpper#5975, biologicalModelAllelicComposition#5976, biologicalModelGeneticBackground#5977, biologicalModelId#5978, biomarkerList#5979, biomarkerName#5980, biomarkers#5981, biosamplesFromSource#5982, cellLineBackground#5983, cellType#5984, clinicalPhase#5985, clinicalSignificances#7058, ... 98 more fields]
                                                               :                                                     +- Project [targetId#5966, diseaseId#6066, drugId#6003, datasourceId#5965, alleleOrigins#5967, allelicRequirements#5968, ancestry#5969, ancestryId#5970, assays#5971, assessments#5972, beta#6842, betaConfidenceIntervalLower#5974, betaConfidenceIntervalUpper#5975, biologicalModelAllelicComposition#5976, biologicalModelGeneticBackground#5977, biologicalModelId#5978, biomarkerList#5979, biomarkerName#5980, biomarkers#5981, biosamplesFromSource#5982, cellLineBackground#5983, cellType#5984, clinicalPhase#5985, clinicalSignificances#7058, ... 99 more fields]
                                                               :                                                        +- Window [max(isApproved#11257) windowspecdefinition(targetId#5966, diseaseId#6066, drugId#6003, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS approvedDrug#18011], [targetId#5966, diseaseId#6066, drugId#6003]
                                                               :                                                           +- Project [targetId#5966, diseaseId#6066, drugId#6003, datasourceId#5965, alleleOrigins#5967, allelicRequirements#5968, ancestry#5969, ancestryId#5970, assays#5971, assessments#5972, beta#6842, betaConfidenceIntervalLower#5974, betaConfidenceIntervalUpper#5975, biologicalModelAllelicComposition#5976, biologicalModelGeneticBackground#5977, biologicalModelId#5978, biomarkerList#5979, biomarkerName#5980, biomarkers#5981, biosamplesFromSource#5982, cellLineBackground#5983, cellType#5984, clinicalPhase#5985, clinicalSignificances#7058, ... 97 more fields]
                                                               :                                                              +- Project [targetId#5966, diseaseId#6066, drugId#6003, datasourceId#5965, alleleOrigins#5967, allelicRequirements#5968, ancestry#5969, ancestryId#5970, assays#5971, assessments#5972, beta#6842, betaConfidenceIntervalLower#5974, betaConfidenceIntervalUpper#5975, biologicalModelAllelicComposition#5976, biologicalModelGeneticBackground#5977, biologicalModelId#5978, biomarkerList#5979, biomarkerName#5980, biomarkers#5981, biosamplesFromSource#5982, cellLineBackground#5983, cellType#5984, clinicalPhase#5985, clinicalSignificances#7058, ... 97 more fields]
                                                               :                                                                 +- Join LeftOuter, (((targetId#5966 = targetId#11240) AND (diseaseId#6066 = diseaseId#11242)) AND (drugId#6003 = drugId#11241))
                                                               :                                                                    :- Filter ((datasourceId#5965 = chembl) AND NOT (homogenized#8340 = noEvaluable))
                                                               :                                                                    :  +- Project [datasourceId#5965, targetId#5966, alleleOrigins#5967, allelicRequirements#5968, ancestry#5969, ancestryId#5970, assays#5971, assessments#5972, beta#6842, betaConfidenceIntervalLower#5974, betaConfidenceIntervalUpper#5975, biologicalModelAllelicComposition#5976, biologicalModelGeneticBackground#5977, biologicalModelId#5978, biomarkerList#5979, biomarkerName#5980, biomarkers#5981, biosamplesFromSource#5982, cellLineBackground#5983, cellType#5984, clinicalPhase#5985, clinicalSignificances#7058, clinicalStatus#5987, cohortDescription#5988, ... 96 more fields]
                                                               :                                                                    :     +- Project [datasourceId#5965, targetId#5966, alleleOrigins#5967, allelicRequirements#5968, ancestry#5969, ancestryId#5970, assays#5971, assessments#5972, beta#6842, betaConfidenceIntervalLower#5974, betaConfidenceIntervalUpper#5975, biologicalModelAllelicComposition#5976, biologicalModelGeneticBackground#5977, biologicalModelId#5978, biomarkerList#5979, biomarkerName#5980, biomarkers#5981, biosamplesFromSource#5982, cellLineBackground#5983, cellType#5984, clinicalPhase#5985, clinicalSignificances#7058, clinicalStatus#5987, cohortDescription#5988, ... 95 more fields]
                                                               :                                                                    :        +- Project [datasourceId#5965, targetId#5966, alleleOrigins#5967, allelicRequirements#5968, ancestry#5969, ancestryId#5970, assays#5971, assessments#5972, beta#6842, betaConfidenceIntervalLower#5974, betaConfidenceIntervalUpper#5975, biologicalModelAllelicComposition#5976, biologicalModelGeneticBackground#5977, biologicalModelId#5978, biomarkerList#5979, biomarkerName#5980, biomarkers#5981, biosamplesFromSource#5982, cellLineBackground#5983, cellType#5984, clinicalPhase#5985, clinicalSignificances#7058, clinicalStatus#5987, cohortDescription#5988, ... 95 more fields]
                                                               :                                                                    :           +- Project [datasourceId#5965, targetId#5966, alleleOrigins#5967, allelicRequirements#5968, ancestry#5969, ancestryId#5970, assays#5971, assessments#5972, beta#6842, betaConfidenceIntervalLower#5974, betaConfidenceIntervalUpper#5975, biologicalModelAllelicComposition#5976, biologicalModelGeneticBackground#5977, biologicalModelId#5978, biomarkerList#5979, biomarkerName#5980, biomarkers#5981, biosamplesFromSource#5982, cellLineBackground#5983, cellType#5984, clinicalPhase#5985, clinicalSignificances#7058, clinicalStatus#5987, cohortDescription#5988, ... 95 more fields]
                                                               :                                                                    :              +- Project [datasourceId#5965, targetId#5966, alleleOrigins#5967, allelicRequirements#5968, ancestry#5969, ancestryId#5970, assays#5971, assessments#5972, beta#6842, betaConfidenceIntervalLower#5974, betaConfidenceIntervalUpper#5975, biologicalModelAllelicComposition#5976, biologicalModelGeneticBackground#5977, biologicalModelId#5978, biomarkerList#5979, biomarkerName#5980, biomarkers#5981, biosamplesFromSource#5982, cellLineBackground#5983, cellType#5984, clinicalPhase#5985, clinicalSignificances#7058, clinicalStatus#5987, cohortDescription#5988, ... 96 more fields]
                                                               :                                                                    :                 +- Window [collect_set(intogen_function#7855, 0, 0) windowspecdefinition(targetId#5966, diseaseId#6066, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS _we0#7978], [targetId#5966, diseaseId#6066]
                                                               :                                                                    :                    +- Project [datasourceId#5965, targetId#5966, alleleOrigins#5967, allelicRequirements#5968, ancestry#5969, ancestryId#5970, assays#5971, assessments#5972, beta#6842, betaConfidenceIntervalLower#5974, betaConfidenceIntervalUpper#5975, biologicalModelAllelicComposition#5976, biologicalModelGeneticBackground#5977, biologicalModelId#5978, biomarkerList#5979, biomarkerName#5980, biomarkers#5981, biosamplesFromSource#5982, cellLineBackground#5983, cellType#5984, clinicalPhase#5985, clinicalSignificances#7058, clinicalStatus#5987, cohortDescription#5988, ... 94 more fields]
                                                               :                                                                    :                       +- Project [datasourceId#5965, targetId#5966, alleleOrigins#5967, allelicRequirements#5968, ancestry#5969, ancestryId#5970, assays#5971, assessments#5972, beta#6842, betaConfidenceIntervalLower#5974, betaConfidenceIntervalUpper#5975, biologicalModelAllelicComposition#5976, biologicalModelGeneticBackground#5977, biologicalModelId#5978, biomarkerList#5979, biomarkerName#5980, biomarkers#5981, biosamplesFromSource#5982, cellLineBackground#5983, cellType#5984, clinicalPhase#5985, clinicalSignificances#7058, clinicalStatus#5987, cohortDescription#5988, ... 94 more fields]
                                                               :                                                                    :                          +- Project [datasourceId#5965, targetId#5966, alleleOrigins#5967, allelicRequirements#5968, ancestry#5969, ancestryId#5970, assays#5971, assessments#5972, beta#6842, betaConfidenceIntervalLower#5974, betaConfidenceIntervalUpper#5975, biologicalModelAllelicComposition#5976, biologicalModelGeneticBackground#5977, biologicalModelId#5978, biomarkerList#5979, biomarkerName#5980, biomarkers#5981, biosamplesFromSource#5982, cellLineBackground#5983, cellType#5984, clinicalPhase#5985, clinicalSignificances#7058, clinicalStatus#5987, cohortDescription#5988, ... 93 more fields]
                                                               :                                                                    :                             +- Project [datasourceId#5965, targetId#5966, alleleOrigins#5967, allelicRequirements#5968, ancestry#5969, ancestryId#5970, assays#5971, assessments#5972, beta#6842, betaConfidenceIntervalLower#5974, betaConfidenceIntervalUpper#5975, biologicalModelAllelicComposition#5976, biologicalModelGeneticBackground#5977, biologicalModelId#5978, biomarkerList#5979, biomarkerName#5980, biomarkers#5981, biosamplesFromSource#5982, cellLineBackground#5983, cellType#5984, clinicalPhase#5985, clinicalSignificances#7058, clinicalStatus#5987, cohortDescription#5988, ... 92 more fields]
                                                               :                                                                    :                                +- Join LeftOuter, ((drugId2#6786 = drugId#6003) AND (targetId2#6793 = targetId#5966))
                                                               :                                                                    :                                   :- Join LeftOuter, (target_id#6836 = targetId#5966)
                                                               :                                                                    :                                   :  :- Project [datasourceId#5965, targetId#5966, alleleOrigins#5967, allelicRequirements#5968, ancestry#5969, ancestryId#5970, assays#5971, assessments#5972, beta#6842, betaConfidenceIntervalLower#5974, betaConfidenceIntervalUpper#5975, biologicalModelAllelicComposition#5976, biologicalModelGeneticBackground#5977, biologicalModelId#5978, biomarkerList#5979, biomarkerName#5980, biomarkers#5981, biosamplesFromSource#5982, cellLineBackground#5983, cellType#5984, clinicalPhase#5985, concat_ws(,, clinicalSignificances#5986) AS clinicalSignificances#7058, clinicalStatus#5987, cohortDescription#5988, ... 83 more fields]
                                                               :                                                                    :                                   :  :  +- Project [datasourceId#5965, targetId#5966, alleleOrigins#5967, allelicRequirements#5968, ancestry#5969, ancestryId#5970, assays#5971, assessments#5972, beta#6842, betaConfidenceIntervalLower#5974, betaConfidenceIntervalUpper#5975, biologicalModelAllelicComposition#5976, biologicalModelGeneticBackground#5977, biologicalModelId#5978, biomarkerList#5979, biomarkerName#5980, biomarkers#5981, biosamplesFromSource#5982, cellLineBackground#5983, cellType#5984, clinicalPhase#5985, clinicalSignificances#5986, clinicalStatus#5987, cohortDescription#5988, ... 83 more fields]
                                                               :                                                                    :                                   :  :     +- Project [datasourceId#5965, targetId#5966, alleleOrigins#5967, allelicRequirements#5968, ancestry#5969, ancestryId#5970, assays#5971, assessments#5972, cast(beta#5973 as double) AS beta#6842, betaConfidenceIntervalLower#5974, betaConfidenceIntervalUpper#5975, biologicalModelAllelicComposition#5976, biologicalModelGeneticBackground#5977, biologicalModelId#5978, biomarkerList#5979, biomarkerName#5980, biomarkers#5981, biosamplesFromSource#5982, cellLineBackground#5983, cellType#5984, clinicalPhase#5985, clinicalSignificances#5986, clinicalStatus#5987, cohortDescription#5988, ... 83 more fields]
                                                               :                                                                    :                                   :  :        +- Filter datasourceId#5965 IN (ot_genetics_portal,gwas_credible_sets,gene_burden,eva,eva_somatic,gene2phenotype,orphanet,cancer_gene_census,intogen,impc,chembl)
                                                               :                                                                    :                                   :  :           +- Relation [datasourceId#5965,targetId#5966,alleleOrigins#5967,allelicRequirements#5968,ancestry#5969,ancestryId#5970,assays#5971,assessments#5972,beta#5973,betaConfidenceIntervalLower#5974,betaConfidenceIntervalUpper#5975,biologicalModelAllelicComposition#5976,biologicalModelGeneticBackground#5977,biologicalModelId#5978,biomarkerList#5979,biomarkerName#5980,biomarkers#5981,biosamplesFromSource#5982,cellLineBackground#5983,cellType#5984,clinicalPhase#5985,clinicalSignificances#5986,clinicalStatus#5987,cohortDescription#5988,... 83 more fields] parquet
                                                               :                                                                    :                                   :  +- Project [id#6715 AS target_id#6836, approvedSymbol#6716, description#6821, description_splited#6825, TSorOncogene#6830]
                                                               :                                                                    :                                   :     +- Project [id#6715, approvedSymbol#6716, description#6821, description_splited#6825, CASE WHEN (RLIKE(description_splited#6825, ncogene) AND RLIKE(description_splited#6825, TSG)) THEN bivalent WHEN RLIKE(description_splited#6825, ncogene(\s|$)) THEN oncogene WHEN RLIKE(description_splited#6825, TSG(\s|$)) THEN TSG ELSE noEvaluable END AS TSorOncogene#6830]
                                                               :                                                                    :                                   :        +- Project [id#6715, approvedSymbol#6716, description#6821, concat_ws(,, description#6821) AS description_splited#6825]
                                                               :                                                                    :                                   :           +- Aggregate [id#6715, approvedSymbol#6716], [id#6715, approvedSymbol#6716, collect_set(description#6813, 0, 0) AS description#6821]
                                                               :                                                                    :                                   :              +- Filter description#6813 IN (TSG,oncogene,Oncogene,oncogene,oncogene,TSG,TSG,oncogene,fusion,oncogene,oncogene,fusion)
                                                               :                                                                    :                                   :                 +- Project [id#6715, approvedSymbol#6716, col#6808.description AS description#6813]
                                                               :                                                                    :                                   :                    +- Project [id#6715, approvedSymbol#6716, col#6808]
                                                               :                                                                    :                                   :                       +- Generate explode(hallmarks#6725.attributes), true, [col#6808]
                                                               :                                                                    :                                   :                          +- Relation [id#6715,approvedSymbol#6716,biotype#6717,transcriptIds#6718,canonicalTranscript#6719,canonicalExons#6720,genomicLocation#6721,alternativeGenes#6722,approvedName#6723,go#6724,hallmarks#6725,synonyms#6726,symbolSynonyms#6727,nameSynonyms#6728,functionDescriptions#6729,subcellularLocations#6730,targetClass#6731,obsoleteSymbols#6732,obsoleteNames#6733,constraint#6734,tep#6735,proteinIds#6736,dbXrefs#6737,chemicalProbes#6738,... 4 more fields] parquet
                                                               :                                                                    :                                   +- Aggregate [targetId2#6793, drugId2#6786], [targetId2#6793, drugId2#6786, collect_set(actionType#6771, 0, 0) AS actionType#6803]
                                                               :                                                                    :                                      +- Project [targetId2#6793, drugId2#6786, actionType#6771, mechanismOfAction#6772]
                                                               :                                                                    :                                         +- Generate explode(targets#6776), true, [targetId2#6793]
                                                               :                                                                    :                                            +- Project [drugId2#6786, actionType#6771, mechanismOfAction#6772, targets#6776]
                                                               :                                                                    :                                               +- Generate explode(chemblIds#6773), true, [drugId2#6786]
                                                               :                                                                    :                                                  +- Relation [actionType#6771,mechanismOfAction#6772,chemblIds#6773,targetName#6774,targetType#6775,targets#6776,references#6777] parquet
                                                               :                                                                    +- Filter (isApproved#11257 = 1)
                                                               :                                                                       +- Deduplicate [targetId#11240, drugId#11241, diseaseId#11242, isApproved#11257]
                                                               :                                                                          +- Project [targetId#11240, drugId#11241, diseaseId#11242, CASE WHEN (isApproved#11244 = cast(true as boolean)) THEN 1 ELSE 0 END AS isApproved#11257]
                                                               :                                                                             +- Project [targetId#11240, drugId#11241, diseaseId#11242, isApproved#11244]
                                                               :                                                                                +- Relation [targetId#11240,drugId#11241,diseaseId#11242,clinicalTrialId#11243,isApproved#11244,isComplex#11245] parquet
                                                               +- Project [biosampleId#1093, biosampleName#1094]
                                                                  +- Relation [biosampleId#1093,biosampleName#1094,description#1095,xrefs#1096,synonyms#1097,parents#1098,ancestors#1099,children#1100,descendants#1101] parquet


In [10]:
df

Unnamed: 0,partitionByPValue,partitionForPhase,groupByForRows,phase,x,OR,pValue,LowCI,HighCI,total,array
0,"[targetId, diseaseId, rightStudyType]","[targetId, diseaseId]","[targetId, diseaseId, clinicalStudyId, clinica...",Phase4,pqtl,1.1,0.741089,0.7,1.81,4333,"[[290, 22], [3711, 310]]"
1,"[targetId, diseaseId, rightStudyType]","[targetId, diseaseId]","[targetId, diseaseId, clinicalStudyId, clinica...",Phase4,eqtl,1.38,0.006557,1.09,1.76,4333,"[[1718, 117], [2283, 215]]"
2,"[targetId, diseaseId, rightStudyType]","[targetId, diseaseId]","[targetId, diseaseId, clinicalStudyId, clinica...",Phase4,sqtl,1.39,0.138676,0.92,2.16,4333,"[[437, 27], [3564, 305]]"


In [22]:
import numpy as np
def convertTuple(tup):
    st = ",".join(map(str, tup))
    return st
from scipy.stats import fisher_exact
from scipy.stats.contingency import odds_ratio
from pyspark.sql.types import *

group1=["targetId", "diseaseId","maxClinPhase_TD"] ### canonical one
group2=["targetId", "diseaseId", "clinicalStudyId","clinicalPhase","maxClinPhase_TD","approvedDrug"]
group3=["targetId", "diseaseId", "maxClinPhase_TD","approvedDrug"] ## maxClinPhase_TD
group4=["targetId", "diseaseId","drugId", "maxClinPhase_TD","approvedDrug"] ### maxClinPhase_TD
group5=["targetId", "diseaseId","drugId","clinicalStudyId", "maxClinPhase_TD","approvedDrug"] ## maxClinPhase_TD
group6=["targetId", "diseaseId","clinicalPhase"]

# Dictionary to map group names to lists
group_mapping = {
    "group1": group1,
    "group2": group2,
    "group3": group3,
    "group4": group4,
    "group5": group5,
    "group6": group6
}
clinical_phases=["Phase4","approved"]
remove=["maxClinPhase_TD", "approvedDrug"]

value_analysis=["pqtl","eqtl","sqtl"]
# Prepare aggregation depending on the variable problem
window_spec = Window.partitionBy("targetId", "diseaseId", "projectId").orderBy(F.col("pValueExponent").asc())
for value in value_analysis:
    # Iterate over group mapping
    for group_name, group_columns in group_mapping.items():

        if "approvedDrug" in group_columns:

            print(f"Processing group: {group_name}", "with variables:", group_columns)
            filtered_group1 = [item for item in group_columns if item not in remove]
            print(filtered_group1,"for", value)
            x = value
            pre=new_benchmark.filter(F.col("clinicalStudyId").isNotNull()).withColumn(
                    "maxClinPhase_TD", ### no longer maxclinphase for T-D
                    F.max("clinicalPhase").over(Window.partitionBy(filtered_group1))).withColumn("agree_lowestPval", F.first("AgreeDrug").over(window_spec)
            ).groupBy(group_columns).pivot("rightStudyType").agg(F.collect_set("agree_lowestPVal")
            ).withColumn(
            "Phase4",
            F.when(F.col("maxClinPhase_TD") == 4, F.lit("yes")).otherwise(F.lit("no")),
            ).withColumn(
                "Phase>=3",
                F.when(F.col("maxClinPhase_TD") >= 3, F.lit("yes")).otherwise(F.lit("no")),
            ).withColumn(
                "Phase>=2",
                F.when(F.col("maxClinPhase_TD") >= 2, F.lit("yes")).otherwise(F.lit("no")),
            ).withColumn(
                "Phase>=1",
                F.when(F.col("maxClinPhase_TD") >= 1, F.lit("yes")).otherwise(F.lit("no")),
            ).withColumn(
            "approved",
                F.when(F.col("approvedDrug")==1, F.lit("yes")).otherwise(F.lit("no")),
            ).withColumn(
                x, 
                F.when(F.array_contains(F.col(x),"yes"), F.lit("yes")).otherwise(F.lit("no"))
            )

            for phase in clinical_phases:
                
                print("groupby for", phase)
                pre.groupBy(x).pivot(phase).count().select(x,"yes","no").sort(F.col(x).desc()).show()

                array1 = np.delete(pre.groupBy(x).pivot(phase).count().select(x,"yes","no").sort(F.col(x).desc())
                .fillna(0)
                .toPandas()
                .to_numpy(),
                [0],
                1,
                )
                total = np.sum(array1)
                res_npPhaseX = np.array(array1, dtype=int)
                resX = convertTuple(fisher_exact(res_npPhaseX, alternative="two-sided"))
                resx_CI = convertTuple(
                    odds_ratio(res_npPhaseX).confidence_interval(confidence_level=0.95)
                )
                print(round(float(resX.split(",")[0]), 2),float(resX.split(",")[1]),
                    round(float(resx_CI.split(",")[0]), 2),
                    round(float(resx_CI.split(",")[1]), 2))
                print("\n")
        elif "group6" == group_name:
            print(f"Processing group: {group_name}", "with variables:", group_columns)
            filtered_group1 = [item for item in group_columns if item not in remove]
            print(filtered_group1,"for", value)
            x = value
            pre=new_benchmark.filter(F.col("clinicalStudyId").isNotNull()).withColumn(
                    "maxClinPhase_TD", ### no longer maxclinphase for T-D
                    F.max("clinicalPhase").over(Window.partitionBy(filtered_group1))).withColumn("agree_lowestPval", F.first("AgreeDrug").over(window_spec)
            ).groupBy(group_columns).pivot("rightStudyType").agg(F.collect_set("agree_lowestPVal")
            ).withColumn(
            "Phase4",
            F.when(F.col("clinicalPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
            ).withColumn(
                "Phase>=3",
                F.when(F.col("clinicalPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
            ).withColumn(
                "Phase>=2",
                F.when(F.col("clinicalPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
            ).withColumn(
                "Phase>=1",
                F.when(F.col("clinicalPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
            ).withColumn(
                x, 
                F.when(F.array_contains(F.col(x),"yes"), F.lit("yes")).otherwise(F.lit("no"))
            )
            
            for phase in ["phase4"]:
                print("groupby for just phase 4 because it does not have approvedDrug columns")
                pre.groupBy(x).pivot(phase).count().select(x,"yes","no").sort(F.col(x).desc()).show()

                array1 = np.delete(pre.groupBy(x).pivot(phase).count().select(x,"yes","no").sort(F.col(x).desc())
                .fillna(0)
                .toPandas()
                .to_numpy(),
                [0],
                1,
                )
                total = np.sum(array1)
                res_npPhaseX = np.array(array1, dtype=int)
                resX = convertTuple(fisher_exact(res_npPhaseX, alternative="two-sided"))
                resx_CI = convertTuple(
                    odds_ratio(res_npPhaseX).confidence_interval(confidence_level=0.95)
                )
                print(round(float(resX.split(",")[0]), 2),float(resX.split(",")[1]),
                    round(float(resx_CI.split(",")[0]), 2),
                    round(float(resx_CI.split(",")[1]), 2))
                print("\n")
        else: 
            print(f"Processing group: {group_name}", "with variables:", group_columns)
            filtered_group1 = [item for item in group_columns if item not in remove]
            print(filtered_group1,"for", value)
            x = value
            pre=new_benchmark.filter(F.col("clinicalStudyId").isNotNull()).withColumn(
                    "maxClinPhase_TD", ### no longer maxclinphase for T-D
                    F.max("clinicalPhase").over(Window.partitionBy(filtered_group1))).withColumn("agree_lowestPval", F.first("AgreeDrug").over(window_spec)
            ).groupBy(group_columns).pivot("rightStudyType").agg(F.collect_set("agree_lowestPVal")
            ).withColumn(
            "Phase4",
            F.when(F.col("maxClinPhase_TD") == 4, F.lit("yes")).otherwise(F.lit("no")),
            ).withColumn(
                "Phase>=3",
                F.when(F.col("maxClinPhase_TD") >= 3, F.lit("yes")).otherwise(F.lit("no")),
            ).withColumn(
                "Phase>=2",
                F.when(F.col("maxClinPhase_TD") >= 2, F.lit("yes")).otherwise(F.lit("no")),
            ).withColumn(
                "Phase>=1",
                F.when(F.col("maxClinPhase_TD") >= 1, F.lit("yes")).otherwise(F.lit("no")),
            ).withColumn(
                x, 
                F.when(F.array_contains(F.col(x),"yes"), F.lit("yes")).otherwise(F.lit("no"))
            )
            
            for phase in ["phase4"]:
                print("groupby for just phase 4 because it does not have approvedDrug columns")
                pre.groupBy(x).pivot(phase).count().select(x,"yes","no").sort(F.col(x).desc()).show()

                array1 = np.delete(pre.groupBy(x).pivot(phase).count().select(x,"yes","no").sort(F.col(x).desc())
                .fillna(0)
                .toPandas()
                .to_numpy(),
                [0],
                1,
                )
                total = np.sum(array1)
                res_npPhaseX = np.array(array1, dtype=int)
                resX = convertTuple(fisher_exact(res_npPhaseX, alternative="two-sided"))
                resx_CI = convertTuple(
                    odds_ratio(res_npPhaseX).confidence_interval(confidence_level=0.95)
                )
                print(round(float(resX.split(",")[0]), 2),float(resX.split(",")[1]),
                    round(float(resx_CI.split(",")[0]), 2),
                    round(float(resx_CI.split(",")[1]), 2))
                print("\n")

Processing group: group1 with variables: ['targetId', 'diseaseId', 'maxClinPhase_TD']
['targetId', 'diseaseId'] for pqtl


                                                                                

groupby for just phase 4 because it does not have approvedDrug columns


                                                                                

+----+---+---+
|pqtl|yes| no|
+----+---+---+
| yes| 16|  9|
|  no| 58| 84|
+----+---+---+



                                                                                

2.57 0.04795664956148016 0.99 7.05


Processing group: group2 with variables: ['targetId', 'diseaseId', 'clinicalStudyId', 'clinicalPhase', 'maxClinPhase_TD', 'approvedDrug']
['targetId', 'diseaseId', 'clinicalStudyId', 'clinicalPhase'] for pqtl


                                                                                

groupby for Phase4


                                                                                

+----+----+----+
|pqtl| yes|  no|
+----+----+----+
| yes|  48| 264|
|  no|1217|3100|
+----+----+----+



                                                                                

0.46 3.370334268822121e-07 0.33 0.64


groupby for approved


                                                                                

+----+----+----+
|pqtl| yes|  no|
+----+----+----+
| yes| 146| 166|
|  no|3160|1157|
+----+----+----+



                                                                                

0.32 3.1267369103605534e-21 0.25 0.41


Processing group: group3 with variables: ['targetId', 'diseaseId', 'maxClinPhase_TD', 'approvedDrug']
['targetId', 'diseaseId'] for pqtl


                                                                                

groupby for Phase4


                                                                                

+----+---+---+
|pqtl|yes| no|
+----+---+---+
| yes| 24| 10|
|  no| 87| 87|
+----+---+---+



                                                                                

2.4 0.03783216519580829 1.03 5.95


groupby for approved


                                                                                

+----+---+---+
|pqtl|yes| no|
+----+---+---+
| yes| 10| 24|
|  no| 39|135|
+----+---+---+



                                                                                

1.44 0.382535727470141 0.56 3.45


Processing group: group4 with variables: ['targetId', 'diseaseId', 'drugId', 'maxClinPhase_TD', 'approvedDrug']
['targetId', 'diseaseId', 'drugId'] for pqtl


                                                                                

groupby for Phase4


                                                                                

+----+---+---+
|pqtl|yes| no|
+----+---+---+
| yes| 26| 43|
|  no|182|291|
+----+---+---+



                                                                                

0.97 1.0 0.55 1.67


groupby for approved


                                                                                

+----+---+---+
|pqtl|yes| no|
+----+---+---+
| yes| 20| 49|
|  no|139|334|
+----+---+---+



                                                                                

0.98 1.0 0.53 1.75


Processing group: group5 with variables: ['targetId', 'diseaseId', 'drugId', 'clinicalStudyId', 'maxClinPhase_TD', 'approvedDrug']
['targetId', 'diseaseId', 'drugId', 'clinicalStudyId'] for pqtl


                                                                                

groupby for Phase4


                                                                                

+----+----+----+
|pqtl| yes|  no|
+----+----+----+
| yes|  53| 269|
|  no|1383|3455|
+----+----+----+



                                                                                

0.49 1.2493302270554325e-06 0.36 0.67


groupby for approved


                                                                                

+----+----+----+
|pqtl| yes|  no|
+----+----+----+
| yes| 153| 169|
|  no|3531|1307|
+----+----+----+



                                                                                

0.34 1.8105192263539285e-20 0.27 0.42


Processing group: group6 with variables: ['targetId', 'diseaseId', 'clinicalPhase']
['targetId', 'diseaseId', 'clinicalPhase'] for pqtl


                                                                                

groupby for just phase 4 because it does not have approvedDrug columns


                                                                                

+----+---+---+
|pqtl|yes| no|
+----+---+---+
| yes| 16| 39|
|  no| 58|261|
+----+---+---+



                                                                                

1.85 0.06800669903855017 0.9 3.65


Processing group: group1 with variables: ['targetId', 'diseaseId', 'maxClinPhase_TD']
['targetId', 'diseaseId'] for eqtl


                                                                                

groupby for just phase 4 because it does not have approvedDrug columns


                                                                                

+----+---+---+
|eqtl|yes| no|
+----+---+---+
| yes| 43| 51|
|  no| 31| 42|
+----+---+---+



                                                                                

1.14 0.7539103155787842 0.59 2.22


Processing group: group2 with variables: ['targetId', 'diseaseId', 'clinicalStudyId', 'clinicalPhase', 'maxClinPhase_TD', 'approvedDrug']
['targetId', 'diseaseId', 'clinicalStudyId', 'clinicalPhase'] for eqtl


                                                                                

groupby for Phase4


                                                                                

+----+---+----+
|eqtl|yes|  no|
+----+---+----+
| yes|574|1823|
|  no|691|1541|
+----+---+----+



                                                                                

0.7 1.0448861999596396e-07 0.62 0.8


groupby for approved


                                                                                

+----+----+---+
|eqtl| yes| no|
+----+----+---+
| yes|1757|640|
|  no|1549|683|
+----+----+---+



                                                                                

1.21 0.00339683963323155 1.06 1.38


Processing group: group3 with variables: ['targetId', 'diseaseId', 'maxClinPhase_TD', 'approvedDrug']
['targetId', 'diseaseId'] for eqtl


                                                                                

groupby for Phase4


                                                                                

+----+---+---+
|eqtl|yes| no|
+----+---+---+
| yes| 63| 53|
|  no| 48| 44|
+----+---+---+



                                                                                

1.09 0.7810178165681114 0.61 1.96


groupby for approved


                                                                                

+----+---+---+
|eqtl|yes| no|
+----+---+---+
| yes| 27| 89|
|  no| 22| 70|
+----+---+---+



                                                                                

0.97 1.0 0.48 1.94


Processing group: group4 with variables: ['targetId', 'diseaseId', 'drugId', 'maxClinPhase_TD', 'approvedDrug']
['targetId', 'diseaseId', 'drugId'] for eqtl


                                                                                

groupby for Phase4


                                                                                

+----+---+---+
|eqtl|yes| no|
+----+---+---+
| yes|123|216|
|  no| 85|118|
+----+---+---+



                                                                                

0.79 0.20251855484720355 0.55 1.15


groupby for approved


                                                                                

+----+---+---+
|eqtl|yes| no|
+----+---+---+
| yes| 99|240|
|  no| 60|143|
+----+---+---+



                                                                                

0.98 1.0 0.66 1.47


Processing group: group5 with variables: ['targetId', 'diseaseId', 'drugId', 'clinicalStudyId', 'maxClinPhase_TD', 'approvedDrug']
['targetId', 'diseaseId', 'drugId', 'clinicalStudyId'] for eqtl


                                                                                

groupby for Phase4


                                                                                

+----+---+----+
|eqtl|yes|  no|
+----+---+----+
| yes|639|1982|
|  no|797|1742|
+----+---+----+



                                                                                

0.7 2.1769523354181277e-08 0.62 0.8


groupby for approved


                                                                                

+----+----+---+
|eqtl| yes| no|
+----+----+---+
| yes|1919|702|
|  no|1765|774|
+----+----+---+



                                                                                

1.2 0.003421416232169518 1.06 1.36


Processing group: group6 with variables: ['targetId', 'diseaseId', 'clinicalPhase']
['targetId', 'diseaseId', 'clinicalPhase'] for eqtl


                                                                                

groupby for just phase 4 because it does not have approvedDrug columns


                                                                                

+----+---+---+
|eqtl|yes| no|
+----+---+---+
| yes| 43|183|
|  no| 31|117|
+----+---+---+



                                                                                

0.89 0.6912086913648727 0.51 1.54


Processing group: group1 with variables: ['targetId', 'diseaseId', 'maxClinPhase_TD']
['targetId', 'diseaseId'] for sqtl


                                                                                

groupby for just phase 4 because it does not have approvedDrug columns


                                                                                

+----+---+---+
|sqtl|yes| no|
+----+---+---+
| yes| 13| 22|
|  no| 61| 71|
+----+---+---+



                                                                                

0.69 0.4443100689028572 0.29 1.57


Processing group: group2 with variables: ['targetId', 'diseaseId', 'clinicalStudyId', 'clinicalPhase', 'maxClinPhase_TD', 'approvedDrug']
['targetId', 'diseaseId', 'clinicalStudyId', 'clinicalPhase'] for sqtl


                                                                                

groupby for Phase4


                                                                                

+----+---+----+
|sqtl|yes|  no|
+----+---+----+
| yes|304| 771|
|  no|961|2593|
+----+---+----+



                                                                                

1.06 0.4347965651803126 0.91 1.24


groupby for approved


                                                                                

+----+----+----+
|sqtl| yes|  no|
+----+----+----+
| yes| 860| 215|
|  no|2446|1108|
+----+----+----+



                                                                                

1.81 3.7144502444442453e-13 1.53 2.15


Processing group: group3 with variables: ['targetId', 'diseaseId', 'maxClinPhase_TD', 'approvedDrug']
['targetId', 'diseaseId'] for sqtl


                                                                                

groupby for Phase4


                                                                                

+----+---+---+
|sqtl|yes| no|
+----+---+---+
| yes| 20| 23|
|  no| 91| 74|
+----+---+---+



                                                                                

0.71 0.3910810335821794 0.34 1.46


groupby for approved


                                                                                

+----+---+---+
|sqtl|yes| no|
+----+---+---+
| yes| 11| 32|
|  no| 38|127|
+----+---+---+



                                                                                

1.15 0.6922022263425054 0.48 2.62


Processing group: group4 with variables: ['targetId', 'diseaseId', 'drugId', 'maxClinPhase_TD', 'approvedDrug']
['targetId', 'diseaseId', 'drugId'] for sqtl


                                                                                

groupby for Phase4


                                                                                

+----+---+---+
|sqtl|yes| no|
+----+---+---+
| yes| 47| 78|
|  no|161|256|
+----+---+---+



                                                                                

0.96 0.916547682975065 0.62 1.47


groupby for approved


                                                                                

+----+---+---+
|sqtl|yes| no|
+----+---+---+
| yes| 39| 86|
|  no|120|297|
+----+---+---+



                                                                                

1.12 0.654312004089056 0.71 1.76


Processing group: group5 with variables: ['targetId', 'diseaseId', 'drugId', 'clinicalStudyId', 'maxClinPhase_TD', 'approvedDrug']
['targetId', 'diseaseId', 'drugId', 'clinicalStudyId'] for sqtl


                                                                                

groupby for Phase4


                                                                                

+----+----+----+
|sqtl| yes|  no|
+----+----+----+
| yes| 347| 844|
|  no|1089|2880|
+----+----+----+



                                                                                

1.09 0.25333226868075165 0.94 1.26


groupby for approved


                                                                                

+----+----+----+
|sqtl| yes|  no|
+----+----+----+
| yes| 953| 238|
|  no|2731|1238|
+----+----+----+



                                                                                

1.82 1.761092533228378e-14 1.55 2.13


Processing group: group6 with variables: ['targetId', 'diseaseId', 'clinicalPhase']
['targetId', 'diseaseId', 'clinicalPhase'] for sqtl


                                                                                

groupby for just phase 4 because it does not have approvedDrug columns


                                                                                

+----+---+---+
|sqtl|yes| no|
+----+---+---+
| yes| 13| 66|
|  no| 61|234|
+----+---+---+





0.76 0.5246665763693217 0.36 1.5




                                                                                

##### See how the benchmark varies regarding maxclinphase made in chembl_indication

In [None]:
#### build list of comparison and prediction columns
rows=comparisons_df_iterative(disdic,variables_study[2])[0]
#### prepare aggregation depending on the variable problem
window_spec = Window.partitionBy("targetId","diseaseId","rightStudyType").orderBy(F.col("pValueExponent").asc())
#### take directionality from lowest p value
bench2=benchmark.withColumn("agree_lowestPval", F.first("AgreeDrug").over(window_spec)
    ).groupBy("targetId", "diseaseId", "clinicalStudyId","clinicalPhase","maxClinPhase","approvedDrug").pivot("rightStudyType").agg(F.collect_set("agree_lowestPVal")
    ).withColumn(
    "Phase4",
    F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=3",
        F.when(F.col("maxClinPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=2",
        F.when(F.col("maxClinPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=1",
        F.when(F.col("maxClinPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
    "approved",
        F.when(F.col("approvedDrug")==1, F.lit("yes")).otherwise(F.lit("no")),
    )
#### build columns yes/no for each distinct value in the column variable

bench3=benchmark.withColumn("agree_lowestPval", F.first("AgreeDrug").over(window_spec)
    ).withColumn(
            "maxClinPhase_TD", ### maxclinphase for T-D
            F.max("clinicalPhase").over(Window.partitionBy("targetId", "diseaseId")),
    ).groupBy("targetId", "diseaseId", "maxClinPhase_TD","approvedDrug").pivot("rightStudyType").agg(F.collect_set("agree_lowestPVal")
    ).withColumn(
    "Phase4",
    F.when(F.col("maxClinPhase_TD") == 4, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=3",
        F.when(F.col("maxClinPhase_TD") >= 3, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=2",
        F.when(F.col("maxClinPhase_TD") >= 2, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=1",
        F.when(F.col("maxClinPhase_TD") >= 1, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
    "approved",
        F.when(F.col("approvedDrug")==1, F.lit("yes")).otherwise(F.lit("no")),
    )
#### build columns yes/no for each distinct value in the column variable
bench4=benchmark.withColumn("agree_lowestPval", F.first("AgreeDrug").over(window_spec)
    ).withColumn(
            "maxClinPhase_TD", ### no longer maxclinphase for T-D
            F.max("clinicalPhase").over(Window.partitionBy("targetId", "diseaseId")),
    ).groupBy("targetId", "diseaseId","drugId", "maxClinPhase_TD","approvedDrug").pivot("rightStudyType").agg(F.collect_set("agree_lowestPVal")
    ).withColumn(
    "Phase4",
    F.when(F.col("maxClinPhase_TD") == 4, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=3",
        F.when(F.col("maxClinPhase_TD") >= 3, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=2",
        F.when(F.col("maxClinPhase_TD") >= 2, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=1",
        F.when(F.col("maxClinPhase_TD") >= 1, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
    "approved",
        F.when(F.col("approvedDrug")==1, F.lit("yes")).otherwise(F.lit("no")),
    )

                                                                                

In [None]:
x="pqtl"
print("")
bench2.withColumn(
        x, 
        F.when(F.array_contains(F.col(x),"yes"), F.lit("yes")).otherwise(F.lit("no"))
    ).groupBy(x).pivot("phase4").count().select(x,"yes","no").sort(F.col(x).desc()).show()
bench3.withColumn(
        x, 
        F.when(F.array_contains(F.col(x),"yes"), F.lit("yes")).otherwise(F.lit("no"))
    ).groupBy(x).pivot("phase4").count().select(x,"yes","no").sort(F.col(x).desc()).show()

bench4.withColumn(
        x, 
        F.when(F.array_contains(F.col(x),"yes"), F.lit("yes")).otherwise(F.lit("no"))
    ).groupBy(x).pivot("phase4").count().select(x,"yes","no").sort(F.col(x).desc()).show()

bench4.withColumn(
        x, 
        F.when(F.array_contains(F.col(x),"yes"), F.lit("yes")).otherwise(F.lit("no"))
    ).groupBy(x).pivot("approved").count().select(x,"yes","no").sort(F.col(x).desc()).show()

                                                                                

+----+----+----+
|pqtl| yes|  no|
+----+----+----+
| yes|  63| 265|
|  no|1142|2944|
+----+----+----+



                                                                                

+----+---+---+
|pqtl|yes| no|
+----+---+---+
| yes| 21|  6|
|  no| 78| 77|
+----+---+---+



[Stage 1865:>                                                       (0 + 1) / 1]

+----+---+---+
|pqtl|yes| no|
+----+---+---+
| yes| 75|  7|
|  no|444|152|
+----+---+---+





+----+---+---+
|pqtl|yes| no|
+----+---+---+
| yes| 64| 18|
|  no|384|212|
+----+---+---+



                                                                                

In [None]:
x="eqtl"
bench2.withColumn(
        x, 
        F.when(F.array_contains(F.col(x),"yes"), F.lit("yes")).otherwise(F.lit("no"))
    ).groupBy(x).pivot("phase4").count().show()
bench3.withColumn(
        x, 
        F.when(F.array_contains(F.col(x),"yes"), F.lit("yes")).otherwise(F.lit("no"))
    ).groupBy(x).pivot("phase4").count().show()

bench4.withColumn(
        x, 
        F.when(F.array_contains(F.col(x),"yes"), F.lit("yes")).otherwise(F.lit("no"))
    ).groupBy(x).pivot("phase4").count().show()


                                                                                

+----+----+---+
|eqtl|  no|yes|
+----+----+---+
|  no|2121|886|
| yes|1088|319|
+----+----+---+



                                                                                

+----+---+---+
|eqtl| no|yes|
+----+---+---+
|  no| 49| 63|
| yes| 34| 36|
+----+---+---+



[Stage 2183:>                                                       (0 + 1) / 1]

+----+---+---+
|eqtl| no|yes|
+----+---+---+
|  no| 95|280|
| yes| 64|239|
+----+---+---+



                                                                                

In [38]:
x="sqtl"
bench2.withColumn(
        x, 
        F.when(F.array_contains(F.col(x),"yes"), F.lit("yes")).otherwise(F.lit("no"))
    ).groupBy(x).pivot("phase4").count().show()
bench3.withColumn(
        x, 
        F.when(F.array_contains(F.col(x),"yes"), F.lit("yes")).otherwise(F.lit("no"))
    ).groupBy(x).pivot("phase4").count().show()

bench4.withColumn(
        x, 
        F.when(F.array_contains(F.col(x),"yes"), F.lit("yes")).otherwise(F.lit("no"))
    ).groupBy(x).pivot("phase4").count().show()

                                                                                

+----+----+---+
|sqtl|  no|yes|
+----+----+---+
|  no|2688|969|
| yes| 521|236|
+----+----+---+



                                                                                

+----+---+---+
|sqtl| no|yes|
+----+---+---+
|  no| 74| 88|
| yes|  9| 11|
+----+---+---+



[Stage 4196:>                                                       (0 + 1) / 1]

+----+---+---+
|sqtl| no|yes|
+----+---+---+
|  no|147|454|
| yes| 12| 65|
+----+---+---+



                                                                                

### now with studies that we know they perform well

In [30]:
disdic

{'ROSMAP': 'projectId',
 'BrainSeq': 'projectId',
 'BLUEPRINT': 'projectId',
 'GEUVADIS': 'projectId',
 'TwinsUK': 'projectId',
 'GTEx': 'projectId',
 'FUSION': 'projectId',
 'Lepik_2017': 'projectId',
 'Fairfax_2014': 'projectId',
 'CEDAR': 'projectId',
 'OneK1K': 'projectId',
 'UKB_PPP_EUR': 'projectId',
 'Schmiedel_2018': 'projectId',
 'Perez_2022': 'projectId',
 'Quach_2016': 'projectId',
 'Alasoo_2018': 'projectId',
 'Gilchrist_2021': 'projectId',
 'CommonMind': 'projectId',
 'Sun_2018': 'projectId',
 'Braineac2': 'projectId',
 'Peng_2018': 'projectId',
 'Bossini-Castillo_2019': 'projectId',
 'Nedelec_2016': 'projectId',
 'Steinberg_2020': 'projectId',
 'Jerber_2021': 'projectId',
 'HipSci': 'projectId',
 'GENCORD': 'projectId',
 'Kim-Hellmuth_2017': 'projectId',
 'CAP': 'projectId',
 'iPSCORE': 'projectId',
 'PhLiPS': 'projectId',
 'Walker_2019': 'projectId',
 'Aygun_2021': 'projectId',
 'PISA': 'projectId',
 'Cytoimmgen': 'projectId',
 'van_de_Bunt_2015': 'projectId',
 'Nathan_2

In [31]:
#### build list of comparison and prediction columns
rows=comparisons_df_iterative(disdic,variables_study[2])[0]
#### prepare aggregation depending on the variable problem
window_spec = Window.partitionBy("targetId","diseaseId","projectId").orderBy(F.col("pValueExponent").asc())
#### take directionality from lowest p value
bench2=benchmark.withColumn("agree_lowestPval", F.first("AgreeDrug").over(window_spec)
    ).groupBy("targetId", "diseaseId", "clinicalStudyId","clinicalPhase","maxClinPhase","approvedDrug").pivot("projectId").agg(F.collect_set("agree_lowestPVal")
    ).withColumn(
    "Phase4",
    F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=3",
        F.when(F.col("maxClinPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=2",
        F.when(F.col("maxClinPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=1",
        F.when(F.col("maxClinPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
    "approved",
        F.when(F.col("approvedDrug")==1, F.lit("yes")).otherwise(F.lit("no")),
    )
#### build columns yes/no for each distinct value in the column variable

bench3=benchmark.withColumn("agree_lowestPval", F.first("AgreeDrug").over(window_spec)
    ).withColumn(
            "maxClinPhase_TD", ### no longer maxclinphase for T-D
            F.max("clinicalPhase").over(Window.partitionBy("targetId", "diseaseId")),
    ).groupBy("targetId", "diseaseId", "maxClinPhase_TD","approvedDrug").pivot("projectId").agg(F.collect_set("agree_lowestPVal")
    ).withColumn(
    "Phase4",
    F.when(F.col("maxClinPhase_TD") == 4, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=3",
        F.when(F.col("maxClinPhase_TD") >= 3, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=2",
        F.when(F.col("maxClinPhase_TD") >= 2, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=1",
        F.when(F.col("maxClinPhase_TD") >= 1, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
    "approved",
        F.when(F.col("approvedDrug")==1, F.lit("yes")).otherwise(F.lit("no")),
    )
#### build columns yes/no for each distinct value in the column variable
bench4=benchmark.withColumn("agree_lowestPval", F.first("AgreeDrug").over(window_spec)
    ).withColumn(
            "maxClinPhase_TD", ### no longer maxclinphase for T-D
            F.max("clinicalPhase").over(Window.partitionBy("targetId", "diseaseId")),
    ).groupBy("targetId", "diseaseId","drugId", "maxClinPhase_TD","approvedDrug").pivot("projectId").agg(F.collect_set("agree_lowestPVal")
    ).withColumn(
    "Phase4",
    F.when(F.col("maxClinPhase_TD") == 4, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=3",
        F.when(F.col("maxClinPhase_TD") >= 3, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=2",
        F.when(F.col("maxClinPhase_TD") >= 2, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=1",
        F.when(F.col("maxClinPhase_TD") >= 1, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
    "approved",
        F.when(F.col("approvedDrug")==1, F.lit("yes")).otherwise(F.lit("no")),
    )

                                                                                

In [33]:
x="UKB_PPP_EUR"
bench2.withColumn(
        x, 
        F.when(F.array_contains(F.col(x),"yes"), F.lit("yes")).otherwise(F.lit("no"))
    ).groupBy(x).pivot("phase4").count().show()
bench3.withColumn(
        x, 
        F.when(F.array_contains(F.col(x),"yes"), F.lit("yes")).otherwise(F.lit("no"))
    ).groupBy(x).pivot("phase4").count().show()

bench4.withColumn(
        x, 
        F.when(F.array_contains(F.col(x),"yes"), F.lit("yes")).otherwise(F.lit("no"))
    ).groupBy(x).pivot("phase4").count().show()

                                                                                

+-----------+----+----+
|UKB_PPP_EUR|  no| yes|
+-----------+----+----+
|         no|2944|1142|
|        yes| 265|  63|
+-----------+----+----+



                                                                                

+-----------+---+---+
|UKB_PPP_EUR| no|yes|
+-----------+---+---+
|         no| 77| 78|
|        yes|  6| 21|
+-----------+---+---+





+-----------+---+---+
|UKB_PPP_EUR| no|yes|
+-----------+---+---+
|         no|152|444|
|        yes|  7| 75|
+-----------+---+---+



                                                                                

### build report to analyse different comparisons

In [None]:
### including clinicalStudyId to explode the data

#### build list of comparison and prediction columns
rows=comparisons_df_iterative(disdic,variables_study[2])[0]
#### prepare aggregation depending on the variable problem
window_spec = Window.partitionBy("targetId","diseaseId","projectId").orderBy(F.col("pValueExponent").asc())
#### take directionality from lowest p value
bench2=benchmark.withColumn("agree_lowestPval", F.first("AgreeDrug").over(window_spec)
    ).groupBy("targetId", "diseaseId", "clinicalStudyId","clinicalPhase","maxClinPhase","approvedDrug").pivot("projectId").agg(F.collect_set("agree_lowestPVal")
    ).withColumn(
    "Phase4",
    F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=3",
        F.when(F.col("maxClinPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=2",
        F.when(F.col("maxClinPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=1",
        F.when(F.col("maxClinPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
    "approved",
        F.when(F.col("approvedDrug")==1, F.lit("yes")).otherwise(F.lit("no")),
    )
#### build columns yes/no for each distinct value in the column variable

bench3=benchmark.withColumn("agree_lowestPval", F.first("AgreeDrug").over(window_spec)
    ).withColumn(
            "maxClinPhase_TD", ### no longer maxclinphase for T-D
            F.max("clinicalPhase").over(Window.partitionBy("targetId", "diseaseId")),
    ).groupBy("targetId", "diseaseId", "maxClinPhase_TD","approvedDrug").pivot("projectId").agg(F.collect_set("agree_lowestPVal")
    ).withColumn(
    "Phase4",
    F.when(F.col("maxClinPhase_TD") == 4, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=3",
        F.when(F.col("maxClinPhase_TD") >= 3, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=2",
        F.when(F.col("maxClinPhase_TD") >= 2, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=1",
        F.when(F.col("maxClinPhase_TD") >= 1, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
    "approved",
        F.when(F.col("approvedDrug")==1, F.lit("yes")).otherwise(F.lit("no")),
    )
#### build columns yes/no for each distinct value in the column variable
bench4=benchmark.withColumn("agree_lowestPval", F.first("AgreeDrug").over(window_spec)
    ).withColumn(
            "maxClinPhase_TD", ### no longer maxclinphase for T-D
            F.max("clinicalPhase").over(Window.partitionBy("targetId", "diseaseId")),
    ).groupBy("targetId", "diseaseId","drugId", "maxClinPhase_TD","approvedDrug").pivot("projectId").agg(F.collect_set("agree_lowestPVal")
    ).withColumn(
    "Phase4",
    F.when(F.col("maxClinPhase_TD") == 4, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=3",
        F.when(F.col("maxClinPhase_TD") >= 3, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=2",
        F.when(F.col("maxClinPhase_TD") >= 2, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=1",
        F.when(F.col("maxClinPhase_TD") >= 1, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
    "approved",
        F.when(F.col("approvedDrug")==1, F.lit("yes")).otherwise(F.lit("no")),
    )
bench5=benchmark.withColumn("agree_lowestPval", F.first("AgreeDrug").over(window_spec)
    ).withColumn(
            "maxClinPhase_TD", ### no longer maxclinphase for T-D
            F.max("clinicalPhase").over(Window.partitionBy("targetId", "diseaseId")),
    ).groupBy("targetId", "diseaseId","drugId","clinicalStudyId", "maxClinPhase_TD","approvedDrug").pivot("projectId").agg(F.collect_set("agree_lowestPVal")
    ).withColumn(
    "Phase4",
    F.when(F.col("maxClinPhase_TD") == 4, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=3",
        F.when(F.col("maxClinPhase_TD") >= 3, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=2",
        F.when(F.col("maxClinPhase_TD") >= 2, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=1",
        F.when(F.col("maxClinPhase_TD") >= 1, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
    "approved",
        F.when(F.col("approvedDrug")==1, F.lit("yes")).otherwise(F.lit("no")),
    )

                                                                                

In [None]:
## al clinical phases with drugId
x="UKB_PPP_EUR"
bench2.withColumn(
        x, 
        F.when(F.array_contains(F.col(x),"yes"), F.lit("yes")).otherwise(F.lit("no"))
    ).groupBy(x).pivot("phase4").count().select(x,"yes","no").sort(F.col(x).desc()).show()
bench3.withColumn(
        x, 
        F.when(F.array_contains(F.col(x),"yes"), F.lit("yes")).otherwise(F.lit("no"))
    ).groupBy(x).pivot("phase4").count().select(x,"yes","no").sort(F.col(x).desc()).show()

bench4.withColumn(
        x, 
        F.when(F.array_contains(F.col(x),"yes"), F.lit("yes")).otherwise(F.lit("no"))
    ).groupBy(x).pivot("phase4").count().select(x,"yes","no").sort(F.col(x).desc()).show()

bench5.withColumn(
        x, 
        F.when(F.array_contains(F.col(x),"yes"), F.lit("yes")).otherwise(F.lit("no"))
    ).groupBy(x).pivot("phase4").count().select(x,"yes","no").sort(F.col(x).desc()).show()

                                                                                

+-----------+----+----+
|UKB_PPP_EUR|  no| yes|
+-----------+----+----+
|         no|2944|1142|
|        yes| 265|  63|
+-----------+----+----+



                                                                                

+-----------+---+---+
|UKB_PPP_EUR| no|yes|
+-----------+---+---+
|         no|266| 78|
|        yes| 39| 21|
+-----------+---+---+



                                                                                

+-----------+---+---+
|UKB_PPP_EUR| no|yes|
+-----------+---+---+
|         no|723|323|
|        yes| 94| 47|
+-----------+---+---+





+-----------+----+----+
|UKB_PPP_EUR|  no| yes|
+-----------+----+----+
|         no|3471|1630|
|        yes| 270|  86|
+-----------+----+----+



                                                                                

In [43]:
### using maximum clinical phases per target-disease-drugId 


#### build list of comparison and prediction columns
rows=comparisons_df_iterative(disdic,variables_study[2])[0]
#### prepare aggregation depending on the variable problem
window_spec = Window.partitionBy("targetId","diseaseId","projectId").orderBy(F.col("pValueExponent").asc())
#### take directionality from lowest p value
bench2=benchmark.withColumn("agree_lowestPval", F.first("AgreeDrug").over(window_spec)
    ).groupBy("targetId", "diseaseId", "clinicalStudyId","clinicalPhase","maxClinPhase","approvedDrug").pivot("projectId").agg(F.collect_set("agree_lowestPVal")
    ).withColumn(
    "Phase4",
    F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=3",
        F.when(F.col("maxClinPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=2",
        F.when(F.col("maxClinPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=1",
        F.when(F.col("maxClinPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
    "approved",
        F.when(F.col("approvedDrug")==1, F.lit("yes")).otherwise(F.lit("no")),
    )
#### build columns yes/no for each distinct value in the column variable

bench3=benchmark.withColumn("agree_lowestPval", F.first("AgreeDrug").over(window_spec)
    ).withColumn(
            "maxClinPhase_TD", ### no longer maxclinphase for T-D
            F.max("clinicalPhase").over(Window.partitionBy("targetId", "diseaseId")),
    ).groupBy("targetId", "diseaseId", "maxClinPhase","approvedDrug").pivot("projectId").agg(F.collect_set("agree_lowestPVal")
    ).withColumn(
    "Phase4",
    F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=3",
        F.when(F.col("maxClinPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=2",
        F.when(F.col("maxClinPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=1",
        F.when(F.col("maxClinPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
    "approved",
        F.when(F.col("approvedDrug")==1, F.lit("yes")).otherwise(F.lit("no")),
    )
#### build columns yes/no for each distinct value in the column variable
bench4=benchmark.withColumn("agree_lowestPval", F.first("AgreeDrug").over(window_spec)
    ).withColumn(
            "maxClinPhase_TD", ### no longer maxclinphase for T-D
            F.max("clinicalPhase").over(Window.partitionBy("targetId", "diseaseId","drugId")),
    ).groupBy("targetId", "diseaseId","drugId", "maxClinPhase_TD","approvedDrug").pivot("projectId").agg(F.collect_set("agree_lowestPVal")
    ).withColumn(
    "Phase4",
    F.when(F.col("maxClinPhase_TD") == 4, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=3",
        F.when(F.col("maxClinPhase_TD") >= 3, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=2",
        F.when(F.col("maxClinPhase_TD") >= 2, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=1",
        F.when(F.col("maxClinPhase_TD") >= 1, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
    "approved",
        F.when(F.col("approvedDrug")==1, F.lit("yes")).otherwise(F.lit("no")),
    )
bench5=benchmark.withColumn("agree_lowestPval", F.first("AgreeDrug").over(window_spec)
    ).withColumn(
            "maxClinPhase_TD", ### no longer maxclinphase for T-D
            F.max("clinicalPhase").over(Window.partitionBy("targetId", "diseaseId","drugId")),
    ).groupBy("targetId", "diseaseId","drugId","clinicalStudyId", "maxClinPhase_TD","approvedDrug").pivot("projectId").agg(F.collect_set("agree_lowestPVal")
    ).withColumn(
    "Phase4",
    F.when(F.col("maxClinPhase_TD") == 4, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=3",
        F.when(F.col("maxClinPhase_TD") >= 3, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=2",
        F.when(F.col("maxClinPhase_TD") >= 2, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=1",
        F.when(F.col("maxClinPhase_TD") >= 1, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
    "approved",
        F.when(F.col("approvedDrug")==1, F.lit("yes")).otherwise(F.lit("no")),
    )

                                                                                

In [44]:
x="UKB_PPP_EUR"
bench2.withColumn(
        x, 
        F.when(F.array_contains(F.col(x),"yes"), F.lit("yes")).otherwise(F.lit("no"))
    ).groupBy(x).pivot("phase4").count().show()
bench3.withColumn(
        x, 
        F.when(F.array_contains(F.col(x),"yes"), F.lit("yes")).otherwise(F.lit("no"))
    ).groupBy(x).pivot("phase4").count().show()

bench4.withColumn(
        x, 
        F.when(F.array_contains(F.col(x),"yes"), F.lit("yes")).otherwise(F.lit("no"))
    ).groupBy(x).pivot("phase4").count().show()

bench5.withColumn(
        x, 
        F.when(F.array_contains(F.col(x),"yes"), F.lit("yes")).otherwise(F.lit("no"))
    ).groupBy(x).pivot("phase4").count().show()

                                                                                

+-----------+----+----+
|UKB_PPP_EUR|  no| yes|
+-----------+----+----+
|         no|2944|1142|
|        yes| 265|  63|
+-----------+----+----+



                                                                                

+-----------+---+---+
|UKB_PPP_EUR| no|yes|
+-----------+---+---+
|         no|266| 78|
|        yes| 39| 21|
+-----------+---+---+



                                                                                

+-----------+---+---+
|UKB_PPP_EUR| no|yes|
+-----------+---+---+
|         no|273|323|
|        yes| 35| 47|
+-----------+---+---+





+-----------+---+----+
|UKB_PPP_EUR| no| yes|
+-----------+---+----+
|         no|658|4443|
|        yes| 76| 280|
+-----------+---+----+



                                                                                

In [35]:
x="UKB_PPP_EUR"
bench2.withColumn(
        x, 
        F.when(F.array_contains(F.col(x),"yes"), F.lit("yes")).otherwise(F.lit("no"))
    ).groupBy(x).pivot("phase4").count().show()
bench3.withColumn(
        x, 
        F.when(F.array_contains(F.col(x),"yes"), F.lit("yes")).otherwise(F.lit("no"))
    ).groupBy(x).pivot("phase4").count().show()

bench4.withColumn(
        x, 
        F.when(F.array_contains(F.col(x),"yes"), F.lit("yes")).otherwise(F.lit("no"))
    ).groupBy(x).pivot("phase4").count().show()

bench5.withColumn(
        x, 
        F.when(F.array_contains(F.col(x),"yes"), F.lit("yes")).otherwise(F.lit("no"))
    ).groupBy(x).pivot("phase4").count().show()

                                                                                

+-----------+----+----+
|UKB_PPP_EUR|  no| yes|
+-----------+----+----+
|         no|2944|1142|
|        yes| 265|  63|
+-----------+----+----+



                                                                                

+-----------+---+---+
|UKB_PPP_EUR| no|yes|
+-----------+---+---+
|         no| 77| 78|
|        yes|  6| 21|
+-----------+---+---+



                                                                                

+-----------+---+---+
|UKB_PPP_EUR| no|yes|
+-----------+---+---+
|         no|152|444|
|        yes|  7| 75|
+-----------+---+---+





+-----------+---+----+
|UKB_PPP_EUR| no| yes|
+-----------+---+----+
|         no|276|4825|
|        yes|  9| 347|
+-----------+---+----+



                                                                                

In [None]:
####2 Define agregation function
import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
from scipy.stats.contingency import odds_ratio
from pyspark.sql.types import *

def convertTuple(tup):
    st = ",".join(map(str, tup))
    return st

#####3 run in a function
def aggregations_original(
    df,
    data,
    listado,
    comparisonColumn,
    comparisonType,
    predictionColumn,
    predictionType,
    today_date,
):
    wComparison = Window.partitionBy(comparisonColumn)
    wPrediction = Window.partitionBy(predictionColumn)
    wPredictionComparison = Window.partitionBy(comparisonColumn, predictionColumn)
    results = []
    # uniqIds = df.select("targetId", "diseaseId").distinct().count()
    out = (
        df.withColumn("comparisonType", F.lit(comparisonType))
        .withColumn("dataset", F.lit(data))
        .withColumn("predictionType", F.lit(predictionType))
        # .withColumn("total", F.lit(uniqIds))
        .withColumn("a", F.count("targetId").over(wPredictionComparison))
        .withColumn("comparisonColumn", F.lit(comparisonColumn))
        .withColumn("predictionColumnValue", F.lit(predictionColumn))
        .withColumn(
            "predictionTotal",
            F.count("targetId").over(wPrediction),
        )
        .withColumn(
            "comparisonTotal",
            F.count("targetId").over(wComparison),
        )
        .select(
            F.col(predictionColumn).alias("prediction"),
            F.col(comparisonColumn).alias("comparison"),
            "dataset",
            "comparisonColumn",
            "predictionColumnValue",
            "comparisonType",
            "predictionType",
            "a",
            "predictionTotal",
            "comparisonTotal",
        )
        .filter(F.col("prediction").isNotNull())
        .filter(F.col("comparison").isNotNull())
        .distinct()
    )
    '''
    out.write.mode("overwrite").parquet(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + comparisonType
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )
    '''
    
    listado.append(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + comparisonType
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )
    path = "gs://ot-team/jroldan/" + str(
        today_date
        + "_"
        + "analysis/"
        + data
        # + "_propagated"
        + "/"
        + comparisonColumn
        + "_"
        + comparisonType
        + "_"
        + predictionColumn
        + ".parquet"
    )
    print(path)
    ### making analysis
    array1 = np.delete(
        out.join(full_data, on=["prediction", "comparison"], how="outer")
        .groupBy("comparison")
        .pivot("prediction")
        .agg(F.first("a"))
        .sort(F.col("comparison").desc())
        .select("comparison", "yes", "no")
        .fillna(0)
        .toPandas()
        .to_numpy(),
        [0],
        1,
    )
    total = np.sum(array1)
    res_npPhaseX = np.array(array1, dtype=int)
    resX = convertTuple(fisher_exact(res_npPhaseX, alternative="two-sided"))
    resx_CI = convertTuple(
        odds_ratio(res_npPhaseX).confidence_interval(confidence_level=0.95)
    )

    result_st.append(resX)
    result_ci.append(resx_CI)
    (rs_result, rs_ci) = relative_success(array1)
    results.extend(
        [
            comparisonType,
            comparisonColumn,
            predictionColumn,
            round(float(resX.split(",")[0]), 2),
            float(resX.split(",")[1]),
            round(float(resx_CI.split(",")[0]), 2),
            round(float(resx_CI.split(",")[1]), 2),
            str(total),
            np.array(res_npPhaseX).tolist(),
            round(float(rs_result), 2),
            round(float(rs_ci[0]), 2),
            round(float(rs_ci[1]), 2),
            # studies,
            # tissues,
            path,
        ]
    )
    return results



#### 3 Loop over different datasets (as they will have different rows and columns)

def comparisons_df_iterative(disdic,projectId):
    toAnalysis=[(key, value) for key, value in disdic.items() if value == projectId]
    schema = StructType(
        [
            StructField("comparison", StringType(), True),
            StructField("comparisonType", StringType(), True),
        ]
    )

    comparisons = spark.createDataFrame(toAnalysis, schema=schema)
    ### include all the columns as predictor

    predictions = spark.createDataFrame(
        data=[
            ("Phase4", "clinical"),
            ("Phase>=3", "clinical"),
            ("Phase>=2", "clinical"),
            ("Phase>=1", "clinical"),
            #("nPhase4", "clinical"),
            #("nPhase>=3", "clinical"),
            #("nPhase>=2", "clinical"),
            #("nPhase>=1", "clinical"),
            ("approved", "clinical"),

            # ("PhaseT", "clinical"),
        ]
    )
    return comparisons.join(predictions, how="full").collect()

full_data = spark.createDataFrame(
    data=[
        ("yes", "yes"),
        ("yes", "no"),
        ("no", "yes"),
        ("no", "no"),
    ],
    schema=StructType(
        [
            StructField("prediction", StringType(), True),
            StructField("comparison", StringType(), True),
        ]
    ),
)
print("created full_data and lists")

result = []
result_st = []
result_ci = []
array2 = []
listado = []
result_all = []
today_date = str(date.today())
variables_study = ["projectId", "biosampleName", "rightStudyType", "colocDoE"]

print("looping for variables_study")

for variable in variables_study:
    print("analysing",variable)
    #### build list of comparison and prediction columns
    rows=comparisons_df_iterative(disdic,variable)
    #### prepare aggregation depending on the variable problem
    window_spec = Window.partitionBy("targetId","diseaseId",variable).orderBy(F.col("pValueExponent").asc())
    #### take directionality from lowest p value
    bench2=benchmark.withColumn("agree_lowestPval", F.first("AgreeDrug").over(window_spec)
        ).groupBy("targetId", "diseaseId", "clinicalStudyId","clinicalPhase","maxClinPhase","approvedDrug").pivot(variable).agg(F.collect_set("agree_lowestPVal")
        ).withColumn(
        "Phase4",
        F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
        ).withColumn(
            "Phase>=3",
            F.when(F.col("maxClinPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
        ).withColumn(
            "Phase>=2",
            F.when(F.col("maxClinPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
        ).withColumn(
            "Phase>=1",
            F.when(F.col("maxClinPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
        ).withColumn(
        "approved",
            F.when(F.col("approvedDrug")==1, F.lit("yes")).otherwise(F.lit("no")),
        )
    #### build columns yes/no for each distinct value in the column variable
    for x, value in [(key, val) for key, val in disdic.items() if val == variable]:
        print("building columns: ", x,"and",value)
        bench2 = bench2.withColumn(
            x, 
            F.when(F.array_contains(F.col(x),"yes"), F.lit("yes")).otherwise(F.lit("no"))
        )
    #### doing aggregations per 
    for row in rows:
        print("row:",row)
        results = aggregations_original(bench2, "propagated", listado, *row, today_date)
        result_all.append(results)
    
    schema = StructType(
    [
        StructField("group", StringType(), True),
        StructField("comparison", StringType(), True),
        StructField("phase", StringType(), True),
        StructField("oddsRatio", DoubleType(), True),
        StructField("pValue", DoubleType(), True),
        StructField("lowerInterval", DoubleType(), True),
        StructField("upperInterval", DoubleType(), True),
        StructField("total", StringType(), True),
        StructField("values", ArrayType(ArrayType(IntegerType())), True),
        StructField("relSuccess", DoubleType(), True),
        StructField("rsLower", DoubleType(), True),
        StructField("rsUpper", DoubleType(), True),
        StructField("path", StringType(), True),
    ]
)

# Convert list of lists to DataFrame
df = spreadSheetFormatter(spark.createDataFrame(result_all, schema=schema))
df.toPandas().to_csv(
    f"gs://ot-team/jroldan/analysis/{today_date}_credibleSetColocDoEanalysis_fixedIndex_fixedTotalNumber_CoherentThing.csv"
)

print("dataframe written \n Analysis finished")

In [6]:
#### build list of comparison and prediction columns
rows=comparisons_df_iterative(disdic,variables_study[2])[0]
#### prepare aggregation depending on the variable problem
window_spec = Window.partitionBy("targetId","diseaseId","rightStudyType").orderBy(F.col("pValueExponent").asc())
#### take directionality from lowest p value
bench2=benchmark.withColumn("agree_lowestPval", F.first("AgreeDrug").over(window_spec)
    ).groupBy("targetId", "diseaseId", "clinicalStudyId","clinicalPhase","maxClinPhase","approvedDrug").pivot("rightStudyType").agg(F.collect_set("agree_lowestPVal")
    ).withColumn(
    "Phase4",
    F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=3",
        F.when(F.col("maxClinPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=2",
        F.when(F.col("maxClinPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "Phase>=1",
        F.when(F.col("maxClinPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
    "approved",
        F.when(F.col("approvedDrug")==1, F.lit("yes")).otherwise(F.lit("no")),
    )
#### build columns yes/no for each distinct value in the column variable


NameError: name 'comparisons_df_iterative' is not defined

In [86]:
for x, value in [(key, val) for key, val in disdic.items() if val == "rightStudyType"]:
    print(x,value)

eqtl rightStudyType
sqtl rightStudyType
tuqtl rightStudyType
sceqtl rightStudyType
pqtl rightStudyType
sctuqtl rightStudyType
scsqtl rightStudyType


In [90]:
x="eqtl"
bench2.withColumn(
        x, 
        F.when(F.array_contains(F.array(F.col(x)),"yes"), F.lit("yes")).otherwise(F.lit("no"))
    ).groupBy(x).pivot("phase4").count().show()



+----+----+---+
|eqtl|  no|yes|
+----+----+---+
|  no|1916|825|
| yes|1293|379|
+----+----+---+



                                                                                

In [94]:
x="pqtl"
bench2.withColumn(
        x, 
        F.when(F.array_contains(F.array(F.col(x)),"yes"), F.lit("yes")).otherwise(F.lit("no"))
    ).groupBy(x).pivot("phase4").count().show()



+----+----+----+
|pqtl|  no| yes|
+----+----+----+
|  no|2944|1141|
| yes| 265|  63|
+----+----+----+



                                                                                

In [93]:
bench2.withColumn(
        x, 
        F.when(F.array_contains(F.array(F.col(x)),"yes"), F.lit("yes")).otherwise(F.lit("no"))
    ).sort("targetId","diseaseId",F.col("clinicalPhase").asc()).show()



+---------------+-----------+---------------+-------------+------------+------------+----+----+------+------+-------+----+-----+------+--------+--------+--------+--------+
|       targetId|  diseaseId|clinicalStudyId|clinicalPhase|maxClinPhase|approvedDrug|eqtl|pqtl|sceqtl|scsqtl|sctuqtl|sqtl|tuqtl|Phase4|Phase>=3|Phase>=2|Phase>=1|approved|
+---------------+-----------+---------------+-------------+------------+------------+----+----+------+------+-------+----+-----+------+--------+--------+--------+--------+
|ENSG00000006071|EFO_0000400|    NCT01489644|          1.0|         1.0|           1|  no|  no|    no|    no|     no|  no|   no|    no|      no|      no|     yes|     yes|
|ENSG00000006071|EFO_0000400|    NCT02954822|          1.0|         1.0|           1|  no|  no|    no|    no|     no|  no|   no|    no|      no|      no|     yes|     yes|
|ENSG00000006071|EFO_0000400|    NCT00959101|          1.0|         1.0|           1|  no|  no|    no|    no|     no|  no|   no|    no|     

                                                                                

In [85]:
for x, value in [(key, val) for key, val in disdic.items() if val == "rightStudyType"]:
    print("building columns: ", x,"and",value)
    bench2 = bench2.withColumn(
        x, 
        F.when(F.array_contains(F.col(x),"yes"), F.lit("yes")).otherwise(F.lit("no"))
    ).show()

building columns:  eqtl and rightStudyType


AnalysisException: cannot resolve 'array_contains(eqtl, 'yes')' due to data type mismatch: Input to function array_contains should have been array followed by a value with same element type, but it's [string, string].;
'Project [targetId#2095238, diseaseId#2095387, clinicalStudyId#2102030, clinicalPhase#2096480, maxClinPhase#2098565, approvedDrug#2098689, CASE WHEN array_contains(eqtl#4471932, yes) THEN yes ELSE no END AS eqtl#4472065, pqtl#4472008, sceqtl#4471989, scsqtl#4472046, sctuqtl#4472027, sqtl#4471951, tuqtl#4471970, Phase4#4471847, Phase>=3#4471862, Phase>=2#4471878, Phase>=1#4471895, approved#4471913]
+- Project [targetId#2095238, diseaseId#2095387, clinicalStudyId#2102030, clinicalPhase#2096480, maxClinPhase#2098565, approvedDrug#2098689, eqtl#4471932, pqtl#4472008, sceqtl#4471989, CASE WHEN array_contains(scsqtl#4471827, yes) THEN yes ELSE no END AS scsqtl#4472046, sctuqtl#4472027, sqtl#4471951, tuqtl#4471970, Phase4#4471847, Phase>=3#4471862, Phase>=2#4471878, Phase>=1#4471895, approved#4471913]
   +- Project [targetId#2095238, diseaseId#2095387, clinicalStudyId#2102030, clinicalPhase#2096480, maxClinPhase#2098565, approvedDrug#2098689, eqtl#4471932, pqtl#4472008, sceqtl#4471989, scsqtl#4471827, CASE WHEN array_contains(sctuqtl#4471829, yes) THEN yes ELSE no END AS sctuqtl#4472027, sqtl#4471951, tuqtl#4471970, Phase4#4471847, Phase>=3#4471862, Phase>=2#4471878, Phase>=1#4471895, approved#4471913]
      +- Project [targetId#2095238, diseaseId#2095387, clinicalStudyId#2102030, clinicalPhase#2096480, maxClinPhase#2098565, approvedDrug#2098689, eqtl#4471932, CASE WHEN array_contains(pqtl#4471823, yes) THEN yes ELSE no END AS pqtl#4472008, sceqtl#4471989, scsqtl#4471827, sctuqtl#4471829, sqtl#4471951, tuqtl#4471970, Phase4#4471847, Phase>=3#4471862, Phase>=2#4471878, Phase>=1#4471895, approved#4471913]
         +- Project [targetId#2095238, diseaseId#2095387, clinicalStudyId#2102030, clinicalPhase#2096480, maxClinPhase#2098565, approvedDrug#2098689, eqtl#4471932, pqtl#4471823, CASE WHEN array_contains(sceqtl#4471825, yes) THEN yes ELSE no END AS sceqtl#4471989, scsqtl#4471827, sctuqtl#4471829, sqtl#4471951, tuqtl#4471970, Phase4#4471847, Phase>=3#4471862, Phase>=2#4471878, Phase>=1#4471895, approved#4471913]
            +- Project [targetId#2095238, diseaseId#2095387, clinicalStudyId#2102030, clinicalPhase#2096480, maxClinPhase#2098565, approvedDrug#2098689, eqtl#4471932, pqtl#4471823, sceqtl#4471825, scsqtl#4471827, sctuqtl#4471829, sqtl#4471951, CASE WHEN array_contains(tuqtl#4471833, yes) THEN yes ELSE no END AS tuqtl#4471970, Phase4#4471847, Phase>=3#4471862, Phase>=2#4471878, Phase>=1#4471895, approved#4471913]
               +- Project [targetId#2095238, diseaseId#2095387, clinicalStudyId#2102030, clinicalPhase#2096480, maxClinPhase#2098565, approvedDrug#2098689, eqtl#4471932, pqtl#4471823, sceqtl#4471825, scsqtl#4471827, sctuqtl#4471829, CASE WHEN array_contains(sqtl#4471831, yes) THEN yes ELSE no END AS sqtl#4471951, tuqtl#4471833, Phase4#4471847, Phase>=3#4471862, Phase>=2#4471878, Phase>=1#4471895, approved#4471913]
                  +- Project [targetId#2095238, diseaseId#2095387, clinicalStudyId#2102030, clinicalPhase#2096480, maxClinPhase#2098565, approvedDrug#2098689, CASE WHEN array_contains(eqtl#4471821, yes) THEN yes ELSE no END AS eqtl#4471932, pqtl#4471823, sceqtl#4471825, scsqtl#4471827, sctuqtl#4471829, sqtl#4471831, tuqtl#4471833, Phase4#4471847, Phase>=3#4471862, Phase>=2#4471878, Phase>=1#4471895, approved#4471913]
                     +- Project [targetId#2095238, diseaseId#2095387, clinicalStudyId#2102030, clinicalPhase#2096480, maxClinPhase#2098565, approvedDrug#2098689, eqtl#4471821, pqtl#4471823, sceqtl#4471825, scsqtl#4471827, sctuqtl#4471829, sqtl#4471831, tuqtl#4471833, Phase4#4471847, Phase>=3#4471862, Phase>=2#4471878, Phase>=1#4471895, CASE WHEN (approvedDrug#2098689 = 1) THEN yes ELSE no END AS approved#4471913]
                        +- Project [targetId#2095238, diseaseId#2095387, clinicalStudyId#2102030, clinicalPhase#2096480, maxClinPhase#2098565, approvedDrug#2098689, eqtl#4471821, pqtl#4471823, sceqtl#4471825, scsqtl#4471827, sctuqtl#4471829, sqtl#4471831, tuqtl#4471833, Phase4#4471847, Phase>=3#4471862, Phase>=2#4471878, CASE WHEN (maxClinPhase#2098565 >= cast(1 as double)) THEN yes ELSE no END AS Phase>=1#4471895]
                           +- Project [targetId#2095238, diseaseId#2095387, clinicalStudyId#2102030, clinicalPhase#2096480, maxClinPhase#2098565, approvedDrug#2098689, eqtl#4471821, pqtl#4471823, sceqtl#4471825, scsqtl#4471827, sctuqtl#4471829, sqtl#4471831, tuqtl#4471833, Phase4#4471847, Phase>=3#4471862, CASE WHEN (maxClinPhase#2098565 >= cast(2 as double)) THEN yes ELSE no END AS Phase>=2#4471878]
                              +- Project [targetId#2095238, diseaseId#2095387, clinicalStudyId#2102030, clinicalPhase#2096480, maxClinPhase#2098565, approvedDrug#2098689, eqtl#4471821, pqtl#4471823, sceqtl#4471825, scsqtl#4471827, sctuqtl#4471829, sqtl#4471831, tuqtl#4471833, Phase4#4471847, CASE WHEN (maxClinPhase#2098565 >= cast(3 as double)) THEN yes ELSE no END AS Phase>=3#4471862]
                                 +- Project [targetId#2095238, diseaseId#2095387, clinicalStudyId#2102030, clinicalPhase#2096480, maxClinPhase#2098565, approvedDrug#2098689, eqtl#4471821, pqtl#4471823, sceqtl#4471825, scsqtl#4471827, sctuqtl#4471829, sqtl#4471831, tuqtl#4471833, CASE WHEN (maxClinPhase#2098565 = cast(4 as double)) THEN yes ELSE no END AS Phase4#4471847]
                                    +- Aggregate [targetId#2095238, diseaseId#2095387, clinicalStudyId#2102030, clinicalPhase#2096480, maxClinPhase#2098565, approvedDrug#2098689], [targetId#2095238, diseaseId#2095387, clinicalStudyId#2102030, clinicalPhase#2096480, maxClinPhase#2098565, approvedDrug#2098689, collect_set(if ((rightStudyType#2091891 <=> cast(eqtl as string))) agree_lowestPVal#4470460 else cast(null as string), 0, 0) AS eqtl#4471821, collect_set(if ((rightStudyType#2091891 <=> cast(pqtl as string))) agree_lowestPVal#4470460 else cast(null as string), 0, 0) AS pqtl#4471823, collect_set(if ((rightStudyType#2091891 <=> cast(sceqtl as string))) agree_lowestPVal#4470460 else cast(null as string), 0, 0) AS sceqtl#4471825, collect_set(if ((rightStudyType#2091891 <=> cast(scsqtl as string))) agree_lowestPVal#4470460 else cast(null as string), 0, 0) AS scsqtl#4471827, collect_set(if ((rightStudyType#2091891 <=> cast(sctuqtl as string))) agree_lowestPVal#4470460 else cast(null as string), 0, 0) AS sctuqtl#4471829, collect_set(if ((rightStudyType#2091891 <=> cast(sqtl as string))) agree_lowestPVal#4470460 else cast(null as string), 0, 0) AS sqtl#4471831, collect_set(if ((rightStudyType#2091891 <=> cast(tuqtl as string))) agree_lowestPVal#4470460 else cast(null as string), 0, 0) AS tuqtl#4471833]
                                       +- Project [biosampleId#2091857, targetId#2095238, diseaseId#2095387, leftStudyLocusId#2091888, rightStudyId#2093900, rightStudyLocusId#2091889, chromosome#2091890, rightStudyType#2091891, numberColocalisingVariants#2091892L, h0#2091893, h1#2091894, h2#2091895, h3#2091896, h4#2091897, colocalisationMethod#2091898, betaRatioSignAverage#2091899, leftStudyId#2093875, leftVariantId#2093876, credibleLeftStudyType#2093877, rightVariantId#2093901, credibleRightStudyType#2093902, projectId#2091830, indexStudyType#2093953, condition#2091853, ... 24 more fields]
                                          +- Project [biosampleId#2091857, targetId#2095238, diseaseId#2095387, leftStudyLocusId#2091888, rightStudyId#2093900, rightStudyLocusId#2091889, chromosome#2091890, rightStudyType#2091891, numberColocalisingVariants#2091892L, h0#2091893, h1#2091894, h2#2091895, h3#2091896, h4#2091897, colocalisationMethod#2091898, betaRatioSignAverage#2091899, leftStudyId#2093875, leftVariantId#2093876, credibleLeftStudyType#2093877, rightVariantId#2093901, credibleRightStudyType#2093902, projectId#2091830, indexStudyType#2093953, condition#2091853, ... 25 more fields]
                                             +- Window [first(AgreeDrug#2103382, false) windowspecdefinition(targetId#2095238, diseaseId#2095387, rightStudyType#2091891, pValueExponent#2095287 ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS agree_lowestPval#4470460], [targetId#2095238, diseaseId#2095387, rightStudyType#2091891], [pValueExponent#2095287 ASC NULLS FIRST]
                                                +- Project [biosampleId#2091857, targetId#2095238, diseaseId#2095387, leftStudyLocusId#2091888, rightStudyId#2093900, rightStudyLocusId#2091889, chromosome#2091890, rightStudyType#2091891, numberColocalisingVariants#2091892L, h0#2091893, h1#2091894, h2#2091895, h3#2091896, h4#2091897, colocalisationMethod#2091898, betaRatioSignAverage#2091899, leftStudyId#2093875, leftVariantId#2093876, credibleLeftStudyType#2093877, rightVariantId#2093901, credibleRightStudyType#2093902, projectId#2091830, indexStudyType#2093953, condition#2091853, ... 23 more fields]
                                                   +- Project [biosampleId#2091857, targetId#2095238, diseaseId#2095387, leftStudyLocusId#2091888, rightStudyId#2093900, rightStudyLocusId#2091889, chromosome#2091890, rightStudyType#2091891, numberColocalisingVariants#2091892L, h0#2091893, h1#2091894, h2#2091895, h3#2091896, h4#2091897, colocalisationMethod#2091898, betaRatioSignAverage#2091899, leftStudyId#2093875, leftVariantId#2093876, credibleLeftStudyType#2093877, rightVariantId#2093901, credibleRightStudyType#2093902, projectId#2091830, indexStudyType#2093953, condition#2091853, ... 23 more fields]
                                                      +- Join LeftOuter, (biosampleId#2091857 = biosampleId#2091938)
                                                         :- Filter NOT (name#2091536 = COVID-19)
                                                         :  +- Project [targetId#2095238, diseaseId#2095387, leftStudyLocusId#2091888, rightStudyId#2093900, rightStudyLocusId#2091889, chromosome#2091890, rightStudyType#2091891, numberColocalisingVariants#2091892L, h0#2091893, h1#2091894, h2#2091895, h3#2091896, h4#2091897, colocalisationMethod#2091898, betaRatioSignAverage#2091899, leftStudyId#2093875, leftVariantId#2093876, credibleLeftStudyType#2093877, rightVariantId#2093901, credibleRightStudyType#2093902, projectId#2091830, indexStudyType#2093953, condition#2091853, biosampleId#2091857, ... 22 more fields]
                                                         :     +- Project [targetId#2095238, diseaseId#2095387, leftStudyLocusId#2091888, rightStudyId#2093900, rightStudyLocusId#2091889, chromosome#2091890, rightStudyType#2091891, numberColocalisingVariants#2091892L, h0#2091893, h1#2091894, h2#2091895, h3#2091896, h4#2091897, colocalisationMethod#2091898, betaRatioSignAverage#2091899, leftStudyId#2093875, leftVariantId#2093876, credibleLeftStudyType#2093877, rightVariantId#2093901, credibleRightStudyType#2093902, projectId#2091830, indexStudyType#2093953, condition#2091853, biosampleId#2091857, ... 21 more fields]
                                                         :        +- Join Inner, ((targetId#2095238 = targetId#2096461) AND (diseaseId#2095387 = diseaseId#2096561))
                                                         :           :- Filter (betaGwas#2095217 < cast(0 as double))
                                                         :           :  +- Project [diseaseId#2095387, leftStudyLocusId#2091888, targetId#2095238, rightStudyId#2093900, rightStudyLocusId#2091889, chromosome#2091890, rightStudyType#2091891, numberColocalisingVariants#2091892L, h0#2091893, h1#2091894, h2#2091895, h3#2091896, h4#2091897, colocalisationMethod#2091898, betaRatioSignAverage#2091899, leftStudyId#2093875, leftVariantId#2093876, credibleLeftStudyType#2093877, rightVariantId#2093901, credibleRightStudyType#2093902, projectId#2091830, indexStudyType#2093953, condition#2091853, biosampleId#2091857, ... 15 more fields]
                                                         :           :     +- Project [diseaseId#2095387, leftStudyLocusId#2091888, targetId#2095238, rightStudyId#2093900, rightStudyLocusId#2091889, chromosome#2091890, rightStudyType#2091891, numberColocalisingVariants#2091892L, h0#2091893, h1#2091894, h2#2091895, h3#2091896, h4#2091897, colocalisationMethod#2091898, betaRatioSignAverage#2091899, leftStudyId#2093875, leftVariantId#2093876, credibleLeftStudyType#2093877, rightVariantId#2093901, credibleRightStudyType#2093902, projectId#2091830, indexStudyType#2093953, condition#2091853, biosampleId#2091857, ... 14 more fields]
                                                         :           :        +- Project [diseaseId#2095387, leftStudyLocusId#2091888, targetId#2095238, rightStudyId#2093900, rightStudyLocusId#2091889, chromosome#2091890, rightStudyType#2091891, numberColocalisingVariants#2091892L, h0#2091893, h1#2091894, h2#2091895, h3#2091896, h4#2091897, colocalisationMethod#2091898, betaRatioSignAverage#2091899, leftStudyId#2093875, leftVariantId#2093876, credibleLeftStudyType#2093877, rightVariantId#2093901, credibleRightStudyType#2093902, projectId#2091830, indexStudyType#2093953, condition#2091853, biosampleId#2091857, ... 15 more fields]
                                                         :           :           +- Generate explode(concat(array(diseaseId#2091663), parents#2091539)), true, [diseaseId#2095387]
                                                         :           :              +- Project [diseaseId#2091663, leftStudyLocusId#2091888, targetId#2095238, rightStudyId#2093900, rightStudyLocusId#2091889, chromosome#2091890, rightStudyType#2091891, numberColocalisingVariants#2091892L, h0#2091893, h1#2091894, h2#2091895, h3#2091896, h4#2091897, colocalisationMethod#2091898, betaRatioSignAverage#2091899, leftStudyId#2093875, leftVariantId#2093876, credibleLeftStudyType#2093877, rightVariantId#2093901, credibleRightStudyType#2093902, projectId#2091830, indexStudyType#2093953, condition#2091853, biosampleId#2091857, ... 15 more fields]
                                                         :           :                 +- Join LeftOuter, (diseaseId#2091663 = diseaseId#2095341)
                                                         :           :                    :- Project [leftStudyLocusId#2091888, targetId#2095238, rightStudyId#2093900, rightStudyLocusId#2091889, chromosome#2091890, rightStudyType#2091891, numberColocalisingVariants#2091892L, h0#2091893, h1#2091894, h2#2091895, h3#2091896, h4#2091897, colocalisationMethod#2091898, betaRatioSignAverage#2091899, leftStudyId#2093875, leftVariantId#2093876, credibleLeftStudyType#2093877, rightVariantId#2093901, credibleRightStudyType#2093902, projectId#2091830, indexStudyType#2093953, condition#2091853, biosampleId#2091857, datasourceId#2091562, ... 12 more fields]
                                                         :           :                    :  +- Join Inner, ((leftStudyLocusId#2091888 = leftStudyLocusId#2095262) AND (targetId#2095238 = targetId#2091563))
                                                         :           :                    :     :- Project [rightStudyId#2093900, rightStudyLocusId#2091889, leftStudyLocusId#2091888, chromosome#2091890, rightStudyType#2091891, numberColocalisingVariants#2091892L, h0#2091893, h1#2091894, h2#2091895, h3#2091896, h4#2091897, colocalisationMethod#2091898, betaRatioSignAverage#2091899, leftStudyId#2093875, leftVariantId#2093876, credibleLeftStudyType#2093877, rightVariantId#2093901, credibleRightStudyType#2093902, geneId#2091829 AS targetId#2095238, projectId#2091830, indexStudyType#2093953, condition#2091853, biosampleId#2091857]
                                                         :           :                    :     :  +- Project [rightStudyId#2093900, rightStudyLocusId#2091889, leftStudyLocusId#2091888, chromosome#2091890, rightStudyType#2091891, numberColocalisingVariants#2091892L, h0#2091893, h1#2091894, h2#2091895, h3#2091896, h4#2091897, colocalisationMethod#2091898, betaRatioSignAverage#2091899, leftStudyId#2093875, leftVariantId#2093876, credibleLeftStudyType#2093877, rightVariantId#2093901, credibleRightStudyType#2093902, geneId#2091829, projectId#2091830, indexStudyType#2093953, condition#2091853, biosampleId#2091857]
                                                         :           :                    :     :     +- Join LeftOuter, (rightStudyId#2093900 = rightStudyId#2093952)
                                                         :           :                    :     :        :- Project [rightStudyLocusId#2091889, leftStudyLocusId#2091888, chromosome#2091890, rightStudyType#2091891, numberColocalisingVariants#2091892L, h0#2091893, h1#2091894, h2#2091895, h3#2091896, h4#2091897, colocalisationMethod#2091898, betaRatioSignAverage#2091899, leftStudyId#2093875, leftVariantId#2093876, credibleLeftStudyType#2093877, rightStudyId#2093900, rightVariantId#2093901, credibleRightStudyType#2093902]
                                                         :           :                    :     :        :  +- Join LeftOuter, (rightStudyLocusId#2091889 = rightStudyLocusId#2093899)
                                                         :           :                    :     :        :     :- Project [leftStudyLocusId#2091888, rightStudyLocusId#2091889, chromosome#2091890, rightStudyType#2091891, numberColocalisingVariants#2091892L, h0#2091893, h1#2091894, h2#2091895, h3#2091896, h4#2091897, colocalisationMethod#2091898, betaRatioSignAverage#2091899, leftStudyId#2093875, leftVariantId#2093876, credibleLeftStudyType#2093877]
                                                         :           :                    :     :        :     :  +- Join LeftOuter, (leftStudyLocusId#2091888 = leftStudyLocusId#2093874)
                                                         :           :                    :     :        :     :     :- Relation [leftStudyLocusId#2091888,rightStudyLocusId#2091889,chromosome#2091890,rightStudyType#2091891,numberColocalisingVariants#2091892L,h0#2091893,h1#2091894,h2#2091895,h3#2091896,h4#2091897,colocalisationMethod#2091898,betaRatioSignAverage#2091899] parquet
                                                         :           :                    :     :        :     :     +- Project [studyLocusId#2091776 AS leftStudyLocusId#2093874, StudyId#2091777 AS leftStudyId#2093875, variantId#2091778 AS leftVariantId#2093876, studyType#2091801 AS credibleLeftStudyType#2093877]
                                                         :           :                    :     :        :     :        +- Relation [studyLocusId#2091776,studyId#2091777,variantId#2091778,chromosome#2091779,position#2091780,region#2091781,beta#2091782,zScore#2091783,pValueMantissa#2091784,pValueExponent#2091785,effectAlleleFrequencyFromSource#2091786,standardError#2091787,subStudyDescription#2091788,qualityControls#2091789,finemappingMethod#2091790,credibleSetIndex#2091791,credibleSetlog10BF#2091792,purityMeanR2#2091793,purityMinR2#2091794,locusStart#2091795,locusEnd#2091796,sampleSize#2091797,ldSet#2091798,locus#2091799,... 2 more fields] parquet
                                                         :           :                    :     :        :     +- Project [studyLocusId#2093907 AS rightStudyLocusId#2093899, studyId#2093908 AS rightStudyId#2093900, variantId#2093909 AS rightVariantId#2093901, studyType#2093932 AS credibleRightStudyType#2093902]
                                                         :           :                    :     :        :        +- Relation [studyLocusId#2093907,studyId#2093908,variantId#2093909,chromosome#2093910,position#2093911,region#2093912,beta#2093913,zScore#2093914,pValueMantissa#2093915,pValueExponent#2093916,effectAlleleFrequencyFromSource#2093917,standardError#2093918,subStudyDescription#2093919,qualityControls#2093920,finemappingMethod#2093921,credibleSetIndex#2093922,credibleSetlog10BF#2093923,purityMeanR2#2093924,purityMinR2#2093925,locusStart#2093926,locusEnd#2093927,sampleSize#2093928,ldSet#2093929,locus#2093930,... 2 more fields] parquet
                                                         :           :                    :     :        +- Project [studyId#2091828 AS rightStudyId#2093952, geneId#2091829, projectId#2091830, studyType#2093812 AS indexStudyType#2093953, condition#2091853, biosampleId#2091857]
                                                         :           :                    :     :           +- Project [studyId#2091828, geneId#2091829, projectId#2091830, studyType#2093812, traitFromSource#2091832, traitFromSourceMappedIds#2091833, biosampleFromSourceId#2091834, pubmedId#2091835, publicationTitle#2091836, publicationFirstAuthor#2091837, publicationDate#2091838, publicationJournal#2091839, backgroundTraitFromSourceMappedIds#2091840, initialSampleSize#2091841, nCases#2091842, nControls#2091843, nSamples#2091844, cohorts#2091845, ldPopulationStructure#2091846, discoverySamples#2091847, replicationSamples#2091848, qualityControls#2091849, analysisFlags#2091850, summarystatsLocation#2091851, ... 6 more fields]
                                                         :           :                    :     :              +- Project [studyId#2091828, geneId#2091829, projectId#2091830, CASE WHEN isnotnull(newStudyType#2092272) THEN newStudyType#2092272 ELSE studyType#2091831 END AS studyType#2093812, traitFromSource#2091832, traitFromSourceMappedIds#2091833, biosampleFromSourceId#2091834, pubmedId#2091835, publicationTitle#2091836, publicationFirstAuthor#2091837, publicationDate#2091838, publicationJournal#2091839, backgroundTraitFromSourceMappedIds#2091840, initialSampleSize#2091841, nCases#2091842, nControls#2091843, nSamples#2091844, cohorts#2091845, ldPopulationStructure#2091846, discoverySamples#2091847, replicationSamples#2091848, qualityControls#2091849, analysisFlags#2091850, summarystatsLocation#2091851, ... 7 more fields]
                                                         :           :                    :     :                 +- Project [studyId#2091828, geneId#2091829, projectId#2091830, studyType#2091831, traitFromSource#2091832, traitFromSourceMappedIds#2091833, biosampleFromSourceId#2091834, pubmedId#2091835, publicationTitle#2091836, publicationFirstAuthor#2091837, publicationDate#2091838, publicationJournal#2091839, backgroundTraitFromSourceMappedIds#2091840, initialSampleSize#2091841, nCases#2091842, nControls#2091843, nSamples#2091844, cohorts#2091845, ldPopulationStructure#2091846, discoverySamples#2091847, replicationSamples#2091848, qualityControls#2091849, analysisFlags#2091850, summarystatsLocation#2091851, ... 7 more fields]
                                                         :           :                    :     :                    +- Join LeftOuter, (studyId#2091828 = studyId#2092975)
                                                         :           :                    :     :                       :- Relation [studyId#2091828,geneId#2091829,projectId#2091830,studyType#2091831,traitFromSource#2091832,traitFromSourceMappedIds#2091833,biosampleFromSourceId#2091834,pubmedId#2091835,publicationTitle#2091836,publicationFirstAuthor#2091837,publicationDate#2091838,publicationJournal#2091839,backgroundTraitFromSourceMappedIds#2091840,initialSampleSize#2091841,nCases#2091842,nControls#2091843,nSamples#2091844,cohorts#2091845,ldPopulationStructure#2091846,discoverySamples#2091847,replicationSamples#2091848,qualityControls#2091849,analysisFlags#2091850,summarystatsLocation#2091851,... 6 more fields] parquet
                                                         :           :                    :     :                       +- Project [studyId#2092975, newStudyType#2092272]
                                                         :           :                    :     :                          +- Project [studyId#2092975, geneId#2092976, projectId#2092977, studyType#2092978, traitFromSource#2092979, traitFromSourceMappedIds#2092980, biosampleFromSourceId#2092981, pubmedId#2092982, publicationTitle#2092983, publicationFirstAuthor#2092984, publicationDate#2092985, publicationJournal#2092986, backgroundTraitFromSourceMappedIds#2092987, initialSampleSize#2092988, nCases#2092989, nControls#2092990, nSamples#2092991, cohorts#2092992, ldPopulationStructure#2092993, discoverySamples#2092994, replicationSamples#2092995, qualityControls#2092996, analysisFlags#2092997, summarystatsLocation#2092998, ... 7 more fields]
                                                         :           :                    :     :                             +- Project [extracted_column#2092014, study_type#2091966, studyId#2092975, geneId#2092976, projectId#2092977, studyType#2092978, traitFromSource#2092979, traitFromSourceMappedIds#2092980, biosampleFromSourceId#2092981, pubmedId#2092982, publicationTitle#2092983, publicationFirstAuthor#2092984, publicationDate#2092985, publicationJournal#2092986, backgroundTraitFromSourceMappedIds#2092987, initialSampleSize#2092988, nCases#2092989, nControls#2092990, nSamples#2092991, cohorts#2092992, ldPopulationStructure#2092993, discoverySamples#2092994, replicationSamples#2092995, qualityControls#2092996, ... 10 more fields]
                                                         :           :                    :     :                                +- Project [extracted_column#2092014, study_type#2091966, studyId#2092975, geneId#2092976, projectId#2092977, studyType#2092978, traitFromSource#2092979, traitFromSourceMappedIds#2092980, biosampleFromSourceId#2092981, pubmedId#2092982, publicationTitle#2092983, publicationFirstAuthor#2092984, publicationDate#2092985, publicationJournal#2092986, backgroundTraitFromSourceMappedIds#2092987, initialSampleSize#2092988, nCases#2092989, nControls#2092990, nSamples#2092991, cohorts#2092992, ldPopulationStructure#2092993, discoverySamples#2092994, replicationSamples#2092995, qualityControls#2092996, ... 9 more fields]
                                                         :           :                    :     :                                   +- Project [extracted_column#2092014, study_type#2091966, studyId#2092975, geneId#2092976, projectId#2092977, studyType#2092978, traitFromSource#2092979, traitFromSourceMappedIds#2092980, biosampleFromSourceId#2092981, pubmedId#2092982, publicationTitle#2092983, publicationFirstAuthor#2092984, publicationDate#2092985, publicationJournal#2092986, backgroundTraitFromSourceMappedIds#2092987, initialSampleSize#2092988, nCases#2092989, nControls#2092990, nSamples#2092991, cohorts#2092992, ldPopulationStructure#2092993, discoverySamples#2092994, replicationSamples#2092995, qualityControls#2092996, ... 8 more fields]
                                                         :           :                    :     :                                      +- Join RightOuter, (extracted_column#2091978 = extracted_column#2092014)
                                                         :           :                    :     :                                         :- Project [concat_ws(_, study_label#2091958, quant_method#2091964, sample_group#2091959) AS extracted_column#2091978, study_type#2091966]
                                                         :           :                    :     :                                         :  +- LogicalRDD [study_id#2091956, dataset_id#2091957, study_label#2091958, sample_group#2091959, tissue_id#2091960, tissue_label#2091961, condition_label#2091962, sample_size#2091963, quant_method#2091964, pmid#2091965, study_type#2091966], false
                                                         :           :                    :     :                                         +- Project [studyId#2092975, geneId#2092976, projectId#2092977, studyType#2092978, traitFromSource#2092979, traitFromSourceMappedIds#2092980, biosampleFromSourceId#2092981, pubmedId#2092982, publicationTitle#2092983, publicationFirstAuthor#2092984, publicationDate#2092985, publicationJournal#2092986, backgroundTraitFromSourceMappedIds#2092987, initialSampleSize#2092988, nCases#2092989, nControls#2092990, nSamples#2092991, cohorts#2092992, ldPopulationStructure#2092993, discoverySamples#2092994, replicationSamples#2092995, qualityControls#2092996, analysisFlags#2092997, summarystatsLocation#2092998, ... 7 more fields]
                                                         :           :                    :     :                                            +- Project [studyId#2092975, geneId#2092976, projectId#2092977, studyType#2092978, traitFromSource#2092979, traitFromSourceMappedIds#2092980, biosampleFromSourceId#2092981, pubmedId#2092982, publicationTitle#2092983, publicationFirstAuthor#2092984, publicationDate#2092985, publicationJournal#2092986, backgroundTraitFromSourceMappedIds#2092987, initialSampleSize#2092988, nCases#2092989, nControls#2092990, nSamples#2092991, cohorts#2092992, ldPopulationStructure#2092993, discoverySamples#2092994, replicationSamples#2092995, qualityControls#2092996, analysisFlags#2092997, summarystatsLocation#2092998, ... 7 more fields]
                                                         :           :                    :     :                                               +- Filter NOT StartsWith(studyId#2092975, UKB_PPP)
                                                         :           :                    :     :                                                  +- Filter NOT (studyType#2092978 = gwas)
                                                         :           :                    :     :                                                     +- Relation [studyId#2092975,geneId#2092976,projectId#2092977,studyType#2092978,traitFromSource#2092979,traitFromSourceMappedIds#2092980,biosampleFromSourceId#2092981,pubmedId#2092982,publicationTitle#2092983,publicationFirstAuthor#2092984,publicationDate#2092985,publicationJournal#2092986,backgroundTraitFromSourceMappedIds#2092987,initialSampleSize#2092988,nCases#2092989,nControls#2092990,nSamples#2092991,cohorts#2092992,ldPopulationStructure#2092993,discoverySamples#2092994,replicationSamples#2092995,qualityControls#2092996,analysisFlags#2092997,summarystatsLocation#2092998,... 6 more fields] parquet
                                                         :           :                    :     +- Project [studyLocusId#2091641 AS leftStudyLocusId#2095262, datasourceId#2091562, targetId#2091563, datatypeId#2091592, diseaseFromSourceMappedId#2091596, resourceScore#2091632, targetFromSourceId#2091648, diseaseId#2091663, id#2091664, score#2091665, sourceId#2091668, studyId#2095279, variantId#2095280, betaGwas#2095217, pValueExponent#2095287]
                                                         :           :                    :        +- Project [studyLocusId#2091641, datasourceId#2091562, targetId#2091563, datatypeId#2091592, diseaseFromSourceMappedId#2091596, resourceScore#2091632, targetFromSourceId#2091648, diseaseId#2091663, id#2091664, score#2091665, sourceId#2091668, studyId#2095279, variantId#2095280, betaGwas#2095217, pValueExponent#2095287]
                                                         :           :                    :           +- Join LeftOuter, (studyLocusId#2091641 = studyLocusId#2095278)
                                                         :           :                    :              :- Project [datasourceId#2091562, targetId#2091563, datatypeId#2091592, diseaseFromSourceMappedId#2091596, resourceScore#2091632, studyLocusId#2091641, targetFromSourceId#2091648, diseaseId#2091663, id#2091664, score#2091665, sourceId#2091668]
                                                         :           :                    :              :  +- Filter (datasourceId#2091562 = gwas_credible_sets)
                                                         :           :                    :              :     +- Relation [datasourceId#2091562,targetId#2091563,alleleOrigins#2091564,allelicRequirements#2091565,ancestry#2091566,ancestryId#2091567,assays#2091568,assessments#2091569,beta#2091570,betaConfidenceIntervalLower#2091571,betaConfidenceIntervalUpper#2091572,biologicalModelAllelicComposition#2091573,biologicalModelGeneticBackground#2091574,biologicalModelId#2091575,biomarkerList#2091576,biomarkerName#2091577,biomarkers#2091578,biosamplesFromSource#2091579,cellLineBackground#2091580,cellType#2091581,clinicalPhase#2091582,clinicalSignificances#2091583,clinicalStatus#2091584,cohortDescription#2091585,... 83 more fields] parquet
                                                         :           :                    :              +- Project [studyLocusId#2095278, studyId#2095279, variantId#2095280, beta#2095284 AS betaGwas#2095217, pValueExponent#2095287]
                                                         :           :                    :                 +- Relation [studyLocusId#2095278,studyId#2095279,variantId#2095280,chromosome#2095281,position#2095282,region#2095283,beta#2095284,zScore#2095285,pValueMantissa#2095286,pValueExponent#2095287,effectAlleleFrequencyFromSource#2095288,standardError#2095289,subStudyDescription#2095290,qualityControls#2095291,finemappingMethod#2095292,credibleSetIndex#2095293,credibleSetlog10BF#2095294,purityMeanR2#2095295,purityMinR2#2095296,locusStart#2095297,locusEnd#2095298,sampleSize#2095299,ldSet#2095300,locus#2095301,... 2 more fields] parquet
                                                         :           :                    +- Project [id#2091532 AS diseaseId#2095341, name#2091536, parents#2091539, therapeuticAreas#2091544]
                                                         :           :                       +- Relation [id#2091532,code#2091533,dbXRefs#2091534,description#2091535,name#2091536,directLocationIds#2091537,obsoleteTerms#2091538,parents#2091539,synonyms#2091540,ancestors#2091541,descendants#2091542,children#2091543,therapeuticAreas#2091544,indirectLocationIds#2091545,ontology#2091546] parquet
                                                         :           +- Project [targetId#2096461, diseaseId#2096561, clinicalStudyId#2102030, clinicalPhase#2096480, maxClinPhase#2098565, approvedDrug#2098689, drugGoF_protect#2102052L, LoF_protect#2101949L AS drugLoF_protect#2102061L]
                                                         :              +- Project [targetId#2096461, diseaseId#2096561, clinicalStudyId#2102030, clinicalPhase#2096480, maxClinPhase#2098565, approvedDrug#2098689, GoF_protect#2101948L AS drugGoF_protect#2102052L, LoF_protect#2101949L]
                                                         :                 +- Project [targetId#2096461, diseaseId#2096561, clinicalStudyId#2102030, clinicalPhase#2096480, maxClinPhase#2098565, approvedDrug#2098689, GoF_protect#2101948L, LoF_protect#2101949L]
                                                         :                    +- Filter (coherencyDiagonal#2102003 = coherent)
                                                         :                       +- Project [targetId#2096461, diseaseId#2096561, studyId#2096538 AS clinicalStudyId#2102030, clinicalPhase#2096480, maxClinPhase#2098565, approvedDrug#2098689, GoF_protect#2101948L, LoF_protect#2101949L, GoF_risk#2101962, LoF_risk#2101980, noEvaluable#2101991, coherencyDiagonal#2102003, coherencyOneCell#2102016]
                                                         :                          +- Project [targetId#2096461, diseaseId#2096561, studyId#2096538, clinicalPhase#2096480, maxClinPhase#2098565, approvedDrug#2098689, GoF_protect#2101948L, LoF_protect#2101949L, GoF_risk#2101962, LoF_risk#2101980, noEvaluable#2101991, coherencyDiagonal#2102003, CASE WHEN ((((isnull(LoF_risk#2101980) AND isnull(LoF_protect#2101949L)) AND isnull(GoF_risk#2101962)) AND isnull(GoF_protect#2101948L)) AND isnull(noEvaluable#2101991)) THEN noEvid WHEN ((((isnull(LoF_risk#2101980) AND isnull(LoF_protect#2101949L)) AND isnull(GoF_risk#2101962)) AND isnull(GoF_protect#2101948L)) AND isnotnull(noEvaluable#2101991)) THEN EvidNotDoE WHEN (((isnotnull(LoF_risk#2101980) OR isnotnull(LoF_protect#2101949L)) OR isnotnull(GoF_risk#2101962)) OR isnotnull(GoF_protect#2101948L)) THEN CASE WHEN (isnotnull(LoF_risk#2101980) AND ((isnull(LoF_protect#2101949L) AND isnull(GoF_risk#2101962)) AND isnull(GoF_protect#2101948L))) THEN coherent WHEN (isnotnull(GoF_risk#2101962) AND ((isnull(LoF_protect#2101949L) AND isnull(LoF_risk#2101980)) AND isnull(GoF_protect#2101948L))) THEN coherent WHEN (isnotnull(LoF_protect#2101949L) AND ((isnull(LoF_risk#2101980) AND isnull(GoF_risk#2101962)) AND isnull(GoF_protect#2101948L))) THEN coherent WHEN (isnotnull(GoF_protect#2101948L) AND ((isnull(LoF_protect#2101949L) AND isnull(GoF_risk#2101962)) AND isnull(LoF_risk#2101980))) THEN coherent ELSE dispar END END AS coherencyOneCell#2102016]
                                                         :                             +- Project [targetId#2096461, diseaseId#2096561, studyId#2096538, clinicalPhase#2096480, maxClinPhase#2098565, approvedDrug#2098689, GoF_protect#2101948L, LoF_protect#2101949L, GoF_risk#2101962, LoF_risk#2101980, noEvaluable#2101991, CASE WHEN ((((isnull(LoF_risk#2101980) AND isnull(LoF_protect#2101949L)) AND isnull(GoF_risk#2101962)) AND isnull(GoF_protect#2101948L)) AND isnull(noEvaluable#2101991)) THEN noEvid WHEN ((((isnull(LoF_risk#2101980) AND isnull(LoF_protect#2101949L)) AND isnull(GoF_risk#2101962)) AND isnull(GoF_protect#2101948L)) AND isnotnull(noEvaluable#2101991)) THEN EvidNotDoE WHEN (((isnotnull(LoF_risk#2101980) OR isnotnull(LoF_protect#2101949L)) OR isnotnull(GoF_risk#2101962)) OR isnotnull(GoF_protect#2101948L)) THEN CASE WHEN (isnotnull(GoF_risk#2101962) AND isnotnull(LoF_risk#2101980)) THEN dispar WHEN (isnotnull(LoF_protect#2101949L) AND isnotnull(LoF_risk#2101980)) THEN dispar WHEN (isnotnull(GoF_protect#2101948L) AND isnotnull(GoF_risk#2101962)) THEN dispar WHEN (isnotnull(GoF_protect#2101948L) AND isnotnull(LoF_protect#2101949L)) THEN dispar ELSE coherent END END AS coherencyDiagonal#2102003]
                                                         :                                +- Project [targetId#2096461, diseaseId#2096561, studyId#2096538, clinicalPhase#2096480, maxClinPhase#2098565, approvedDrug#2098689, GoF_protect#2101948L, LoF_protect#2101949L, GoF_risk#2101962, LoF_risk#2101980, null AS noEvaluable#2101991]
                                                         :                                   +- Project [targetId#2096461, diseaseId#2096561, studyId#2096538, clinicalPhase#2096480, maxClinPhase#2098565, approvedDrug#2098689, GoF_protect#2101948L, LoF_protect#2101949L, GoF_risk#2101962, null AS LoF_risk#2101980]
                                                         :                                      +- Project [targetId#2096461, diseaseId#2096561, studyId#2096538, clinicalPhase#2096480, maxClinPhase#2098565, approvedDrug#2098689, GoF_protect#2101948L, LoF_protect#2101949L, null AS GoF_risk#2101962]
                                                         :                                         +- Project [targetId#2096461, diseaseId#2096561, studyId#2096538, clinicalPhase#2096480, maxClinPhase#2098565, approvedDrug#2098689, __pivot_count(1) AS count AS `count(1) AS count`#2101947[0] AS GoF_protect#2101948L, __pivot_count(1) AS count AS `count(1) AS count`#2101947[1] AS LoF_protect#2101949L]
                                                         :                                            +- Aggregate [targetId#2096461, diseaseId#2096561, studyId#2096538, clinicalPhase#2096480, maxClinPhase#2098565, approvedDrug#2098689], [targetId#2096461, diseaseId#2096561, studyId#2096538, clinicalPhase#2096480, maxClinPhase#2098565, approvedDrug#2098689, pivotfirst(homogenized#2098300, count(1) AS count#2101941L, GoF_protect, LoF_protect, 0, 0) AS __pivot_count(1) AS count AS `count(1) AS count`#2101947]
                                                         :                                               +- Aggregate [targetId#2096461, diseaseId#2096561, studyId#2096538, clinicalPhase#2096480, maxClinPhase#2098565, approvedDrug#2098689, homogenized#2098300], [targetId#2096461, diseaseId#2096561, studyId#2096538, clinicalPhase#2096480, maxClinPhase#2098565, approvedDrug#2098689, homogenized#2098300, count(1) AS count(1) AS count#2101941L]
                                                         :                                                  +- Project [targetId#2096461, diseaseId#2096561, drugId#2096498, datasourceId#2096460, alleleOrigins#2096462, allelicRequirements#2096463, ancestry#2096464, ancestryId#2096465, assays#2096466, assessments#2096467, beta#2096802, betaConfidenceIntervalLower#2096469, betaConfidenceIntervalUpper#2096470, biologicalModelAllelicComposition#2096471, biologicalModelGeneticBackground#2096472, biologicalModelId#2096473, biomarkerList#2096474, biomarkerName#2096475, biomarkers#2096476, biosamplesFromSource#2096477, cellLineBackground#2096478, cellType#2096479, clinicalPhase#2096480, clinicalSignificances#2097018, ... 99 more fields]
                                                         :                                                     +- Project [targetId#2096461, diseaseId#2096561, drugId#2096498, datasourceId#2096460, alleleOrigins#2096462, allelicRequirements#2096463, ancestry#2096464, ancestryId#2096465, assays#2096466, assessments#2096467, beta#2096802, betaConfidenceIntervalLower#2096469, betaConfidenceIntervalUpper#2096470, biologicalModelAllelicComposition#2096471, biologicalModelGeneticBackground#2096472, biologicalModelId#2096473, biomarkerList#2096474, biomarkerName#2096475, biomarkers#2096476, biosamplesFromSource#2096477, cellLineBackground#2096478, cellType#2096479, clinicalPhase#2096480, clinicalSignificances#2097018, ... 100 more fields]
                                                         :                                                        +- Window [max(isApproved#2098438) windowspecdefinition(targetId#2096461, diseaseId#2096561, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS approvedDrug#2098689], [targetId#2096461, diseaseId#2096561]
                                                         :                                                           +- Project [targetId#2096461, diseaseId#2096561, drugId#2096498, datasourceId#2096460, alleleOrigins#2096462, allelicRequirements#2096463, ancestry#2096464, ancestryId#2096465, assays#2096466, assessments#2096467, beta#2096802, betaConfidenceIntervalLower#2096469, betaConfidenceIntervalUpper#2096470, biologicalModelAllelicComposition#2096471, biologicalModelGeneticBackground#2096472, biologicalModelId#2096473, biomarkerList#2096474, biomarkerName#2096475, biomarkers#2096476, biosamplesFromSource#2096477, cellLineBackground#2096478, cellType#2096479, clinicalPhase#2096480, clinicalSignificances#2097018, ... 98 more fields]
                                                         :                                                              +- Project [targetId#2096461, diseaseId#2096561, drugId#2096498, datasourceId#2096460, alleleOrigins#2096462, allelicRequirements#2096463, ancestry#2096464, ancestryId#2096465, assays#2096466, assessments#2096467, beta#2096802, betaConfidenceIntervalLower#2096469, betaConfidenceIntervalUpper#2096470, biologicalModelAllelicComposition#2096471, biologicalModelGeneticBackground#2096472, biologicalModelId#2096473, biomarkerList#2096474, biomarkerName#2096475, biomarkers#2096476, biosamplesFromSource#2096477, cellLineBackground#2096478, cellType#2096479, clinicalPhase#2096480, clinicalSignificances#2097018, ... 98 more fields]
                                                         :                                                                 +- Project [targetId#2096461, diseaseId#2096561, drugId#2096498, datasourceId#2096460, alleleOrigins#2096462, allelicRequirements#2096463, ancestry#2096464, ancestryId#2096465, assays#2096466, assessments#2096467, beta#2096802, betaConfidenceIntervalLower#2096469, betaConfidenceIntervalUpper#2096470, biologicalModelAllelicComposition#2096471, biologicalModelGeneticBackground#2096472, biologicalModelId#2096473, biomarkerList#2096474, biomarkerName#2096475, biomarkers#2096476, biosamplesFromSource#2096477, cellLineBackground#2096478, cellType#2096479, clinicalPhase#2096480, clinicalSignificances#2097018, ... 99 more fields]
                                                         :                                                                    +- Window [max(clinicalPhase#2096480) windowspecdefinition(targetId#2096461, diseaseId#2096561, drugId#2096498, studyId#2096538, clinicalPhase#2096480, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS maxClinPhase#2098565], [targetId#2096461, diseaseId#2096561, drugId#2096498, studyId#2096538, clinicalPhase#2096480]
                                                         :                                                                       +- Project [targetId#2096461, diseaseId#2096561, drugId#2096498, datasourceId#2096460, alleleOrigins#2096462, allelicRequirements#2096463, ancestry#2096464, ancestryId#2096465, assays#2096466, assessments#2096467, beta#2096802, betaConfidenceIntervalLower#2096469, betaConfidenceIntervalUpper#2096470, biologicalModelAllelicComposition#2096471, biologicalModelGeneticBackground#2096472, biologicalModelId#2096473, biomarkerList#2096474, biomarkerName#2096475, biomarkers#2096476, biosamplesFromSource#2096477, cellLineBackground#2096478, cellType#2096479, clinicalPhase#2096480, clinicalSignificances#2097018, ... 97 more fields]
                                                         :                                                                          +- Project [targetId#2096461, diseaseId#2096561, drugId#2096498, datasourceId#2096460, alleleOrigins#2096462, allelicRequirements#2096463, ancestry#2096464, ancestryId#2096465, assays#2096466, assessments#2096467, beta#2096802, betaConfidenceIntervalLower#2096469, betaConfidenceIntervalUpper#2096470, biologicalModelAllelicComposition#2096471, biologicalModelGeneticBackground#2096472, biologicalModelId#2096473, biomarkerList#2096474, biomarkerName#2096475, biomarkers#2096476, biosamplesFromSource#2096477, cellLineBackground#2096478, cellType#2096479, clinicalPhase#2096480, clinicalSignificances#2097018, ... 97 more fields]
                                                         :                                                                             +- Join LeftOuter, (((targetId#2096461 = targetId#2098421) AND (diseaseId#2096561 = diseaseId#2098423)) AND (drugId#2096498 = drugId#2098422))
                                                         :                                                                                :- Filter ((datasourceId#2096460 = chembl) AND NOT (homogenized#2098300 = noEvaluable))
                                                         :                                                                                :  +- Project [datasourceId#2096460, targetId#2096461, alleleOrigins#2096462, allelicRequirements#2096463, ancestry#2096464, ancestryId#2096465, assays#2096466, assessments#2096467, beta#2096802, betaConfidenceIntervalLower#2096469, betaConfidenceIntervalUpper#2096470, biologicalModelAllelicComposition#2096471, biologicalModelGeneticBackground#2096472, biologicalModelId#2096473, biomarkerList#2096474, biomarkerName#2096475, biomarkers#2096476, biosamplesFromSource#2096477, cellLineBackground#2096478, cellType#2096479, clinicalPhase#2096480, clinicalSignificances#2097018, clinicalStatus#2096482, cohortDescription#2096483, ... 96 more fields]
                                                         :                                                                                :     +- Project [datasourceId#2096460, targetId#2096461, alleleOrigins#2096462, allelicRequirements#2096463, ancestry#2096464, ancestryId#2096465, assays#2096466, assessments#2096467, beta#2096802, betaConfidenceIntervalLower#2096469, betaConfidenceIntervalUpper#2096470, biologicalModelAllelicComposition#2096471, biologicalModelGeneticBackground#2096472, biologicalModelId#2096473, biomarkerList#2096474, biomarkerName#2096475, biomarkers#2096476, biosamplesFromSource#2096477, cellLineBackground#2096478, cellType#2096479, clinicalPhase#2096480, clinicalSignificances#2097018, clinicalStatus#2096482, cohortDescription#2096483, ... 95 more fields]
                                                         :                                                                                :        +- Project [datasourceId#2096460, targetId#2096461, alleleOrigins#2096462, allelicRequirements#2096463, ancestry#2096464, ancestryId#2096465, assays#2096466, assessments#2096467, beta#2096802, betaConfidenceIntervalLower#2096469, betaConfidenceIntervalUpper#2096470, biologicalModelAllelicComposition#2096471, biologicalModelGeneticBackground#2096472, biologicalModelId#2096473, biomarkerList#2096474, biomarkerName#2096475, biomarkers#2096476, biosamplesFromSource#2096477, cellLineBackground#2096478, cellType#2096479, clinicalPhase#2096480, clinicalSignificances#2097018, clinicalStatus#2096482, cohortDescription#2096483, ... 95 more fields]
                                                         :                                                                                :           +- Project [datasourceId#2096460, targetId#2096461, alleleOrigins#2096462, allelicRequirements#2096463, ancestry#2096464, ancestryId#2096465, assays#2096466, assessments#2096467, beta#2096802, betaConfidenceIntervalLower#2096469, betaConfidenceIntervalUpper#2096470, biologicalModelAllelicComposition#2096471, biologicalModelGeneticBackground#2096472, biologicalModelId#2096473, biomarkerList#2096474, biomarkerName#2096475, biomarkers#2096476, biosamplesFromSource#2096477, cellLineBackground#2096478, cellType#2096479, clinicalPhase#2096480, clinicalSignificances#2097018, clinicalStatus#2096482, cohortDescription#2096483, ... 95 more fields]
                                                         :                                                                                :              +- Project [datasourceId#2096460, targetId#2096461, alleleOrigins#2096462, allelicRequirements#2096463, ancestry#2096464, ancestryId#2096465, assays#2096466, assessments#2096467, beta#2096802, betaConfidenceIntervalLower#2096469, betaConfidenceIntervalUpper#2096470, biologicalModelAllelicComposition#2096471, biologicalModelGeneticBackground#2096472, biologicalModelId#2096473, biomarkerList#2096474, biomarkerName#2096475, biomarkers#2096476, biosamplesFromSource#2096477, cellLineBackground#2096478, cellType#2096479, clinicalPhase#2096480, clinicalSignificances#2097018, clinicalStatus#2096482, cohortDescription#2096483, ... 96 more fields]
                                                         :                                                                                :                 +- Window [collect_set(intogen_function#2097815, 0, 0) windowspecdefinition(targetId#2096461, diseaseId#2096561, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS _we0#2097938], [targetId#2096461, diseaseId#2096561]
                                                         :                                                                                :                    +- Project [datasourceId#2096460, targetId#2096461, alleleOrigins#2096462, allelicRequirements#2096463, ancestry#2096464, ancestryId#2096465, assays#2096466, assessments#2096467, beta#2096802, betaConfidenceIntervalLower#2096469, betaConfidenceIntervalUpper#2096470, biologicalModelAllelicComposition#2096471, biologicalModelGeneticBackground#2096472, biologicalModelId#2096473, biomarkerList#2096474, biomarkerName#2096475, biomarkers#2096476, biosamplesFromSource#2096477, cellLineBackground#2096478, cellType#2096479, clinicalPhase#2096480, clinicalSignificances#2097018, clinicalStatus#2096482, cohortDescription#2096483, ... 94 more fields]
                                                         :                                                                                :                       +- Project [datasourceId#2096460, targetId#2096461, alleleOrigins#2096462, allelicRequirements#2096463, ancestry#2096464, ancestryId#2096465, assays#2096466, assessments#2096467, beta#2096802, betaConfidenceIntervalLower#2096469, betaConfidenceIntervalUpper#2096470, biologicalModelAllelicComposition#2096471, biologicalModelGeneticBackground#2096472, biologicalModelId#2096473, biomarkerList#2096474, biomarkerName#2096475, biomarkers#2096476, biosamplesFromSource#2096477, cellLineBackground#2096478, cellType#2096479, clinicalPhase#2096480, clinicalSignificances#2097018, clinicalStatus#2096482, cohortDescription#2096483, ... 94 more fields]
                                                         :                                                                                :                          +- Project [datasourceId#2096460, targetId#2096461, alleleOrigins#2096462, allelicRequirements#2096463, ancestry#2096464, ancestryId#2096465, assays#2096466, assessments#2096467, beta#2096802, betaConfidenceIntervalLower#2096469, betaConfidenceIntervalUpper#2096470, biologicalModelAllelicComposition#2096471, biologicalModelGeneticBackground#2096472, biologicalModelId#2096473, biomarkerList#2096474, biomarkerName#2096475, biomarkers#2096476, biosamplesFromSource#2096477, cellLineBackground#2096478, cellType#2096479, clinicalPhase#2096480, clinicalSignificances#2097018, clinicalStatus#2096482, cohortDescription#2096483, ... 93 more fields]
                                                         :                                                                                :                             +- Project [datasourceId#2096460, targetId#2096461, alleleOrigins#2096462, allelicRequirements#2096463, ancestry#2096464, ancestryId#2096465, assays#2096466, assessments#2096467, beta#2096802, betaConfidenceIntervalLower#2096469, betaConfidenceIntervalUpper#2096470, biologicalModelAllelicComposition#2096471, biologicalModelGeneticBackground#2096472, biologicalModelId#2096473, biomarkerList#2096474, biomarkerName#2096475, biomarkers#2096476, biosamplesFromSource#2096477, cellLineBackground#2096478, cellType#2096479, clinicalPhase#2096480, clinicalSignificances#2097018, clinicalStatus#2096482, cohortDescription#2096483, ... 92 more fields]
                                                         :                                                                                :                                +- Join LeftOuter, ((drugId2#2096746 = drugId#2096498) AND (targetId2#2096753 = targetId#2096461))
                                                         :                                                                                :                                   :- Join LeftOuter, (target_id#2096796 = targetId#2096461)
                                                         :                                                                                :                                   :  :- Project [datasourceId#2096460, targetId#2096461, alleleOrigins#2096462, allelicRequirements#2096463, ancestry#2096464, ancestryId#2096465, assays#2096466, assessments#2096467, beta#2096802, betaConfidenceIntervalLower#2096469, betaConfidenceIntervalUpper#2096470, biologicalModelAllelicComposition#2096471, biologicalModelGeneticBackground#2096472, biologicalModelId#2096473, biomarkerList#2096474, biomarkerName#2096475, biomarkers#2096476, biosamplesFromSource#2096477, cellLineBackground#2096478, cellType#2096479, clinicalPhase#2096480, concat_ws(,, clinicalSignificances#2096481) AS clinicalSignificances#2097018, clinicalStatus#2096482, cohortDescription#2096483, ... 83 more fields]
                                                         :                                                                                :                                   :  :  +- Project [datasourceId#2096460, targetId#2096461, alleleOrigins#2096462, allelicRequirements#2096463, ancestry#2096464, ancestryId#2096465, assays#2096466, assessments#2096467, beta#2096802, betaConfidenceIntervalLower#2096469, betaConfidenceIntervalUpper#2096470, biologicalModelAllelicComposition#2096471, biologicalModelGeneticBackground#2096472, biologicalModelId#2096473, biomarkerList#2096474, biomarkerName#2096475, biomarkers#2096476, biosamplesFromSource#2096477, cellLineBackground#2096478, cellType#2096479, clinicalPhase#2096480, clinicalSignificances#2096481, clinicalStatus#2096482, cohortDescription#2096483, ... 83 more fields]
                                                         :                                                                                :                                   :  :     +- Project [datasourceId#2096460, targetId#2096461, alleleOrigins#2096462, allelicRequirements#2096463, ancestry#2096464, ancestryId#2096465, assays#2096466, assessments#2096467, cast(beta#2096468 as double) AS beta#2096802, betaConfidenceIntervalLower#2096469, betaConfidenceIntervalUpper#2096470, biologicalModelAllelicComposition#2096471, biologicalModelGeneticBackground#2096472, biologicalModelId#2096473, biomarkerList#2096474, biomarkerName#2096475, biomarkers#2096476, biosamplesFromSource#2096477, cellLineBackground#2096478, cellType#2096479, clinicalPhase#2096480, clinicalSignificances#2096481, clinicalStatus#2096482, cohortDescription#2096483, ... 83 more fields]
                                                         :                                                                                :                                   :  :        +- Filter datasourceId#2096460 IN (ot_genetics_portal,gwas_credible_sets,gene_burden,eva,eva_somatic,gene2phenotype,orphanet,cancer_gene_census,intogen,impc,chembl)
                                                         :                                                                                :                                   :  :           +- Relation [datasourceId#2096460,targetId#2096461,alleleOrigins#2096462,allelicRequirements#2096463,ancestry#2096464,ancestryId#2096465,assays#2096466,assessments#2096467,beta#2096468,betaConfidenceIntervalLower#2096469,betaConfidenceIntervalUpper#2096470,biologicalModelAllelicComposition#2096471,biologicalModelGeneticBackground#2096472,biologicalModelId#2096473,biomarkerList#2096474,biomarkerName#2096475,biomarkers#2096476,biosamplesFromSource#2096477,cellLineBackground#2096478,cellType#2096479,clinicalPhase#2096480,clinicalSignificances#2096481,clinicalStatus#2096482,cohortDescription#2096483,... 83 more fields] parquet
                                                         :                                                                                :                                   :  +- Project [id#2096675 AS target_id#2096796, approvedSymbol#2096676, description#2096781, description_splited#2096785, TSorOncogene#2096790]
                                                         :                                                                                :                                   :     +- Project [id#2096675, approvedSymbol#2096676, description#2096781, description_splited#2096785, CASE WHEN (RLIKE(description_splited#2096785, ncogene) AND RLIKE(description_splited#2096785, TSG)) THEN bivalent WHEN RLIKE(description_splited#2096785, ncogene(\s|$)) THEN oncogene WHEN RLIKE(description_splited#2096785, TSG(\s|$)) THEN TSG ELSE noEvaluable END AS TSorOncogene#2096790]
                                                         :                                                                                :                                   :        +- Project [id#2096675, approvedSymbol#2096676, description#2096781, concat_ws(,, description#2096781) AS description_splited#2096785]
                                                         :                                                                                :                                   :           +- Aggregate [id#2096675, approvedSymbol#2096676], [id#2096675, approvedSymbol#2096676, collect_set(description#2096773, 0, 0) AS description#2096781]
                                                         :                                                                                :                                   :              +- Filter description#2096773 IN (TSG,oncogene,Oncogene,oncogene,oncogene,TSG,TSG,oncogene,fusion,oncogene,oncogene,fusion)
                                                         :                                                                                :                                   :                 +- Project [id#2096675, approvedSymbol#2096676, col#2096768.description AS description#2096773]
                                                         :                                                                                :                                   :                    +- Project [id#2096675, approvedSymbol#2096676, col#2096768]
                                                         :                                                                                :                                   :                       +- Generate explode(hallmarks#2096685.attributes), true, [col#2096768]
                                                         :                                                                                :                                   :                          +- Relation [id#2096675,approvedSymbol#2096676,biotype#2096677,transcriptIds#2096678,canonicalTranscript#2096679,canonicalExons#2096680,genomicLocation#2096681,alternativeGenes#2096682,approvedName#2096683,go#2096684,hallmarks#2096685,synonyms#2096686,symbolSynonyms#2096687,nameSynonyms#2096688,functionDescriptions#2096689,subcellularLocations#2096690,targetClass#2096691,obsoleteSymbols#2096692,obsoleteNames#2096693,constraint#2096694,tep#2096695,proteinIds#2096696,dbXrefs#2096697,chemicalProbes#2096698,... 4 more fields] parquet
                                                         :                                                                                :                                   +- Aggregate [targetId2#2096753, drugId2#2096746], [targetId2#2096753, drugId2#2096746, collect_set(actionType#2096731, 0, 0) AS actionType#2096763]
                                                         :                                                                                :                                      +- Project [targetId2#2096753, drugId2#2096746, actionType#2096731, mechanismOfAction#2096732]
                                                         :                                                                                :                                         +- Generate explode(targets#2096736), true, [targetId2#2096753]
                                                         :                                                                                :                                            +- Project [drugId2#2096746, actionType#2096731, mechanismOfAction#2096732, targets#2096736]
                                                         :                                                                                :                                               +- Generate explode(chemblIds#2096733), true, [drugId2#2096746]
                                                         :                                                                                :                                                  +- Relation [actionType#2096731,mechanismOfAction#2096732,chemblIds#2096733,targetName#2096734,targetType#2096735,targets#2096736,references#2096737] parquet
                                                         :                                                                                +- Filter (isApproved#2098438 = 1)
                                                         :                                                                                   +- Deduplicate [targetId#2098421, drugId#2098422, diseaseId#2098423, isApproved#2098438]
                                                         :                                                                                      +- Project [targetId#2098421, drugId#2098422, diseaseId#2098423, CASE WHEN (isApproved#2098425 = cast(true as boolean)) THEN 1 ELSE 0 END AS isApproved#2098438]
                                                         :                                                                                         +- Project [targetId#2098421, drugId#2098422, diseaseId#2098423, isApproved#2098425]
                                                         :                                                                                            +- Relation [targetId#2098421,drugId#2098422,diseaseId#2098423,clinicalTrialId#2098424,isApproved#2098425,isComplex#2098426] parquet
                                                         +- Project [biosampleId#2091938, biosampleName#2091939]
                                                            +- Relation [biosampleId#2091938,biosampleName#2091939,description#2091940,xrefs#2091941,synonyms#2091942,parents#2091943,ancestors#2091944,children#2091945,descendants#2091946] parquet


In [None]:
benchmark.groupBy("")

### Take examples of the numbers that we observe in the statistics 

In [22]:
### include drugId and studyId becuase there are some studies where there are multiple drugs. 
# Everywthing will become coherent in chembl assoc since this
chemblAssoc = (

    discrepancifier(
        assessment.filter(
            (F.col("datasourceId") == "chembl")
            & (F.col("homogenized") != "noEvaluable")
        )
        .join(drugApproved.filter(F.col("isApproved")==1), on=["targetId","diseaseId","drugId"], how="left")
        .withColumn(
            "maxClinPhase",
            F.max("clinicalPhase").over(Window.partitionBy("targetId", "diseaseId")),
        )
        .withColumn(
        "approvedDrug",
        F.max(F.col("isApproved")).over(Window.partitionBy("targetId", "diseaseId")),
        )
        .groupBy("targetId", "diseaseId","drugId", "studyId","clinicalPhase","maxClinPhase","approvedDrug")
        .pivot("homogenized")
        .count()
    )
    .filter(F.col("coherencyDiagonal") == "coherent")
    .drop(
        "coherencyDiagonal", "coherencyOneCell", "noEvaluable", "GoF_risk", "LoF_risk"
    )
    .withColumnRenamed("GoF_protect", "drugGoF_protect")
    .withColumnRenamed("LoF_protect", "drugLoF_protect")
)

In [21]:
new_chemblAssoc.show()

+---------------+-------------+-------------+-----------+-------------+------------+---------------+---------------+
|       targetId|    diseaseId|       drugId|    studyId|clinicalPhase|maxClinPhase|drugGoF_protect|drugLoF_protect|
+---------------+-------------+-------------+-----------+-------------+------------+---------------+---------------+
|ENSG00000007314|  EFO_0000555|     CHEMBL79|NCT00108446|          2.0|         2.0|           null|              1|
|ENSG00000007314|  EFO_0000555|   CHEMBL1404|NCT02239926|          2.0|         2.0|           null|              1|
|ENSG00000007314|  EFO_0000555|    CHEMBL558|NCT01717404|          1.0|         2.0|           null|              1|
|ENSG00000007314|  EFO_0003102|    CHEMBL569|NCT02099240|          0.5|         0.5|           null|              1|
|ENSG00000007314|  EFO_0003894|     CHEMBL79|NCT01807455|          4.0|         4.0|           null|              1|
|ENSG00000007314|  EFO_0004699| CHEMBL220492|NCT00245583|       

In [16]:
discrepancifier(
        assessment.filter(
            (F.col("datasourceId") == "chembl")
            & (F.col("homogenized") != "noEvaluable")
        )
        .withColumn(
            "maxClinPhase",
            F.max("clinicalPhase").over(Window.partitionBy("targetId", "diseaseId")),
        )
        .groupBy("targetId", "diseaseId", "studyId","clinicalPhase","maxClinPhase")
        .pivot("homogenized")
        .count()
    ).groupBy("coherencyDiagonal").count().show()

25/01/08 12:01:01 WARN CacheManager: Asked to cache already cached data.
25/01/08 12:01:01 WARN CacheManager: Asked to cache already cached data.
25/01/08 12:01:01 WARN CacheManager: Asked to cache already cached data.


+-----------------+------+
|coherencyDiagonal| count|
+-----------------+------+
|           dispar|   760|
|         coherent|445772|
+-----------------+------+



In [18]:
discrepancifier(
        assessment.filter(
            (F.col("datasourceId") == "chembl")
            & (F.col("homogenized") != "noEvaluable")
        )
        .withColumn(
            "maxClinPhase",
            F.max("clinicalPhase").over(Window.partitionBy("targetId", "diseaseId")),
        )
        .groupBy("targetId", "diseaseId", "drugId","clinicalPhase","maxClinPhase")
        .pivot("homogenized")
        .count()
    ).groupBy("coherencyDiagonal").count().show()



+-----------------+------+
|coherencyDiagonal| count|
+-----------------+------+
|         coherent|197723|
+-----------------+------+



                                                                                

In [17]:
discrepancifier(
        assessment.filter(
            (F.col("datasourceId") == "chembl")
            & (F.col("homogenized") != "noEvaluable")
        )
        .withColumn(
            "maxClinPhase",
            F.max("clinicalPhase").over(Window.partitionBy("targetId", "diseaseId")),
        )
        .groupBy("targetId", "diseaseId","drugId", "studyId","clinicalPhase","maxClinPhase")
        .pivot("homogenized")
        .count()
    ).groupBy("coherencyDiagonal").count().show()



+-----------------+------+
|coherencyDiagonal| count|
+-----------------+------+
|         coherent|538206|
+-----------------+------+



                                                                                

In [4]:
assessment.filter(
            (F.col("datasourceId") == "chembl")
            & (F.col("homogenized") != "noEvaluable")
        ).show()

+------------+---------------+-------------+-------------------+--------+----------+------+-----------+----+---------------------------+---------------------------+---------------------------------+--------------------------------+-----------------+-------------+-------------+----------+--------------------+------------------+--------+-------------+---------------------+--------------+-----------------+--------+----------------+---------------+----------+--------+-------------------+----------+----------------+--------------------+-------------------+-------------------------+-------------------------------------+-------------------------------------+--------------+-------------+------------+-------------------+-----------------+------------------------+-----------------------+-----------------------------+---------------------+----------+----------------------------+-------------------+--------------+---------+--------------------------------+--------------------------------+---------

### Runing analysis using drugId

In [None]:
##### Run the analysis using Drug Id


#### 08.01.2025
#### ALL PHASES 
from array import ArrayType
from functions import (
    relative_success,
    spreadSheetFormatter,
    discrepancifier,
    temporary_directionOfEffect,
)
from stoppedTrials import terminated_td
from DoEAssessment import directionOfEffect
from membraneTargets import target_membrane
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
from datetime import datetime
from datetime import date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.types import (
    StructType,
    StructField,
    DoubleType,
    DecimalType,
    StringType,
    FloatType,
)
import pandas as pd

spark = SparkSession.builder.getOrCreate()

path = "gs://open-targets-pre-data-releases/24.12-uo_test-3/output/etl/parquet/"

target = spark.read.parquet(f"{path}targets/")

diseases = spark.read.parquet(f"{path}diseases/")

evidences = spark.read.parquet(f"{path}evidence")

credible = spark.read.parquet(f"{path}credibleSet")

### index with new fix" "gs://ot-team/irene/gentropy/study_index_2412_fixed"
index = spark.read.parquet(f"gs://ot-team/irene/gentropy/study_index_2412_fixed")

new = spark.read.parquet(f"{path}colocalisation/coloc")

variantIndex = spark.read.parquet(f"{path}variantIndex")

biosample = spark.read.parquet(f"{path}biosample")


#### Fixing scXQTL as XQTLs:
## code provided by @ireneisdoomed
pd.DataFrame.iteritems = pd.DataFrame.items

raw_studies_metadata_schema: StructType = StructType(
        [
            StructField("study_id", StringType(), True),
            StructField("dataset_id", StringType(), True),
            StructField("study_label", StringType(), True),
            StructField("sample_group", StringType(), True),
            StructField("tissue_id", StringType(), True),
            StructField("tissue_label", StringType(), True),
            StructField("condition_label", StringType(), True),
            StructField("sample_size", IntegerType(), True),
            StructField("quant_method", StringType(), True),
            StructField("pmid", StringType(), True),
            StructField("study_type", StringType(), True),
        ]
    )
raw_studies_metadata_path = "https://raw.githubusercontent.com/eQTL-Catalogue/eQTL-Catalogue-resources/fe3c4b4ed911b3a184271a6aadcd8c8769a66aba/data_tables/dataset_metadata.tsv"

study_table = spark.createDataFrame(
            pd.read_csv(raw_studies_metadata_path, sep="\t"),
            schema=raw_studies_metadata_schema,
        )

#index = spark.read.parquet("gs://open-targets-pre-data-releases/24.12-uo_test-3/output/genetics/parquet/study_index")

study_index_w_correct_type = (
    study_table.select(
        F.concat_ws(
            "_",
            F.col("study_label"),
            F.col("quant_method"),
            F.col("sample_group"),
        ).alias("extracted_column"),
        "study_type",
    )
    .join(
        index
        # Get eQTL Catalogue studies
        .filter(F.col("studyType") != "gwas")
        .filter(~F.col("studyId").startswith("UKB_PPP"))
        # Remove measured trait
        .withColumn(
            "extracted_column",
            F.regexp_replace(F.col("studyId"), r"(_ENS.*|_ILMN.*|_X.*|_[0-9]+:.*)", ""),
        )
        .withColumn(
            "extracted_column",
            # After the previous cleanup, there are multiple traits from the same publication starting with the gene symbol that need to be removed (e.g. `Sun_2018_aptamer_plasma_ANXA2.4961.17.1..1`)
            F.when(
                F.col("extracted_column").startswith("Sun_2018_aptamer_plasma"),
                F.lit("Sun_2018_aptamer_plasma"),
            ).otherwise(F.col("extracted_column")),
        ),
        on="extracted_column",
        how="right",
    )
    .persist()
)

fixed = (
    study_index_w_correct_type.withColumn(
        "toFix",
        F.when(
            (F.col("study_type") != "single-cell")
            & (F.col("studyType").startswith("sc")),
            F.lit(True),
        ).otherwise(F.lit(False)),
    )
    # Remove the substring "sc" from the studyType column
    .withColumn(
        "newStudyType",
        F.when(
            F.col("toFix"), F.regexp_replace(F.col("studyType"), r"sc", "")
        ).otherwise(F.col("studyType")),
    )
    .drop("toFix", "extracted_column", "study_type")
).persist()
all_studies = index.join(
    fixed.selectExpr("studyId", "newStudyType"), on="studyId", how="left"
).persist()
fixedIndex = all_studies.withColumn(
    "studyType",
    F.when(F.col("newStudyType").isNotNull(), F.col("newStudyType")).otherwise(
        F.col("studyType")
    ),
).drop("newStudyType")
#### fixed  

newColoc = (
    new.join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on left side
            "studyLocusId as leftStudyLocusId",
            "StudyId as leftStudyId",
            "variantId as leftVariantId",
            "studyType as credibleLeftStudyType",
        ),
        on="leftStudyLocusId",
        how="left",
    )
    .join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on right side
            "studyLocusId as rightStudyLocusId",
            "studyId as rightStudyId",
            "variantId as rightVariantId",
            "studyType as credibleRightStudyType",
        ),
        on="rightStudyLocusId",
        how="left",
    )
    .join(
        fixedIndex.selectExpr(  ### bring modulated target on right side (QTL study)
            "studyId as rightStudyId", "geneId", "projectId", "studyType as indexStudyType", "condition", "biosampleId"
        ),
        on="rightStudyId",
        how="left",
    )
    .persist()
)
# remove columns without content (only null values on them)
df = evidences.filter((F.col("datasourceId") == "gwas_credible_sets"))

# Use an aggregation to determine non-null columns
non_null_counts = df.select(
    *[F.sum(F.col(col).isNotNull().cast("int")).alias(col) for col in df.columns]
)

# Collect the counts for each column
non_null_columns = [
    row[0] for row in non_null_counts.collect()[0].asDict().items() if row[1] > 0
]

# Select only the non-null columns
filtered_df = df.select(*non_null_columns).persist()

## bring studyId, variantId, beta from Gwas and pValue
gwasComplete = filtered_df.join(
    credible.selectExpr(
        "studyLocusId", "studyId", "variantId", "beta as betaGwas", "pValueExponent"
    ),
    on="studyLocusId",
    how="left",
).persist()

resolvedColoc = (
    (
        newColoc.withColumnRenamed("geneId", "targetId")
        .join(
            gwasComplete.withColumnRenamed("studyLocusId", "leftStudyLocusId"),
            on=["leftStudyLocusId", "targetId"],
            how="inner",
        )
        .join(  ### propagated using parent terms
            diseases.selectExpr(
                "id as diseaseId", "name", "parents", "therapeuticAreas"
            ),
            on="diseaseId",
            how="left",
        )
        .withColumn(
            "diseaseId",
            F.explode_outer(F.concat(F.array(F.col("diseaseId")), F.col("parents"))),
        )
        .drop("parents", "oldDiseaseId")
    )
    .withColumn(
        "colocDoE",
        F.when(
            F.col("rightStudyType").isin(
                ["eqtl", "pqtl", "tuqtl", "sceqtl", "sctuqtl"]
            ),
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_protect"),
            ),
        ).when(
            F.col("rightStudyType").isin(
                ["sqtl", "scsqtl"]
            ),  ### opposite directionality than sqtl
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_protect"),
            ),
        ),
    )
    .persist()
)

path = "gs://open-targets-pre-data-releases/24.12-uo_test-3/output/etl/parquet/"

datasource_filter = [
    "ot_genetics_portal",
    "gwas_credible_sets",
    "gene_burden",
    "eva",
    "eva_somatic",
    "gene2phenotype",
    "orphanet",
    "cancer_gene_census",
    "intogen",
    "impc",
    "chembl",
]

assessment, evidences, actionType, oncolabel = temporary_directionOfEffect(
    path, datasource_filter
)

drugApproved=spark.read.parquet("gs://ot-team/irene/l2g/validation/chembl_w_flags").drop("clinicalTrialId","isComplex"
).withColumn("isApproved", F.when(F.col("isApproved")=="true", F.lit(1)).otherwise(F.lit(0))).distinct()

### include drugId and studyId becuase there are some studies where there are multiple drugs. 
# Everywthing will become coherent in chembl assoc since this
### 1nd version of 
analysis_chembl_indication = (
    discrepancifier(
        assessment.filter(
            (F.col("datasourceId") == "chembl")
            & (F.col("homogenized") != "noEvaluable")
        )
        .join(drugApproved.filter(F.col("isApproved")==1), on=["targetId","diseaseId","drugId"], how="left")
        .withColumn(
            "maxClinPhase", ### no longer maxclinphase for T-D
            F.max("clinicalPhase").over(Window.partitionBy("targetId", "diseaseId","drugId", "studyId","clinicalPhase")),
        )
        .withColumn(
        "approvedDrug",
        F.max(F.col("isApproved")).over(Window.partitionBy("targetId", "diseaseId")),
        )
        ## do not consider drugId so when there are two drugs with diferent MoA at the same clinical phase we will not include it (coherency==coherent)
        .groupBy("targetId", "diseaseId", "studyId","drugId","clinicalPhase","maxClinPhase","approvedDrug")  ## we can remove drugID
        .pivot("homogenized")
        .count()
    )
    .withColumnRenamed("studyId","clinicalStudyId")
    .filter(F.col("coherencyDiagonal") == "coherent")
    .drop(
        "coherencyDiagonal", 
        "coherencyOneCell", 
        "noEvaluable", 
        "GoF_risk", 
        "LoF_risk"
    )
    .withColumnRenamed("GoF_protect", "drugGoF_protect")
    .withColumnRenamed("LoF_protect", "drugLoF_protect")
)

In [None]:
benchmark = (
    (
        resolvedColoc.filter(F.col("betaGwas") < 0)
        .join(  ### select just GWAS giving protection
            analysis_chembl_indication, on=["targetId", "diseaseId"], how="inner"
        )
        .withColumn(
            "AgreeDrug",
            F.when(
                (F.col("drugGoF_protect").isNotNull())
                & (F.col("colocDoE") == "GoF_protect"),
                F.lit("yes"),
            )
            .when(
                (F.col("drugLoF_protect").isNotNull())
                & (F.col("colocDoE") == "LoF_protect"),
                F.lit("yes"),
            )
            .otherwise(F.lit("no")),
        )
    )
    .filter(F.col("name") != "COVID-19")  #### remove COVID-19 associations
).join(biosample.select("biosampleId", "biosampleName"), on="biosampleId", how="left")


In [None]:
####2 Define agregation function
import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
from scipy.stats.contingency import odds_ratio
from pyspark.sql.types import *

def convertTuple(tup):
    st = ",".join(map(str, tup))
    return st

#####3 run in a function
def aggregations_original(
    df,
    data,
    listado,
    comparisonColumn,
    comparisonType,
    predictionColumn,
    predictionType,
    today_date,
):
    wComparison = Window.partitionBy(comparisonColumn)
    wPrediction = Window.partitionBy(predictionColumn)
    wPredictionComparison = Window.partitionBy(comparisonColumn, predictionColumn)
    results = []
    # uniqIds = df.select("targetId", "diseaseId").distinct().count()
    out = (
        df.withColumn("comparisonType", F.lit(comparisonType))
        .withColumn("dataset", F.lit(data))
        .withColumn("predictionType", F.lit(predictionType))
        # .withColumn("total", F.lit(uniqIds))
        .withColumn("a", F.count("targetId").over(wPredictionComparison))
        .withColumn("comparisonColumn", F.lit(comparisonColumn))
        .withColumn("predictionColumnValue", F.lit(predictionColumn))
        .withColumn(
            "predictionTotal",
            F.count("targetId").over(wPrediction),
        )
        .withColumn(
            "comparisonTotal",
            F.count("targetId").over(wComparison),
        )
        .select(
            F.col(predictionColumn).alias("prediction"),
            F.col(comparisonColumn).alias("comparison"),
            "dataset",
            "comparisonColumn",
            "predictionColumnValue",
            "comparisonType",
            "predictionType",
            "a",
            "predictionTotal",
            "comparisonTotal",
        )
        .filter(F.col("prediction").isNotNull())
        .filter(F.col("comparison").isNotNull())
        .distinct()
    )
    '''
    out.write.mode("overwrite").parquet(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + comparisonType
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )
    '''
    
    listado.append(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + comparisonType
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )
    path = "gs://ot-team/jroldan/" + str(
        today_date
        + "_"
        + "analysis/"
        + data
        # + "_propagated"
        + "/"
        + comparisonColumn
        + "_"
        + comparisonType
        + "_"
        + predictionColumn
        + ".parquet"
    )
    print(path)
    ### making analysis
    array1 = np.delete(
        out.join(full_data, on=["prediction", "comparison"], how="outer")
        .groupBy("comparison")
        .pivot("prediction")
        .agg(F.first("a"))
        .sort(F.col("comparison").desc())
        .select("comparison", "yes", "no")
        .fillna(0)
        .toPandas()
        .to_numpy(),
        [0],
        1,
    )
    total = np.sum(array1)
    res_npPhaseX = np.array(array1, dtype=int)
    resX = convertTuple(fisher_exact(res_npPhaseX, alternative="two-sided"))
    resx_CI = convertTuple(
        odds_ratio(res_npPhaseX).confidence_interval(confidence_level=0.95)
    )

    result_st.append(resX)
    result_ci.append(resx_CI)
    (rs_result, rs_ci) = relative_success(array1)
    results.extend(
        [
            comparisonType,
            comparisonColumn,
            predictionColumn,
            round(float(resX.split(",")[0]), 2),
            float(resX.split(",")[1]),
            round(float(resx_CI.split(",")[0]), 2),
            round(float(resx_CI.split(",")[1]), 2),
            str(total),
            np.array(res_npPhaseX).tolist(),
            round(float(rs_result), 2),
            round(float(rs_ci[0]), 2),
            round(float(rs_ci[1]), 2),
            # studies,
            # tissues,
            path,
        ]
    )
    return results



#### 3 Loop over different datasets (as they will have different rows and columns)

def comparisons_df_iterative(disdic,projectId):
    toAnalysis=[(key, value) for key, value in disdic.items() if value == projectId]
    schema = StructType(
        [
            StructField("comparison", StringType(), True),
            StructField("comparisonType", StringType(), True),
        ]
    )

    comparisons = spark.createDataFrame(toAnalysis, schema=schema)
    ### include all the columns as predictor

    predictions = spark.createDataFrame(
        data=[
            ("Phase4", "clinical"),
            ("Phase>=3", "clinical"),
            ("Phase>=2", "clinical"),
            ("Phase>=1", "clinical"),
            #("nPhase4", "clinical"),
            #("nPhase>=3", "clinical"),
            #("nPhase>=2", "clinical"),
            #("nPhase>=1", "clinical"),
            ("approved", "clinical"),

            # ("PhaseT", "clinical"),
        ]
    )
    return comparisons.join(predictions, how="full").collect()

full_data = spark.createDataFrame(
    data=[
        ("yes", "yes"),
        ("yes", "no"),
        ("no", "yes"),
        ("no", "no"),
    ],
    schema=StructType(
        [
            StructField("prediction", StringType(), True),
            StructField("comparison", StringType(), True),
        ]
    ),
)
print("created full_data and lists")

result = []
result_st = []
result_ci = []
array2 = []
listado = []
result_all = []
today_date = str(date.today())
variables_study = ["projectId", "biosampleName", "rightStudyType", "colocDoE"]

print("looping for variables_study")

for variable in variables_study:
    print("analysing",variable)
    #### build list of comparison and prediction columns
    rows=comparisons_df_iterative(disdic,variable)
    #### prepare aggregation depending on the variable problem
    window_spec = Window.partitionBy("targetId","diseaseId",variable).orderBy(F.col("pValueExponent").asc())
    #### take directionality from lowest p value
    bench2=benchmark.withColumn("agree_lowestPval", F.first("AgreeDrug").over(window_spec)
        ).groupBy("targetId", "diseaseId", "clinicalStudyId","clinicalPhase","maxClinPhase","approvedDrug").pivot(variable).agg(F.collect_set("agree_lowestPVal")
        ).withColumn(
        "Phase4",
        F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
        ).withColumn(
            "Phase>=3",
            F.when(F.col("maxClinPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
        ).withColumn(
            "Phase>=2",
            F.when(F.col("maxClinPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
        ).withColumn(
            "Phase>=1",
            F.when(F.col("maxClinPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
        ).withColumn(
        "approved",
            F.when(F.col("approvedDrug")==1, F.lit("yes")).otherwise(F.lit("no")),
        )
    #### build columns yes/no for each distinct value in the column variable
    for x, value in [(key, val) for key, val in disdic.items() if val == variable]:
        print("building columns: ", x,"and",value)
        bench2 = bench2.withColumn(
            x, 
            F.when(F.array_contains(F.col(x),"yes"), F.lit("yes")).otherwise(F.lit("no"))
        )
    #### doing aggregations per 
    for row in rows:
        print("row:",row)
        results = aggregations_original(bench2, "propagated", listado, *row, today_date)
        result_all.append(results)
    
    schema = StructType(
    [
        StructField("group", StringType(), True),
        StructField("comparison", StringType(), True),
        StructField("phase", StringType(), True),
        StructField("oddsRatio", DoubleType(), True),
        StructField("pValue", DoubleType(), True),
        StructField("lowerInterval", DoubleType(), True),
        StructField("upperInterval", DoubleType(), True),
        StructField("total", StringType(), True),
        StructField("values", ArrayType(ArrayType(IntegerType())), True),
        StructField("relSuccess", DoubleType(), True),
        StructField("rsLower", DoubleType(), True),
        StructField("rsUpper", DoubleType(), True),
        StructField("path", StringType(), True),
    ]
)

# Convert list of lists to DataFrame
df = spreadSheetFormatter(spark.createDataFrame(result_all, schema=schema))
df.toPandas().to_csv(
    f"gs://ot-team/jroldan/analysis/{today_date}_credibleSetColocDoEanalysis_fixedIndex_fixedTotalNumber_CoherentThing.csv"
)

print("dataframe written \n Analysis finished")