In [1]:
import time
from array import ArrayType
from functions import (
    relative_success,
    spreadSheetFormatter,
    discrepancifier,
    temporary_directionOfEffect,
    buildColocData,
    gwasDataset,
)
# from stoppedTrials import terminated_td
from DoEAssessment import directionOfEffect
# from membraneTargets import target_membrane
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
from datetime import datetime
from datetime import date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.types import (
    StructType,
    StructField,
    DoubleType,
    DecimalType,
    StringType,
    FloatType,
)
import pandas as pd
from functools import reduce

from pyspark.sql import SparkSession

# --- YARN and Spark Configuration Parameters ---
# These parameters directly influence how Spark requests resources from YARN
# and how memory is managed within your Spark application.

# 1. spark.driver.memory: Memory allocated to the Spark driver program.
#    The driver is responsible for coordinating tasks, scheduling, and collecting
#    results. If you're doing operations like 'collect()' on large datasets,
#    or working with large broadcast variables, increase this.
#    General Guideline: Start with 2g-4g for interactive use, up to 8g-16g
#    for very large metadata or small result collection.
driver_memory = "4g"

# 2. spark.executor.memory: Memory allocated to each Spark executor JVM.
#    Executors are the worker processes that perform the actual data processing.
#    This is *the most critical* setting for memory-related YARN issues.
#    If your tasks are failing due to OOM errors, increase this significantly.
#    General Guideline: Depends on your node size and data. Common values are
#    4g, 8g, 16g, or even more. Ensure it doesn't exceed YARN's max container size.
executor_memory = "8g"

# 3. spark.executor.cores: Number of virtual cores (CPU) allocated to each executor.
#    More cores means an executor can run more tasks concurrently.
#    General Guideline: Typically 2-5 cores per executor. Avoid 1 core (poor parallelism)
#    and too many cores (can lead to fewer executors and memory contention).
executor_cores = "4"

# 4. spark.executor.instances: The total number of executors to launch.
#    This determines the overall parallelism of your application across the cluster.
#    General Guideline: Calculate based on your total cluster resources.
#    (Total available cores on cluster / executor_cores).
#    Start with a reasonable number, e.g., 5-20, and scale up.
num_executors = "10" # Example: 10 executors

# 5. spark.yarn.executor.memoryOverhead: Additional memory for the YARN container
#    beyond the JVM heap (spark.executor.memory). This includes off-heap memory,
#    PySpark's Python process memory, thread stacks, etc.
#    Crucial for PySpark! If this is too low, YARN can kill your containers
#    even if your Java heap (executor_memory) is fine.
#    General Guideline: 10-20% of spark.executor.memory, or a fixed amount like 1g-2g.
#    For PySpark, it's often safer to allocate more.
executor_memory_overhead = "2g" # For an 8g executor, 2g overhead is reasonable (25%)

# 6. spark.sql.shuffle.partitions: The number of partitions used for shuffling data
#    during operations like `groupBy`, `join`, `agg`, `sort`.
#    If this is too low: You can get OOM errors if partitions are too large,
#    or task failures due to data skew.
#    If this is too high: Creates many small tasks, leading to overhead.
#    General Guideline: A common heuristic is 2-4 times the total number of CPU cores
#    available in your application (executor_cores * num_executors).
#    For your current setup (4 cores * 10 executors = 40 cores), 400 is very high.
#    Consider (num_executors * executor_cores * 2) as a starting point.
#    Example: 10 executors * 4 cores/executor = 40 total cores. 40 * 2 = 80 partitions.
#    However, if you have *very* large datasets or significant data skew, 400 might be okay,
#    but it's usually better to start lower and increase if you see skew/large partition processing.
shuffle_partitions = "150" # Adjust based on data size and parallelism

# 7. spark.default.parallelism: This parameter is important for RDD operations (less so for DataFrames,
#    where spark.sql.shuffle.partitions is more relevant for shuffles). It suggests the default
#    number of partitions for RDDs created from scratch, and also influences the number of tasks.
#    It's often set to match or be a multiple of the total number of cores.
default_parallelism = str(int(executor_cores) * int(num_executors) * 2) # A common heuristic

# --- Build the SparkSession ---
# Use the .config() method to set these parameters before calling .getOrCreate()
# This ensures Spark requests the correct resources from YARN at the start.

spark = SparkSession.builder \
    .appName("MyOptimizedPySparkApp") \
    .config("spark.master", "yarn") \
    .config("spark.driver.memory", driver_memory) \
    .config("spark.executor.memory", executor_memory) \
    .config("spark.executor.cores", executor_cores) \
    .config("spark.executor.instances", num_executors) \
    .config("spark.yarn.executor.memoryOverhead", executor_memory_overhead) \
    .config("spark.sql.shuffle.partitions", shuffle_partitions) \
    .config("spark.default.parallelism", default_parallelism) \
    .getOrCreate()

print(f"SparkSession created successfully with the following configurations:")
print(f"  spark.driver.memory: {spark.conf.get('spark.driver.memory')}")
print(f"  spark.executor.memory: {spark.conf.get('spark.executor.memory')}")
print(f"  spark.executor.cores: {spark.conf.get('spark.executor.cores')}")
print(f"  spark.executor.instances: {spark.conf.get('spark.executor.instances')}")
print(f"  spark.yarn.executor.memoryOverhead: {spark.conf.get('spark.yarn.executor.memoryOverhead')}")
print(f"  spark.sql.shuffle.partitions: {spark.conf.get('spark.sql.shuffle.partitions')}")
print(f"  spark.default.parallelism: {spark.conf.get('spark.default.parallelism')}")
print(f"Spark UI available at: {spark.sparkContext.uiWebUrl}")

# --- Your PySpark Code Here ---
# Now you can proceed with your data loading and processing.
# Example:
# df = spark.read.parquet("hdfs:///user/your_user/your_large_data.parquet")
# print(f"Number of rows in DataFrame: {df.count()}")
# df.groupBy("some_column").agg({"another_column": "sum"}).show()

# Remember to stop the SparkSession when you are done
# spark.stop()

path_n='gs://open-targets-data-releases/25.06/output/'

target = spark.read.parquet(f"{path_n}target/")

diseases = spark.read.parquet(f"{path_n}disease/")

evidences = spark.read.parquet(f"{path_n}evidence")

credible = spark.read.parquet(f"{path_n}credible_set")

new = spark.read.parquet(f"{path_n}colocalisation_coloc") 

index=spark.read.parquet(f"{path_n}study/")

variantIndex = spark.read.parquet(f"{path_n}variant")

biosample = spark.read.parquet(f"{path_n}biosample")

ecaviar=spark.read.parquet(f"{path_n}colocalisation_ecaviar")

all_coloc=ecaviar.unionByName(new, allowMissingColumns=True)

print("loaded files")

#### FIRST MODULE: BUILDING COLOC 
newColoc=buildColocData(all_coloc,credible,index)

print("loaded newColoc")

### SECOND MODULE: PROCESS EVIDENCES TO AVOID EXCESS OF COLUMNS 
gwasComplete = gwasDataset(evidences,credible)

#### THIRD MODULE: INCLUDE COLOC IN THE 
resolvedColoc = (
    (
        newColoc.withColumnRenamed("geneId", "targetId")
        .join(
            gwasComplete.withColumnRenamed("studyLocusId", "leftStudyLocusId"),
            on=["leftStudyLocusId", "targetId"],
            how="inner",
        )
        .join(  ### propagated using parent terms
            diseases.selectExpr(
                "id as diseaseId", "name", "parents", "therapeuticAreas"
            ),
            on="diseaseId",
            how="left",
        )
        .withColumn(
            "diseaseId",
            F.explode_outer(F.concat(F.array(F.col("diseaseId")), F.col("parents"))),
        )
        .drop("parents", "oldDiseaseId")
    ).withColumn(
        "colocDoE",
        F.when(
            F.col("rightStudyType").isin(
                ["eqtl", "pqtl", "tuqtl", "sceqtl", "sctuqtl"]
            ),
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_protect"),
            ),
        ).when(
            F.col("rightStudyType").isin(
                ["sqtl", "scsqtl"]
            ),  ### opposite directionality than sqtl
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_protect"),
            ),
        ),
    )
    # .persist()
)
print("loaded resolvedColloc")

datasource_filter = [
#   "ot_genetics_portal",
    "gwas_credible_sets",
    "gene_burden",
    "eva",
    "eva_somatic",
    "gene2phenotype",
    "orphanet",
    "cancer_gene_census",
    "intogen",
    "impc",
    "chembl",
]

assessment, evidences, actionType, oncolabel = temporary_directionOfEffect(
    path_n, datasource_filter
)

print("run temporary direction of effect")


print("built drugApproved dataset")


#### FOURTH MODULE BUILDING CHEMBL ASSOCIATIONS - HERE TAKE CARE WITH FILTERING STEP 
analysis_chembl_indication = (
    discrepancifier(
        assessment.filter((F.col("datasourceId") == "chembl"))
        .withColumn(
            "maxClinPhase",
            F.max(F.col("clinicalPhase")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase")
        .pivot("homogenized")
        .agg(F.count("targetId"))
    )
    #.filter(F.col("coherencyDiagonal") == "coherent")
    .drop(
        "coherencyDiagonal", "coherencyOneCell", "noEvaluable", "GoF_risk", "LoF_risk"
    )
    .withColumnRenamed("GoF_protect", "drugGoF_protect")
    .withColumnRenamed("LoF_protect", "drugLoF_protect")
    # .persist()
)

####2 Define agregation function
import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
from scipy.stats.contingency import odds_ratio
from pyspark.sql.types import *


def convertTuple(tup):
    st = ",".join(map(str, tup))
    return st


#####3 run in a function
def aggregations_original(
    df,
    data,
    listado,
    comparisonColumn,
    comparisonType,
    predictionColumn,
    predictionType,
    today_date,
):
    wComparison = Window.partitionBy(comparisonColumn)
    wPrediction = Window.partitionBy(predictionColumn)
    wPredictionComparison = Window.partitionBy(comparisonColumn, predictionColumn)
    results = []
    # uniqIds = df.select("targetId", "diseaseId").distinct().count()
    out = (
        df.withColumn("comparisonType", F.lit(comparisonType))
        .withColumn("dataset", F.lit(data))
        .withColumn("predictionType", F.lit(predictionType))
        # .withColumn("total", F.lit(uniqIds))
        .withColumn("a", F.count("targetId").over(wPredictionComparison))
        .withColumn("comparisonColumn", F.lit(comparisonColumn))
        .withColumn("predictionColumnValue", F.lit(predictionColumn))
        .withColumn(
            "predictionTotal",
            F.count("targetId").over(wPrediction),
        )
        .withColumn(
            "comparisonTotal",
            F.count("targetId").over(wComparison),
        )
        .select(
            F.col(predictionColumn).alias("prediction"),
            F.col(comparisonColumn).alias("comparison"),
            "dataset",
            "comparisonColumn",
            "predictionColumnValue",
            "comparisonType",
            "predictionType",
            "a",
            "predictionTotal",
            "comparisonTotal",
        )
        .filter(F.col("prediction").isNotNull())
        .filter(F.col("comparison").isNotNull())
        .distinct()
    )

    out.write.mode("overwrite").parquet(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + comparisonType
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )

    listado.append(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + comparisonType
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )
    path = "gs://ot-team/jroldan/" + str(
        today_date
        + "_"
        + "analysis/"
        + data
        # + "_propagated"
        + "/"
        + comparisonColumn
        + "_"
        + comparisonType
        + "_"
        + predictionColumn
        + ".parquet"
    )
    print(path)
    
    ### making analysis
    array1 = np.delete(
        out.join(full_data, on=["prediction", "comparison"], how="outer")
        .groupBy("comparison")
        .pivot("prediction")
        .agg(F.first("a"))
        .sort(F.col("comparison").desc())
        .select("comparison", "yes", "no")
        .fillna(0)
        .toPandas()
        .to_numpy(),
        [0],
        1,
    )
    total = np.sum(array1)
    res_npPhaseX = np.array(array1, dtype=int)
    resX = convertTuple(fisher_exact(res_npPhaseX, alternative="two-sided"))
    resx_CI = convertTuple(
        odds_ratio(res_npPhaseX).confidence_interval(confidence_level=0.95)
    )

    result_st.append(resX)
    result_ci.append(resx_CI)
    (rs_result, rs_ci) = relative_success(array1)
    results.extend(
        [
            comparisonType,
            comparisonColumn,
            predictionColumn,
            round(float(resX.split(",")[0]), 2),
            float(resX.split(",")[1]),
            round(float(resx_CI.split(",")[0]), 2),
            round(float(resx_CI.split(",")[1]), 2),
            str(total),
            np.array(res_npPhaseX).tolist(),
            round(float(rs_result), 2),
            round(float(rs_ci[0]), 2),
            round(float(rs_ci[1]), 2),
            # studies,
            # tissues,
            path,
        ]
    )
    return results


#### 3 Loop over different datasets (as they will have different rows and columns)


def comparisons_df_iterative(elements):
    # toAnalysis = [(key, value) for key, value in disdic.items() if value == projectId]
    toAnalysis = [(col, "predictor") for col in elements]
    schema = StructType(
        [
            StructField("comparison", StringType(), True),
            StructField("comparisonType", StringType(), True),
        ]
    )

    comparisons = spark.createDataFrame(toAnalysis, schema=schema)
    ### include all the columns as predictor

    predictions = spark.createDataFrame(
        data=[
            ("Phase>=4", "clinical"),
            ('Phase>=3','clinical'),
            ('Phase>=2','clinical'),
            ('Phase>=1','clinical'),
            ("PhaseT", "clinical"),
        ]
    )
    return comparisons.join(predictions, how="full").collect()


print("load comparisons_df_iterative function")


full_data = spark.createDataFrame(
    data=[
        ("yes", "yes"),
        ("yes", "no"),
        ("no", "yes"),
        ("no", "no"),
    ],
    schema=StructType(
        [
            StructField("prediction", StringType(), True),
            StructField("comparison", StringType(), True),
        ]
    ),
)
print("created full_data and lists")

#rightTissue = spark.read.csv(
#    'gs://ot-team/jroldan/analysis/20250526_rightTissue.csv',
#    header=True,
#).drop("_c0")

print("loaded rightTissue dataset")

negativeTD = (
    evidences.filter(F.col("datasourceId") == "chembl")
    .select("targetId", "diseaseId", "studyStopReason", "studyStopReasonCategories")
    .filter(F.array_contains(F.col("studyStopReasonCategories"), "Negative"))
    .groupBy("targetId", "diseaseId")
    .count()
    .withColumn("stopReason", F.lit("Negative"))
    .drop("count")
)

print("built negativeTD dataset")

print("built bench2 dataset")

###### cut from here
print("looping for variables_study")

#### new part with chatgpt -- TEST

## QUESTIONS TO ANSWER:
# HAVE ECAVIAR >=0.8
# HAVE COLOC 
# HAVE COLOC >= 0.8
# HAVE COLOC + ECAVIAR >= 0.01
# HAVE COLOC >= 0.8 + ECAVIAR >= 0.01
# RIGHT JOING WITH CHEMBL 

### FIFTH MODULE: BUILDING BENCHMARK OF THE DATASET TO EXTRACT EHE ANALYSIS 

resolvedColocFiltered = resolvedColoc.filter((F.col('clpp')>=0.01) | (F.col('h4')>=0.8))
benchmark = (
    (
        resolvedColocFiltered.filter( ## .filter(F.col("betaGwas") < 0)
        F.col("name") != "COVID-19"
    )
        .join(  ### select just GWAS giving protection
            analysis_chembl_indication, on=["targetId", "diseaseId"], how="right"  ### RIGHT SIDE
        )
        .withColumn(
            "AgreeDrug",
            F.when(
                (F.col("drugGoF_protect").isNotNull())
                & (F.col("colocDoE") == "GoF_protect"),
                F.lit("yes"),
            )
            .when(
                (F.col("drugLoF_protect").isNotNull())
                & (F.col("colocDoE") == "LoF_protect"),
                F.lit("yes"),
            )
            .otherwise(F.lit("no")),
        )
    )  #### remove COVID-19 associations
).join(biosample.select("biosampleId", "biosampleName"), on="biosampleId", how="left")


spark session created at 2025-07-15 14:16:54.226009
Analysis started on 2025-07-15 at  2025-07-15 14:16:54.226009


25/07/15 14:16:59 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/07/15 14:16:59 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


SparkSession created successfully with the following configurations:
  spark.driver.memory: 4g
  spark.executor.memory: 8g
  spark.executor.cores: 4
  spark.executor.instances: 10
  spark.yarn.executor.memoryOverhead: 2g
  spark.sql.shuffle.partitions: 150
  spark.default.parallelism: 80
Spark UI available at: http://jr-temp-doe-m.c.open-targets-eu-dev.internal:40257


                                                                                

loaded files
loaded newColoc


                                                                                

loaded gwasComplete
loaded resolvedColloc
run temporary direction of effect
built drugApproved dataset


                                                                                

load comparisons_df_iterative function
created full_data and lists
loaded rightTissue dataset
built negativeTD dataset
built bench2 dataset
looping for variables_study


In [2]:
doe_columns=["LoF_protect", "GoF_risk", "LoF_risk", "GoF_protect"]
diagonal_lof=['LoF_protect','GoF_risk']
diagonal_gof=['LoF_risk','GoF_protect']

conditions = [
    F.when(F.col(c) == F.col("maxDoE"), F.lit(c)).otherwise(F.lit(None)) for c in doe_columns
    ]

In [4]:
benchmark.columns

['biosampleId',
 'targetId',
 'diseaseId',
 'leftStudyLocusId',
 'rightStudyId',
 'rightStudyLocusId',
 'chromosome',
 'rightStudyType',
 'numberColocalisingVariants',
 'clpp',
 'colocalisationMethod',
 'betaRatioSignAverage',
 'h0',
 'h1',
 'h2',
 'h3',
 'h4',
 'leftStudyId',
 'leftVariantId',
 'credibleLeftStudyType',
 'rightVariantId',
 'credibleRightStudyType',
 'qtlPValueExponent',
 'isTransQtl',
 'projectId',
 'indexStudyType',
 'condition',
 'datasourceId',
 'datatypeId',
 'diseaseFromSourceMappedId',
 'resourceScore',
 'targetFromSourceId',
 'id',
 'score',
 'sourceId',
 'studyId',
 'variantId',
 'betaGwas',
 'pValueExponent',
 'name',
 'therapeuticAreas',
 'colocDoE',
 'maxClinPhase',
 'drugGoF_protect',
 'drugLoF_protect',
 'AgreeDrug',
 'biosampleName']

In [3]:
test=discrepancifier(
    benchmark.groupBy(
        "targetId", "diseaseId", "maxClinPhase", "drugLoF_protect", "drugGoF_protect", 'actionType2'
    )
    .pivot("colocDoE")
    .count()
    .withColumnRenamed('drugLoF_protect', 'LoF_protect_ch')
    .withColumnRenamed('drugGoF_protect', 'GoF_protect_ch')

).withColumn( ## .filter(F.col('coherencyDiagonal')!='noEvid')
    "arrayN", F.array(*[F.col(c) for c in doe_columns])
).withColumn(
    "maxDoE", F.array_max(F.col("arrayN"))
).withColumn("maxDoE_names", F.array(*conditions)
).withColumn("maxDoE_names", F.expr("filter(maxDoE_names, x -> x is not null)")
).withColumn(
    "NoneCellYes",
    F.when((F.col("LoF_protect_ch").isNotNull() & (F.col('GoF_protect_ch').isNull())) & (F.array_contains(F.col("maxDoE_names"), F.lit("LoF_protect")))==True, F.lit('yes'))
    .when((F.col("GoF_protect_ch").isNotNull() & (F.col('LoF_protect_ch').isNull())) & (F.array_contains(F.col("maxDoE_names"), F.lit("GoF_protect")))==True, F.lit('yes')
        ).otherwise(F.lit('no'))  # If the value is null, return null # Otherwise, check if name is in array
).withColumn(
    "NdiagonalYes",
    F.when((F.col("LoF_protect_ch").isNotNull() & (F.col('GoF_protect_ch').isNull())) & 
        (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_lof]))) > 0),
        F.lit("yes")
    ).when((F.col("GoF_protect_ch").isNotNull() & (F.col('LoF_protect_ch').isNull())) & 
        (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_gof]))) > 0),
        F.lit("yes")
    ).otherwise(F.lit('no'))
).withColumn(
    "drugCoherency",
    F.when(
        (F.col("LoF_protect_ch").isNotNull())
        & (F.col("GoF_protect_ch").isNull()), F.lit("coherent")
    )
    .when(
        (F.col("LoF_protect_ch").isNull())
        & (F.col("GoF_protect_ch").isNotNull()), F.lit("coherent")
    )
    .when(
        (F.col("LoF_protect_ch").isNotNull())
        & (F.col("GoF_protect_ch").isNotNull()), F.lit("dispar")
    )
    .otherwise(F.lit("other")),
).persist()

                                                                                

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `actionType2` cannot be resolved. Did you mean one of the following? [`chromosome`, `clpp`, `colocDoE`, `name`, `score`].;
'Pivot ArrayBuffer(targetId#2088, diseaseId#2170, maxClinPhase#6197, drugLoF_protect#11720L, drugGoF_protect#11714L, 'actionType2), colocDoE#2044: string, [null, GoF_protect, GoF_risk, LoF_protect, LoF_risk], [count(1)]
+- Project [biosampleId#1002, targetId#2088, diseaseId#2170, leftStudyLocusId#1077, rightStudyId#1140, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, ... 23 more fields]
   +- Join LeftOuter, (biosampleId#1002 = biosampleId#1059)
      :- Project [targetId#2088, diseaseId#2170, leftStudyLocusId#1077, rightStudyId#1140, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, projectId#975, ... 22 more fields]
      :  +- Project [targetId#2088, diseaseId#2170, leftStudyLocusId#1077, rightStudyId#1140, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, projectId#975, ... 21 more fields]
      :     +- Join RightOuter, ((targetId#1801 = targetId#2088) AND (diseaseId#1960 = diseaseId#2170))
      :        :- Filter NOT (name#691 = COVID-19)
      :        :  +- Filter ((clpp#1082 >= 0.01) OR (h4#1098 >= 0.8))
      :        :     +- Project [diseaseId#1960, leftStudyLocusId#1077, targetId#1801, rightStudyId#1140, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, projectId#975, ... 18 more fields]
      :        :        +- Project [diseaseId#1960, leftStudyLocusId#1077, targetId#1801, rightStudyId#1140, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, projectId#975, ... 17 more fields]
      :        :           +- Project [diseaseId#1960, leftStudyLocusId#1077, targetId#1801, rightStudyId#1140, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, projectId#975, ... 18 more fields]
      :        :              +- Generate explode(concat(array(diseaseId#800), parents#694)), true, [diseaseId#1960]
      :        :                 +- Project [diseaseId#800, leftStudyLocusId#1077, targetId#1801, rightStudyId#1140, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, projectId#975, ... 18 more fields]
      :        :                    +- Join LeftOuter, (diseaseId#800 = diseaseId#1911)
      :        :                       :- Project [leftStudyLocusId#1077, targetId#1801, rightStudyId#1140, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, projectId#975, indexStudyType#1200, ... 15 more fields]
      :        :                       :  +- Join Inner, ((leftStudyLocusId#1077 = leftStudyLocusId#1828) AND (targetId#1801 = targetId#718))
      :        :                       :     :- Project [rightStudyId#1140, rightStudyLocusId#1078, leftStudyLocusId#1077, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, geneId#974 AS targetId#1801, projectId#975, indexStudyType#1200, ... 2 more fields]
      :        :                       :     :  +- Project [rightStudyId#1140, rightStudyLocusId#1078, leftStudyLocusId#1077, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, geneId#974, projectId#975, indexStudyType#1200, ... 2 more fields]
      :        :                       :     :     +- Join LeftOuter, (rightStudyId#1140 = rightStudyId#1199)
      :        :                       :     :        :- Project [rightStudyLocusId#1078, leftStudyLocusId#1077, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightStudyId#1140, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176]
      :        :                       :     :        :  +- Join LeftOuter, (rightStudyLocusId#1078 = rightStudyLocusId#1139)
      :        :                       :     :        :     :- Project [leftStudyLocusId#1077, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117]
      :        :                       :     :        :     :  +- Join LeftOuter, (leftStudyLocusId#1077 = leftStudyLocusId#1114)
      :        :                       :     :        :     :     :- Union false, false
      :        :                       :     :        :     :     :  :- Project [leftStudyLocusId#1077, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, null AS h0#1094, null AS h1#1095, null AS h2#1096, null AS h3#1097, null AS h4#1098]
      :        :                       :     :        :     :     :  :  +- Relation [leftStudyLocusId#1077,rightStudyLocusId#1078,chromosome#1079,rightStudyType#1080,numberColocalisingVariants#1081L,clpp#1082,colocalisationMethod#1083,betaRatioSignAverage#1084] parquet
      :        :                       :     :        :     :     :  +- Project [leftStudyLocusId#949, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, null AS clpp#1093, colocalisationMethod#959, betaRatioSignAverage#960, h0#954, h1#955, h2#956, h3#957, h4#958]
      :        :                       :     :        :     :     :     +- Relation [leftStudyLocusId#949,rightStudyLocusId#950,chromosome#951,rightStudyType#952,numberColocalisingVariants#953L,h0#954,h1#955,h2#956,h3#957,h4#958,colocalisationMethod#959,betaRatioSignAverage#960] parquet
      :        :                       :     :        :     :     +- Project [studyLocusId#895 AS leftStudyLocusId#1114, StudyId#896 AS leftStudyId#1115, variantId#897 AS leftVariantId#1116, studyType#920 AS credibleLeftStudyType#1117]
      :        :                       :     :        :     :        +- Relation [studyLocusId#895,studyId#896,variantId#897,chromosome#898,position#899,region#900,beta#901,zScore#902,pValueMantissa#903,pValueExponent#904,effectAlleleFrequencyFromSource#905,standardError#906,subStudyDescription#907,qualityControls#908,finemappingMethod#909,credibleSetIndex#910,credibleSetlog10BF#911,purityMeanR2#912,purityMinR2#913,locusStart#914,locusEnd#915,sampleSize#916,ldSet#917,locus#918,... 3 more fields] parquet
      :        :                       :     :        :     +- Project [studyLocusId#1150 AS rightStudyLocusId#1139, studyId#1151 AS rightStudyId#1140, variantId#1152 AS rightVariantId#1141, studyType#1175 AS credibleRightStudyType#1142, pValueExponent#1159 AS qtlPValueExponent#1143, isTransQtl#1176]
      :        :                       :     :        :        +- Relation [studyLocusId#1150,studyId#1151,variantId#1152,chromosome#1153,position#1154,region#1155,beta#1156,zScore#1157,pValueMantissa#1158,pValueExponent#1159,effectAlleleFrequencyFromSource#1160,standardError#1161,subStudyDescription#1162,qualityControls#1163,finemappingMethod#1164,credibleSetIndex#1165,credibleSetlog10BF#1166,purityMeanR2#1167,purityMinR2#1168,locusStart#1169,locusEnd#1170,sampleSize#1171,ldSet#1172,locus#1173,... 3 more fields] parquet
      :        :                       :     :        +- Project [studyId#973 AS rightStudyId#1199, geneId#974, projectId#975, studyType#976 AS indexStudyType#1200, condition#998, biosampleId#1002]
      :        :                       :     :           +- Relation [studyId#973,geneId#974,projectId#975,studyType#976,traitFromSource#977,traitFromSourceMappedIds#978,biosampleFromSourceId#979,pubmedId#980,publicationTitle#981,publicationFirstAuthor#982,publicationDate#983,publicationJournal#984,backgroundTraitFromSourceMappedIds#985,initialSampleSize#986,nCases#987,nControls#988,nSamples#989,cohorts#990,ldPopulationStructure#991,discoverySamples#992,replicationSamples#993,qualityControls#994,analysisFlags#995,summarystatsLocation#996,... 6 more fields] parquet
      :        :                       :     +- Project [studyLocusId#798 AS leftStudyLocusId#1828, datasourceId#717, targetId#718, datatypeId#743, diseaseFromSourceMappedId#747, resourceScore#769, targetFromSourceId#784, diseaseId#800, id#801, score#802, sourceId#805, studyId#1845, variantId#1846, betaGwas#1780, pValueExponent#1853]
      :        :                       :        +- Project [studyLocusId#798, datasourceId#717, targetId#718, datatypeId#743, diseaseFromSourceMappedId#747, resourceScore#769, targetFromSourceId#784, diseaseId#800, id#801, score#802, sourceId#805, studyId#1845, variantId#1846, betaGwas#1780, pValueExponent#1853]
      :        :                       :           +- Join LeftOuter, (studyLocusId#798 = studyLocusId#1844)
      :        :                       :              :- Project [datasourceId#717, targetId#718, datatypeId#743, diseaseFromSourceMappedId#747, resourceScore#769, targetFromSourceId#784, studyLocusId#798, diseaseId#800, id#801, score#802, sourceId#805]
      :        :                       :              :  +- Filter (datasourceId#717 = gwas_credible_sets)
      :        :                       :              :     +- Relation [datasourceId#717,targetId#718,alleleOrigins#719,allelicRequirements#720,ancestry#721,ancestryId#722,beta#723,betaConfidenceIntervalLower#724,betaConfidenceIntervalUpper#725,biologicalModelAllelicComposition#726,biologicalModelGeneticBackground#727,biologicalModelId#728,biomarkerName#729,biomarkers#730,biosamplesFromSource#731,cellType#732,clinicalPhase#733,clinicalSignificances#734,clinicalStatus#735,cohortDescription#736,cohortId#737,cohortPhenotypes#738,cohortShortName#739,confidence#740,... 65 more fields] parquet
      :        :                       :              +- Project [studyLocusId#1844, studyId#1845, variantId#1846, beta#1850 AS betaGwas#1780, pValueExponent#1853]
      :        :                       :                 +- Relation [studyLocusId#1844,studyId#1845,variantId#1846,chromosome#1847,position#1848,region#1849,beta#1850,zScore#1851,pValueMantissa#1852,pValueExponent#1853,effectAlleleFrequencyFromSource#1854,standardError#1855,subStudyDescription#1856,qualityControls#1857,finemappingMethod#1858,credibleSetIndex#1859,credibleSetlog10BF#1860,purityMeanR2#1861,purityMinR2#1862,locusStart#1863,locusEnd#1864,sampleSize#1865,ldSet#1866,locus#1867,... 3 more fields] parquet
      :        :                       +- Project [id#689 AS diseaseId#1911, name#691, parents#694, therapeuticAreas#700]
      :        :                          +- Relation [id#689,code#690,name#691,description#692,dbXRefs#693,parents#694,synonyms#695,obsoleteTerms#696,obsoleteXRefs#697,children#698,ancestors#699,therapeuticAreas#700,descendants#701,ontology#702] parquet
      :        +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, drugGoF_protect#11714L, LoF_protect#9894L AS drugLoF_protect#11720L]
      :           +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, GoF_protect#9893L AS drugGoF_protect#11714L, LoF_protect#9894L]
      :              +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, GoF_protect#9893L, LoF_protect#9894L]
      :                 +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, GoF_protect#9893L, LoF_protect#9894L, noEvaluable#9895L, GoF_risk#9908, LoF_risk#11522, coherencyDiagonal#11688, CASE WHEN ((((isnull(LoF_risk#11522) AND isnull(LoF_protect#9894L)) AND isnull(GoF_risk#9908)) AND isnull(GoF_protect#9893L)) AND isnull(noEvaluable#9895L)) THEN noEvid WHEN ((((isnull(LoF_risk#11522) AND isnull(LoF_protect#9894L)) AND isnull(GoF_risk#9908)) AND isnull(GoF_protect#9893L)) AND isnotnull(noEvaluable#9895L)) THEN EvidNotDoE WHEN (((isnotnull(LoF_risk#11522) OR isnotnull(LoF_protect#9894L)) OR isnotnull(GoF_risk#9908)) OR isnotnull(GoF_protect#9893L)) THEN CASE WHEN (isnotnull(LoF_risk#11522) AND ((isnull(LoF_protect#9894L) AND isnull(GoF_risk#9908)) AND isnull(GoF_protect#9893L))) THEN coherent WHEN (isnotnull(GoF_risk#9908) AND ((isnull(LoF_protect#9894L) AND isnull(LoF_risk#11522)) AND isnull(GoF_protect#9893L))) THEN coherent WHEN (isnotnull(LoF_protect#9894L) AND ((isnull(LoF_risk#11522) AND isnull(GoF_risk#9908)) AND isnull(GoF_protect#9893L))) THEN coherent WHEN (isnotnull(GoF_protect#9893L) AND ((isnull(LoF_protect#9894L) AND isnull(GoF_risk#9908)) AND isnull(LoF_risk#11522))) THEN coherent ELSE dispar END END AS coherencyOneCell#11698]
      :                    +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, GoF_protect#9893L, LoF_protect#9894L, noEvaluable#9895L, GoF_risk#9908, LoF_risk#11522, CASE WHEN ((((isnull(LoF_risk#11522) AND isnull(LoF_protect#9894L)) AND isnull(GoF_risk#9908)) AND isnull(GoF_protect#9893L)) AND isnull(noEvaluable#9895L)) THEN noEvid WHEN ((((isnull(LoF_risk#11522) AND isnull(LoF_protect#9894L)) AND isnull(GoF_risk#9908)) AND isnull(GoF_protect#9893L)) AND isnotnull(noEvaluable#9895L)) THEN EvidNotDoE WHEN (((isnotnull(LoF_risk#11522) OR isnotnull(LoF_protect#9894L)) OR isnotnull(GoF_risk#9908)) OR isnotnull(GoF_protect#9893L)) THEN CASE WHEN (isnotnull(GoF_risk#9908) AND isnotnull(LoF_risk#11522)) THEN dispar WHEN (isnotnull(LoF_protect#9894L) AND isnotnull(LoF_risk#11522)) THEN dispar WHEN (isnotnull(GoF_protect#9893L) AND isnotnull(GoF_risk#9908)) THEN dispar WHEN (isnotnull(GoF_protect#9893L) AND isnotnull(LoF_protect#9894L)) THEN dispar ELSE coherent END END AS coherencyDiagonal#11688]
      :                       +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, GoF_protect#9893L, LoF_protect#9894L, noEvaluable#9895L, GoF_risk#9908, null AS LoF_risk#11522]
      :                          +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, GoF_protect#9893L, LoF_protect#9894L, noEvaluable#9895L, null AS GoF_risk#9908]
      :                             +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, __pivot_count(targetId) AS `count(targetId)`#9892[0] AS GoF_protect#9893L, __pivot_count(targetId) AS `count(targetId)`#9892[1] AS LoF_protect#9894L, __pivot_count(targetId) AS `count(targetId)`#9892[2] AS noEvaluable#9895L]
      :                                +- Aggregate [targetId#2088, diseaseId#2170, maxClinPhase#6197], [targetId#2088, diseaseId#2170, maxClinPhase#6197, pivotfirst(homogenized#4208, count(targetId)#9884L, GoF_protect, LoF_protect, noEvaluable, 0, 0) AS __pivot_count(targetId) AS `count(targetId)`#9892]
      :                                   +- Aggregate [targetId#2088, diseaseId#2170, maxClinPhase#6197, homogenized#4208], [targetId#2088, diseaseId#2170, maxClinPhase#6197, homogenized#4208, count(targetId#2088) AS count(targetId)#9884L]
      :                                      +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 80 more fields]
      :                                         +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 81 more fields]
      :                                            +- Window [max(clinicalPhase#2103) windowspecdefinition(targetId#2088, diseaseId#2170, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS maxClinPhase#6197], [targetId#2088, diseaseId#2170]
      :                                               +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 79 more fields]
      :                                                  +- Filter (datasourceId#2087 = chembl)
      :                                                     +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 79 more fields]
      :                                                        +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 78 more fields]
      :                                                           +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 78 more fields]
      :                                                              +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 78 more fields]
      :                                                                 +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 79 more fields]
      :                                                                    +- Window [collect_set(intogen_function#3791, 0, 0) windowspecdefinition(targetId#2088, diseaseId#2170, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS _we0#3897], [targetId#2088, diseaseId#2170]
      :                                                                       +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 77 more fields]
      :                                                                          +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 77 more fields]
      :                                                                             +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 76 more fields]
      :                                                                                +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 75 more fields]
      :                                                                                   +- Join LeftOuter, ((drugId2#2875 = drugId#2121) AND (targetId2#2882 = targetId#2088))
      :                                                                                      :- Join LeftOuter, (target_id#2925 = targetId#2088)
      :                                                                                      :  :- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, concat_ws(,, clinicalSignificances#2104) AS clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 66 more fields]
      :                                                                                      :  :  +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#2104, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 66 more fields]
      :                                                                                      :  :     +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, cast(beta#2093 as double) AS beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#2104, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 66 more fields]
      :                                                                                      :  :        +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2093, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#2104, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 66 more fields]
      :                                                                                      :  :           +- Filter datasourceId#2087 IN (gwas_credible_sets,gene_burden,eva,eva_somatic,gene2phenotype,orphanet,cancer_gene_census,intogen,impc,chembl)
      :                                                                                      :  :              +- Relation [datasourceId#2087,targetId#2088,alleleOrigins#2089,allelicRequirements#2090,ancestry#2091,ancestryId#2092,beta#2093,betaConfidenceIntervalLower#2094,betaConfidenceIntervalUpper#2095,biologicalModelAllelicComposition#2096,biologicalModelGeneticBackground#2097,biologicalModelId#2098,biomarkerName#2099,biomarkers#2100,biosamplesFromSource#2101,cellType#2102,clinicalPhase#2103,clinicalSignificances#2104,clinicalStatus#2105,cohortDescription#2106,cohortId#2107,cohortPhenotypes#2108,cohortShortName#2109,confidence#2110,... 65 more fields] parquet
      :                                                                                      :  +- Project [id#2802 AS target_id#2925, approvedSymbol#2803, description#2910, description_splited#2914, TSorOncogene#2919]
      :                                                                                      :     +- Project [id#2802, approvedSymbol#2803, description#2910, description_splited#2914, CASE WHEN (RLIKE(description_splited#2914, ncogene) AND RLIKE(description_splited#2914, TSG)) THEN bivalent WHEN RLIKE(description_splited#2914, ncogene(\s|$)) THEN oncogene WHEN RLIKE(description_splited#2914, TSG(\s|$)) THEN TSG ELSE noEvaluable END AS TSorOncogene#2919]
      :                                                                                      :        +- Project [id#2802, approvedSymbol#2803, description#2910, concat_ws(,, description#2910) AS description_splited#2914]
      :                                                                                      :           +- Aggregate [id#2802, approvedSymbol#2803], [id#2802, approvedSymbol#2803, collect_set(description#2902, 0, 0) AS description#2910]
      :                                                                                      :              +- Filter description#2902 IN (TSG,oncogene,Oncogene,oncogene,oncogene,TSG,TSG,oncogene,fusion,oncogene,oncogene,fusion)
      :                                                                                      :                 +- Project [id#2802, approvedSymbol#2803, col#2897.description AS description#2902]
      :                                                                                      :                    +- Project [id#2802, approvedSymbol#2803, col#2897]
      :                                                                                      :                       +- Generate explode(hallmarks#2812.attributes), true, [col#2897]
      :                                                                                      :                          +- Relation [id#2802,approvedSymbol#2803,biotype#2804,transcriptIds#2805,canonicalTranscript#2806,canonicalExons#2807,genomicLocation#2808,alternativeGenes#2809,approvedName#2810,go#2811,hallmarks#2812,synonyms#2813,symbolSynonyms#2814,nameSynonyms#2815,functionDescriptions#2816,subcellularLocations#2817,targetClass#2818,obsoleteSymbols#2819,obsoleteNames#2820,constraint#2821,tep#2822,proteinIds#2823,dbXrefs#2824,chemicalProbes#2825,... 5 more fields] parquet
      :                                                                                      +- Aggregate [targetId2#2882, drugId2#2875], [targetId2#2882, drugId2#2875, collect_set(actionType#2860, 0, 0) AS actionType#2892]
      :                                                                                         +- Project [targetId2#2882, drugId2#2875, actionType#2860, mechanismOfAction#2861]
      :                                                                                            +- Generate explode(targets#2865), true, [targetId2#2882]
      :                                                                                               +- Project [drugId2#2875, actionType#2860, mechanismOfAction#2861, targets#2865]
      :                                                                                                  +- Generate explode(chemblIds#2862), true, [drugId2#2875]
      :                                                                                                     +- Relation [actionType#2860,mechanismOfAction#2861,chemblIds#2862,targetName#2863,targetType#2864,targets#2865,references#2866] parquet
      +- Project [biosampleId#1059, biosampleName#1060]
         +- Relation [biosampleId#1059,biosampleName#1060,description#1061,xrefs#1062,synonyms#1063,parents#1064,ancestors#1065,children#1066,descendants#1067] parquet


In [33]:
test.count()

                                                                                

74187

In [14]:
test.groupBy('NdiagonalYes','NOneCellYes').pivot('maxClinPhase').count().show()

                                                                                

+------------+-----------+---+---+---+---+---+
|NdiagonalYes|NOneCellYes|0.5|1.0|2.0|3.0|4.0|
+------------+-----------+---+---+---+---+---+
|         yes|         no|  1| 15| 41| 36| 38|
|          no|         no|  1| 21| 78| 56| 61|
|         yes|        yes|  3| 13| 34| 49| 41|
+------------+-----------+---+---+---+---+---+



In [48]:
test.groupBy('coherencyDiagonal').count().show()

+-----------------+-----+
|coherencyDiagonal|count|
+-----------------+-----+
|           noEvid|73699|
|           dispar|  225|
|         coherent|  263|
+-----------------+-----+



In [52]:
test.join(negativeTD, on=["targetId", "diseaseId"], how="left").withColumn(
    "PhaseT",
    F.when(F.col("stopReason") == "Negative", F.lit("yes")).otherwise(F.lit("no")),
).withColumn(
    "phase4Clean",
    F.when(
        (F.col("maxClinPhase") == 4) & (F.col("PhaseT") == "no"), F.lit("yes")
    ).otherwise(F.lit("no")),
).withColumn(
    "phase3Clean",
    F.when(
        (F.col("maxClinPhase") >= 3) & (F.col("PhaseT") == "no"), F.lit("yes")
    ).otherwise(F.lit("no")),
).withColumn(
    "phase2Clean",
    F.when(
        (F.col("maxClinPhase") >= 2) & (F.col("PhaseT") == "no"), F.lit("yes")
    ).otherwise(F.lit("no")),
).withColumn(
    "phase1Clean",
    F.when(
        (F.col("maxClinPhase") >= 1) & (F.col("PhaseT") == "no"), F.lit("yes")
    ).otherwise(F.lit("no")),
).withColumn(
    "hasGenetics",
    F.when(F.col("coherencyDiagonal") != "noEvid", F.lit("yes")).otherwise(F.lit("no")),
).groupBy(
    "hasGenetics"
).pivot(
    "phase3Clean"
).count().sort(
    F.col("hasGenetics").desc()
).select(
    "hasGenetics", "yes", "no"
).show()



+-----------+-----+-----+
|hasGenetics|  yes|   no|
+-----------+-----+-----+
|        yes|  242|  246|
|         no|23145|50554|
+-----------+-----+-----+



                                                                                

In [53]:
print('NoneCellYes','Phase4Clean',(32*68628)/(5419*108))
print('NoneCellYes','Phase3Clean',(79*50739)/(23308*61))
print('NdiagonalYes','Phase4Clean',(64*68529)/(5387*207))
print('NdiagonalYes','Phase3Clean',(144*50673)/(23243*127))
print('hasGenetics','Phase4Clean',(113*68361)/(375*5338))
print('hasGenetics','Phase3Clean',(242*50554)/(23145*246))

NoneCellYes Phase4Clean 3.752393840601997
NoneCellYes Phase3Clean 2.819253644003185
NdiagonalYes Phase4Clean 3.933118645800545
NdiagonalYes Phase3Clean 2.471970055500581
hasGenetics Phase4Clean 3.8590198576245784
hasGenetics Phase3Clean 2.1487139226544567


### Now let´s curate the mechanism of action

In [3]:
### drug mechanism of action
mecact_path = f"{path_n}drug_mechanism_of_action/" #  mechanismOfAction == old version
mecact = spark.read.parquet(mecact_path)

inhibitors = [
    "RNAI INHIBITOR",
    "NEGATIVE MODULATOR",
    "NEGATIVE ALLOSTERIC MODULATOR",
    "ANTAGONIST",
    "ANTISENSE INHIBITOR",
    "BLOCKER",
    "INHIBITOR",
    "DEGRADER",
    "INVERSE AGONIST",
    "ALLOSTERIC ANTAGONIST",
    "DISRUPTING AGENT",
]

activators = [
    "PARTIAL AGONIST",
    "ACTIVATOR",
    "POSITIVE ALLOSTERIC MODULATOR",
    "POSITIVE MODULATOR",
    "AGONIST",
    "SEQUESTERING AGENT",  ## lost at 31.01.2025
    "STABILISER",
    # "EXOGENOUS GENE", ## added 24.06.2025
    # "EXOGENOUS PROTEIN" ## added 24.06.2025
]


actionType = (
        mecact.select(
            F.explode_outer("chemblIds").alias("drugId"),
            "actionType",
            "mechanismOfAction",
            "targets",
        )
        .select(
            F.explode_outer("targets").alias("targetId"),
            "drugId",
            "actionType",
            "mechanismOfAction",
        )
        .groupBy("targetId", "drugId")
        .agg(F.collect_set("actionType").alias("actionType2"))
    ).withColumn('nMoA', F.size(F.col('actionType2')))

In [4]:
analysis_chembl_indication = (
    discrepancifier(
        assessment.filter((F.col("datasourceId") == "chembl")).join(actionType, on=['targetId','drugId'], how='left')
        .withColumn(
            "maxClinPhase",
            F.max(F.col("clinicalPhase")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase",'actionType2')
        .pivot("homogenized")
        .agg(F.count("targetId"))
    )
    #.filter(F.col("coherencyDiagonal") == "coherent")
    .drop(
        "coherencyDiagonal", "coherencyOneCell", "noEvaluable", "GoF_risk", "LoF_risk"
    )
    .withColumnRenamed("GoF_protect", "drugGoF_protect")
    .withColumnRenamed("LoF_protect", "drugLoF_protect")
)

                                                                                

In [5]:
benchmark = (
    (
        resolvedColocFiltered.filter( ## .filter(F.col("betaGwas") < 0)
        F.col("name") != "COVID-19"
    )
        .join(  ### select just GWAS giving protection
            analysis_chembl_indication, on=["targetId", "diseaseId"], how="right"  ### RIGHT SIDE
        )
        .withColumn(
            "AgreeDrug",
            F.when(
                (F.col("drugGoF_protect").isNotNull())
                & (F.col("colocDoE") == "GoF_protect"),
                F.lit("yes"),
            )
            .when(
                (F.col("drugLoF_protect").isNotNull())
                & (F.col("colocDoE") == "LoF_protect"),
                F.lit("yes"),
            )
            .otherwise(F.lit("no")),
        )
    )  #### remove COVID-19 associations
).join(biosample.select("biosampleId", "biosampleName"), on="biosampleId", how="left")

In [8]:
test2=discrepancifier(
    benchmark.groupBy(
        "targetId", "diseaseId", "maxClinPhase", "drugLoF_protect", "drugGoF_protect", 'actionType2'
    )
    .pivot("colocDoE")
    .count()
    .withColumnRenamed('drugLoF_protect', 'LoF_protect_ch')
    .withColumnRenamed('drugGoF_protect', 'GoF_protect_ch')

).withColumn( ## .filter(F.col('coherencyDiagonal')!='noEvid')
    "arrayN", F.array(*[F.col(c) for c in doe_columns])
).withColumn(
    "maxDoE", F.array_max(F.col("arrayN"))
).withColumn("maxDoE_names", F.array(*conditions)
).withColumn("maxDoE_names", F.expr("filter(maxDoE_names, x -> x is not null)")
).withColumn(
    "NoneCellYes",
    F.when((F.col("LoF_protect_ch").isNotNull() & (F.col('GoF_protect_ch').isNull())) & (F.array_contains(F.col("maxDoE_names"), F.lit("LoF_protect")))==True, F.lit('yes'))
    .when((F.col("GoF_protect_ch").isNotNull() & (F.col('LoF_protect_ch').isNull())) & (F.array_contains(F.col("maxDoE_names"), F.lit("GoF_protect")))==True, F.lit('yes')
        ).otherwise(F.lit('no'))  # If the value is null, return null # Otherwise, check if name is in array
).withColumn(
    "NdiagonalYes",
    F.when((F.col("LoF_protect_ch").isNotNull() & (F.col('GoF_protect_ch').isNull())) & 
        (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_lof]))) > 0),
        F.lit("yes")
    ).when((F.col("GoF_protect_ch").isNotNull() & (F.col('LoF_protect_ch').isNull())) & 
        (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_gof]))) > 0),
        F.lit("yes")
    ).otherwise(F.lit('no'))
).withColumn(
    "drugCoherency",
    F.when(
        (F.col("LoF_protect_ch").isNotNull())
        & (F.col("GoF_protect_ch").isNull()), F.lit("coherent")
    )
    .when(
        (F.col("LoF_protect_ch").isNull())
        & (F.col("GoF_protect_ch").isNotNull()), F.lit("coherent")
    )
    .when(
        (F.col("LoF_protect_ch").isNotNull())
        & (F.col("GoF_protect_ch").isNotNull()), F.lit("dispar")
    )
    .otherwise(F.lit("other")),
).join(negativeTD, on=["targetId", "diseaseId"], how="left").withColumn(
    "PhaseT",
    F.when(F.col("stopReason") == "Negative", F.lit("yes")).otherwise(F.lit("no")),
).withColumn(
    "phase4Clean",
    F.when(
        (F.col("maxClinPhase") == 4) & (F.col("PhaseT") == "no"), F.lit("yes")
    ).otherwise(F.lit("no")),
).withColumn(
    "phase3Clean",
    F.when(
        (F.col("maxClinPhase") >= 3) & (F.col("PhaseT") == "no"), F.lit("yes")
    ).otherwise(F.lit("no")),
).withColumn(
    "phase2Clean",
    F.when(
        (F.col("maxClinPhase") >= 2) & (F.col("PhaseT") == "no"), F.lit("yes")
    ).otherwise(F.lit("no")),
).withColumn(
    "phase1Clean",
    F.when(
        (F.col("maxClinPhase") >= 1) & (F.col("PhaseT") == "no"), F.lit("yes")
    ).otherwise(F.lit("no")),
).withColumn(
    "hasGenetics",
    F.when(F.col("coherencyDiagonal") != "noEvid", F.lit("yes")).otherwise(F.lit("no")),
).withColumn( ### convert array column actionType2 to string
    "actionType",
    F.concat_ws(", ", F.col("actionType2"))
).persist()

### for NoneCellYes
test3=test2.groupBy('targetId','diseaseId','NoneCellYes','phase4Clean').pivot('actionType').agg(F.collect_set(F.col('NoneCellYes'))).fillna(0).withColumn('data', F.lit('Drug_NoneCellYes'))
### for NdiagonalYes
test4=test2.groupBy('targetId','diseaseId','NoneCellYes','phase4Clean').pivot('actionType').agg(F.collect_set(F.col('NdiagonalYes'))).fillna(0).withColumn('data', F.lit('Drug_NdiagonalYes'))


                                                                                

In [34]:
print(len(test3.columns),
len(test4.columns))

40 40


#### continue here on 14.07.2025
 - How to make the comparisons of the drug MoA
 - the column of clinical phases is curated with PhaseT

In [None]:
array_columns_to_transform=test3.columns[5:]
# Iterate over the columns and apply the transformation
for col_name in array_columns_to_transform:
    test3 = test3.withColumn(
        col_name,
        F.when(
            F.array_contains(F.col(col_name),'yes'), F.lit('yes')
        )
        .otherwise(
            "no" 
        )
    )
    test4 = test4.withColumn(
        col_name,
        F.when(
            F.array_contains(F.col(col_name),'yes'), F.lit('yes')
        )
        .otherwise(
            "no"
        )
    )

In [10]:
### Select columns to convert to yes/no
columns_to_transform = test3.columns[5:]

In [6]:
negativeTD = (
    evidences.filter(F.col("datasourceId") == "chembl")
    .select("targetId", "diseaseId", "studyStopReason", "studyStopReasonCategories")
    .filter(F.array_contains(F.col("studyStopReasonCategories"), "Negative"))
    .groupBy("targetId", "diseaseId")
    .count()
    .withColumn("stopReason", F.lit("Negative"))
    .drop("count")
)

#### testing pivoting operations


In [None]:
benchmark.persist()

DataFrame[biosampleId: string, targetId: string, diseaseId: string, leftStudyLocusId: string, rightStudyId: string, rightStudyLocusId: string, chromosome: string, rightStudyType: string, numberColocalisingVariants: bigint, clpp: double, colocalisationMethod: string, betaRatioSignAverage: double, h0: double, h1: double, h2: double, h3: double, h4: double, leftStudyId: string, leftVariantId: string, credibleLeftStudyType: string, rightVariantId: string, credibleRightStudyType: string, qtlPValueExponent: int, isTransQtl: boolean, projectId: string, indexStudyType: string, condition: string, datasourceId: string, datatypeId: string, diseaseFromSourceMappedId: string, resourceScore: double, targetFromSourceId: string, id: string, score: double, sourceId: string, studyId: string, variantId: string, betaGwas: double, pValueExponent: int, name: string, therapeuticAreas: array<string>, colocDoE: string, maxClinPhase: double, actionType2: array<string>, drugGoF_protect: bigint, drugLoF_protect: 



In [22]:
benchmark.columns

['biosampleId',
 'targetId',
 'diseaseId',
 'leftStudyLocusId',
 'rightStudyId',
 'rightStudyLocusId',
 'chromosome',
 'rightStudyType',
 'numberColocalisingVariants',
 'clpp',
 'colocalisationMethod',
 'betaRatioSignAverage',
 'h0',
 'h1',
 'h2',
 'h3',
 'h4',
 'leftStudyId',
 'leftVariantId',
 'credibleLeftStudyType',
 'rightVariantId',
 'credibleRightStudyType',
 'qtlPValueExponent',
 'isTransQtl',
 'projectId',
 'indexStudyType',
 'condition',
 'datasourceId',
 'datatypeId',
 'diseaseFromSourceMappedId',
 'resourceScore',
 'targetFromSourceId',
 'id',
 'score',
 'sourceId',
 'studyId',
 'variantId',
 'betaGwas',
 'pValueExponent',
 'name',
 'therapeuticAreas',
 'colocDoE',
 'maxClinPhase',
 'actionType2',
 'drugGoF_protect',
 'drugLoF_protect',
 'AgreeDrug',
 'biosampleName']

In [26]:
current_col_pvalue_order_window = Window.partitionBy("targetId", "diseaseId", "maxClinPhase", "projectId").orderBy(F.col('colocalisationMethod').asc(), F.col("qtlPValueExponent").asc())
benchmark.withColumn('qtlColocDoE',F.first('colocDoE').over(current_col_pvalue_order_window)).show()

+-----------+---------------+-------------+----------------+------------+-----------------+----------+--------------+--------------------------+----+--------------------+--------------------+----+----+----+----+----+-----------+-------------+---------------------+--------------+----------------------+-----------------+----------+---------+--------------+---------+------------+----------+-------------------------+-------------+------------------+----+-----+--------+-------+---------+--------+--------------+----+----------------+--------+------------+------------+---------------+---------------+---------+-------------+-----------+
|biosampleId|       targetId|    diseaseId|leftStudyLocusId|rightStudyId|rightStudyLocusId|chromosome|rightStudyType|numberColocalisingVariants|clpp|colocalisationMethod|betaRatioSignAverage|  h0|  h1|  h2|  h3|  h4|leftStudyId|leftVariantId|credibleLeftStudyType|rightVariantId|credibleRightStudyType|qtlPValueExponent|isTransQtl|projectId|indexStudyType|conditi

In [19]:
benchmark.filter(F.col('biosampleId').isNotNull()).show()

+--------------+---------------+-----------+--------------------+--------------------+--------------------+----------+--------------+--------------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+---------------+---------------------+-----------------+----------------------+-----------------+----------+--------------+--------------+------------+------------------+-------------------+-------------------------+------------------+------------------+--------------------+------------------+------------------+--------------------+---------------+-------------------+--------------+--------------------+--------------------+-----------+------------+--------------------+---------------+---------------+---------+--------------------+
|   biosampleId|       targetId|  diseaseId|    leftStudyLocusId|        rightStudyId|   rightStudyLocusId|chromosom

In [None]:
# --- Prepare dataset for iterative pivoting ---
current_col_pvalue_order_window = Window.partitionBy("targetId", "diseaseId", "maxClinPhase", col_name).orderBy(F.col('colocalisationMethod').asc(), F.col("qtlPValueExponent").asc())
test2=discrepancifier(
    benchmark
    .withColumn('colocDoE',)
    .groupBy(
        "targetId", "diseaseId", "maxClinPhase", "drugLoF_protect", "drugGoF_protect",'actionType2', 'biosampleName', 'projectId', 'rightStudyType','colocalisationMethod','qtlPValueExponent'
    )
    .pivot("colocDoE")
    .count()
    .withColumnRenamed('drugLoF_protect', 'LoF_protect_ch')
    .withColumnRenamed('drugGoF_protect', 'GoF_protect_ch')

).withColumn( ## .filter(F.col('coherencyDiagonal')!='noEvid')
    "arrayN", F.array(*[F.col(c) for c in doe_columns])
).withColumn(
    "maxDoE", F.array_max(F.col("arrayN"))
).withColumn("maxDoE_names", F.array(*conditions)
).withColumn("maxDoE_names", F.expr("filter(maxDoE_names, x -> x is not null)")
).withColumn(
    "NoneCellYes",
    F.when((F.col("LoF_protect_ch").isNotNull() & (F.col('GoF_protect_ch').isNull())) & (F.array_contains(F.col("maxDoE_names"), F.lit("LoF_protect")))==True, F.lit('yes'))
    .when((F.col("GoF_protect_ch").isNotNull() & (F.col('LoF_protect_ch').isNull())) & (F.array_contains(F.col("maxDoE_names"), F.lit("GoF_protect")))==True, F.lit('yes')
        ).otherwise(F.lit('no'))  # If the value is null, return null # Otherwise, check if name is in array
).withColumn(
    "NdiagonalYes",
    F.when((F.col("LoF_protect_ch").isNotNull() & (F.col('GoF_protect_ch').isNull())) & 
        (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_lof]))) > 0),
        F.lit("yes")
    ).when((F.col("GoF_protect_ch").isNotNull() & (F.col('LoF_protect_ch').isNull())) & 
        (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_gof]))) > 0),
        F.lit("yes")
    ).otherwise(F.lit('no'))
).withColumn(
    "drugCoherency",
    F.when(
        (F.col("LoF_protect_ch").isNotNull())
        & (F.col("GoF_protect_ch").isNull()), F.lit("coherent")
    )
    .when(
        (F.col("LoF_protect_ch").isNull())
        & (F.col("GoF_protect_ch").isNotNull()), F.lit("coherent")
    )
    .when(
        (F.col("LoF_protect_ch").isNotNull())
        & (F.col("GoF_protect_ch").isNotNull()), F.lit("dispar")
    )
    .otherwise(F.lit("other")),
).join(negativeTD, on=["targetId", "diseaseId"], how="left").withColumn(
    "PhaseT",
    F.when(F.col("stopReason") == "Negative", F.lit("yes")).otherwise(F.lit("no")),
).withColumn(
    "phase4Clean",
    F.when(
        (F.col("maxClinPhase") == 4) & (F.col("PhaseT") == "no"), F.lit("yes")
    ).otherwise(F.lit("no")),
).withColumn(
    "phase3Clean",
    F.when(
        (F.col("maxClinPhase") >= 3) & (F.col("PhaseT") == "no"), F.lit("yes")
    ).otherwise(F.lit("no")),
).withColumn(
    "phase2Clean",
    F.when(
        (F.col("maxClinPhase") >= 2) & (F.col("PhaseT") == "no"), F.lit("yes")
    ).otherwise(F.lit("no")),
).withColumn(
    "phase1Clean",
    F.when(
        (F.col("maxClinPhase") >= 1) & (F.col("PhaseT") == "no"), F.lit("yes")
    ).otherwise(F.lit("no")),
).withColumn(
    "hasGenetics",
    F.when(F.col("coherencyDiagonal") != "noEvid", F.lit("yes")).otherwise(F.lit("no")),
).withColumn( ### convert array column actionType2 to string
    "actionType2",
    F.concat_ws(", ", F.col("actionType2"))
).persist()


In [None]:
# --- Prepare dataset for iterative pivoting ---
test2=discrepancifier(
    benchmark.groupBy(
        "targetId", "diseaseId", "maxClinPhase", "drugLoF_protect", "drugGoF_protect",'actionType2', 'biosampleName', 'projectId', 'rightStudyType',
    )
    .pivot("colocDoE")
    .count()
    .withColumnRenamed('drugLoF_protect', 'LoF_protect_ch')
    .withColumnRenamed('drugGoF_protect', 'GoF_protect_ch')

).withColumn( ## .filter(F.col('coherencyDiagonal')!='noEvid')
    "arrayN", F.array(*[F.col(c) for c in doe_columns])
).withColumn(
    "maxDoE", F.array_max(F.col("arrayN"))
).withColumn("maxDoE_names", F.array(*conditions)
).withColumn("maxDoE_names", F.expr("filter(maxDoE_names, x -> x is not null)")
).withColumn(
    "NoneCellYes",
    F.when((F.col("LoF_protect_ch").isNotNull() & (F.col('GoF_protect_ch').isNull())) & (F.array_contains(F.col("maxDoE_names"), F.lit("LoF_protect")))==True, F.lit('yes'))
    .when((F.col("GoF_protect_ch").isNotNull() & (F.col('LoF_protect_ch').isNull())) & (F.array_contains(F.col("maxDoE_names"), F.lit("GoF_protect")))==True, F.lit('yes')
        ).otherwise(F.lit('no'))  # If the value is null, return null # Otherwise, check if name is in array
).withColumn(
    "NdiagonalYes",
    F.when((F.col("LoF_protect_ch").isNotNull() & (F.col('GoF_protect_ch').isNull())) & 
        (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_lof]))) > 0),
        F.lit("yes")
    ).when((F.col("GoF_protect_ch").isNotNull() & (F.col('LoF_protect_ch').isNull())) & 
        (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_gof]))) > 0),
        F.lit("yes")
    ).otherwise(F.lit('no'))
).withColumn(
    "drugCoherency",
    F.when(
        (F.col("LoF_protect_ch").isNotNull())
        & (F.col("GoF_protect_ch").isNull()), F.lit("coherent")
    )
    .when(
        (F.col("LoF_protect_ch").isNull())
        & (F.col("GoF_protect_ch").isNotNull()), F.lit("coherent")
    )
    .when(
        (F.col("LoF_protect_ch").isNotNull())
        & (F.col("GoF_protect_ch").isNotNull()), F.lit("dispar")
    )
    .otherwise(F.lit("other")),
).join(negativeTD, on=["targetId", "diseaseId"], how="left").withColumn(
    "PhaseT",
    F.when(F.col("stopReason") == "Negative", F.lit("yes")).otherwise(F.lit("no")),
).withColumn(
    "phase4Clean",
    F.when(
        (F.col("maxClinPhase") == 4) & (F.col("PhaseT") == "no"), F.lit("yes")
    ).otherwise(F.lit("no")),
).withColumn(
    "phase3Clean",
    F.when(
        (F.col("maxClinPhase") >= 3) & (F.col("PhaseT") == "no"), F.lit("yes")
    ).otherwise(F.lit("no")),
).withColumn(
    "phase2Clean",
    F.when(
        (F.col("maxClinPhase") >= 2) & (F.col("PhaseT") == "no"), F.lit("yes")
    ).otherwise(F.lit("no")),
).withColumn(
    "phase1Clean",
    F.when(
        (F.col("maxClinPhase") >= 1) & (F.col("PhaseT") == "no"), F.lit("yes")
    ).otherwise(F.lit("no")),
).withColumn(
    "hasGenetics",
    F.when(F.col("coherencyDiagonal") != "noEvid", F.lit("yes")).otherwise(F.lit("no")),
).withColumn( ### convert array column actionType2 to string
    "actionType2",
    F.concat_ws(", ", F.col("actionType2"))
).persist()


                                                                                

In [None]:
# --- Configuration for your iterative pivoting ---
group_by_columns = ['targetId', 'diseaseId','phase4Clean','phase3Clean','phase2Clean','phase1Clean','PhaseT']
columns_to_pivot_on = ['actionType2', 'biosampleName', 'projectId', 'rightStudyType']
columns_to_aggregate = ['NoneCellYes', 'NdiagonalYes','hasGenetics'] # The values you want to collect in the pivoted cells
all_pivoted_dfs = {}
current_col_pvalue_order_window = Window.partitionBy("targetId", "diseaseId", col_name).orderBy(F.col('colocalisationMethod').asc(), F.col("qtlPValueExponent").asc())

In [29]:
benchmark.columns

['biosampleId',
 'targetId',
 'diseaseId',
 'leftStudyLocusId',
 'rightStudyId',
 'rightStudyLocusId',
 'chromosome',
 'rightStudyType',
 'numberColocalisingVariants',
 'clpp',
 'colocalisationMethod',
 'betaRatioSignAverage',
 'h0',
 'h1',
 'h2',
 'h3',
 'h4',
 'leftStudyId',
 'leftVariantId',
 'credibleLeftStudyType',
 'rightVariantId',
 'credibleRightStudyType',
 'qtlPValueExponent',
 'isTransQtl',
 'projectId',
 'indexStudyType',
 'condition',
 'datasourceId',
 'datatypeId',
 'diseaseFromSourceMappedId',
 'resourceScore',
 'targetFromSourceId',
 'id',
 'score',
 'sourceId',
 'studyId',
 'variantId',
 'betaGwas',
 'pValueExponent',
 'name',
 'therapeuticAreas',
 'colocDoE',
 'maxClinPhase',
 'actionType2',
 'drugGoF_protect',
 'drugLoF_protect',
 'AgreeDrug',
 'biosampleName']

In [68]:
# --- Nested Loops for Dynamic Pivoting ---
for agg_col_name in columns_to_aggregate:
    for pivot_col_name in columns_to_pivot_on:
        print(f"\n--- Creating DataFrame for Aggregation: '{agg_col_name}' and Pivot: '{pivot_col_name}' ---")
        current_col_pvalue_order_window = Window.partitionBy("targetId", "diseaseId", "maxClinPhase", pivot_col_name).orderBy(F.col('colocalisationMethod').asc(), F.col("qtlPValueExponent").asc())
        test2=discrepancifier(benchmark.withColumn('actionType2', F.concat_ws(",", F.col("actionType2"))).withColumn('qtlColocDoE',F.first('colocDoE').over(current_col_pvalue_order_window)).groupBy(
        "targetId", "diseaseId", "maxClinPhase", "drugLoF_protect", "drugGoF_protect",pivot_col_name)
        .pivot("colocDoE")
        .count()
        .withColumnRenamed('drugLoF_protect', 'LoF_protect_ch')
        .withColumnRenamed('drugGoF_protect', 'GoF_protect_ch')).withColumn( ## .filter(F.col('coherencyDiagonal')!='noEvid')
    "arrayN", F.array(*[F.col(c) for c in doe_columns])
    ).withColumn(
        "maxDoE", F.array_max(F.col("arrayN"))
    ).withColumn("maxDoE_names", F.array(*conditions)
    ).withColumn("maxDoE_names", F.expr("filter(maxDoE_names, x -> x is not null)")
    ).withColumn(
        "NoneCellYes",
        F.when((F.col("LoF_protect_ch").isNotNull() & (F.col('GoF_protect_ch').isNull())) & (F.array_contains(F.col("maxDoE_names"), F.lit("LoF_protect")))==True, F.lit('yes'))
        .when((F.col("GoF_protect_ch").isNotNull() & (F.col('LoF_protect_ch').isNull())) & (F.array_contains(F.col("maxDoE_names"), F.lit("GoF_protect")))==True, F.lit('yes')
            ).otherwise(F.lit('no'))  # If the value is null, return null # Otherwise, check if name is in array
    ).withColumn(
        "NdiagonalYes",
        F.when((F.col("LoF_protect_ch").isNotNull() & (F.col('GoF_protect_ch').isNull())) & 
            (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_lof]))) > 0),
            F.lit("yes")
        ).when((F.col("GoF_protect_ch").isNotNull() & (F.col('LoF_protect_ch').isNull())) & 
            (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_gof]))) > 0),
            F.lit("yes")
        ).otherwise(F.lit('no'))
    ).withColumn(
        "drugCoherency",
        F.when(
            (F.col("LoF_protect_ch").isNotNull())
            & (F.col("GoF_protect_ch").isNull()), F.lit("coherent")
        )
        .when(
            (F.col("LoF_protect_ch").isNull())
            & (F.col("GoF_protect_ch").isNotNull()), F.lit("coherent")
        )
        .when(
            (F.col("LoF_protect_ch").isNotNull())
            & (F.col("GoF_protect_ch").isNotNull()), F.lit("dispar")
        )
        .otherwise(F.lit("other")),
    ).join(negativeTD, on=["targetId", "diseaseId"], how="left").withColumn(
        "PhaseT",
        F.when(F.col("stopReason") == "Negative", F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "phase4Clean",
        F.when(
            (F.col("maxClinPhase") == 4) & (F.col("PhaseT") == "no"), F.lit("yes")
        ).otherwise(F.lit("no")),
    ).withColumn(
        "phase3Clean",
        F.when(
            (F.col("maxClinPhase") >= 3) & (F.col("PhaseT") == "no"), F.lit("yes")
        ).otherwise(F.lit("no")),
    ).withColumn(
        "phase2Clean",
        F.when(
            (F.col("maxClinPhase") >= 2) & (F.col("PhaseT") == "no"), F.lit("yes")
        ).otherwise(F.lit("no")),
    ).withColumn(
        "phase1Clean",
        F.when(
            (F.col("maxClinPhase") >= 1) & (F.col("PhaseT") == "no"), F.lit("yes")
        ).otherwise(F.lit("no")),
    ).withColumn(
        "hasGenetics",
        F.when(F.col("coherencyDiagonal") != "noEvid", F.lit("yes")).otherwise(F.lit("no")),
    )
        # 1. Get distinct values for the pivot column (essential for pivot())
        # This brings a small amount of data to the driver, but is necessary for the pivot schema.
        distinct_pivot_values = [row[0] for row in test2.select(pivot_col_name).distinct().collect()]
        print(f"Distinct values for '{pivot_col_name}': {distinct_pivot_values}")

        # 2. Perform the groupBy, pivot, and aggregate operations
        # The .pivot() function requires the list of distinct values for better performance
        # and correct schema inference.
        pivoted_df = (
            test2.groupBy(*group_by_columns)
            .pivot(pivot_col_name, distinct_pivot_values) # Provide distinct values
            .agg(F.collect_set(F.col(agg_col_name))) # Collect all values into a set
            .fillna(0) # Fill cells that have no data with an empty list instead of null
        )

        # 3. Add the 'data' literal column dynamically
        # This column indicates which aggregation column was used.
        pivoted_df = pivoted_df.withColumn('data', F.lit(f'Drug_{agg_col_name}'))

        array_columns_to_convert = [
            field.name for field in pivoted_df.schema.fields
            if isinstance(field.dataType, ArrayType)
        ]
        print(f"Identified ArrayType columns for conversion: {array_columns_to_convert}")

        # 4. Apply the conversion logic to each identified array column
        df_after_conversion = pivoted_df # Start with the pivoted_df
        for col_to_convert in array_columns_to_convert:
            df_after_conversion = df_after_conversion.withColumn(
                col_to_convert,
                F.when(F.col(col_to_convert).isNull(), F.lit('no'))          # Handle NULLs (from pivot for no data)
                .when(F.size(F.col(col_to_convert)) == 0, F.lit('no'))       # Empty array -> 'no'
                .when(F.array_contains(F.col(col_to_convert), F.lit('yes')), F.lit('yes')) # Contains 'yes' -> 'yes'
                .when(F.array_contains(F.col(col_to_convert), F.lit('no')), F.lit('no'))   # Contains 'no' -> 'no'
                .otherwise(F.lit('no')) # Fallback for unexpected array content (e.g., ['other'], ['yes','no'])
            )

        # 4. Generate a unique name for this DataFrame and store it
        df_key = f"df_pivot_{agg_col_name.lower()}_by_{pivot_col_name.lower()}"
        all_pivoted_dfs[df_key] = df_after_conversion.withColumnRenamed( 'phase4Clean','Phase>=4'
        ).withColumnRenamed('phase3Clean','Phase>=3'
        ).withColumnRenamed('phase2Clean','Phase>=2'
        ).withColumnRenamed('phase1Clean','Phase>=1')


# --- Accessing your generated DataFrames ---
print("\n--- All generated DataFrames are stored in 'all_pivoted_dfs' dictionary ---")
print("Keys available:", all_pivoted_dfs.keys())

# Example of accessing a specific DataFrame:
#if 'df_pivot_nonecellyes_by_actiontype' in all_pivoted_dfs:
#    print("\nExample: Showing df_pivot_nonecellyes_by_actiontype:")
#    all_pivoted_dfs['df_pivot_nonecellyes_by_actiontype'].show(truncate=False)

#if 'df_pivot_ndiagonallyes_by_biosamplename' in all_pivoted_dfs:
#    print("\nExample: Showing df_pivot_ndiagonallyes_by_biosamplename:")
#    all_pivoted_dfs['df_pivot_ndiagonallyes_by_biosamplename'].show(truncate=False)



--- Creating DataFrame for Aggregation: 'NoneCellYes' and Pivot: 'actionType2' ---


25/07/15 08:24:24 WARN CacheManager: Asked to cache already cached data.        
                                                                                

Distinct values for 'actionType2': ['NEGATIVE ALLOSTERIC MODULATOR', 'CROSS-LINKING AGENT', 'ANTAGONIST', 'BINDING AGENT', 'ACTIVATOR', 'BLOCKER', 'INHIBITOR', 'POSITIVE MODULATOR', 'OTHER', 'AGONIST', 'OPENER', 'HYDROLYTIC ENZYME', 'STABILISER', 'DISRUPTING AGENT', 'POSITIVE ALLOSTERIC MODULATOR', 'INVERSE AGONIST', 'RELEASING AGENT', 'MODULATOR', 'EXOGENOUS PROTEIN', 'EXOGENOUS GENE', 'PARTIAL AGONIST', 'INHIBITOR,DISRUPTING AGENT', 'INHIBITOR,NEGATIVE ALLOSTERIC MODULATOR', 'BINDING AGENT,AGONIST', '', 'AGONIST,PARTIAL AGONIST', 'ALLOSTERIC ANTAGONIST', 'RNAI INHIBITOR', 'ANTISENSE INHIBITOR', 'AGONIST,MODULATOR', 'VACCINE ANTIGEN', 'NEGATIVE MODULATOR', 'SUBSTRATE', 'DEGRADER', 'ANTAGONIST,INHIBITOR', 'PROTEOLYTIC ENZYME']
Identified ArrayType columns for conversion: ['NEGATIVE ALLOSTERIC MODULATOR', 'CROSS-LINKING AGENT', 'ANTAGONIST', 'BINDING AGENT', 'ACTIVATOR', 'BLOCKER', 'INHIBITOR', 'POSITIVE MODULATOR', 'OTHER', 'AGONIST', 'OPENER', 'HYDROLYTIC ENZYME', 'STABILISER', 'DISRU

25/07/15 08:24:28 WARN CacheManager: Asked to cache already cached data.        
                                                                                

Distinct values for 'biosampleName': ['dorsolateral prefrontal cortex', 'lymphoblastoid cell line', 'skeletal muscle tissue', 'tibial nerve', 'upper lobe of left lung', 'prostate gland', 'gastroesophageal sphincter', 'T-helper 17 cell', 'ascending aorta', 'induced pluripotent stem cell', 'adipose tissue', 'right atrium auricular region', 'blood plasma', 'CD14-positive, CD16-negative classical monocyte', 'stomach', 'left ventricle myocardium', 'thyroid gland', 'breast epithelium', 'right lobe of liver', 'spleen', 'esophagus muscularis mucosa', 'omental fat pad', 'esophagus squamous epithelium', 'tibial artery', 'blood', 'skin of body', 'CD4-positive, alpha-beta T cell', 'cerebellum', 'suprapubic skin', "Ammon's horn", "Peyer's patch", 'amygdala', 'transverse colon', 'substantia nigra', 'nucleus accumbens', 'sigmoid colon', 'caudate nucleus', 'C1 segment of cervical spinal cord', 'ovary', 'fibroblast', 'hypothalamus', 'CD8-positive, alpha-beta T cell', 'body of pancreas', 'pituitary glan

25/07/15 08:24:36 WARN CacheManager: Asked to cache already cached data.


Distinct values for 'projectId': ['ROSMAP', 'GENCORD', 'HipSci', 'GTEx', 'Schmiedel_2018', 'Lepik_2017', 'TwinsUK', 'UKB_PPP_EUR', 'FUSION', 'Quach_2016', 'GEUVADIS', 'Cytoimmgen', 'CommonMind', 'PhLiPS', 'Peng_2018', 'BrainSeq', 'van_de_Bunt_2015', 'PISA', 'Fairfax_2014', 'Nedelec_2016', 'Bossini-Castillo_2019', 'BLUEPRINT', 'Alasoo_2018', 'Steinberg_2020', 'iPSCORE', 'CAP', 'OneK1K', 'Walker_2019', 'Gilchrist_2021', 'Aygun_2021', 'Schwartzentruber_2018', 'Sun_2018', 'Kim-Hellmuth_2017', 'Jerber_2021', 'CEDAR', 'Young_2019', 'Fairfax_2012', 'Perez_2022', 'Nathan_2022', 'Kasela_2017', 'Braineac2', 'Naranbhai_2015', 'Randolph_2021', None]
Identified ArrayType columns for conversion: ['ROSMAP', 'GENCORD', 'HipSci', 'GTEx', 'Schmiedel_2018', 'Lepik_2017', 'TwinsUK', 'UKB_PPP_EUR', 'FUSION', 'Quach_2016', 'GEUVADIS', 'Cytoimmgen', 'CommonMind', 'PhLiPS', 'Peng_2018', 'BrainSeq', 'van_de_Bunt_2015', 'PISA', 'Fairfax_2014', 'Nedelec_2016', 'Bossini-Castillo_2019', 'BLUEPRINT', 'Alasoo_2018',

25/07/15 08:24:40 WARN CacheManager: Asked to cache already cached data.        


Distinct values for 'rightStudyType': ['eqtl', 'tuqtl', 'pqtl', 'sqtl', 'sceqtl', None]
Identified ArrayType columns for conversion: ['eqtl', 'tuqtl', 'pqtl', 'sqtl', 'sceqtl', 'null']

--- Creating DataFrame for Aggregation: 'NdiagonalYes' and Pivot: 'actionType2' ---


25/07/15 08:24:43 WARN CacheManager: Asked to cache already cached data.        
                                                                                

Distinct values for 'actionType2': ['NEGATIVE ALLOSTERIC MODULATOR', 'CROSS-LINKING AGENT', 'ANTAGONIST', 'BINDING AGENT', 'ACTIVATOR', 'BLOCKER', 'INHIBITOR', 'POSITIVE MODULATOR', 'OTHER', 'AGONIST', 'OPENER', 'HYDROLYTIC ENZYME', 'STABILISER', 'DISRUPTING AGENT', 'POSITIVE ALLOSTERIC MODULATOR', 'INVERSE AGONIST', 'RELEASING AGENT', 'MODULATOR', 'EXOGENOUS PROTEIN', 'EXOGENOUS GENE', 'PARTIAL AGONIST', 'INHIBITOR,DISRUPTING AGENT', 'INHIBITOR,NEGATIVE ALLOSTERIC MODULATOR', 'BINDING AGENT,AGONIST', '', 'AGONIST,PARTIAL AGONIST', 'ALLOSTERIC ANTAGONIST', 'RNAI INHIBITOR', 'ANTISENSE INHIBITOR', 'AGONIST,MODULATOR', 'VACCINE ANTIGEN', 'NEGATIVE MODULATOR', 'SUBSTRATE', 'DEGRADER', 'ANTAGONIST,INHIBITOR', 'PROTEOLYTIC ENZYME']
Identified ArrayType columns for conversion: ['NEGATIVE ALLOSTERIC MODULATOR', 'CROSS-LINKING AGENT', 'ANTAGONIST', 'BINDING AGENT', 'ACTIVATOR', 'BLOCKER', 'INHIBITOR', 'POSITIVE MODULATOR', 'OTHER', 'AGONIST', 'OPENER', 'HYDROLYTIC ENZYME', 'STABILISER', 'DISRU

25/07/15 08:24:48 WARN CacheManager: Asked to cache already cached data.


Distinct values for 'biosampleName': ['dorsolateral prefrontal cortex', 'lymphoblastoid cell line', 'skeletal muscle tissue', 'tibial nerve', 'upper lobe of left lung', 'prostate gland', 'gastroesophageal sphincter', 'T-helper 17 cell', 'ascending aorta', 'induced pluripotent stem cell', 'adipose tissue', 'right atrium auricular region', 'blood plasma', 'CD14-positive, CD16-negative classical monocyte', 'stomach', 'left ventricle myocardium', 'thyroid gland', 'breast epithelium', 'right lobe of liver', 'spleen', 'esophagus muscularis mucosa', 'omental fat pad', 'esophagus squamous epithelium', 'tibial artery', 'blood', 'skin of body', 'CD4-positive, alpha-beta T cell', 'cerebellum', 'suprapubic skin', "Ammon's horn", "Peyer's patch", 'amygdala', 'transverse colon', 'substantia nigra', 'nucleus accumbens', 'sigmoid colon', 'caudate nucleus', 'C1 segment of cervical spinal cord', 'ovary', 'fibroblast', 'hypothalamus', 'CD8-positive, alpha-beta T cell', 'body of pancreas', 'pituitary glan

25/07/15 08:24:56 WARN CacheManager: Asked to cache already cached data.
                                                                                

Distinct values for 'projectId': ['Fairfax_2014', 'Nedelec_2016', 'OneK1K', 'GTEx', 'Schmiedel_2018', 'Lepik_2017', 'CEDAR', 'BLUEPRINT', 'TwinsUK', 'UKB_PPP_EUR', 'FUSION', 'Perez_2022', 'Quach_2016', 'HipSci', 'BrainSeq', 'ROSMAP', 'iPSCORE', 'PhLiPS', 'GEUVADIS', 'Kim-Hellmuth_2017', 'Alasoo_2018', 'Cytoimmgen', 'CAP', 'Peng_2018', 'Steinberg_2020', 'Kasela_2017', 'Bossini-Castillo_2019', 'Nathan_2022', 'CommonMind', 'Walker_2019', 'van_de_Bunt_2015', 'Schwartzentruber_2018', 'PISA', 'GENCORD', 'Aygun_2021', 'Fairfax_2012', 'Jerber_2021', 'Braineac2', 'Gilchrist_2021', 'Naranbhai_2015', 'Randolph_2021', 'Sun_2018', 'Young_2019', None]
Identified ArrayType columns for conversion: ['Fairfax_2014', 'Nedelec_2016', 'OneK1K', 'GTEx', 'Schmiedel_2018', 'Lepik_2017', 'CEDAR', 'BLUEPRINT', 'TwinsUK', 'UKB_PPP_EUR', 'FUSION', 'Perez_2022', 'Quach_2016', 'HipSci', 'BrainSeq', 'ROSMAP', 'iPSCORE', 'PhLiPS', 'GEUVADIS', 'Kim-Hellmuth_2017', 'Alasoo_2018', 'Cytoimmgen', 'CAP', 'Peng_2018', 'Stei

25/07/15 08:25:00 WARN CacheManager: Asked to cache already cached data.


Distinct values for 'rightStudyType': ['eqtl', 'tuqtl', 'pqtl', 'sqtl', 'sceqtl', None]
Identified ArrayType columns for conversion: ['eqtl', 'tuqtl', 'pqtl', 'sqtl', 'sceqtl', 'null']

--- Creating DataFrame for Aggregation: 'hasGenetics' and Pivot: 'actionType2' ---


25/07/15 08:25:03 WARN CacheManager: Asked to cache already cached data.        


Distinct values for 'actionType2': ['NEGATIVE ALLOSTERIC MODULATOR', 'ANTAGONIST', 'BINDING AGENT', 'EXOGENOUS PROTEIN', 'PARTIAL AGONIST', 'BLOCKER', 'INHIBITOR', 'POSITIVE MODULATOR', 'AGONIST', 'OPENER', 'HYDROLYTIC ENZYME', 'STABILISER', 'VACCINE ANTIGEN', 'DISRUPTING AGENT', 'POSITIVE ALLOSTERIC MODULATOR', 'MODULATOR', 'CROSS-LINKING AGENT', 'ACTIVATOR', 'OTHER', 'INHIBITOR,DISRUPTING AGENT', 'RNAI INHIBITOR', 'AGONIST,PARTIAL AGONIST', 'BINDING AGENT,AGONIST', 'SUBSTRATE', 'RELEASING AGENT', 'ANTISENSE INHIBITOR', 'INVERSE AGONIST', 'PROTEOLYTIC ENZYME', '', 'EXOGENOUS GENE', 'ALLOSTERIC ANTAGONIST', 'ANTAGONIST,INHIBITOR', 'NEGATIVE MODULATOR', 'INHIBITOR,NEGATIVE ALLOSTERIC MODULATOR', 'DEGRADER', 'AGONIST,MODULATOR']
Identified ArrayType columns for conversion: ['NEGATIVE ALLOSTERIC MODULATOR', 'ANTAGONIST', 'BINDING AGENT', 'EXOGENOUS PROTEIN', 'PARTIAL AGONIST', 'BLOCKER', 'INHIBITOR', 'POSITIVE MODULATOR', 'AGONIST', 'OPENER', 'HYDROLYTIC ENZYME', 'STABILISER', 'VACCINE AN

25/07/15 08:25:07 WARN CacheManager: Asked to cache already cached data.        


Distinct values for 'biosampleName': ['dorsolateral prefrontal cortex', 'lymphoblastoid cell line', 'skeletal muscle tissue', 'tibial nerve', 'upper lobe of left lung', 'prostate gland', 'gastroesophageal sphincter', 'T-helper 17 cell', 'ascending aorta', 'induced pluripotent stem cell', 'adipose tissue', 'right atrium auricular region', 'blood plasma', 'CD14-positive, CD16-negative classical monocyte', 'stomach', 'left ventricle myocardium', 'thyroid gland', 'breast epithelium', 'right lobe of liver', 'spleen', 'esophagus muscularis mucosa', 'omental fat pad', 'esophagus squamous epithelium', 'tibial artery', 'blood', 'skin of body', 'CD4-positive, alpha-beta T cell', 'cerebellum', 'suprapubic skin', "Ammon's horn", "Peyer's patch", 'amygdala', 'transverse colon', 'substantia nigra', 'nucleus accumbens', 'sigmoid colon', 'caudate nucleus', 'C1 segment of cervical spinal cord', 'ovary', 'fibroblast', 'hypothalamus', 'CD8-positive, alpha-beta T cell', 'body of pancreas', 'pituitary glan

25/07/15 08:25:15 WARN CacheManager: Asked to cache already cached data.


Distinct values for 'projectId': ['Fairfax_2014', 'Nedelec_2016', 'OneK1K', 'GTEx', 'Schmiedel_2018', 'Lepik_2017', 'CEDAR', 'BLUEPRINT', 'TwinsUK', 'UKB_PPP_EUR', 'FUSION', 'Perez_2022', 'Quach_2016', 'HipSci', 'BrainSeq', 'ROSMAP', 'iPSCORE', 'PhLiPS', 'GEUVADIS', 'Kim-Hellmuth_2017', 'Alasoo_2018', 'Cytoimmgen', 'CAP', 'Peng_2018', 'Steinberg_2020', 'Kasela_2017', 'Bossini-Castillo_2019', 'Nathan_2022', 'CommonMind', 'Walker_2019', 'van_de_Bunt_2015', 'Schwartzentruber_2018', 'PISA', 'GENCORD', 'Aygun_2021', 'Fairfax_2012', 'Jerber_2021', 'Braineac2', 'Gilchrist_2021', 'Naranbhai_2015', 'Randolph_2021', 'Sun_2018', 'Young_2019', None]
Identified ArrayType columns for conversion: ['Fairfax_2014', 'Nedelec_2016', 'OneK1K', 'GTEx', 'Schmiedel_2018', 'Lepik_2017', 'CEDAR', 'BLUEPRINT', 'TwinsUK', 'UKB_PPP_EUR', 'FUSION', 'Perez_2022', 'Quach_2016', 'HipSci', 'BrainSeq', 'ROSMAP', 'iPSCORE', 'PhLiPS', 'GEUVADIS', 'Kim-Hellmuth_2017', 'Alasoo_2018', 'Cytoimmgen', 'CAP', 'Peng_2018', 'Stei

25/07/15 08:25:20 WARN CacheManager: Asked to cache already cached data.
                                                                                

Distinct values for 'rightStudyType': ['tuqtl', 'pqtl', 'eqtl', 'sceqtl', 'sqtl', None]
Identified ArrayType columns for conversion: ['tuqtl', 'pqtl', 'eqtl', 'sceqtl', 'sqtl', 'null']

--- All generated DataFrames are stored in 'all_pivoted_dfs' dictionary ---
Keys available: dict_keys(['df_pivot_nonecellyes_by_actiontype2', 'df_pivot_nonecellyes_by_biosamplename', 'df_pivot_nonecellyes_by_projectid', 'df_pivot_nonecellyes_by_rightstudytype', 'df_pivot_ndiagonalyes_by_actiontype2', 'df_pivot_ndiagonalyes_by_biosamplename', 'df_pivot_ndiagonalyes_by_projectid', 'df_pivot_ndiagonalyes_by_rightstudytype', 'df_pivot_hasgenetics_by_actiontype2', 'df_pivot_hasgenetics_by_biosamplename', 'df_pivot_hasgenetics_by_projectid', 'df_pivot_hasgenetics_by_rightstudytype'])


In [57]:
unique_values

['NEGATIVE ALLOSTERIC MODULATOR',
 'CROSS-LINKING AGENT',
 'ANTAGONIST',
 'BINDING AGENT',
 'ACTIVATOR',
 'BLOCKER',
 'INHIBITOR',
 'POSITIVE MODULATOR',
 'OTHER',
 'AGONIST',
 'OPENER',
 'HYDROLYTIC ENZYME',
 'STABILISER',
 'DISRUPTING AGENT',
 'POSITIVE ALLOSTERIC MODULATOR',
 'INVERSE AGONIST',
 'RELEASING AGENT',
 'MODULATOR',
 'EXOGENOUS PROTEIN',
 'EXOGENOUS GENE',
 'PARTIAL AGONIST',
 'INHIBITOR,DISRUPTING AGENT',
 'INHIBITOR,NEGATIVE ALLOSTERIC MODULATOR',
 'BINDING AGENT,AGONIST',
 '',
 'AGONIST,PARTIAL AGONIST',
 'ALLOSTERIC ANTAGONIST',
 'RNAI INHIBITOR',
 'ANTISENSE INHIBITOR',
 'AGONIST,MODULATOR',
 'VACCINE ANTIGEN',
 'NEGATIVE MODULATOR',
 'SUBSTRATE',
 'DEGRADER',
 'ANTAGONIST,INHIBITOR',
 'PROTEOLYTIC ENZYME']

In [56]:
all_pivoted_dfs[key].drop(*unique_values).columns

['targetId',
 'diseaseId',
 'phase4Clean',
 'phase3Clean',
 'phase2Clean',
 'phase1Clean',
 'PhaseT',
 'data']

In [54]:
len(all_pivoted_dfs[key].drop(*unique_values).columns[7:])

1

In [59]:
column_name

'actiontype2'

In [62]:
all_pivoted_dfs

{'df_pivot_nonecellyes_by_actiontype2': DataFrame[targetId: string, diseaseId: string, phase4Clean: string, phase3Clean: string, phase2Clean: string, phase1Clean: string, PhaseT: string, NEGATIVE ALLOSTERIC MODULATOR: string, CROSS-LINKING AGENT: string, ANTAGONIST: string, BINDING AGENT: string, ACTIVATOR: string, BLOCKER: string, INHIBITOR: string, POSITIVE MODULATOR: string, OTHER: string, AGONIST: string, OPENER: string, HYDROLYTIC ENZYME: string, STABILISER: string, DISRUPTING AGENT: string, POSITIVE ALLOSTERIC MODULATOR: string, INVERSE AGONIST: string, RELEASING AGENT: string, MODULATOR: string, EXOGENOUS PROTEIN: string, EXOGENOUS GENE: string, PARTIAL AGONIST: string, INHIBITOR,DISRUPTING AGENT: string, INHIBITOR,NEGATIVE ALLOSTERIC MODULATOR: string, BINDING AGENT,AGONIST: string, : string, AGONIST,PARTIAL AGONIST: string, ALLOSTERIC ANTAGONIST: string, RNAI INHIBITOR: string, ANTISENSE INHIBITOR: string, AGONIST,MODULATOR: string, VACCINE ANTIGEN: string, NEGATIVE MODULATOR:

In [65]:
all_pivoted_dfs['df_pivot_ndiagonalyes_by_actiontype2'].columns[7:]

['NEGATIVE ALLOSTERIC MODULATOR',
 'ANTAGONIST',
 'BINDING AGENT',
 'EXOGENOUS PROTEIN',
 'PARTIAL AGONIST',
 'BLOCKER',
 'INHIBITOR',
 'POSITIVE MODULATOR',
 'AGONIST',
 'OPENER',
 'HYDROLYTIC ENZYME',
 'STABILISER',
 'VACCINE ANTIGEN',
 'DISRUPTING AGENT',
 'POSITIVE ALLOSTERIC MODULATOR',
 'MODULATOR',
 'CROSS-LINKING AGENT',
 'ACTIVATOR',
 'OTHER',
 'INHIBITOR,DISRUPTING AGENT',
 'RNAI INHIBITOR',
 'AGONIST,PARTIAL AGONIST',
 'BINDING AGENT,AGONIST',
 'SUBSTRATE',
 'RELEASING AGENT',
 'ANTISENSE INHIBITOR',
 'INVERSE AGONIST',
 'PROTEOLYTIC ENZYME',
 '',
 'EXOGENOUS GENE',
 'ALLOSTERIC ANTAGONIST',
 'ANTAGONIST,INHIBITOR',
 'NEGATIVE MODULATOR',
 'INHIBITOR,NEGATIVE ALLOSTERIC MODULATOR',
 'DEGRADER',
 'AGONIST,MODULATOR',
 'data']

In [None]:
result = []
result_st = []
result_ci = []
array2 = []
listado = []
result_all = []
today_date = str(date.today())
for key,df in all_pivoted_dfs.items():

    print(f'working with {key}')
    parts = key.split('_by_') ### take the part of key belonging to column name
    column_name = parts[1] ### take the last part which is column name
    all_pivoted_dfs[key].persist()
    unique_values = all_pivoted_dfs[key].drop('null').columns[7:]
    print('There are ', len(unique_values), 'columns to analyse with phases')
    rows = comparisons_df_iterative(unique_values)

    # If needed, now process the rest
    for row in rows:
        print('performing', row)
        results = aggregations_original(
            all_pivoted_dfs[key], "propagated", listado, *row, today_date
        )
        result_all.append(results)
        print('results appended')
    all_pivoted_dfs[key].unpersist()
    print('df unpersisted')

working with df_pivot_nonecellyes_by_actiontype2
There are  37 columns to analyse with phases
performing Row(comparison='NEGATIVE ALLOSTERIC MODULATOR', comparisonType='predictor', _1='Phase>=4', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/NEGATIVE ALLOSTERIC MODULATOR_predictor_Phase>=4.parquet


                                                                                

results appended
performing Row(comparison='NEGATIVE ALLOSTERIC MODULATOR', comparisonType='predictor', _1='Phase>=3', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/NEGATIVE ALLOSTERIC MODULATOR_predictor_Phase>=3.parquet


                                                                                

results appended
performing Row(comparison='NEGATIVE ALLOSTERIC MODULATOR', comparisonType='predictor', _1='Phase>=2', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/NEGATIVE ALLOSTERIC MODULATOR_predictor_Phase>=2.parquet


                                                                                

results appended
performing Row(comparison='NEGATIVE ALLOSTERIC MODULATOR', comparisonType='predictor', _1='Phase>=1', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/NEGATIVE ALLOSTERIC MODULATOR_predictor_Phase>=1.parquet


                                                                                

results appended
performing Row(comparison='NEGATIVE ALLOSTERIC MODULATOR', comparisonType='predictor', _1='PhaseT', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/NEGATIVE ALLOSTERIC MODULATOR_predictor_PhaseT.parquet


                                                                                

results appended
performing Row(comparison='CROSS-LINKING AGENT', comparisonType='predictor', _1='Phase>=4', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/CROSS-LINKING AGENT_predictor_Phase>=4.parquet


                                                                                

results appended
performing Row(comparison='CROSS-LINKING AGENT', comparisonType='predictor', _1='Phase>=3', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/CROSS-LINKING AGENT_predictor_Phase>=3.parquet


                                                                                

results appended
performing Row(comparison='CROSS-LINKING AGENT', comparisonType='predictor', _1='Phase>=2', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/CROSS-LINKING AGENT_predictor_Phase>=2.parquet


                                                                                

results appended
performing Row(comparison='CROSS-LINKING AGENT', comparisonType='predictor', _1='Phase>=1', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/CROSS-LINKING AGENT_predictor_Phase>=1.parquet


                                                                                

results appended
performing Row(comparison='CROSS-LINKING AGENT', comparisonType='predictor', _1='PhaseT', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/CROSS-LINKING AGENT_predictor_PhaseT.parquet


                                                                                

results appended
performing Row(comparison='ANTAGONIST', comparisonType='predictor', _1='Phase>=4', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/ANTAGONIST_predictor_Phase>=4.parquet


                                                                                

results appended
performing Row(comparison='ANTAGONIST', comparisonType='predictor', _1='Phase>=3', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/ANTAGONIST_predictor_Phase>=3.parquet


                                                                                

results appended
performing Row(comparison='ANTAGONIST', comparisonType='predictor', _1='Phase>=2', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/ANTAGONIST_predictor_Phase>=2.parquet


                                                                                

results appended
performing Row(comparison='ANTAGONIST', comparisonType='predictor', _1='Phase>=1', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/ANTAGONIST_predictor_Phase>=1.parquet


                                                                                

results appended
performing Row(comparison='ANTAGONIST', comparisonType='predictor', _1='PhaseT', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/ANTAGONIST_predictor_PhaseT.parquet


                                                                                

results appended
performing Row(comparison='BINDING AGENT', comparisonType='predictor', _1='Phase>=4', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/BINDING AGENT_predictor_Phase>=4.parquet


                                                                                

results appended
performing Row(comparison='BINDING AGENT', comparisonType='predictor', _1='Phase>=3', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/BINDING AGENT_predictor_Phase>=3.parquet


                                                                                

results appended
performing Row(comparison='BINDING AGENT', comparisonType='predictor', _1='Phase>=2', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/BINDING AGENT_predictor_Phase>=2.parquet


                                                                                

results appended
performing Row(comparison='BINDING AGENT', comparisonType='predictor', _1='Phase>=1', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/BINDING AGENT_predictor_Phase>=1.parquet


                                                                                

results appended
performing Row(comparison='BINDING AGENT', comparisonType='predictor', _1='PhaseT', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/BINDING AGENT_predictor_PhaseT.parquet


                                                                                

results appended
performing Row(comparison='ACTIVATOR', comparisonType='predictor', _1='Phase>=4', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/ACTIVATOR_predictor_Phase>=4.parquet
results appended
performing Row(comparison='ACTIVATOR', comparisonType='predictor', _1='Phase>=3', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/ACTIVATOR_predictor_Phase>=3.parquet
results appended
performing Row(comparison='ACTIVATOR', comparisonType='predictor', _1='Phase>=2', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/ACTIVATOR_predictor_Phase>=2.parquet


                                                                                

results appended
performing Row(comparison='ACTIVATOR', comparisonType='predictor', _1='Phase>=1', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/ACTIVATOR_predictor_Phase>=1.parquet


                                                                                

results appended
performing Row(comparison='ACTIVATOR', comparisonType='predictor', _1='PhaseT', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/ACTIVATOR_predictor_PhaseT.parquet


                                                                                

results appended
performing Row(comparison='BLOCKER', comparisonType='predictor', _1='Phase>=4', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/BLOCKER_predictor_Phase>=4.parquet


                                                                                

results appended
performing Row(comparison='BLOCKER', comparisonType='predictor', _1='Phase>=3', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/BLOCKER_predictor_Phase>=3.parquet


                                                                                

results appended
performing Row(comparison='BLOCKER', comparisonType='predictor', _1='Phase>=2', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/BLOCKER_predictor_Phase>=2.parquet


                                                                                

results appended
performing Row(comparison='BLOCKER', comparisonType='predictor', _1='Phase>=1', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/BLOCKER_predictor_Phase>=1.parquet


                                                                                

results appended
performing Row(comparison='BLOCKER', comparisonType='predictor', _1='PhaseT', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/BLOCKER_predictor_PhaseT.parquet


                                                                                

results appended
performing Row(comparison='INHIBITOR', comparisonType='predictor', _1='Phase>=4', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/INHIBITOR_predictor_Phase>=4.parquet


                                                                                

results appended
performing Row(comparison='INHIBITOR', comparisonType='predictor', _1='Phase>=3', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/INHIBITOR_predictor_Phase>=3.parquet


                                                                                

results appended
performing Row(comparison='INHIBITOR', comparisonType='predictor', _1='Phase>=2', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/INHIBITOR_predictor_Phase>=2.parquet


                                                                                

results appended
performing Row(comparison='INHIBITOR', comparisonType='predictor', _1='Phase>=1', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/INHIBITOR_predictor_Phase>=1.parquet


                                                                                

results appended
performing Row(comparison='INHIBITOR', comparisonType='predictor', _1='PhaseT', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/INHIBITOR_predictor_PhaseT.parquet


                                                                                

results appended
performing Row(comparison='POSITIVE MODULATOR', comparisonType='predictor', _1='Phase>=4', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/POSITIVE MODULATOR_predictor_Phase>=4.parquet


                                                                                

results appended
performing Row(comparison='POSITIVE MODULATOR', comparisonType='predictor', _1='Phase>=3', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/POSITIVE MODULATOR_predictor_Phase>=3.parquet


                                                                                

results appended
performing Row(comparison='POSITIVE MODULATOR', comparisonType='predictor', _1='Phase>=2', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/POSITIVE MODULATOR_predictor_Phase>=2.parquet


                                                                                

results appended
performing Row(comparison='POSITIVE MODULATOR', comparisonType='predictor', _1='Phase>=1', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/POSITIVE MODULATOR_predictor_Phase>=1.parquet


                                                                                

results appended
performing Row(comparison='POSITIVE MODULATOR', comparisonType='predictor', _1='PhaseT', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/POSITIVE MODULATOR_predictor_PhaseT.parquet
results appended
performing Row(comparison='OTHER', comparisonType='predictor', _1='Phase>=4', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/OTHER_predictor_Phase>=4.parquet


                                                                                

results appended
performing Row(comparison='OTHER', comparisonType='predictor', _1='Phase>=3', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/OTHER_predictor_Phase>=3.parquet
results appended
performing Row(comparison='OTHER', comparisonType='predictor', _1='Phase>=2', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/OTHER_predictor_Phase>=2.parquet


                                                                                

results appended
performing Row(comparison='OTHER', comparisonType='predictor', _1='Phase>=1', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/OTHER_predictor_Phase>=1.parquet


                                                                                

results appended
performing Row(comparison='OTHER', comparisonType='predictor', _1='PhaseT', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/OTHER_predictor_PhaseT.parquet
results appended
performing Row(comparison='AGONIST', comparisonType='predictor', _1='Phase>=4', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/AGONIST_predictor_Phase>=4.parquet


                                                                                

results appended
performing Row(comparison='AGONIST', comparisonType='predictor', _1='Phase>=3', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/AGONIST_predictor_Phase>=3.parquet


                                                                                

results appended
performing Row(comparison='AGONIST', comparisonType='predictor', _1='Phase>=2', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/AGONIST_predictor_Phase>=2.parquet


                                                                                

results appended
performing Row(comparison='AGONIST', comparisonType='predictor', _1='Phase>=1', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/AGONIST_predictor_Phase>=1.parquet


                                                                                

results appended
performing Row(comparison='AGONIST', comparisonType='predictor', _1='PhaseT', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/AGONIST_predictor_PhaseT.parquet
results appended
performing Row(comparison='OPENER', comparisonType='predictor', _1='Phase>=4', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/OPENER_predictor_Phase>=4.parquet


                                                                                

results appended
performing Row(comparison='OPENER', comparisonType='predictor', _1='Phase>=3', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/OPENER_predictor_Phase>=3.parquet
results appended
performing Row(comparison='OPENER', comparisonType='predictor', _1='Phase>=2', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/OPENER_predictor_Phase>=2.parquet


                                                                                

results appended
performing Row(comparison='OPENER', comparisonType='predictor', _1='Phase>=1', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/OPENER_predictor_Phase>=1.parquet


                                                                                

results appended
performing Row(comparison='OPENER', comparisonType='predictor', _1='PhaseT', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/OPENER_predictor_PhaseT.parquet
results appended
performing Row(comparison='HYDROLYTIC ENZYME', comparisonType='predictor', _1='Phase>=4', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/HYDROLYTIC ENZYME_predictor_Phase>=4.parquet


                                                                                

results appended
performing Row(comparison='HYDROLYTIC ENZYME', comparisonType='predictor', _1='Phase>=3', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/HYDROLYTIC ENZYME_predictor_Phase>=3.parquet


                                                                                

results appended
performing Row(comparison='HYDROLYTIC ENZYME', comparisonType='predictor', _1='Phase>=2', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/HYDROLYTIC ENZYME_predictor_Phase>=2.parquet


                                                                                

results appended
performing Row(comparison='HYDROLYTIC ENZYME', comparisonType='predictor', _1='Phase>=1', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/HYDROLYTIC ENZYME_predictor_Phase>=1.parquet


                                                                                

results appended
performing Row(comparison='HYDROLYTIC ENZYME', comparisonType='predictor', _1='PhaseT', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/HYDROLYTIC ENZYME_predictor_PhaseT.parquet


                                                                                

results appended
performing Row(comparison='STABILISER', comparisonType='predictor', _1='Phase>=4', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/STABILISER_predictor_Phase>=4.parquet
results appended
performing Row(comparison='STABILISER', comparisonType='predictor', _1='Phase>=3', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/STABILISER_predictor_Phase>=3.parquet
results appended
performing Row(comparison='STABILISER', comparisonType='predictor', _1='Phase>=2', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/STABILISER_predictor_Phase>=2.parquet


                                                                                

results appended
performing Row(comparison='STABILISER', comparisonType='predictor', _1='Phase>=1', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/STABILISER_predictor_Phase>=1.parquet
results appended
performing Row(comparison='STABILISER', comparisonType='predictor', _1='PhaseT', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/STABILISER_predictor_PhaseT.parquet


                                                                                

results appended
performing Row(comparison='DISRUPTING AGENT', comparisonType='predictor', _1='Phase>=4', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/DISRUPTING AGENT_predictor_Phase>=4.parquet


                                                                                

results appended
performing Row(comparison='DISRUPTING AGENT', comparisonType='predictor', _1='Phase>=3', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/DISRUPTING AGENT_predictor_Phase>=3.parquet
results appended
performing Row(comparison='DISRUPTING AGENT', comparisonType='predictor', _1='Phase>=2', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/DISRUPTING AGENT_predictor_Phase>=2.parquet


                                                                                

results appended
performing Row(comparison='DISRUPTING AGENT', comparisonType='predictor', _1='Phase>=1', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/DISRUPTING AGENT_predictor_Phase>=1.parquet
results appended
performing Row(comparison='DISRUPTING AGENT', comparisonType='predictor', _1='PhaseT', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/DISRUPTING AGENT_predictor_PhaseT.parquet


                                                                                

results appended
performing Row(comparison='POSITIVE ALLOSTERIC MODULATOR', comparisonType='predictor', _1='Phase>=4', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/POSITIVE ALLOSTERIC MODULATOR_predictor_Phase>=4.parquet


                                                                                

results appended
performing Row(comparison='POSITIVE ALLOSTERIC MODULATOR', comparisonType='predictor', _1='Phase>=3', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/POSITIVE ALLOSTERIC MODULATOR_predictor_Phase>=3.parquet


                                                                                

results appended
performing Row(comparison='POSITIVE ALLOSTERIC MODULATOR', comparisonType='predictor', _1='Phase>=2', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/POSITIVE ALLOSTERIC MODULATOR_predictor_Phase>=2.parquet


                                                                                

results appended
performing Row(comparison='POSITIVE ALLOSTERIC MODULATOR', comparisonType='predictor', _1='Phase>=1', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/POSITIVE ALLOSTERIC MODULATOR_predictor_Phase>=1.parquet


                                                                                

results appended
performing Row(comparison='POSITIVE ALLOSTERIC MODULATOR', comparisonType='predictor', _1='PhaseT', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/POSITIVE ALLOSTERIC MODULATOR_predictor_PhaseT.parquet


                                                                                

results appended
performing Row(comparison='INVERSE AGONIST', comparisonType='predictor', _1='Phase>=4', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/INVERSE AGONIST_predictor_Phase>=4.parquet


                                                                                

results appended
performing Row(comparison='INVERSE AGONIST', comparisonType='predictor', _1='Phase>=3', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/INVERSE AGONIST_predictor_Phase>=3.parquet


                                                                                

results appended
performing Row(comparison='INVERSE AGONIST', comparisonType='predictor', _1='Phase>=2', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/INVERSE AGONIST_predictor_Phase>=2.parquet


                                                                                

results appended
performing Row(comparison='INVERSE AGONIST', comparisonType='predictor', _1='Phase>=1', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/INVERSE AGONIST_predictor_Phase>=1.parquet


                                                                                

results appended
performing Row(comparison='INVERSE AGONIST', comparisonType='predictor', _1='PhaseT', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/INVERSE AGONIST_predictor_PhaseT.parquet
results appended
performing Row(comparison='RELEASING AGENT', comparisonType='predictor', _1='Phase>=4', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/RELEASING AGENT_predictor_Phase>=4.parquet


                                                                                

results appended
performing Row(comparison='RELEASING AGENT', comparisonType='predictor', _1='Phase>=3', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/RELEASING AGENT_predictor_Phase>=3.parquet


                                                                                

results appended
performing Row(comparison='RELEASING AGENT', comparisonType='predictor', _1='Phase>=2', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/RELEASING AGENT_predictor_Phase>=2.parquet


                                                                                

results appended
performing Row(comparison='RELEASING AGENT', comparisonType='predictor', _1='Phase>=1', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/RELEASING AGENT_predictor_Phase>=1.parquet


                                                                                

results appended
performing Row(comparison='RELEASING AGENT', comparisonType='predictor', _1='PhaseT', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/RELEASING AGENT_predictor_PhaseT.parquet


                                                                                

results appended
performing Row(comparison='MODULATOR', comparisonType='predictor', _1='Phase>=4', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/MODULATOR_predictor_Phase>=4.parquet


                                                                                

results appended
performing Row(comparison='MODULATOR', comparisonType='predictor', _1='Phase>=3', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/MODULATOR_predictor_Phase>=3.parquet
results appended
performing Row(comparison='MODULATOR', comparisonType='predictor', _1='Phase>=2', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/MODULATOR_predictor_Phase>=2.parquet


                                                                                

results appended
performing Row(comparison='MODULATOR', comparisonType='predictor', _1='Phase>=1', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/MODULATOR_predictor_Phase>=1.parquet


                                                                                

results appended
performing Row(comparison='MODULATOR', comparisonType='predictor', _1='PhaseT', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/MODULATOR_predictor_PhaseT.parquet


                                                                                

results appended
performing Row(comparison='EXOGENOUS PROTEIN', comparisonType='predictor', _1='Phase>=4', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/EXOGENOUS PROTEIN_predictor_Phase>=4.parquet


                                                                                

results appended
performing Row(comparison='EXOGENOUS PROTEIN', comparisonType='predictor', _1='Phase>=3', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/EXOGENOUS PROTEIN_predictor_Phase>=3.parquet


                                                                                

results appended
performing Row(comparison='EXOGENOUS PROTEIN', comparisonType='predictor', _1='Phase>=2', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/EXOGENOUS PROTEIN_predictor_Phase>=2.parquet


                                                                                

results appended
performing Row(comparison='EXOGENOUS PROTEIN', comparisonType='predictor', _1='Phase>=1', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/EXOGENOUS PROTEIN_predictor_Phase>=1.parquet
results appended
performing Row(comparison='EXOGENOUS PROTEIN', comparisonType='predictor', _1='PhaseT', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/EXOGENOUS PROTEIN_predictor_PhaseT.parquet


                                                                                

results appended
performing Row(comparison='EXOGENOUS GENE', comparisonType='predictor', _1='Phase>=4', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/EXOGENOUS GENE_predictor_Phase>=4.parquet


                                                                                

results appended
performing Row(comparison='EXOGENOUS GENE', comparisonType='predictor', _1='Phase>=3', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/EXOGENOUS GENE_predictor_Phase>=3.parquet


                                                                                

results appended
performing Row(comparison='EXOGENOUS GENE', comparisonType='predictor', _1='Phase>=2', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/EXOGENOUS GENE_predictor_Phase>=2.parquet


                                                                                

results appended
performing Row(comparison='EXOGENOUS GENE', comparisonType='predictor', _1='Phase>=1', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/EXOGENOUS GENE_predictor_Phase>=1.parquet


                                                                                

results appended
performing Row(comparison='EXOGENOUS GENE', comparisonType='predictor', _1='PhaseT', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/EXOGENOUS GENE_predictor_PhaseT.parquet


                                                                                

results appended
performing Row(comparison='PARTIAL AGONIST', comparisonType='predictor', _1='Phase>=4', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/PARTIAL AGONIST_predictor_Phase>=4.parquet


                                                                                

results appended
performing Row(comparison='PARTIAL AGONIST', comparisonType='predictor', _1='Phase>=3', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/PARTIAL AGONIST_predictor_Phase>=3.parquet


                                                                                

results appended
performing Row(comparison='PARTIAL AGONIST', comparisonType='predictor', _1='Phase>=2', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/PARTIAL AGONIST_predictor_Phase>=2.parquet


                                                                                

results appended
performing Row(comparison='PARTIAL AGONIST', comparisonType='predictor', _1='Phase>=1', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/PARTIAL AGONIST_predictor_Phase>=1.parquet


                                                                                

results appended
performing Row(comparison='PARTIAL AGONIST', comparisonType='predictor', _1='PhaseT', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/PARTIAL AGONIST_predictor_PhaseT.parquet


                                                                                

results appended
performing Row(comparison='INHIBITOR,DISRUPTING AGENT', comparisonType='predictor', _1='Phase>=4', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/INHIBITOR,DISRUPTING AGENT_predictor_Phase>=4.parquet
results appended
performing Row(comparison='INHIBITOR,DISRUPTING AGENT', comparisonType='predictor', _1='Phase>=3', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/INHIBITOR,DISRUPTING AGENT_predictor_Phase>=3.parquet


                                                                                

results appended
performing Row(comparison='INHIBITOR,DISRUPTING AGENT', comparisonType='predictor', _1='Phase>=2', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/INHIBITOR,DISRUPTING AGENT_predictor_Phase>=2.parquet


                                                                                

results appended
performing Row(comparison='INHIBITOR,DISRUPTING AGENT', comparisonType='predictor', _1='Phase>=1', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/INHIBITOR,DISRUPTING AGENT_predictor_Phase>=1.parquet


                                                                                

results appended
performing Row(comparison='INHIBITOR,DISRUPTING AGENT', comparisonType='predictor', _1='PhaseT', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/INHIBITOR,DISRUPTING AGENT_predictor_PhaseT.parquet


                                                                                

results appended
performing Row(comparison='INHIBITOR,NEGATIVE ALLOSTERIC MODULATOR', comparisonType='predictor', _1='Phase>=4', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/INHIBITOR,NEGATIVE ALLOSTERIC MODULATOR_predictor_Phase>=4.parquet


                                                                                

results appended
performing Row(comparison='INHIBITOR,NEGATIVE ALLOSTERIC MODULATOR', comparisonType='predictor', _1='Phase>=3', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/INHIBITOR,NEGATIVE ALLOSTERIC MODULATOR_predictor_Phase>=3.parquet


                                                                                

results appended
performing Row(comparison='INHIBITOR,NEGATIVE ALLOSTERIC MODULATOR', comparisonType='predictor', _1='Phase>=2', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/INHIBITOR,NEGATIVE ALLOSTERIC MODULATOR_predictor_Phase>=2.parquet


                                                                                

results appended
performing Row(comparison='INHIBITOR,NEGATIVE ALLOSTERIC MODULATOR', comparisonType='predictor', _1='Phase>=1', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/INHIBITOR,NEGATIVE ALLOSTERIC MODULATOR_predictor_Phase>=1.parquet
results appended
performing Row(comparison='INHIBITOR,NEGATIVE ALLOSTERIC MODULATOR', comparisonType='predictor', _1='PhaseT', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/INHIBITOR,NEGATIVE ALLOSTERIC MODULATOR_predictor_PhaseT.parquet
results appended
performing Row(comparison='BINDING AGENT,AGONIST', comparisonType='predictor', _1='Phase>=4', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/BINDING AGENT,AGONIST_predictor_Phase>=4.parquet


                                                                                

results appended
performing Row(comparison='BINDING AGENT,AGONIST', comparisonType='predictor', _1='Phase>=3', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/BINDING AGENT,AGONIST_predictor_Phase>=3.parquet


                                                                                

results appended
performing Row(comparison='BINDING AGENT,AGONIST', comparisonType='predictor', _1='Phase>=2', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/BINDING AGENT,AGONIST_predictor_Phase>=2.parquet


                                                                                

results appended
performing Row(comparison='BINDING AGENT,AGONIST', comparisonType='predictor', _1='Phase>=1', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/BINDING AGENT,AGONIST_predictor_Phase>=1.parquet


                                                                                

results appended
performing Row(comparison='BINDING AGENT,AGONIST', comparisonType='predictor', _1='PhaseT', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/BINDING AGENT,AGONIST_predictor_PhaseT.parquet


                                                                                

results appended
performing Row(comparison='', comparisonType='predictor', _1='Phase>=4', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/_predictor_Phase>=4.parquet


                                                                                

results appended
performing Row(comparison='', comparisonType='predictor', _1='Phase>=3', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/_predictor_Phase>=3.parquet
results appended
performing Row(comparison='', comparisonType='predictor', _1='Phase>=2', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/_predictor_Phase>=2.parquet


                                                                                

results appended
performing Row(comparison='', comparisonType='predictor', _1='Phase>=1', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/_predictor_Phase>=1.parquet


                                                                                

results appended
performing Row(comparison='', comparisonType='predictor', _1='PhaseT', _2='clinical')
gs://ot-team/jroldan/2025-07-15_analysis/propagated/_predictor_PhaseT.parquet


                                                                                

results appended
performing Row(comparison='AGONIST,PARTIAL AGONIST', comparisonType='predictor', _1='Phase>=4', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/AGONIST,PARTIAL AGONIST_predictor_Phase>=4.parquet


                                                                                

results appended
performing Row(comparison='AGONIST,PARTIAL AGONIST', comparisonType='predictor', _1='Phase>=3', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/AGONIST,PARTIAL AGONIST_predictor_Phase>=3.parquet


                                                                                

results appended
performing Row(comparison='AGONIST,PARTIAL AGONIST', comparisonType='predictor', _1='Phase>=2', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/AGONIST,PARTIAL AGONIST_predictor_Phase>=2.parquet


                                                                                

results appended
performing Row(comparison='AGONIST,PARTIAL AGONIST', comparisonType='predictor', _1='Phase>=1', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/AGONIST,PARTIAL AGONIST_predictor_Phase>=1.parquet


                                                                                

results appended
performing Row(comparison='AGONIST,PARTIAL AGONIST', comparisonType='predictor', _1='PhaseT', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/AGONIST,PARTIAL AGONIST_predictor_PhaseT.parquet


                                                                                

results appended
performing Row(comparison='ALLOSTERIC ANTAGONIST', comparisonType='predictor', _1='Phase>=4', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/ALLOSTERIC ANTAGONIST_predictor_Phase>=4.parquet


                                                                                

results appended
performing Row(comparison='ALLOSTERIC ANTAGONIST', comparisonType='predictor', _1='Phase>=3', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/ALLOSTERIC ANTAGONIST_predictor_Phase>=3.parquet


                                                                                

results appended
performing Row(comparison='ALLOSTERIC ANTAGONIST', comparisonType='predictor', _1='Phase>=2', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/ALLOSTERIC ANTAGONIST_predictor_Phase>=2.parquet


                                                                                

results appended
performing Row(comparison='ALLOSTERIC ANTAGONIST', comparisonType='predictor', _1='Phase>=1', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/ALLOSTERIC ANTAGONIST_predictor_Phase>=1.parquet


                                                                                

results appended
performing Row(comparison='ALLOSTERIC ANTAGONIST', comparisonType='predictor', _1='PhaseT', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/ALLOSTERIC ANTAGONIST_predictor_PhaseT.parquet


                                                                                

results appended
performing Row(comparison='RNAI INHIBITOR', comparisonType='predictor', _1='Phase>=4', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/RNAI INHIBITOR_predictor_Phase>=4.parquet


                                                                                

results appended
performing Row(comparison='RNAI INHIBITOR', comparisonType='predictor', _1='Phase>=3', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/RNAI INHIBITOR_predictor_Phase>=3.parquet


                                                                                

results appended
performing Row(comparison='RNAI INHIBITOR', comparisonType='predictor', _1='Phase>=2', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/RNAI INHIBITOR_predictor_Phase>=2.parquet


                                                                                

results appended
performing Row(comparison='RNAI INHIBITOR', comparisonType='predictor', _1='Phase>=1', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/RNAI INHIBITOR_predictor_Phase>=1.parquet


                                                                                

results appended
performing Row(comparison='RNAI INHIBITOR', comparisonType='predictor', _1='PhaseT', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/RNAI INHIBITOR_predictor_PhaseT.parquet


                                                                                

results appended
performing Row(comparison='ANTISENSE INHIBITOR', comparisonType='predictor', _1='Phase>=4', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/ANTISENSE INHIBITOR_predictor_Phase>=4.parquet


                                                                                

results appended
performing Row(comparison='ANTISENSE INHIBITOR', comparisonType='predictor', _1='Phase>=3', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/ANTISENSE INHIBITOR_predictor_Phase>=3.parquet


                                                                                

results appended
performing Row(comparison='ANTISENSE INHIBITOR', comparisonType='predictor', _1='Phase>=2', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/ANTISENSE INHIBITOR_predictor_Phase>=2.parquet


                                                                                

results appended
performing Row(comparison='ANTISENSE INHIBITOR', comparisonType='predictor', _1='Phase>=1', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/ANTISENSE INHIBITOR_predictor_Phase>=1.parquet


                                                                                

results appended
performing Row(comparison='ANTISENSE INHIBITOR', comparisonType='predictor', _1='PhaseT', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/ANTISENSE INHIBITOR_predictor_PhaseT.parquet


                                                                                

results appended
performing Row(comparison='AGONIST,MODULATOR', comparisonType='predictor', _1='Phase>=4', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/AGONIST,MODULATOR_predictor_Phase>=4.parquet
results appended
performing Row(comparison='AGONIST,MODULATOR', comparisonType='predictor', _1='Phase>=3', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/AGONIST,MODULATOR_predictor_Phase>=3.parquet


                                                                                

results appended
performing Row(comparison='AGONIST,MODULATOR', comparisonType='predictor', _1='Phase>=2', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/AGONIST,MODULATOR_predictor_Phase>=2.parquet


                                                                                

results appended
performing Row(comparison='AGONIST,MODULATOR', comparisonType='predictor', _1='Phase>=1', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/AGONIST,MODULATOR_predictor_Phase>=1.parquet
results appended
performing Row(comparison='AGONIST,MODULATOR', comparisonType='predictor', _1='PhaseT', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/AGONIST,MODULATOR_predictor_PhaseT.parquet


                                                                                

results appended
performing Row(comparison='VACCINE ANTIGEN', comparisonType='predictor', _1='Phase>=4', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/VACCINE ANTIGEN_predictor_Phase>=4.parquet


                                                                                

results appended
performing Row(comparison='VACCINE ANTIGEN', comparisonType='predictor', _1='Phase>=3', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/VACCINE ANTIGEN_predictor_Phase>=3.parquet


                                                                                

results appended
performing Row(comparison='VACCINE ANTIGEN', comparisonType='predictor', _1='Phase>=2', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/VACCINE ANTIGEN_predictor_Phase>=2.parquet


                                                                                

results appended
performing Row(comparison='VACCINE ANTIGEN', comparisonType='predictor', _1='Phase>=1', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/VACCINE ANTIGEN_predictor_Phase>=1.parquet


                                                                                

results appended
performing Row(comparison='VACCINE ANTIGEN', comparisonType='predictor', _1='PhaseT', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/VACCINE ANTIGEN_predictor_PhaseT.parquet


                                                                                

results appended
performing Row(comparison='NEGATIVE MODULATOR', comparisonType='predictor', _1='Phase>=4', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/NEGATIVE MODULATOR_predictor_Phase>=4.parquet


                                                                                

results appended
performing Row(comparison='NEGATIVE MODULATOR', comparisonType='predictor', _1='Phase>=3', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/NEGATIVE MODULATOR_predictor_Phase>=3.parquet


                                                                                

results appended
performing Row(comparison='NEGATIVE MODULATOR', comparisonType='predictor', _1='Phase>=2', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/NEGATIVE MODULATOR_predictor_Phase>=2.parquet


                                                                                

results appended
performing Row(comparison='NEGATIVE MODULATOR', comparisonType='predictor', _1='Phase>=1', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/NEGATIVE MODULATOR_predictor_Phase>=1.parquet


                                                                                

results appended
performing Row(comparison='NEGATIVE MODULATOR', comparisonType='predictor', _1='PhaseT', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/NEGATIVE MODULATOR_predictor_PhaseT.parquet


                                                                                

results appended
performing Row(comparison='SUBSTRATE', comparisonType='predictor', _1='Phase>=4', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/SUBSTRATE_predictor_Phase>=4.parquet


                                                                                

results appended
performing Row(comparison='SUBSTRATE', comparisonType='predictor', _1='Phase>=3', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/SUBSTRATE_predictor_Phase>=3.parquet


                                                                                

results appended
performing Row(comparison='SUBSTRATE', comparisonType='predictor', _1='Phase>=2', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/SUBSTRATE_predictor_Phase>=2.parquet


                                                                                

results appended
performing Row(comparison='SUBSTRATE', comparisonType='predictor', _1='Phase>=1', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/SUBSTRATE_predictor_Phase>=1.parquet


                                                                                

results appended
performing Row(comparison='SUBSTRATE', comparisonType='predictor', _1='PhaseT', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/SUBSTRATE_predictor_PhaseT.parquet


                                                                                

results appended
performing Row(comparison='DEGRADER', comparisonType='predictor', _1='Phase>=4', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/DEGRADER_predictor_Phase>=4.parquet


                                                                                

results appended
performing Row(comparison='DEGRADER', comparisonType='predictor', _1='Phase>=3', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/DEGRADER_predictor_Phase>=3.parquet


                                                                                

results appended
performing Row(comparison='DEGRADER', comparisonType='predictor', _1='Phase>=2', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/DEGRADER_predictor_Phase>=2.parquet


                                                                                

results appended
performing Row(comparison='DEGRADER', comparisonType='predictor', _1='Phase>=1', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/DEGRADER_predictor_Phase>=1.parquet


                                                                                

results appended
performing Row(comparison='DEGRADER', comparisonType='predictor', _1='PhaseT', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/DEGRADER_predictor_PhaseT.parquet
results appended
performing Row(comparison='ANTAGONIST,INHIBITOR', comparisonType='predictor', _1='Phase>=4', _2='clinical')


                                                                                

gs://ot-team/jroldan/2025-07-15_analysis/propagated/ANTAGONIST,INHIBITOR_predictor_Phase>=4.parquet


                                                                                

results appended
performing Row(comparison='ANTAGONIST,INHIBITOR', comparisonType='predictor', _1='Phase>=3', _2='clinical')


ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/miniconda3/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

                                                                                

In [None]:
def comparisons_df_iterative(elements):
    # toAnalysis = [(key, value) for key, value in disdic.items() if value == projectId]
    toAnalysis = [(col, "predictor") for col in elements]
    schema = StructType(
        [
            StructField("comparison", StringType(), True),
            StructField("comparisonType", StringType(), True),
        ]
    )

    comparisons = spark.createDataFrame(toAnalysis, schema=schema)
    ### include all the columns as predictor

    predictions = spark.createDataFrame(
        data=[
            ("Phase>=4", "clinical"),
            ('Phase>=3','clinical'),
            ('Phase>=2','clinical'),
            ('Phase>=1','clinical'),
            ("PhaseT", "clinical"),
        ]
    )
    return comparisons.join(predictions, how="full").collect()


print("load comparisons_df_iterative function")

In [None]:

schema = StructType(
    [
        StructField("group", StringType(), True),
        StructField("comparison", StringType(), True),
        StructField("phase", StringType(), True),
        StructField("oddsRatio", DoubleType(), True),
        StructField("pValue", DoubleType(), True),
        StructField("lowerInterval", DoubleType(), True),
        StructField("upperInterval", DoubleType(), True),
        StructField("total", StringType(), True),
        StructField("values", ArrayType(ArrayType(IntegerType())), True),
        StructField("relSuccess", DoubleType(), True),
        StructField("rsLower", DoubleType(), True),
        StructField("rsUpper", DoubleType(), True),
        StructField("path", StringType(), True),
    ]
)
import re

# Define the list of patterns to search for
patterns = [
    "_only",
    #"_tissue",
    #"_isSignalFromRightTissue",
    "_isRightTissueSignalAgreed",
]
# Create a regex pattern to match any of the substrings
regex_pattern = "(" + "|".join(map(re.escape, patterns)) + ")"

# Convert list of lists to DataFrame
df = (
    spreadSheetFormatter(spark.createDataFrame(result_all, schema=schema))
    .withColumn(
        "prefix",
        F.regexp_replace(
            F.col("comparison"), regex_pattern + ".*", ""
        ),  # Extract part before the pattern
    )
    .withColumn(
        "suffix",
        F.regexp_extract(
            F.col("comparison"), regex_pattern, 0
        ),  # Extract the pattern itself
    )
)

### annotate projectId, tissue, qtl type and doe type:

from pyspark.sql.functions import create_map
from itertools import chain

mapping_expr=create_map([F.lit(x) for x in chain(*disdic.items())])

df_annot=df.withColumn('annotation',mapping_expr.getItem(F.col('prefix')))

df_annot.toPandas().to_csv(
    f"gs://ot-team/jroldan/analysis/{today_date}_credibleSetColocDoEanalysis_filteredColocAndCaviarWithOthers4phases.csv"
)

print("dataframe written \n Analysis finished")

In [None]:

#bench2 = benchmark.join(
#    rightTissue, on=["name", "bioSampleName"], how="left"
#).withColumn(
#    "rightTissue",
#    F.when(F.col("rightTissue1") == "yes", F.lit("yes")).otherwise(F.lit("no")),
#)

print("built benchmark dataset")

## write the benchmark 
#name='benchmark'
#output_partitioned_path = f"gs://ot-team/jroldan/analysis/parquetFiles/{name}"
#benchmark.write.mode("overwrite").parquet(output_partitioned_path)
#print(f'written {name}')
#### Analysis

#### 1 Build a dictionary with the distinct values as key and column names as value
variables_study = ["projectId", "biosampleName", "rightStudyType", "colocDoE","colocalisationMethod"]

# List to hold temporary DataFrames
temp_dfs_for_union = []

# Iterate over the column names to prepare DataFrames for union
for col_name in variables_study:
    # Select the current column, alias it to 'distinct_value' for consistent schema
    # Filter out nulls, then get distinct values
    # Add a literal column with the original 'col_name'
    df_temp = (
        benchmark.select(F.col(col_name).alias("distinct_value"))
        .filter(F.col("distinct_value").isNotNull()) # Exclude None (null) values
        .distinct()
        .withColumn("column_name", F.lit(col_name))
    )
    temp_dfs_for_union.append(df_temp)

disdic = {}

if temp_dfs_for_union:
    # Union all the temporary DataFrames.
    # unionByName is crucial to handle potential schema differences (e.g., if columns have same name but different types)
    # and ensures columns are matched by name.
    combined_distinct_values_df = temp_dfs_for_union[0]
    for i in range(1, len(temp_dfs_for_union)):
        combined_distinct_values_df = combined_distinct_values_df.unionByName(temp_dfs_for_union[i])

    # Now, collect the combined distinct values.
    # This is a single collect operation on the aggregated DataFrame.
    print("Collecting combined distinct values from the cluster...")
    collected_rows = combined_distinct_values_df.collect()

    # Populate the dictionary from the collected rows
    for row in collected_rows:
        disdic[row.distinct_value] = row.column_name
else:
    print("variables_study list is empty, disdic will be empty.")


print("\nFinal disdic:", disdic)

# Assuming 'spark' session, 'benchmark' DataFrame, 'negativeTD' DataFrame, and 'disdic' dictionary are defined

# --- Step 1: Pre-compute 'hasboth' ONCE ---
# This is a shuffle, but only happens once.
print("Pre-computing 'hasboth' column...")
window_target_disease_only = Window.partitionBy('targetId', 'diseaseId')
benchmark_processed = benchmark.withColumn(
    'hasboth',
    F.size(F.collect_set('colocalisationMethod').over(window_target_disease_only))
)

# You might consider caching this intermediate result if 'benchmark' is very large
# and you have enough memory, to avoid re-reading from source if possible.
# benchmark_processed.cache() # or .persist(StorageLevel.MEMORY_AND_DISK)
# benchmark_processed.count() # Force computation if you cache

pivoted_dfs = {}

# --- Step 2: Loop for each variable_study column ---
for col_name in variables_study:
    print(f"Processing pivot for: {col_name}")

    # Define window specs for the current iteration, including 'col_name' in partition
    # (This shuffle is still per iteration, but unavoidable if 'resolvedAgreeDrug' depends on 'col_name' values)
    current_col_window_spec_qtl = Window.partitionBy("targetId", "diseaseId", col_name).orderBy(F.col("qtlPValueExponent").asc())
    current_col_pvalue_order_window = Window.partitionBy("targetId", "diseaseId", col_name).orderBy(F.col('colocalisationMethod').asc(), F.col("qtlPValueExponent").asc())

    # Calculate 'resolvedAgreeDrug' for the current 'col_name'
    # This involves a shuffle per iteration.
    temp_df_with_resolved = benchmark_processed.withColumn('resolvedAgreeDrug',
        F.when(F.col('hasboth') > 1,
            F.first(F.col('AgreeDrug'), ignorenulls=True).over(current_col_pvalue_order_window)
        ).otherwise(F.first(F.col('AgreeDrug'), ignorenulls=True).over(current_col_window_spec_qtl))
    )

    # --- Step 3: Perform the pivot and join ---
    # This is an expensive operation (shuffle, potential wide dataframe)
    pivoted_df = (
        temp_df_with_resolved
        .groupBy(
            "targetId",
            "diseaseId",
            "maxClinPhase",
        )
        .pivot(col_name) # Pivoting on values of the 'col_name' column
        .agg(F.collect_set("resolvedAgreeDrug"))
        .join(negativeTD, on=["targetId", "diseaseId"], how="left") # Ensure negativeTD is broadcast if small
    )

    # --- Step 4: Add derived columns (these are generally cheap) ---
    for phase in [1, 2, 3, 4]:
        pivoted_df = pivoted_df.withColumn(
            f"Phase>={phase}",
            F.when(F.col("maxClinPhase") >= phase, F.lit("yes")).otherwise(F.lit("no")),
        )

    pivoted_df = pivoted_df.withColumn(
        "PhaseT",
        F.when(F.col("stopReason") == "Negative", F.lit("yes")).otherwise(F.lit("no")),
    )

    # Add _only columns dynamically based on disdic values matching current column
    matching_keys = [key for key, val in disdic.items() if val == col_name]

    for key in matching_keys:
        # F.col(key) assumes 'key' refers to a column that exists in pivoted_df after the pivot.
        pivoted_df = pivoted_df.withColumn(
            f"{key}_only",
            F.when(F.array_contains(F.col(key), "yes"), F.lit("yes")).otherwise(F.lit("no")),
        )

### making columns for the 

    # --- Step 5: Store result. Consider writing to GCS to break lineage if memory is an issue ---
    # This is highly recommended if 'variables_study' is very large.
    # Write to Parquet for efficient storage and schema preservation.
    # output_path = f"gs://your-bucket/temp_pivoted_results/{col_name}"
    # print(f"Writing results for {col_name} to {output_path}")
    # pivoted_df.write.mode("overwrite").parquet(output_path)
    # pivoted_dfs[col_name] = spark.read.parquet(output_path) # Read back if needed later
    # output_partitioned_path = f"gs://ot-team/jroldan/analysis/parquetFiles/pivoted_df_{col_name}"
    # pivoted_df.write.mode("overwrite").parquet(output_partitioned_path)
    # print(f"DataFrame successfully written and partitioned to {output_partitioned_path}")
    # If not writing to GCS, just store the DF in memory (be cautious for large number of DFs)

    pivoted_dfs[col_name] = pivoted_df

##### PROJECTID
project_keys=[f"{k}_only" for k,v in disdic.items() if v == 'projectId']
main=['GTEx_only', 'UKB_PPP_EUR_only']
others=[item for item in project_keys if item not in main]

# First condition: any "yes" in list1
condition1 = reduce(lambda acc, col: acc | (F.col(col) == "yes"), others[1:], F.col(others[0]) == "yes")
# Add both columns
pivoted_dfs['projectId'] = pivoted_dfs['projectId'].withColumn("othersProjectId_only", F.when(condition1, "yes").otherwise("no")) 

##### BIOSAMPLE NAME
biosample_keys=[f"{k}_only" for k,v in disdic.items() if v == 'biosampleName']
main=['tibial nerve_only', 'upper lobe of left lung_only','blood plasma_only','lymphoblastoid cell line_only']
others=[item for item in biosample_keys if item not in main]

# First condition: any "yes" in list1
condition1 = reduce(lambda acc, col: acc | (F.col(col) == "yes"), others[1:], F.col(others[0]) == "yes")
# Add both columns
pivoted_dfs['biosampleName'] = pivoted_dfs['biosampleName'].withColumn("othersBiosampleName_only", F.when(condition1, "yes").otherwise("no")) 


##### RIGHTSTUDYTYPE 
rightStudy_keys=[f"{k}_only" for k,v in disdic.items() if v == 'rightStudyType']
main=['eqtl_only', 'pqtl_only']
others=[item for item in rightStudy_keys if item not in main]

# First condition: any "yes" in list1
condition1 = reduce(lambda acc, col: acc | (F.col(col) == "yes"), others[1:], F.col(others[0]) == "yes")
# Add both columns
pivoted_dfs['rightStudyType'] = pivoted_dfs['rightStudyType'].withColumn("otherRightStudyType_only", F.when(condition1, "yes").otherwise("no")) 


###append to dictionary

disdic.update({'othersProjectId': 'projectId', 'othersBiosampleName_only': 'biosampleName', 'otherRightStudyType':'rightStudyType'})


result = []
result_st = []
result_ci = []
array2 = []
listado = []
result_all = []
today_date = str(date.today())

##### PROJECT ID ###### 
print('working with projectId')
pivoted_dfs['projectId'].persist()
unique_values = benchmark.select('projectId').filter(F.col('projectId').isNotNull()).distinct().rdd.flatMap(lambda x: x).collect()
filter = len(pivoted_dfs['projectId'].drop(*unique_values).columns[10:])
print('There are ', filter, 'columns to analyse with phases')
rows = comparisons_df_iterative(pivoted_dfs['projectId'].columns[-filter:])

# If needed, now process the rest
for row in rows:
    results = aggregations_original(
        pivoted_dfs['projectId'], "propagated", listado, *row, today_date
    )
    result_all.append(results)

pivoted_dfs['projectId'].unpersist()
print('df unpersisted')

##### BIOSAMPLE NAME ###### 
print('working with biosampleName')
pivoted_dfs['biosampleName'].persist()
unique_values = benchmark.select('biosampleName').filter(F.col('biosampleName').isNotNull()).distinct().rdd.flatMap(lambda x: x).collect()
filter = len(pivoted_dfs['biosampleName'].drop(*unique_values).columns[10:])
print('There are ', filter, 'columns to analyse with phases')
rows = comparisons_df_iterative(pivoted_dfs['biosampleName'].columns[-filter:])

for row in rows:
    results = aggregations_original(
        pivoted_dfs['biosampleName'], "propagated", listado, *row, today_date
    )
    result_all.append(results)

pivoted_dfs['biosampleName'].unpersist()
print('df unpersisted')

##### RIGHTSTUDYTYPE  ###### 
print('working with rightStudyType')
pivoted_dfs['rightStudyType'].persist()
unique_values = benchmark.select('rightStudyType').filter(F.col('rightStudyType').isNotNull()).distinct().rdd.flatMap(lambda x: x).collect()
filter = len(pivoted_dfs['rightStudyType'].drop(*unique_values).columns[10:])
print('There are ', filter, 'columns to analyse with phases')
rows = comparisons_df_iterative(pivoted_dfs['rightStudyType'].columns[-filter:])

for row in rows:
    results = aggregations_original(
        pivoted_dfs['rightStudyType'], "propagated", listado, *row, today_date
    )
    result_all.append(results)
pivoted_dfs['rightStudyType'].unpersist()
print('df unpersisted')

##### COLOC DOE ######
print('working with colocDoE')
pivoted_dfs['colocDoE'].persist()
unique_values = benchmark.select('colocDoE').filter(F.col('colocDoE').isNotNull()).distinct().rdd.flatMap(lambda x: x).collect()
filter = len(pivoted_dfs['colocDoE'].drop(*unique_values).columns[10:])
print('There are ', filter, 'columns to analyse with phases')
rows = comparisons_df_iterative(pivoted_dfs['colocDoE'].columns[-filter:])

for row in rows:
    results = aggregations_original(
        pivoted_dfs['colocDoE'], "propagated", listado, *row, today_date
    )
    result_all.append(results)
pivoted_dfs['colocDoE'].unpersist()
print('df unpersisted')

##### COLOCALISATION METHOD ######
print('working with colocalisationMethod')
pivoted_dfs['colocalisationMethod'].persist()
unique_values = benchmark.select('colocalisationMethod').filter(F.col('colocalisationMethod').isNotNull()).distinct().rdd.flatMap(lambda x: x).collect()
filter = len(pivoted_dfs['colocalisationMethod'].drop(*unique_values).columns[10:])
print('There are ', filter, 'columns to analyse with phases')
rows = comparisons_df_iterative(pivoted_dfs['colocalisationMethod'].columns[-filter:])

for row in rows:
    results = aggregations_original(
        pivoted_dfs['colocalisationMethod'], "propagated", listado, *row, today_date
    )
    result_all.append(results)
pivoted_dfs['colocalisationMethod'].unpersist()
print('df unpersisted')

schema = StructType(
    [
        StructField("group", StringType(), True),
        StructField("comparison", StringType(), True),
        StructField("phase", StringType(), True),
        StructField("oddsRatio", DoubleType(), True),
        StructField("pValue", DoubleType(), True),
        StructField("lowerInterval", DoubleType(), True),
        StructField("upperInterval", DoubleType(), True),
        StructField("total", StringType(), True),
        StructField("values", ArrayType(ArrayType(IntegerType())), True),
        StructField("relSuccess", DoubleType(), True),
        StructField("rsLower", DoubleType(), True),
        StructField("rsUpper", DoubleType(), True),
        StructField("path", StringType(), True),
    ]
)
import re

# Define the list of patterns to search for
patterns = [
    "_only",
    #"_tissue",
    #"_isSignalFromRightTissue",
    "_isRightTissueSignalAgreed",
]
# Create a regex pattern to match any of the substrings
regex_pattern = "(" + "|".join(map(re.escape, patterns)) + ")"

# Convert list of lists to DataFrame
df = (
    spreadSheetFormatter(spark.createDataFrame(result_all, schema=schema))
    .withColumn(
        "prefix",
        F.regexp_replace(
            F.col("comparison"), regex_pattern + ".*", ""
        ),  # Extract part before the pattern
    )
    .withColumn(
        "suffix",
        F.regexp_extract(
            F.col("comparison"), regex_pattern, 0
        ),  # Extract the pattern itself
    )
)

### annotate projectId, tissue, qtl type and doe type:

from pyspark.sql.functions import create_map
from itertools import chain

mapping_expr=create_map([F.lit(x) for x in chain(*disdic.items())])

df_annot=df.withColumn('annotation',mapping_expr.getItem(F.col('prefix')))

df_annot.toPandas().to_csv(
    f"gs://ot-team/jroldan/analysis/{today_date}_credibleSetColocDoEanalysis_filteredColocAndCaviarWithOthers4phases.csv"
)

print("dataframe written \n Analysis finished")

In [None]:
import time
from array import ArrayType
from functions import (
    relative_success,
    spreadSheetFormatter,
    discrepancifier,
    temporary_directionOfEffect,
    buildColocData,
    gwasDataset,
)
# from stoppedTrials import terminated_td
from DoEAssessment import directionOfEffect
# from membraneTargets import target_membrane
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
from datetime import datetime
from datetime import date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.types import (
    StructType,
    StructField,
    DoubleType,
    DecimalType,
    StringType,
    FloatType,
)
import pandas as pd
from functools import reduce

spark = SparkSession.builder.getOrCreate()
spark.conf.set(
    "spark.sql.shuffle.partitions", "400"
)  # Default is 200, increase if needed

path_n='gs://open-targets-data-releases/25.06/output/'

target = spark.read.parquet(f"{path_n}target/")

diseases = spark.read.parquet(f"{path_n}disease/")

evidences = spark.read.parquet(f"{path_n}evidence")

credible = spark.read.parquet(f"{path_n}credible_set")

new = spark.read.parquet(f"{path_n}colocalisation_coloc") 

index=spark.read.parquet(f"{path_n}study/")

variantIndex = spark.read.parquet(f"{path_n}variant")

biosample = spark.read.parquet(f"{path_n}biosample")

ecaviar=spark.read.parquet(f"{path_n}colocalisation_ecaviar")

all_coloc=ecaviar.unionByName(new, allowMissingColumns=True)

print("loaded files")

#### FIRST MODULE: BUILDING COLOC 
newColoc=buildColocData(all_coloc,credible,index)

print("loaded newColoc")

### SECOND MODULE: PROCESS EVIDENCES TO AVOID EXCESS OF COLUMNS 
gwasComplete = gwasDataset(evidences,credible)

#### THIRD MODULE: INCLUDE COLOC IN THE 
resolvedColoc = (
    (
        newColoc.withColumnRenamed("geneId", "targetId")
        .join(
            gwasComplete.withColumnRenamed("studyLocusId", "leftStudyLocusId"),
            on=["leftStudyLocusId", "targetId"],
            how="inner",
        )
        .join(  ### propagated using parent terms
            diseases.selectExpr(
                "id as diseaseId", "name", "parents", "therapeuticAreas"
            ),
            on="diseaseId",
            how="left",
        )
        .withColumn(
            "diseaseId",
            F.explode_outer(F.concat(F.array(F.col("diseaseId")), F.col("parents"))),
        )
        .drop("parents", "oldDiseaseId")
    ).withColumn(
        "colocDoE",
        F.when(
            F.col("rightStudyType").isin(
                ["eqtl", "pqtl", "tuqtl", "sceqtl", "sctuqtl"]
            ),
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_protect"),
            ),
        ).when(
            F.col("rightStudyType").isin(
                ["sqtl", "scsqtl"]
            ),  ### opposite directionality than sqtl
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_protect"),
            ),
        ),
    )
    # .persist()
)
print("loaded resolvedColloc")

datasource_filter = [
#   "ot_genetics_portal",
    "gwas_credible_sets",
    "gene_burden",
    "eva",
    "eva_somatic",
    "gene2phenotype",
    "orphanet",
    "cancer_gene_census",
    "intogen",
    "impc",
    "chembl",
]

assessment, evidences, actionType, oncolabel = temporary_directionOfEffect(
    path_n, datasource_filter
)

print("run temporary direction of effect")


print("built drugApproved dataset")


#### FOURTH MODULE BUILDING CHEMBL ASSOCIATIONS - HERE TAKE CARE WITH FILTERING STEP 
analysis_chembl_indication = (
    discrepancifier(
        assessment.filter((F.col("datasourceId") == "chembl"))
        .withColumn(
            "maxClinPhase",
            F.max(F.col("clinicalPhase")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase")
        .pivot("homogenized")
        .agg(F.count("targetId"))
    )
    #.filter(F.col("coherencyDiagonal") == "coherent")
    .drop(
        "coherencyDiagonal", "coherencyOneCell", "noEvaluable", "GoF_risk", "LoF_risk"
    )
    .withColumnRenamed("GoF_protect", "drugGoF_protect")
    .withColumnRenamed("LoF_protect", "drugLoF_protect")
    # .persist()
)

####2 Define agregation function
import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
from scipy.stats.contingency import odds_ratio
from pyspark.sql.types import *


def convertTuple(tup):
    st = ",".join(map(str, tup))
    return st


#####3 run in a function
def aggregations_original(
    df,
    data,
    listado,
    comparisonColumn,
    comparisonType,
    predictionColumn,
    predictionType,
    today_date,
):
    wComparison = Window.partitionBy(comparisonColumn)
    wPrediction = Window.partitionBy(predictionColumn)
    wPredictionComparison = Window.partitionBy(comparisonColumn, predictionColumn)
    results = []
    # uniqIds = df.select("targetId", "diseaseId").distinct().count()
    out = (
        df.withColumn("comparisonType", F.lit(comparisonType))
        .withColumn("dataset", F.lit(data))
        .withColumn("predictionType", F.lit(predictionType))
        # .withColumn("total", F.lit(uniqIds))
        .withColumn("a", F.count("targetId").over(wPredictionComparison))
        .withColumn("comparisonColumn", F.lit(comparisonColumn))
        .withColumn("predictionColumnValue", F.lit(predictionColumn))
        .withColumn(
            "predictionTotal",
            F.count("targetId").over(wPrediction),
        )
        .withColumn(
            "comparisonTotal",
            F.count("targetId").over(wComparison),
        )
        .select(
            F.col(predictionColumn).alias("prediction"),
            F.col(comparisonColumn).alias("comparison"),
            "dataset",
            "comparisonColumn",
            "predictionColumnValue",
            "comparisonType",
            "predictionType",
            "a",
            "predictionTotal",
            "comparisonTotal",
        )
        .filter(F.col("prediction").isNotNull())
        .filter(F.col("comparison").isNotNull())
        .distinct()
    )

    out.write.mode("overwrite").parquet(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + comparisonType
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )

    listado.append(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + comparisonType
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )
    path = "gs://ot-team/jroldan/" + str(
        today_date
        + "_"
        + "analysis/"
        + data
        # + "_propagated"
        + "/"
        + comparisonColumn
        + "_"
        + comparisonType
        + "_"
        + predictionColumn
        + ".parquet"
    )
    print(path)
    
    ### making analysis
    array1 = np.delete(
        out.join(full_data, on=["prediction", "comparison"], how="outer")
        .groupBy("comparison")
        .pivot("prediction")
        .agg(F.first("a"))
        .sort(F.col("comparison").desc())
        .select("comparison", "yes", "no")
        .fillna(0)
        .toPandas()
        .to_numpy(),
        [0],
        1,
    )
    total = np.sum(array1)
    print(total)
    res_npPhaseX = np.array(array1, dtype=int)
    resX = convertTuple(fisher_exact(res_npPhaseX, alternative="two-sided"))
    resx_CI = convertTuple(
        odds_ratio(res_npPhaseX).confidence_interval(confidence_level=0.95)
    )

    result_st.append(resX)
    result_ci.append(resx_CI)
    (rs_result, rs_ci) = relative_success(array1)
    results.extend(
        [
            comparisonType,
            comparisonColumn,
            predictionColumn,
            round(float(resX.split(",")[0]), 2),
            float(resX.split(",")[1]),
            round(float(resx_CI.split(",")[0]), 2),
            round(float(resx_CI.split(",")[1]), 2),
            str(total),
            np.array(res_npPhaseX).tolist(),
            round(float(rs_result), 2),
            round(float(rs_ci[0]), 2),
            round(float(rs_ci[1]), 2),
            # studies,
            # tissues,
            path,
        ]
    )
    return results


#### 3 Loop over different datasets (as they will have different rows and columns)


def comparisons_df_iterative(elements):
    # toAnalysis = [(key, value) for key, value in disdic.items() if value == projectId]
    toAnalysis = [(col, "predictor") for col in elements]
    schema = StructType(
        [
            StructField("comparison", StringType(), True),
            StructField("comparisonType", StringType(), True),
        ]
    )

    comparisons = spark.createDataFrame(toAnalysis, schema=schema)
    ### include all the columns as predictor

    predictions = spark.createDataFrame(
        data=[
            ("Phase>=4", "clinical"),
            #('Phase>=3','clinical'),
            #('Phase>=2','clinical'),
            #('Phase>=1','clinical'),
            #("PhaseT", "clinical"),
        ]
    )
    return comparisons.join(predictions, how="full").collect()


print("load comparisons_df_iterative function")


full_data = spark.createDataFrame(
    data=[
        ("yes", "yes"),
        ("yes", "no"),
        ("no", "yes"),
        ("no", "no"),
    ],
    schema=StructType(
        [
            StructField("prediction", StringType(), True),
            StructField("comparison", StringType(), True),
        ]
    ),
)
print("created full_data and lists")

#rightTissue = spark.read.csv(
#    'gs://ot-team/jroldan/analysis/20250526_rightTissue.csv',
#    header=True,
#).drop("_c0")

print("loaded rightTissue dataset")

negativeTD = (
    evidences.filter(F.col("datasourceId") == "chembl")
    .select("targetId", "diseaseId", "studyStopReason", "studyStopReasonCategories")
    .filter(F.array_contains(F.col("studyStopReasonCategories"), "Negative"))
    .groupBy("targetId", "diseaseId")
    .count()
    .withColumn("stopReason", F.lit("Negative"))
    .drop("count")
)

print("built negativeTD dataset")

print("built bench2 dataset")

###### cut from here
print("looping for variables_study")

#### new part with chatgpt -- TEST

## QUESTIONS TO ANSWER:
# HAVE ECAVIAR >=0.8
# HAVE COLOC 
# HAVE COLOC >= 0.8
# HAVE COLOC + ECAVIAR >= 0.01
# HAVE COLOC >= 0.8 + ECAVIAR >= 0.01
# RIGHT JOING WITH CHEMBL 

### FIFTH MODULE: BUILDING BENCHMARK OF THE DATASET TO EXTRACT EHE ANALYSIS 

resolvedColocFiltered = resolvedColoc.filter((F.col('clpp')>=0.01) | (F.col('h4')>=0.8))
benchmark = (
    (
        resolvedColocFiltered.filter( ## .filter(F.col("betaGwas") < 0)
        F.col("name") != "COVID-19"
    )
        .join(  ### select just GWAS giving protection
            analysis_chembl_indication, on=["targetId", "diseaseId"], how="right"  ### RIGHT SIDE
        )
        .withColumn(
            "AgreeDrug",
            F.when(
                (F.col("drugGoF_protect").isNotNull())
                & (F.col("colocDoE") == "GoF_protect"),
                F.lit("yes"),
            )
            .when(
                (F.col("drugLoF_protect").isNotNull())
                & (F.col("colocDoE") == "LoF_protect"),
                F.lit("yes"),
            )
            .otherwise(F.lit("no")),
        )
    )  #### remove COVID-19 associations
).join(biosample.select("biosampleId", "biosampleName"), on="biosampleId", how="left")


### drug mechanism of action
mecact_path = f"{path_n}drug_mechanism_of_action/" #  mechanismOfAction == old version
mecact = spark.read.parquet(mecact_path)

inhibitors = [
    "RNAI INHIBITOR",
    "NEGATIVE MODULATOR",
    "NEGATIVE ALLOSTERIC MODULATOR",
    "ANTAGONIST",
    "ANTISENSE INHIBITOR",
    "BLOCKER",
    "INHIBITOR",
    "DEGRADER",
    "INVERSE AGONIST",
    "ALLOSTERIC ANTAGONIST",
    "DISRUPTING AGENT",
]

activators = [
    "PARTIAL AGONIST",
    "ACTIVATOR",
    "POSITIVE ALLOSTERIC MODULATOR",
    "POSITIVE MODULATOR",
    "AGONIST",
    "SEQUESTERING AGENT",  ## lost at 31.01.2025
    "STABILISER",
    # "EXOGENOUS GENE", ## added 24.06.2025
    # "EXOGENOUS PROTEIN" ## added 24.06.2025
]


actionType = (
        mecact.select(
            F.explode_outer("chemblIds").alias("drugId"),
            "actionType",
            "mechanismOfAction",
            "targets",
        )
        .select(
            F.explode_outer("targets").alias("targetId"),
            "drugId",
            "actionType",
            "mechanismOfAction",
        )
        .groupBy("targetId", "drugId")
        .agg(F.collect_set("actionType").alias("actionType2"))
    ).withColumn('nMoA', F.size(F.col('actionType2')))

analysis_chembl_indication = (
    discrepancifier(
        assessment.filter((F.col("datasourceId") == "chembl")).join(actionType, on=['targetId','drugId'], how='left')
        .withColumn(
            "maxClinPhase",
            F.max(F.col("clinicalPhase")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase",'actionType2')
        .pivot("homogenized")
        .agg(F.count("targetId"))
    )
    #.filter(F.col("coherencyDiagonal") == "coherent")
    .drop(
        "coherencyDiagonal", "coherencyOneCell", "noEvaluable", "GoF_risk", "LoF_risk"
    )
    .withColumnRenamed("GoF_protect", "drugGoF_protect")
    .withColumnRenamed("LoF_protect", "drugLoF_protect")
)

benchmark = (
    (
        resolvedColocFiltered.filter( ## .filter(F.col("betaGwas") < 0)
        F.col("name") != "COVID-19"
    )
        .join(  ### select just GWAS giving protection
            analysis_chembl_indication, on=["targetId", "diseaseId"], how="right"  ### RIGHT SIDE
        )
        .withColumn(
            "AgreeDrug",
            F.when(
                (F.col("drugGoF_protect").isNotNull())
                & (F.col("colocDoE") == "GoF_protect"),
                F.lit("yes"),
            )
            .when(
                (F.col("drugLoF_protect").isNotNull())
                & (F.col("colocDoE") == "LoF_protect"),
                F.lit("yes"),
            )
            .otherwise(F.lit("no")),
        )
    )  #### remove COVID-19 associations
).join(biosample.select("biosampleId", "biosampleName"), on="biosampleId", how="left")

negativeTD = (
    evidences.filter(F.col("datasourceId") == "chembl")
    .select("targetId", "diseaseId", "studyStopReason", "studyStopReasonCategories")
    .filter(F.array_contains(F.col("studyStopReasonCategories"), "Negative"))
    .groupBy("targetId", "diseaseId")
    .count()
    .withColumn("stopReason", F.lit("Negative"))
    .drop("count")
)

# --- Configuration for your iterative pivoting ---
group_by_columns = ['targetId', 'diseaseId','phase4Clean','phase3Clean','phase2Clean','phase1Clean','PhaseT']
columns_to_pivot_on = ['actionType2', 'biosampleName', 'projectId', 'rightStudyType']
columns_to_aggregate = ['NoneCellYes', 'NdiagonalYes','hasGenetics'] # The values you want to collect in the pivoted cells
all_pivoted_dfs = {}

doe_columns=["LoF_protect", "GoF_risk", "LoF_risk", "GoF_protect"]
diagonal_lof=['LoF_protect','GoF_risk']
diagonal_gof=['LoF_risk','GoF_protect']

conditions = [
    F.when(F.col(c) == F.col("maxDoE"), F.lit(c)).otherwise(F.lit(None)) for c in doe_columns
    ]

# --- Nested Loops for Dynamic Pivoting ---
for agg_col_name in columns_to_aggregate:
    for pivot_col_name in columns_to_pivot_on:
        print(f"\n--- Creating DataFrame for Aggregation: '{agg_col_name}' and Pivot: '{pivot_col_name}' ---")
        current_col_pvalue_order_window = Window.partitionBy("targetId", "diseaseId", "maxClinPhase", pivot_col_name).orderBy(F.col('colocalisationMethod').asc(), F.col("qtlPValueExponent").asc())
        test2=discrepancifier(benchmark.withColumn('actionType2', F.concat_ws(",", F.col("actionType2"))).withColumn('qtlColocDoE',F.first('colocDoE').over(current_col_pvalue_order_window)).groupBy(
        "targetId", "diseaseId", "maxClinPhase", "drugLoF_protect", "drugGoF_protect",pivot_col_name)
        .pivot("colocDoE")
        .count()
        .withColumnRenamed('drugLoF_protect', 'LoF_protect_ch')
        .withColumnRenamed('drugGoF_protect', 'GoF_protect_ch')).withColumn( ## .filter(F.col('coherencyDiagonal')!='noEvid')
    "arrayN", F.array(*[F.col(c) for c in doe_columns])
    ).withColumn(
        "maxDoE", F.array_max(F.col("arrayN"))
    ).withColumn("maxDoE_names", F.array(*conditions)
    ).withColumn("maxDoE_names", F.expr("filter(maxDoE_names, x -> x is not null)")
    ).withColumn(
        "NoneCellYes",
        F.when((F.col("LoF_protect_ch").isNotNull() & (F.col('GoF_protect_ch').isNull())) & (F.array_contains(F.col("maxDoE_names"), F.lit("LoF_protect")))==True, F.lit('yes'))
        .when((F.col("GoF_protect_ch").isNotNull() & (F.col('LoF_protect_ch').isNull())) & (F.array_contains(F.col("maxDoE_names"), F.lit("GoF_protect")))==True, F.lit('yes')
            ).otherwise(F.lit('no'))  # If the value is null, return null # Otherwise, check if name is in array
    ).withColumn(
        "NdiagonalYes",
        F.when((F.col("LoF_protect_ch").isNotNull() & (F.col('GoF_protect_ch').isNull())) & 
            (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_lof]))) > 0),
            F.lit("yes")
        ).when((F.col("GoF_protect_ch").isNotNull() & (F.col('LoF_protect_ch').isNull())) & 
            (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_gof]))) > 0),
            F.lit("yes")
        ).otherwise(F.lit('no'))
    ).withColumn(
        "drugCoherency",
        F.when(
            (F.col("LoF_protect_ch").isNotNull())
            & (F.col("GoF_protect_ch").isNull()), F.lit("coherent")
        )
        .when(
            (F.col("LoF_protect_ch").isNull())
            & (F.col("GoF_protect_ch").isNotNull()), F.lit("coherent")
        )
        .when(
            (F.col("LoF_protect_ch").isNotNull())
            & (F.col("GoF_protect_ch").isNotNull()), F.lit("dispar")
        )
        .otherwise(F.lit("other")),
    ).join(negativeTD, on=["targetId", "diseaseId"], how="left").withColumn(
        "PhaseT",
        F.when(F.col("stopReason") == "Negative", F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "phase4Clean",
        F.when(
            (F.col("maxClinPhase") == 4) & (F.col("PhaseT") == "no"), F.lit("yes")
        ).otherwise(F.lit("no")),
    ).withColumn(
        "phase3Clean",
        F.when(
            (F.col("maxClinPhase") >= 3) & (F.col("PhaseT") == "no"), F.lit("yes")
        ).otherwise(F.lit("no")),
    ).withColumn(
        "phase2Clean",
        F.when(
            (F.col("maxClinPhase") >= 2) & (F.col("PhaseT") == "no"), F.lit("yes")
        ).otherwise(F.lit("no")),
    ).withColumn(
        "phase1Clean",
        F.when(
            (F.col("maxClinPhase") >= 1) & (F.col("PhaseT") == "no"), F.lit("yes")
        ).otherwise(F.lit("no")),
    ).withColumn(
        "hasGenetics",
        F.when(F.col("coherencyDiagonal") != "noEvid", F.lit("yes")).otherwise(F.lit("no")),
    )
        # 1. Get distinct values for the pivot column (essential for pivot())
        # This brings a small amount of data to the driver, but is necessary for the pivot schema.
        distinct_pivot_values = [row[0] for row in test2.select(pivot_col_name).distinct().collect()]
        print(f"Distinct values for '{pivot_col_name}': {distinct_pivot_values}")

        # 2. Perform the groupBy, pivot, and aggregate operations
        # The .pivot() function requires the list of distinct values for better performance
        # and correct schema inference.
        pivoted_df = (
            test2.groupBy(*group_by_columns)
            .pivot(pivot_col_name, distinct_pivot_values) # Provide distinct values
            .agg(F.collect_set(F.col(agg_col_name))) # Collect all values into a set
            .fillna(0) # Fill cells that have no data with an empty list instead of null
        )

        # 3. Add the 'data' literal column dynamically
        # This column indicates which aggregation column was used.
        pivoted_df = pivoted_df.withColumn('data', F.lit(f'Drug_{agg_col_name}'))

        array_columns_to_convert = [
            field.name for field in pivoted_df.schema.fields
            if isinstance(field.dataType, ArrayType)
        ]
        print(f"Identified ArrayType columns for conversion: {array_columns_to_convert}")

        # 4. Apply the conversion logic to each identified array column
        df_after_conversion = pivoted_df # Start with the pivoted_df
        for col_to_convert in array_columns_to_convert:
            df_after_conversion = df_after_conversion.withColumn(
                col_to_convert,
                F.when(F.col(col_to_convert).isNull(), F.lit('no'))          # Handle NULLs (from pivot for no data)
                .when(F.size(F.col(col_to_convert)) == 0, F.lit('no'))       # Empty array -> 'no'
                .when(F.array_contains(F.col(col_to_convert), F.lit('yes')), F.lit('yes')) # Contains 'yes' -> 'yes'
                .when(F.array_contains(F.col(col_to_convert), F.lit('no')), F.lit('no'))   # Contains 'no' -> 'no'
                .otherwise(F.lit('no')) # Fallback for unexpected array content (e.g., ['other'], ['yes','no'])
            )

        # 4. Generate a unique name for this DataFrame and store it
        df_key = f"df_pivot_{agg_col_name.lower()}_by_{pivot_col_name.lower()}"
        all_pivoted_dfs[df_key] = df_after_conversion.withColumnRenamed( 'phase4Clean','Phase>=4'
        ).withColumnRenamed('phase3Clean','Phase>=3'
        ).withColumnRenamed('phase2Clean','Phase>=2'
        ).withColumnRenamed('phase1Clean','Phase>=1')


# --- Accessing your generated DataFrames ---
print("\n--- All generated DataFrames are stored in 'all_pivoted_dfs' dictionary ---")
print("Keys available:", all_pivoted_dfs.keys())


result = []
result_st = []
result_ci = []
array2 = []
listado = []
result_all = []
today_date = str(date.today())
for key,df in all_pivoted_dfs.items():

    print(f'working with {key}')
    parts = key.split('_by_') ### take the part of key belonging to column name
    column_name = parts[1] ### take the last part which is column name
    all_pivoted_dfs[key].persist()
    unique_values = all_pivoted_dfs[key].drop('null').columns[7:]
    print('There are ', len(unique_values), 'columns to analyse with phases')
    rows = comparisons_df_iterative(unique_values)

    # If needed, now process the rest
    for row in rows:
        print('performing', row)
        results = aggregations_original(
            all_pivoted_dfs[key], "propagated", listado, *row, today_date
        )
        result_all.append(results)
        print('results appended')
    all_pivoted_dfs[key].unpersist()
    print('df unpersisted')


schema = StructType(
    [
        StructField("group", StringType(), True),
        StructField("comparison", StringType(), True),
        StructField("phase", StringType(), True),
        StructField("oddsRatio", DoubleType(), True),
        StructField("pValue", DoubleType(), True),
        StructField("lowerInterval", DoubleType(), True),
        StructField("upperInterval", DoubleType(), True),
        StructField("total", StringType(), True),
        StructField("values", ArrayType(ArrayType(IntegerType())), True),
        StructField("relSuccess", DoubleType(), True),
        StructField("rsLower", DoubleType(), True),
        StructField("rsUpper", DoubleType(), True),
        StructField("path", StringType(), True),
    ]
)
import re

# Define the list of patterns to search for
patterns = [
    "_only",
    #"_tissue",
    #"_isSignalFromRightTissue",
    "_isRightTissueSignalAgreed",
]
# Create a regex pattern to match any of the substrings
regex_pattern = "(" + "|".join(map(re.escape, patterns)) + ")"

# Convert list of lists to DataFrame
df = (
    spreadSheetFormatter(spark.createDataFrame(result_all, schema=schema))
    .withColumn(
        "prefix",
        F.regexp_replace(
            F.col("comparison"), regex_pattern + ".*", ""
        ),  # Extract part before the pattern
    )
    .withColumn(
        "suffix",
        F.regexp_extract(
            F.col("comparison"), regex_pattern, 0
        ),  # Extract the pattern itself
    )
)

### annotate projectId, tissue, qtl type and doe type:

from pyspark.sql.functions import create_map
from itertools import chain

#mapping_expr=create_map([F.lit(x) for x in chain(*disdic.items())])

#df_annot=df.withColumn('annotation',mapping_expr.getItem(F.col('prefix')))

df_annot.toPandas().to_csv(
    f"gs://ot-team/jroldan/analysis/{today_date}_credibleSetColocDoEanalysis_filteredColocAndCaviarWithOthers4phases.csv"
)

print("dataframe written \n Analysis finished")

In [None]:
#### I need to incrase performance of dataset

In [1]:
import time
from array import ArrayType
from functions import (
    relative_success,
    spreadSheetFormatter,
    discrepancifier,
    temporary_directionOfEffect,
    buildColocData,
    gwasDataset,
)
# from stoppedTrials import terminated_td
from DoEAssessment import directionOfEffect
# from membraneTargets import target_membrane
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
from datetime import datetime
from datetime import date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.types import (
    StructType,
    StructField,
    DoubleType,
    DecimalType,
    StringType,
    FloatType,
)
import pandas as pd
from functools import reduce

# --- YARN and Spark Configuration Parameters ---
# These parameters directly influence how Spark requests resources from YARN
# and how memory is managed within your Spark application.

# 1. spark.driver.memory: Memory allocated to the Spark driver program.
#    The driver is responsible for coordinating tasks, scheduling, and collecting
#    results. If you're doing operations like 'collect()' on large datasets,
#    or working with large broadcast variables, increase this.
#    General Guideline: Start with 2g-4g for interactive use, up to 8g-16g
#    for very large metadata or small result collection.
driver_memory = "4g"

# 2. spark.executor.memory: Memory allocated to each Spark executor JVM.
#    Executors are the worker processes that perform the actual data processing.
#    This is *the most critical* setting for memory-related YARN issues.
#    If your tasks are failing due to OOM errors, increase this significantly.
#    General Guideline: Depends on your node size and data. Common values are
#    4g, 8g, 16g, or even more. Ensure it doesn't exceed YARN's max container size.
executor_memory = "8g"

# 3. spark.executor.cores: Number of virtual cores (CPU) allocated to each executor.
#    More cores means an executor can run more tasks concurrently.
#    General Guideline: Typically 2-5 cores per executor. Avoid 1 core (poor parallelism)
#    and too many cores (can lead to fewer executors and memory contention).
executor_cores = "4"

# 4. spark.executor.instances: The total number of executors to launch.
#    This determines the overall parallelism of your application across the cluster.
#    General Guideline: Calculate based on your total cluster resources.
#    (Total available cores on cluster / executor_cores).
#    Start with a reasonable number, e.g., 5-20, and scale up.
num_executors = "10" # Example: 10 executors

# 5. spark.yarn.executor.memoryOverhead: Additional memory for the YARN container
#    beyond the JVM heap (spark.executor.memory). This includes off-heap memory,
#    PySpark's Python process memory, thread stacks, etc.
#    Crucial for PySpark! If this is too low, YARN can kill your containers
#    even if your Java heap (executor_memory) is fine.
#    General Guideline: 10-20% of spark.executor.memory, or a fixed amount like 1g-2g.
#    For PySpark, it's often safer to allocate more.
executor_memory_overhead = "2g" # For an 8g executor, 2g overhead is reasonable (25%)

# 6. spark.sql.shuffle.partitions: The number of partitions used for shuffling data
#    during operations like `groupBy`, `join`, `agg`, `sort`.
#    If this is too low: You can get OOM errors if partitions are too large,
#    or task failures due to data skew.
#    If this is too high: Creates many small tasks, leading to overhead.
#    General Guideline: A common heuristic is 2-4 times the total number of CPU cores
#    available in your application (executor_cores * num_executors).
#    For your current setup (4 cores * 10 executors = 40 cores), 400 is very high.
#    Consider (num_executors * executor_cores * 2) as a starting point.
#    Example: 10 executors * 4 cores/executor = 40 total cores. 40 * 2 = 80 partitions.
#    However, if you have *very* large datasets or significant data skew, 400 might be okay,
#    but it's usually better to start lower and increase if you see skew/large partition processing.
shuffle_partitions = "150" # Adjust based on data size and parallelism

# 7. spark.default.parallelism: This parameter is important for RDD operations (less so for DataFrames,
#    where spark.sql.shuffle.partitions is more relevant for shuffles). It suggests the default
#    number of partitions for RDDs created from scratch, and also influences the number of tasks.
#    It's often set to match or be a multiple of the total number of cores.
default_parallelism = str(int(executor_cores) * int(num_executors) * 2) # A common heuristic

# --- Build the SparkSession ---
# Use the .config() method to set these parameters before calling .getOrCreate()
# This ensures Spark requests the correct resources from YARN at the start.

spark = SparkSession.builder \
    .appName("MyOptimizedPySparkApp") \
    .config("spark.master", "yarn") \
    .config("spark.driver.memory", driver_memory) \
    .config("spark.executor.memory", executor_memory) \
    .config("spark.executor.cores", executor_cores) \
    .config("spark.executor.instances", num_executors) \
    .config("spark.yarn.executor.memoryOverhead", executor_memory_overhead) \
    .config("spark.sql.shuffle.partitions", shuffle_partitions) \
    .config("spark.default.parallelism", default_parallelism) \
    .getOrCreate()

print(f"SparkSession created successfully with the following configurations:")
print(f"  spark.driver.memory: {spark.conf.get('spark.driver.memory')}")
print(f"  spark.executor.memory: {spark.conf.get('spark.executor.memory')}")
print(f"  spark.executor.cores: {spark.conf.get('spark.executor.cores')}")
print(f"  spark.executor.instances: {spark.conf.get('spark.executor.instances')}")
print(f"  spark.yarn.executor.memoryOverhead: {spark.conf.get('spark.yarn.executor.memoryOverhead')}")
print(f"  spark.sql.shuffle.partitions: {spark.conf.get('spark.sql.shuffle.partitions')}")
print(f"  spark.default.parallelism: {spark.conf.get('spark.default.parallelism')}")
print(f"Spark UI available at: {spark.sparkContext.uiWebUrl}")

# --- Your PySpark Code Here ---
# Now you can proceed with your data loading and processing.
# Example:
# df = spark.read.parquet("hdfs:///user/your_user/your_large_data.parquet")
# print(f"Number of rows in DataFrame: {df.count()}")
# df.groupBy("some_column").agg({"another_column": "sum"}).show()

# Remember to stop the SparkSession when you are done
# spark.stop()

path_n='gs://open-targets-data-releases/25.06/output/'

target = spark.read.parquet(f"{path_n}target/")

diseases = spark.read.parquet(f"{path_n}disease/")

evidences = spark.read.parquet(f"{path_n}evidence")

credible = spark.read.parquet(f"{path_n}credible_set")

new = spark.read.parquet(f"{path_n}colocalisation_coloc") 

index=spark.read.parquet(f"{path_n}study/")

variantIndex = spark.read.parquet(f"{path_n}variant")

biosample = spark.read.parquet(f"{path_n}biosample")

ecaviar=spark.read.parquet(f"{path_n}colocalisation_ecaviar")

all_coloc=ecaviar.unionByName(new, allowMissingColumns=True)

print("loaded files")

#### FIRST MODULE: BUILDING COLOC 
newColoc=buildColocData(all_coloc,credible,index)

print("loaded newColoc")

### SECOND MODULE: PROCESS EVIDENCES TO AVOID EXCESS OF COLUMNS 
gwasComplete = gwasDataset(evidences,credible)

#### THIRD MODULE: INCLUDE COLOC IN THE 
resolvedColoc = (
    (
        newColoc.withColumnRenamed("geneId", "targetId")
        .join(
            gwasComplete.withColumnRenamed("studyLocusId", "leftStudyLocusId"),
            on=["leftStudyLocusId", "targetId"],
            how="inner",
        )
        .join(  ### propagated using parent terms
            diseases.selectExpr(
                "id as diseaseId", "name", "parents", "therapeuticAreas"
            ),
            on="diseaseId",
            how="left",
        )
        .withColumn(
            "diseaseId",
            F.explode_outer(F.concat(F.array(F.col("diseaseId")), F.col("parents"))),
        )
        .drop("parents", "oldDiseaseId")
    ).withColumn(
        "colocDoE",
        F.when(
            F.col("rightStudyType").isin(
                ["eqtl", "pqtl", "tuqtl", "sceqtl", "sctuqtl"]
            ),
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_protect"),
            ),
        ).when(
            F.col("rightStudyType").isin(
                ["sqtl", "scsqtl"]
            ),  ### opposite directionality than sqtl
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_protect"),
            ),
        ),
    )
    # .persist()
)
print("loaded resolvedColloc")

datasource_filter = [
#   "ot_genetics_portal",
    "gwas_credible_sets",
    "gene_burden",
    "eva",
    "eva_somatic",
    "gene2phenotype",
    "orphanet",
    "cancer_gene_census",
    "intogen",
    "impc",
    "chembl",
]

assessment, evidences, actionType, oncolabel = temporary_directionOfEffect(
    path_n, datasource_filter
)

print("run temporary direction of effect")


print("built drugApproved dataset")


#### FOURTH MODULE BUILDING CHEMBL ASSOCIATIONS - HERE TAKE CARE WITH FILTERING STEP 
analysis_chembl_indication = (
    discrepancifier(
        assessment.filter((F.col("datasourceId") == "chembl"))
        .withColumn(
            "maxClinPhase",
            F.max(F.col("clinicalPhase")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase")
        .pivot("homogenized")
        .agg(F.count("targetId"))
    )
    #.filter(F.col("coherencyDiagonal") == "coherent")
    .drop(
        "coherencyDiagonal", "coherencyOneCell", "noEvaluable", "GoF_risk", "LoF_risk"
    )
    .withColumnRenamed("GoF_protect", "drugGoF_protect")
    .withColumnRenamed("LoF_protect", "drugLoF_protect")
    # .persist()
)

####2 Define agregation function
import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
from scipy.stats.contingency import odds_ratio
from pyspark.sql.types import *


def convertTuple(tup):
    st = ",".join(map(str, tup))
    return st


#####3 run in a function
def aggregations_original(
    df,
    data,
    listado,
    comparisonColumn,
    comparisonType,
    predictionColumn,
    predictionType,
    today_date,
):
    wComparison = Window.partitionBy(comparisonColumn)
    wPrediction = Window.partitionBy(predictionColumn)
    wPredictionComparison = Window.partitionBy(comparisonColumn, predictionColumn)
    results = []
    # uniqIds = df.select("targetId", "diseaseId").distinct().count()
    out = (
        df.withColumn("comparisonType", F.lit(comparisonType))
        .withColumn("dataset", F.lit(data))
        .withColumn("predictionType", F.lit(predictionType))
        # .withColumn("total", F.lit(uniqIds))
        .withColumn("a", F.count("targetId").over(wPredictionComparison))
        .withColumn("comparisonColumn", F.lit(comparisonColumn))
        .withColumn("predictionColumnValue", F.lit(predictionColumn))
        .withColumn(
            "predictionTotal",
            F.count("targetId").over(wPrediction),
        )
        .withColumn(
            "comparisonTotal",
            F.count("targetId").over(wComparison),
        )
        .select(
            F.col(predictionColumn).alias("prediction"),
            F.col(comparisonColumn).alias("comparison"),
            "dataset",
            "comparisonColumn",
            "predictionColumnValue",
            "comparisonType",
            "predictionType",
            "a",
            "predictionTotal",
            "comparisonTotal",
        )
        .filter(F.col("prediction").isNotNull())
        .filter(F.col("comparison").isNotNull())
        .distinct()
    )

    out.write.mode("overwrite").parquet(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + comparisonType
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )

    listado.append(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + comparisonType
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )
    path = "gs://ot-team/jroldan/" + str(
        today_date
        + "_"
        + "analysis/"
        + data
        # + "_propagated"
        + "/"
        + comparisonColumn
        + "_"
        + comparisonType
        + "_"
        + predictionColumn
        + ".parquet"
    )
    print(path)
    
    ### making analysis
    array1 = np.delete(
        out.join(full_data, on=["prediction", "comparison"], how="outer")
        .groupBy("comparison")
        .pivot("prediction")
        .agg(F.first("a"))
        .sort(F.col("comparison").desc())
        .select("comparison", "yes", "no")
        .fillna(0)
        .toPandas()
        .to_numpy(),
        [0],
        1,
    )
    total = np.sum(array1)
    res_npPhaseX = np.array(array1, dtype=int)
    resX = convertTuple(fisher_exact(res_npPhaseX, alternative="two-sided"))
    resx_CI = convertTuple(
        odds_ratio(res_npPhaseX).confidence_interval(confidence_level=0.95)
    )

    result_st.append(resX)
    result_ci.append(resx_CI)
    (rs_result, rs_ci) = relative_success(array1)
    results.extend(
        [
            comparisonType,
            comparisonColumn,
            predictionColumn,
            round(float(resX.split(",")[0]), 2),
            float(resX.split(",")[1]),
            round(float(resx_CI.split(",")[0]), 2),
            round(float(resx_CI.split(",")[1]), 2),
            str(total),
            np.array(res_npPhaseX).tolist(),
            round(float(rs_result), 2),
            round(float(rs_ci[0]), 2),
            round(float(rs_ci[1]), 2),
            # studies,
            # tissues,
            path,
        ]
    )
    return results


#### 3 Loop over different datasets (as they will have different rows and columns)


def comparisons_df_iterative(elements):
    #toAnalysis = [(key, value) for key, value in disdic.items() if value == projectId]
    toAnalysis = [(col, "predictor") for col in elements]
    schema = StructType(
        [
            StructField("comparison", StringType(), True),
            StructField("comparisonType", StringType(), True),
        ]
    )

    comparisons = spark.createDataFrame(toAnalysis, schema=schema)
    ### include all the columns as predictor

    predictions = spark.createDataFrame(
        data=[
            ("Phase>=4", "clinical"),
            #('Phase>=3','clinical'),
            #('Phase>=2','clinical'),
            #('Phase>=1','clinical'),
            #("PhaseT", "clinical"),
        ]
    )
    return comparisons.join(predictions, how="full").collect()


print("load comparisons_df_iterative function")


full_data = spark.createDataFrame(
    data=[
        ("yes", "yes"),
        ("yes", "no"),
        ("no", "yes"),
        ("no", "no"),
    ],
    schema=StructType(
        [
            StructField("prediction", StringType(), True),
            StructField("comparison", StringType(), True),
        ]
    ),
)
print("created full_data and lists")

#rightTissue = spark.read.csv(
#    'gs://ot-team/jroldan/analysis/20250526_rightTissue.csv',
#    header=True,
#).drop("_c0")

print("loaded rightTissue dataset")

negativeTD = (
    evidences.filter(F.col("datasourceId") == "chembl")
    .select("targetId", "diseaseId", "studyStopReason", "studyStopReasonCategories")
    .filter(F.array_contains(F.col("studyStopReasonCategories"), "Negative"))
    .groupBy("targetId", "diseaseId")
    .count()
    .withColumn("stopReason", F.lit("Negative"))
    .drop("count")
)

print("built negativeTD dataset")

print("built bench2 dataset")

###### cut from here
print("looping for variables_study")

#### new part with chatgpt -- TEST

## QUESTIONS TO ANSWER:
# HAVE ECAVIAR >=0.8
# HAVE COLOC 
# HAVE COLOC >= 0.8
# HAVE COLOC + ECAVIAR >= 0.01
# HAVE COLOC >= 0.8 + ECAVIAR >= 0.01
# RIGHT JOING WITH CHEMBL 

### FIFTH MODULE: BUILDING BENCHMARK OF THE DATASET TO EXTRACT EHE ANALYSIS 

resolvedColocFiltered = resolvedColoc.filter((F.col('clpp')>=0.01) | (F.col('h4')>=0.8))
benchmark = (
    (
        resolvedColocFiltered.filter( ## .filter(F.col("betaGwas") < 0)
        F.col("name") != "COVID-19"
    )
        .join(  ### select just GWAS giving protection
            analysis_chembl_indication, on=["targetId", "diseaseId"], how="right"  ### RIGHT SIDE
        )
        .withColumn(
            "AgreeDrug",
            F.when(
                (F.col("drugGoF_protect").isNotNull())
                & (F.col("colocDoE") == "GoF_protect"),
                F.lit("yes"),
            )
            .when(
                (F.col("drugLoF_protect").isNotNull())
                & (F.col("colocDoE") == "LoF_protect"),
                F.lit("yes"),
            )
            .otherwise(F.lit("no")),
        )
    )  #### remove COVID-19 associations
).join(biosample.select("biosampleId", "biosampleName"), on="biosampleId", how="left")


### drug mechanism of action
mecact_path = f"{path_n}drug_mechanism_of_action/" #  mechanismOfAction == old version
mecact = spark.read.parquet(mecact_path)

inhibitors = [
    "RNAI INHIBITOR",
    "NEGATIVE MODULATOR",
    "NEGATIVE ALLOSTERIC MODULATOR",
    "ANTAGONIST",
    "ANTISENSE INHIBITOR",
    "BLOCKER",
    "INHIBITOR",
    "DEGRADER",
    "INVERSE AGONIST",
    "ALLOSTERIC ANTAGONIST",
    "DISRUPTING AGENT",
]

activators = [
    "PARTIAL AGONIST",
    "ACTIVATOR",
    "POSITIVE ALLOSTERIC MODULATOR",
    "POSITIVE MODULATOR",
    "AGONIST",
    "SEQUESTERING AGENT",  ## lost at 31.01.2025
    "STABILISER",
    # "EXOGENOUS GENE", ## added 24.06.2025
    # "EXOGENOUS PROTEIN" ## added 24.06.2025
]


actionType = (
        mecact.select(
            F.explode_outer("chemblIds").alias("drugId"),
            "actionType",
            "mechanismOfAction",
            "targets",
        )
        .select(
            F.explode_outer("targets").alias("targetId"),
            "drugId",
            "actionType",
            "mechanismOfAction",
        )
        .groupBy("targetId", "drugId")
        .agg(F.collect_set("actionType").alias("actionType2"))
    ).withColumn('nMoA', F.size(F.col('actionType2')))

analysis_chembl_indication = (
    discrepancifier(
        assessment.filter((F.col("datasourceId") == "chembl")).join(actionType, on=['targetId','drugId'], how='left')
        .withColumn(
            "maxClinPhase",
            F.max(F.col("clinicalPhase")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase",'actionType2')
        .pivot("homogenized")
        .agg(F.count("targetId"))
    )
    #.filter(F.col("coherencyDiagonal") == "coherent")
    .drop(
        "coherencyDiagonal", "coherencyOneCell", "noEvaluable", "GoF_risk", "LoF_risk"
    )
    .withColumnRenamed("GoF_protect", "drugGoF_protect")
    .withColumnRenamed("LoF_protect", "drugLoF_protect")
)

benchmark = (
    (
        resolvedColocFiltered.filter( ## .filter(F.col("betaGwas") < 0)
        F.col("name") != "COVID-19"
    )
        .join(  ### select just GWAS giving protection
            analysis_chembl_indication, on=["targetId", "diseaseId"], how="right"  ### RIGHT SIDE
        )
        .withColumn(
            "AgreeDrug",
            F.when(
                (F.col("drugGoF_protect").isNotNull())
                & (F.col("colocDoE") == "GoF_protect"),
                F.lit("yes"),
            )
            .when(
                (F.col("drugLoF_protect").isNotNull())
                & (F.col("colocDoE") == "LoF_protect"),
                F.lit("yes"),
            )
            .otherwise(F.lit("no")),
        )
    )  #### remove COVID-19 associations
).join(biosample.select("biosampleId", "biosampleName"), on="biosampleId", how="left")

negativeTD = (
    evidences.filter(F.col("datasourceId") == "chembl")
    .select("targetId", "diseaseId", "studyStopReason", "studyStopReasonCategories")
    .filter(F.array_contains(F.col("studyStopReasonCategories"), "Negative"))
    .groupBy("targetId", "diseaseId")
    .count()
    .withColumn("stopReason", F.lit("Negative"))
    .drop("count")
)


spark session created at 2025-07-16 07:36:45.161017
Analysis started on 2025-07-16 at  2025-07-16 07:36:45.161017


25/07/16 07:36:50 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/07/16 07:36:50 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


SparkSession created successfully with the following configurations:
  spark.driver.memory: 4g
  spark.executor.memory: 8g
  spark.executor.cores: 4
  spark.executor.instances: 10
  spark.yarn.executor.memoryOverhead: 2g
  spark.sql.shuffle.partitions: 150
  spark.default.parallelism: 80
Spark UI available at: http://jr-temp-doe-m.c.open-targets-eu-dev.internal:42303


                                                                                

loaded files
loaded newColoc


                                                                                

loaded gwasComplete
loaded resolvedColloc
run temporary direction of effect
built drugApproved dataset


                                                                                

load comparisons_df_iterative function
created full_data and lists
loaded rightTissue dataset
built negativeTD dataset
built bench2 dataset
looping for variables_study


In [4]:
current_col_pvalue_order_window=['actionType2', 'biosampleName', 'projectId', 'rightStudyType','colocalisationMethod']

In [5]:
benchmark.withColumn('qtlColocDoE',F.first('colocDoE').over(current_col_pvalue_order_window)).groupBy(
    "targetId",
    "diseaseId",
    "maxClinPhase",
    "drugLoF_protect",
    "drugGoF_protect",
    "actionType2",
    "biosampleName",
    "projectId",
    "rightStudyType",
    "colocalisationMethod",
).count().show()

PySparkTypeError: [NOT_WINDOWSPEC] Argument `window` should be a WindowSpec, got list.

In [None]:
### create disdic dictionary
disdic={}

# --- Configuration for your iterative pivoting ---
group_by_columns = ['targetId', 'diseaseId','phase4Clean','phase3Clean','phase2Clean','phase1Clean','PhaseT']
columns_to_pivot_on = ['actionType2', 'biosampleName', 'projectId', 'rightStudyType','colocalisationMethod']
columns_to_aggregate = ['NoneCellYes', 'NdiagonalYes','hasGenetics'] # The values you want to collect in the pivoted cells
all_pivoted_dfs = {}

doe_columns=["LoF_protect", "GoF_risk", "LoF_risk", "GoF_protect"]
diagonal_lof=['LoF_protect','GoF_risk']
diagonal_gof=['LoF_risk','GoF_protect']

conditions = [
    F.when(F.col(c) == F.col("maxDoE"), F.lit(c)).otherwise(F.lit(None)) for c in doe_columns
    ]

# --- Nested Loops for Dynamic Pivoting ---
for agg_col_name in columns_to_aggregate:
    for pivot_col_name in columns_to_pivot_on:
        print(f"\n--- Creating DataFrame for Aggregation: '{agg_col_name}' and Pivot: '{pivot_col_name}' ---")
        current_col_pvalue_order_window = Window.partitionBy("targetId", "diseaseId", "maxClinPhase", pivot_col_name).orderBy(F.col('colocalisationMethod').asc(), F.col("qtlPValueExponent").asc())
        test2=discrepancifier(benchmark.withColumn('actionType2', F.concat_ws(",", F.col("actionType2"))).withColumn('qtlColocDoE',F.first('colocDoE').over(current_col_pvalue_order_window)).groupBy(
        "targetId", "diseaseId", "maxClinPhase", "drugLoF_protect", "drugGoF_protect",pivot_col_name)
        .pivot("colocDoE")
        .count()
        .withColumnRenamed('drugLoF_protect', 'LoF_protect_ch')
        .withColumnRenamed('drugGoF_protect', 'GoF_protect_ch')).withColumn( ## .filter(F.col('coherencyDiagonal')!='noEvid')
    "arrayN", F.array(*[F.col(c) for c in doe_columns])
    ).withColumn(
        "maxDoE", F.array_max(F.col("arrayN"))
    ).withColumn("maxDoE_names", F.array(*conditions)
    ).withColumn("maxDoE_names", F.expr("filter(maxDoE_names, x -> x is not null)")
    ).withColumn(
        "NoneCellYes",
        F.when((F.col("LoF_protect_ch").isNotNull() & (F.col('GoF_protect_ch').isNull())) & (F.array_contains(F.col("maxDoE_names"), F.lit("LoF_protect")))==True, F.lit('yes'))
        .when((F.col("GoF_protect_ch").isNotNull() & (F.col('LoF_protect_ch').isNull())) & (F.array_contains(F.col("maxDoE_names"), F.lit("GoF_protect")))==True, F.lit('yes')
            ).otherwise(F.lit('no'))  # If the value is null, return null # Otherwise, check if name is in array
    ).withColumn(
        "NdiagonalYes",
        F.when((F.col("LoF_protect_ch").isNotNull() & (F.col('GoF_protect_ch').isNull())) & 
            (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_lof]))) > 0),
            F.lit("yes")
        ).when((F.col("GoF_protect_ch").isNotNull() & (F.col('LoF_protect_ch').isNull())) & 
            (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_gof]))) > 0),
            F.lit("yes")
        ).otherwise(F.lit('no'))
    ).withColumn(
        "drugCoherency",
        F.when(
            (F.col("LoF_protect_ch").isNotNull())
            & (F.col("GoF_protect_ch").isNull()), F.lit("coherent")
        )
        .when(
            (F.col("LoF_protect_ch").isNull())
            & (F.col("GoF_protect_ch").isNotNull()), F.lit("coherent")
        )
        .when(
            (F.col("LoF_protect_ch").isNotNull())
            & (F.col("GoF_protect_ch").isNotNull()), F.lit("dispar")
        )
        .otherwise(F.lit("other")),
    ).join(negativeTD, on=["targetId", "diseaseId"], how="left").withColumn(
        "PhaseT",
        F.when(F.col("stopReason") == "Negative", F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "phase4Clean",
        F.when(
            (F.col("maxClinPhase") == 4) & (F.col("PhaseT") == "no"), F.lit("yes")
        ).otherwise(F.lit("no")),
    ).withColumn(
        "phase3Clean",
        F.when(
            (F.col("maxClinPhase") >= 3) & (F.col("PhaseT") == "no"), F.lit("yes")
        ).otherwise(F.lit("no")),
    ).withColumn(
        "phase2Clean",
        F.when(
            (F.col("maxClinPhase") >= 2) & (F.col("PhaseT") == "no"), F.lit("yes")
        ).otherwise(F.lit("no")),
    ).withColumn(
        "phase1Clean",
        F.when(
            (F.col("maxClinPhase") >= 1) & (F.col("PhaseT") == "no"), F.lit("yes")
        ).otherwise(F.lit("no")),
    ).withColumn(
        "hasGenetics",
        F.when(F.col("coherencyDiagonal") != "noEvid", F.lit("yes")).otherwise(F.lit("no")),
    )
        # 1. Get distinct values for the pivot column (essential for pivot())
        # This brings a small amount of data to the driver, but is necessary for the pivot schema.
        distinct_pivot_values = [row[0] for row in test2.select(pivot_col_name).distinct().collect()]
        print(f"Distinct values for '{pivot_col_name}': {distinct_pivot_values}")

        # 2. Perform the groupBy, pivot, and aggregate operations
        # The .pivot() function requires the list of distinct values for better performance
        # and correct schema inference.
        pivoted_df = (
            test2.groupBy(*group_by_columns)
            .pivot(pivot_col_name, distinct_pivot_values) # Provide distinct values
            .agg(F.collect_set(F.col(agg_col_name))) # Collect all values into a set
            .fillna(0) # Fill cells that have no data with an empty list instead of null
        )
        # 3. Add items to dictionary to map the columns:
        # filter out None and 'null':
        filtered = [x for x in distinct_pivot_values if x is not None and x != 'null']
        # using list comprehension
        disdic = {item: pivot_col_name for item in filtered} 

        # 3. Add the 'data' literal column dynamically
        # This column indicates which aggregation column was used.
        #pivoted_df = pivoted_df.withColumn('data', F.lit(f'Drug_{agg_col_name}'))

        array_columns_to_convert = [
            field.name for field in pivoted_df.schema.fields
            if isinstance(field.dataType, ArrayType)
        ]
        print(f"Identified ArrayType columns for conversion: {array_columns_to_convert}")

        # 4. Apply the conversion logic to each identified array column
        df_after_conversion = pivoted_df # Start with the pivoted_df
        for col_to_convert in array_columns_to_convert:
            df_after_conversion = df_after_conversion.withColumn(
                col_to_convert,
                F.when(F.col(col_to_convert).isNull(), F.lit('no'))          # Handle NULLs (from pivot for no data)
                .when(F.size(F.col(col_to_convert)) == 0, F.lit('no'))       # Empty array -> 'no'
                .when(F.array_contains(F.col(col_to_convert), F.lit('yes')), F.lit('yes')) # Contains 'yes' -> 'yes'
                .when(F.array_contains(F.col(col_to_convert), F.lit('no')), F.lit('no'))   # Contains 'no' -> 'no'
                .otherwise(F.lit('no')) # Fallback for unexpected array content (e.g., ['other'], ['yes','no'])
            )

        # 4. Generate a unique name for this DataFrame and store it
        df_key = f"df_pivot_{agg_col_name.lower()}_by_{pivot_col_name.lower()}"
        all_pivoted_dfs[df_key] = df_after_conversion.withColumnRenamed( 'phase4Clean','Phase>=4'
        ).withColumnRenamed('phase3Clean','Phase>=3'
        ).withColumnRenamed('phase2Clean','Phase>=2'
        ).withColumnRenamed('phase1Clean','Phase>=1')


# --- Accessing your generated DataFrames ---
print("\n--- All generated DataFrames are stored in 'all_pivoted_dfs' dictionary ---")
print("Keys available:", all_pivoted_dfs.keys())

In [10]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import ArrayType

# Assume 'spark' SparkSession is already created with optimized configurations
# as discussed in the previous responses.

# Make sure benchmark and negativeTD DataFrames are defined or loaded here
# Example (replace with your actual loading logic):
# benchmark = spark.read.parquet("hdfs:///path/to/your/benchmark_data.parquet")
# negativeTD = spark.read.parquet("hdfs:///path/to/your/negativeTD_data.parquet")


group_by_columns = ['targetId', 'diseaseId', 'phase4Clean', 'phase3Clean', 'phase2Clean', 'phase1Clean', 'PhaseT']
columns_to_pivot_on = ['actionType2', 'biosampleName', 'projectId', 'rightStudyType', 'colocalisationMethod']
columns_to_aggregate = ['NoneCellYes', 'NdiagonalYes', 'hasGenetics']
doe_columns = ["LoF_protect", "GoF_risk", "LoF_risk", "GoF_protect"]
diagonal_lof = ['LoF_protect', 'GoF_risk']
diagonal_gof = ['LoF_risk', 'GoF_protect']
conditions = [
    F.when(F.col(c) == F.col("maxDoE"), F.lit(c)).otherwise(F.lit(None)) for c in doe_columns
]

transformed_df = benchmark.withColumn('actionType2', F.concat_ws(",", F.col("actionType2")))

current_col_pvalue_order_window_unified = Window.partitionBy("targetId", "diseaseId", "maxClinPhase").orderBy(F.col('colocalisationMethod').asc(), F.col("qtlPValueExponent").asc())

transformed_df = (
    transformed_df
    .withColumn('qtlColocDoE', F.first('colocDoE').over(current_col_pvalue_order_window_unified))
    .groupBy(
        "targetId", "diseaseId", "maxClinPhase", "drugLoF_protect", "drugGoF_protect",
        *columns_to_pivot_on
    )
    .pivot("colocDoE")
    .count()
    .withColumnRenamed('drugLoF_protect', 'LoF_protect_ch')
    .withColumnRenamed('drugGoF_protect', 'GoF_protect_ch')
    .fillna(0)
)

transformed_df = transformed_df.withColumn(
    "arrayN", F.array(*[F.col(c) for c in doe_columns])
).withColumn(
    "maxDoE", F.array_max(F.col("arrayN"))
).withColumn(
    "maxDoE_names", F.array(*conditions)
).withColumn(
    "maxDoE_names", F.expr("filter(maxDoE_names, x -> x is not null)")
).withColumn(
    "NoneCellYes",
    F.when((F.col("LoF_protect_ch").isNotNull() & (F.col('GoF_protect_ch').isNull())) & (F.array_contains(F.col("maxDoE_names"), F.lit("LoF_protect")))==True, F.lit('yes'))
    .when((F.col("GoF_protect_ch").isNotNull() & (F.col('LoF_protect_ch').isNull())) & (F.array_contains(F.col("maxDoE_names"), F.lit("GoF_protect")))==True, F.lit('yes')
        ).otherwise(F.lit('no'))
).withColumn(
    "NdiagonalYes",
    F.when((F.col("LoF_protect_ch").isNotNull() & (F.col('GoF_protect_ch').isNull())) &
        (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_lof]))) > 0),
        F.lit("yes")
    ).when((F.col("GoF_protect_ch").isNotNull() & (F.col('LoF_protect_ch').isNull())) &
        (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_gof]))) > 0),
        F.lit("yes")
    ).otherwise(F.lit('no'))
).withColumn(
    "drugCoherency",
    F.when(
        (F.col("LoF_protect_ch").isNotNull())
        & (F.col("GoF_protect_ch").isNull()), F.lit("coherent")
    )
    .when(
        (F.col("LoF_protect_ch").isNull())
        & (F.col("GoF_protect_ch").isNotNull()), F.lit("coherent")
    )
    .when(
        (F.col("LoF_protect_ch").isNotNull())
        & (F.col("GoF_protect_ch").isNotNull()), F.lit("dispar")
    )
    .otherwise(F.lit("other")),
)

transformed_df = transformed_df.join(negativeTD, on=["targetId", "diseaseId"], how="left")

transformed_df = transformed_df.withColumn(
    "PhaseT",
    F.when(F.col("stopReason") == "Negative", F.lit("yes")).otherwise(F.lit("no")),
).withColumn(
    "phase4Clean",
    F.when(
        (F.col("maxClinPhase") == 4) & (F.col("PhaseT") == "no"), F.lit("yes")
    ).otherwise(F.lit("no")),
).withColumn(
    "phase3Clean",
    F.when(
        (F.col("maxClinPhase") >= 3) & (F.col("PhaseT") == "no"), F.lit("yes")
    ).otherwise(F.lit("no")),
).withColumn(
    "phase2Clean",
    F.when(
        (F.col("maxClinPhase") >= 2) & (F.col("PhaseT") == "no"), F.lit("yes")
    ).otherwise(F.lit("no")),
).withColumn(
    "phase1Clean",
    F.when(
        (F.col("maxClinPhase") >= 1) & (F.col("PhaseT") == "no"), F.lit("yes")
    ).otherwise(F.lit("no")),
).withColumn(
    "hasGenetics",
    F.when(F.col("coherencyDiagonal") != "noEvid", F.lit("yes")).otherwise(F.lit("no")),
)


base_df_for_unpivot = transformed_df.select(
    *group_by_columns,
    *columns_to_pivot_on,
    *columns_to_aggregate
)

pivot_col_expressions = [F.struct(F.lit(col).alias("pivot_col_name"), F.col(col).alias("pivot_col_value")) for col in columns_to_pivot_on]
agg_col_expressions = [F.struct(F.lit(col).alias("agg_col_name"), F.col(col).alias("agg_col_value")) for col in columns_to_aggregate]


unpivoted_pivot_cols = base_df_for_unpivot.select(
    *group_by_columns,
    F.explode(F.array(*pivot_col_expressions)).alias("pivot_data"),
    *columns_to_aggregate
).select(
    *group_by_columns,
    F.col("pivot_data.pivot_col_name"),
    F.col("pivot_data.pivot_col_value"),
    *columns_to_aggregate
)

final_unpivoted_df = unpivoted_pivot_cols.select(
    *group_by_columns,
    F.col("pivot_col_name"),
    F.col("pivot_col_value"),
    F.explode(F.array(*agg_col_expressions)).alias("agg_data")
).select(
    *group_by_columns,
    F.col("pivot_col_name"),
    F.col("pivot_col_value"),
    F.col("agg_data.agg_col_name"),
    F.col("agg_data.agg_col_value")
)

all_distinct_pivot_values = [
    row[0] for row in final_unpivoted_df.select("pivot_col_value").distinct().collect()
    if row[0] is not None
]


final_pivoted_df = (
    final_unpivoted_df
    .groupBy(*group_by_columns, "pivot_col_name", "agg_col_name")
    .pivot("pivot_col_value", all_distinct_pivot_values)
    .agg(F.collect_set(F.col("agg_col_value")))
    .fillna(F.array())
)

columns_to_process = [col_name for col_name in final_pivoted_df.columns if col_name in all_distinct_pivot_values]

for col_to_convert in columns_to_process:
    final_pivoted_df = final_pivoted_df.withColumn(
        col_to_convert,
        F.when(F.col(col_to_convert).isNull(), F.lit('no'))
        .when(F.size(F.col(col_to_convert)) == 0, F.lit('no'))
        .when(F.array_contains(F.col(col_to_convert), F.lit('yes')), F.lit('yes'))
        .when(F.array_contains(F.col(col_to_convert), F.lit('no')), F.lit('no'))
        .otherwise(F.lit('no'))
    )

unpivoted_values_df = final_pivoted_df.unpivot(
    pivot_cols=all_distinct_pivot_values,
    key_col="pivoted_value_key",
    value_col="pivoted_agg_result"
)

final_combined_df = unpivoted_values_df.withColumn(
    "final_col_name",
    F.concat_ws("_",
                F.lit("Drug"),
                F.col("agg_col_name"),
                F.col("pivot_col_name"),
                F.col("pivoted_value_key"))
)

all_final_column_names = [
    row[0] for row in final_combined_df.select("final_col_name").distinct().collect()
    if row[0] is not None
]

final_wide_df = (
    final_combined_df
    .groupBy(*group_by_columns)
    .pivot("final_col_name", all_final_column_names)
    .agg(F.first(F.col("pivoted_agg_result")))
    .fillna("no")
)

final_wide_df = final_wide_df.withColumnRenamed('phase4Clean', 'Phase>=4') \
                             .withColumnRenamed('phase3Clean', 'Phase>=3') \
                             .withColumnRenamed('phase2Clean', 'Phase>=2') \
                             .withColumnRenamed('phase1Clean', 'Phase>=1')

final_disdic_map = {col_name: parts[2] for col_name in all_final_column_names if len(parts := col_name.split('_')) >= 3 and parts[0] == 'Drug'}

                                                                                

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `coherencyDiagonal` cannot be resolved. Did you mean one of the following? [`phase1Clean`, `phase2Clean`, `phase3Clean`, `phase4Clean`, `NdiagonalYes`].;
'Project [targetId#2088, diseaseId#2170, maxClinPhase#17918, LoF_protect_ch#17919L, GoF_protect_ch#17920L, actionType2#16878, biosampleName#1060, projectId#975, rightStudyType#1080, colocalisationMethod#1083, null#17921L, GoF_protect#17922L, GoF_risk#17923L, LoF_protect#17924L, LoF_risk#17925L, arrayN#17941, maxDoE#17958L, maxDoE_names#17995, NoneCellYes#18015, NdiagonalYes#18035, drugCoherency#18056, stopReason#16867, PhaseT#18190, phase4Clean#18214, ... 4 more fields]
+- Project [targetId#2088, diseaseId#2170, maxClinPhase#17918, LoF_protect_ch#17919L, GoF_protect_ch#17920L, actionType2#16878, biosampleName#1060, projectId#975, rightStudyType#1080, colocalisationMethod#1083, null#17921L, GoF_protect#17922L, GoF_risk#17923L, LoF_protect#17924L, LoF_risk#17925L, arrayN#17941, maxDoE#17958L, maxDoE_names#17995, NoneCellYes#18015, NdiagonalYes#18035, drugCoherency#18056, stopReason#16867, PhaseT#18190, phase4Clean#18214, ... 3 more fields]
   +- Project [targetId#2088, diseaseId#2170, maxClinPhase#17918, LoF_protect_ch#17919L, GoF_protect_ch#17920L, actionType2#16878, biosampleName#1060, projectId#975, rightStudyType#1080, colocalisationMethod#1083, null#17921L, GoF_protect#17922L, GoF_risk#17923L, LoF_protect#17924L, LoF_risk#17925L, arrayN#17941, maxDoE#17958L, maxDoE_names#17995, NoneCellYes#18015, NdiagonalYes#18035, drugCoherency#18056, stopReason#16867, PhaseT#18190, phase4Clean#18214, ... 2 more fields]
      +- Project [targetId#2088, diseaseId#2170, maxClinPhase#17918, LoF_protect_ch#17919L, GoF_protect_ch#17920L, actionType2#16878, biosampleName#1060, projectId#975, rightStudyType#1080, colocalisationMethod#1083, null#17921L, GoF_protect#17922L, GoF_risk#17923L, LoF_protect#17924L, LoF_risk#17925L, arrayN#17941, maxDoE#17958L, maxDoE_names#17995, NoneCellYes#18015, NdiagonalYes#18035, drugCoherency#18056, stopReason#16867, PhaseT#18190, phase4Clean#18214, CASE WHEN ((maxClinPhase#17918 >= cast(3 as double)) AND (PhaseT#18190 = no)) THEN yes ELSE no END AS phase3Clean#18239]
         +- Project [targetId#2088, diseaseId#2170, maxClinPhase#17918, LoF_protect_ch#17919L, GoF_protect_ch#17920L, actionType2#16878, biosampleName#1060, projectId#975, rightStudyType#1080, colocalisationMethod#1083, null#17921L, GoF_protect#17922L, GoF_risk#17923L, LoF_protect#17924L, LoF_risk#17925L, arrayN#17941, maxDoE#17958L, maxDoE_names#17995, NoneCellYes#18015, NdiagonalYes#18035, drugCoherency#18056, stopReason#16867, PhaseT#18190, CASE WHEN ((maxClinPhase#17918 = cast(4 as double)) AND (PhaseT#18190 = no)) THEN yes ELSE no END AS phase4Clean#18214]
            +- Project [targetId#2088, diseaseId#2170, maxClinPhase#17918, LoF_protect_ch#17919L, GoF_protect_ch#17920L, actionType2#16878, biosampleName#1060, projectId#975, rightStudyType#1080, colocalisationMethod#1083, null#17921L, GoF_protect#17922L, GoF_risk#17923L, LoF_protect#17924L, LoF_risk#17925L, arrayN#17941, maxDoE#17958L, maxDoE_names#17995, NoneCellYes#18015, NdiagonalYes#18035, drugCoherency#18056, stopReason#16867, CASE WHEN (stopReason#16867 = Negative) THEN yes ELSE no END AS PhaseT#18190]
               +- Project [targetId#2088, diseaseId#2170, maxClinPhase#17918, LoF_protect_ch#17919L, GoF_protect_ch#17920L, actionType2#16878, biosampleName#1060, projectId#975, rightStudyType#1080, colocalisationMethod#1083, null#17921L, GoF_protect#17922L, GoF_risk#17923L, LoF_protect#17924L, LoF_risk#17925L, arrayN#17941, maxDoE#17958L, maxDoE_names#17995, NoneCellYes#18015, NdiagonalYes#18035, drugCoherency#18056, stopReason#16867]
                  +- Join LeftOuter, ((targetId#2088 = targetId#18079) AND (diseaseId#2170 = diseaseId#18161))
                     :- Project [targetId#2088, diseaseId#2170, maxClinPhase#17918, LoF_protect_ch#17919L, GoF_protect_ch#17920L, actionType2#16878, biosampleName#1060, projectId#975, rightStudyType#1080, colocalisationMethod#1083, null#17921L, GoF_protect#17922L, GoF_risk#17923L, LoF_protect#17924L, LoF_risk#17925L, arrayN#17941, maxDoE#17958L, maxDoE_names#17995, NoneCellYes#18015, NdiagonalYes#18035, CASE WHEN (isnotnull(LoF_protect_ch#17919L) AND isnull(GoF_protect_ch#17920L)) THEN coherent WHEN (isnull(LoF_protect_ch#17919L) AND isnotnull(GoF_protect_ch#17920L)) THEN coherent WHEN (isnotnull(LoF_protect_ch#17919L) AND isnotnull(GoF_protect_ch#17920L)) THEN dispar ELSE other END AS drugCoherency#18056]
                     :  +- Project [targetId#2088, diseaseId#2170, maxClinPhase#17918, LoF_protect_ch#17919L, GoF_protect_ch#17920L, actionType2#16878, biosampleName#1060, projectId#975, rightStudyType#1080, colocalisationMethod#1083, null#17921L, GoF_protect#17922L, GoF_risk#17923L, LoF_protect#17924L, LoF_risk#17925L, arrayN#17941, maxDoE#17958L, maxDoE_names#17995, NoneCellYes#18015, CASE WHEN ((isnotnull(LoF_protect_ch#17919L) AND isnull(GoF_protect_ch#17920L)) AND (size(array_intersect(maxDoE_names#17995, array(LoF_protect, GoF_risk)), true) > 0)) THEN yes WHEN ((isnotnull(GoF_protect_ch#17920L) AND isnull(LoF_protect_ch#17919L)) AND (size(array_intersect(maxDoE_names#17995, array(LoF_risk, GoF_protect)), true) > 0)) THEN yes ELSE no END AS NdiagonalYes#18035]
                     :     +- Project [targetId#2088, diseaseId#2170, maxClinPhase#17918, LoF_protect_ch#17919L, GoF_protect_ch#17920L, actionType2#16878, biosampleName#1060, projectId#975, rightStudyType#1080, colocalisationMethod#1083, null#17921L, GoF_protect#17922L, GoF_risk#17923L, LoF_protect#17924L, LoF_risk#17925L, arrayN#17941, maxDoE#17958L, maxDoE_names#17995, CASE WHEN (((isnotnull(LoF_protect_ch#17919L) AND isnull(GoF_protect_ch#17920L)) AND array_contains(maxDoE_names#17995, LoF_protect)) = true) THEN yes WHEN (((isnotnull(GoF_protect_ch#17920L) AND isnull(LoF_protect_ch#17919L)) AND array_contains(maxDoE_names#17995, GoF_protect)) = true) THEN yes ELSE no END AS NoneCellYes#18015]
                     :        +- Project [targetId#2088, diseaseId#2170, maxClinPhase#17918, LoF_protect_ch#17919L, GoF_protect_ch#17920L, actionType2#16878, biosampleName#1060, projectId#975, rightStudyType#1080, colocalisationMethod#1083, null#17921L, GoF_protect#17922L, GoF_risk#17923L, LoF_protect#17924L, LoF_risk#17925L, arrayN#17941, maxDoE#17958L, filter(maxDoE_names#17976, lambdafunction(isnotnull(lambda x#17996), lambda x#17996, false)) AS maxDoE_names#17995]
                     :           +- Project [targetId#2088, diseaseId#2170, maxClinPhase#17918, LoF_protect_ch#17919L, GoF_protect_ch#17920L, actionType2#16878, biosampleName#1060, projectId#975, rightStudyType#1080, colocalisationMethod#1083, null#17921L, GoF_protect#17922L, GoF_risk#17923L, LoF_protect#17924L, LoF_risk#17925L, arrayN#17941, maxDoE#17958L, array(CASE WHEN (LoF_protect#17924L = maxDoE#17958L) THEN LoF_protect ELSE cast(null as string) END, CASE WHEN (GoF_risk#17923L = maxDoE#17958L) THEN GoF_risk ELSE cast(null as string) END, CASE WHEN (LoF_risk#17925L = maxDoE#17958L) THEN LoF_risk ELSE cast(null as string) END, CASE WHEN (GoF_protect#17922L = maxDoE#17958L) THEN GoF_protect ELSE cast(null as string) END) AS maxDoE_names#17976]
                     :              +- Project [targetId#2088, diseaseId#2170, maxClinPhase#17918, LoF_protect_ch#17919L, GoF_protect_ch#17920L, actionType2#16878, biosampleName#1060, projectId#975, rightStudyType#1080, colocalisationMethod#1083, null#17921L, GoF_protect#17922L, GoF_risk#17923L, LoF_protect#17924L, LoF_risk#17925L, arrayN#17941, array_max(arrayN#17941) AS maxDoE#17958L]
                     :                 +- Project [targetId#2088, diseaseId#2170, maxClinPhase#17918, LoF_protect_ch#17919L, GoF_protect_ch#17920L, actionType2#16878, biosampleName#1060, projectId#975, rightStudyType#1080, colocalisationMethod#1083, null#17921L, GoF_protect#17922L, GoF_risk#17923L, LoF_protect#17924L, LoF_risk#17925L, array(LoF_protect#17924L, GoF_risk#17923L, LoF_risk#17925L, GoF_protect#17922L) AS arrayN#17941]
                     :                    +- Project [targetId#2088, diseaseId#2170, coalesce(nanvl(maxClinPhase#12040, cast(null as double)), cast(0.0 as double)) AS maxClinPhase#17918, coalesce(LoF_protect_ch#17871L, cast(0.0 as bigint)) AS LoF_protect_ch#17919L, coalesce(GoF_protect_ch#17887L, cast(0.0 as bigint)) AS GoF_protect_ch#17920L, actionType2#16878, biosampleName#1060, projectId#975, rightStudyType#1080, colocalisationMethod#1083, coalesce(null#17841L, cast(0.0 as bigint)) AS null#17921L, coalesce(GoF_protect#17842L, cast(0.0 as bigint)) AS GoF_protect#17922L, coalesce(GoF_risk#17843L, cast(0.0 as bigint)) AS GoF_risk#17923L, coalesce(LoF_protect#17844L, cast(0.0 as bigint)) AS LoF_protect#17924L, coalesce(LoF_risk#17845L, cast(0.0 as bigint)) AS LoF_risk#17925L]
                     :                       +- Project [targetId#2088, diseaseId#2170, maxClinPhase#12040, LoF_protect_ch#17871L, drugGoF_protect#16696L AS GoF_protect_ch#17887L, actionType2#16878, biosampleName#1060, projectId#975, rightStudyType#1080, colocalisationMethod#1083, null#17841L, GoF_protect#17842L, GoF_risk#17843L, LoF_protect#17844L, LoF_risk#17845L]
                     :                          +- Project [targetId#2088, diseaseId#2170, maxClinPhase#12040, drugLoF_protect#16703L AS LoF_protect_ch#17871L, drugGoF_protect#16696L, actionType2#16878, biosampleName#1060, projectId#975, rightStudyType#1080, colocalisationMethod#1083, null#17841L, GoF_protect#17842L, GoF_risk#17843L, LoF_protect#17844L, LoF_risk#17845L]
                     :                             +- Project [targetId#2088, diseaseId#2170, maxClinPhase#12040, drugLoF_protect#16703L, drugGoF_protect#16696L, actionType2#16878, biosampleName#1060, projectId#975, rightStudyType#1080, colocalisationMethod#1083, __pivot_count(1) AS count AS `count(1) AS count`#17840[0] AS null#17841L, __pivot_count(1) AS count AS `count(1) AS count`#17840[1] AS GoF_protect#17842L, __pivot_count(1) AS count AS `count(1) AS count`#17840[2] AS GoF_risk#17843L, __pivot_count(1) AS count AS `count(1) AS count`#17840[3] AS LoF_protect#17844L, __pivot_count(1) AS count AS `count(1) AS count`#17840[4] AS LoF_risk#17845L]
                     :                                +- Aggregate [targetId#2088, diseaseId#2170, maxClinPhase#12040, drugLoF_protect#16703L, drugGoF_protect#16696L, actionType2#16878, biosampleName#1060, projectId#975, rightStudyType#1080, colocalisationMethod#1083], [targetId#2088, diseaseId#2170, maxClinPhase#12040, drugLoF_protect#16703L, drugGoF_protect#16696L, actionType2#16878, biosampleName#1060, projectId#975, rightStudyType#1080, colocalisationMethod#1083, pivotfirst(colocDoE#2044, count(1) AS count#17828L, null, GoF_protect, GoF_risk, LoF_protect, LoF_risk, 0, 0) AS __pivot_count(1) AS count AS `count(1) AS count`#17840]
                     :                                   +- Aggregate [targetId#2088, diseaseId#2170, maxClinPhase#12040, drugLoF_protect#16703L, drugGoF_protect#16696L, actionType2#16878, biosampleName#1060, projectId#975, rightStudyType#1080, colocalisationMethod#1083, colocDoE#2044], [targetId#2088, diseaseId#2170, maxClinPhase#12040, drugLoF_protect#16703L, drugGoF_protect#16696L, actionType2#16878, biosampleName#1060, projectId#975, rightStudyType#1080, colocalisationMethod#1083, colocDoE#2044, count(1) AS count(1) AS count#17828L]
                     :                                      +- Project [biosampleId#1002, targetId#2088, diseaseId#2170, leftStudyLocusId#1077, rightStudyId#1140, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, ... 25 more fields]
                     :                                         +- Project [biosampleId#1002, targetId#2088, diseaseId#2170, leftStudyLocusId#1077, rightStudyId#1140, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, ... 26 more fields]
                     :                                            +- Window [first(colocDoE#2044, false) windowspecdefinition(targetId#2088, diseaseId#2170, maxClinPhase#12040, colocalisationMethod#1083 ASC NULLS FIRST, qtlPValueExponent#1143 ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS qtlColocDoE#16928], [targetId#2088, diseaseId#2170, maxClinPhase#12040], [colocalisationMethod#1083 ASC NULLS FIRST, qtlPValueExponent#1143 ASC NULLS FIRST]
                     :                                               +- Project [biosampleId#1002, targetId#2088, diseaseId#2170, leftStudyLocusId#1077, rightStudyId#1140, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, ... 24 more fields]
                     :                                                  +- Project [biosampleId#1002, targetId#2088, diseaseId#2170, leftStudyLocusId#1077, rightStudyId#1140, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, ... 24 more fields]
                     :                                                     +- Project [biosampleId#1002, targetId#2088, diseaseId#2170, leftStudyLocusId#1077, rightStudyId#1140, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, ... 24 more fields]
                     :                                                        +- Join LeftOuter, (biosampleId#1002 = biosampleId#1059)
                     :                                                           :- Project [targetId#2088, diseaseId#2170, leftStudyLocusId#1077, rightStudyId#1140, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, projectId#975, ... 23 more fields]
                     :                                                           :  +- Project [targetId#2088, diseaseId#2170, leftStudyLocusId#1077, rightStudyId#1140, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, projectId#975, ... 22 more fields]
                     :                                                           :     +- Join RightOuter, ((targetId#1801 = targetId#2088) AND (diseaseId#1960 = diseaseId#2170))
                     :                                                           :        :- Filter NOT (name#691 = COVID-19)
                     :                                                           :        :  +- Filter ((clpp#1082 >= 0.01) OR (h4#1098 >= 0.8))
                     :                                                           :        :     +- Project [diseaseId#1960, leftStudyLocusId#1077, targetId#1801, rightStudyId#1140, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, projectId#975, ... 18 more fields]
                     :                                                           :        :        +- Project [diseaseId#1960, leftStudyLocusId#1077, targetId#1801, rightStudyId#1140, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, projectId#975, ... 17 more fields]
                     :                                                           :        :           +- Project [diseaseId#1960, leftStudyLocusId#1077, targetId#1801, rightStudyId#1140, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, projectId#975, ... 18 more fields]
                     :                                                           :        :              +- Generate explode(concat(array(diseaseId#800), parents#694)), true, [diseaseId#1960]
                     :                                                           :        :                 +- Project [diseaseId#800, leftStudyLocusId#1077, targetId#1801, rightStudyId#1140, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, projectId#975, ... 18 more fields]
                     :                                                           :        :                    +- Join LeftOuter, (diseaseId#800 = diseaseId#1911)
                     :                                                           :        :                       :- Project [leftStudyLocusId#1077, targetId#1801, rightStudyId#1140, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, projectId#975, indexStudyType#1200, ... 15 more fields]
                     :                                                           :        :                       :  +- Join Inner, ((leftStudyLocusId#1077 = leftStudyLocusId#1828) AND (targetId#1801 = targetId#718))
                     :                                                           :        :                       :     :- Project [rightStudyId#1140, rightStudyLocusId#1078, leftStudyLocusId#1077, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, geneId#974 AS targetId#1801, projectId#975, indexStudyType#1200, ... 2 more fields]
                     :                                                           :        :                       :     :  +- Project [rightStudyId#1140, rightStudyLocusId#1078, leftStudyLocusId#1077, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, geneId#974, projectId#975, indexStudyType#1200, ... 2 more fields]
                     :                                                           :        :                       :     :     +- Join LeftOuter, (rightStudyId#1140 = rightStudyId#1199)
                     :                                                           :        :                       :     :        :- Project [rightStudyLocusId#1078, leftStudyLocusId#1077, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightStudyId#1140, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176]
                     :                                                           :        :                       :     :        :  +- Join LeftOuter, (rightStudyLocusId#1078 = rightStudyLocusId#1139)
                     :                                                           :        :                       :     :        :     :- Project [leftStudyLocusId#1077, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117]
                     :                                                           :        :                       :     :        :     :  +- Join LeftOuter, (leftStudyLocusId#1077 = leftStudyLocusId#1114)
                     :                                                           :        :                       :     :        :     :     :- Union false, false
                     :                                                           :        :                       :     :        :     :     :  :- Project [leftStudyLocusId#1077, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, null AS h0#1094, null AS h1#1095, null AS h2#1096, null AS h3#1097, null AS h4#1098]
                     :                                                           :        :                       :     :        :     :     :  :  +- Relation [leftStudyLocusId#1077,rightStudyLocusId#1078,chromosome#1079,rightStudyType#1080,numberColocalisingVariants#1081L,clpp#1082,colocalisationMethod#1083,betaRatioSignAverage#1084] parquet
                     :                                                           :        :                       :     :        :     :     :  +- Project [leftStudyLocusId#949, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, null AS clpp#1093, colocalisationMethod#959, betaRatioSignAverage#960, h0#954, h1#955, h2#956, h3#957, h4#958]
                     :                                                           :        :                       :     :        :     :     :     +- Relation [leftStudyLocusId#949,rightStudyLocusId#950,chromosome#951,rightStudyType#952,numberColocalisingVariants#953L,h0#954,h1#955,h2#956,h3#957,h4#958,colocalisationMethod#959,betaRatioSignAverage#960] parquet
                     :                                                           :        :                       :     :        :     :     +- Project [studyLocusId#895 AS leftStudyLocusId#1114, StudyId#896 AS leftStudyId#1115, variantId#897 AS leftVariantId#1116, studyType#920 AS credibleLeftStudyType#1117]
                     :                                                           :        :                       :     :        :     :        +- Relation [studyLocusId#895,studyId#896,variantId#897,chromosome#898,position#899,region#900,beta#901,zScore#902,pValueMantissa#903,pValueExponent#904,effectAlleleFrequencyFromSource#905,standardError#906,subStudyDescription#907,qualityControls#908,finemappingMethod#909,credibleSetIndex#910,credibleSetlog10BF#911,purityMeanR2#912,purityMinR2#913,locusStart#914,locusEnd#915,sampleSize#916,ldSet#917,locus#918,... 3 more fields] parquet
                     :                                                           :        :                       :     :        :     +- Project [studyLocusId#1150 AS rightStudyLocusId#1139, studyId#1151 AS rightStudyId#1140, variantId#1152 AS rightVariantId#1141, studyType#1175 AS credibleRightStudyType#1142, pValueExponent#1159 AS qtlPValueExponent#1143, isTransQtl#1176]
                     :                                                           :        :                       :     :        :        +- Relation [studyLocusId#1150,studyId#1151,variantId#1152,chromosome#1153,position#1154,region#1155,beta#1156,zScore#1157,pValueMantissa#1158,pValueExponent#1159,effectAlleleFrequencyFromSource#1160,standardError#1161,subStudyDescription#1162,qualityControls#1163,finemappingMethod#1164,credibleSetIndex#1165,credibleSetlog10BF#1166,purityMeanR2#1167,purityMinR2#1168,locusStart#1169,locusEnd#1170,sampleSize#1171,ldSet#1172,locus#1173,... 3 more fields] parquet
                     :                                                           :        :                       :     :        +- Project [studyId#973 AS rightStudyId#1199, geneId#974, projectId#975, studyType#976 AS indexStudyType#1200, condition#998, biosampleId#1002]
                     :                                                           :        :                       :     :           +- Relation [studyId#973,geneId#974,projectId#975,studyType#976,traitFromSource#977,traitFromSourceMappedIds#978,biosampleFromSourceId#979,pubmedId#980,publicationTitle#981,publicationFirstAuthor#982,publicationDate#983,publicationJournal#984,backgroundTraitFromSourceMappedIds#985,initialSampleSize#986,nCases#987,nControls#988,nSamples#989,cohorts#990,ldPopulationStructure#991,discoverySamples#992,replicationSamples#993,qualityControls#994,analysisFlags#995,summarystatsLocation#996,... 6 more fields] parquet
                     :                                                           :        :                       :     +- Project [studyLocusId#798 AS leftStudyLocusId#1828, datasourceId#717, targetId#718, datatypeId#743, diseaseFromSourceMappedId#747, resourceScore#769, targetFromSourceId#784, diseaseId#800, id#801, score#802, sourceId#805, studyId#1845, variantId#1846, betaGwas#1780, pValueExponent#1853]
                     :                                                           :        :                       :        +- Project [studyLocusId#798, datasourceId#717, targetId#718, datatypeId#743, diseaseFromSourceMappedId#747, resourceScore#769, targetFromSourceId#784, diseaseId#800, id#801, score#802, sourceId#805, studyId#1845, variantId#1846, betaGwas#1780, pValueExponent#1853]
                     :                                                           :        :                       :           +- Join LeftOuter, (studyLocusId#798 = studyLocusId#1844)
                     :                                                           :        :                       :              :- Project [datasourceId#717, targetId#718, datatypeId#743, diseaseFromSourceMappedId#747, resourceScore#769, targetFromSourceId#784, studyLocusId#798, diseaseId#800, id#801, score#802, sourceId#805]
                     :                                                           :        :                       :              :  +- Filter (datasourceId#717 = gwas_credible_sets)
                     :                                                           :        :                       :              :     +- Relation [datasourceId#717,targetId#718,alleleOrigins#719,allelicRequirements#720,ancestry#721,ancestryId#722,beta#723,betaConfidenceIntervalLower#724,betaConfidenceIntervalUpper#725,biologicalModelAllelicComposition#726,biologicalModelGeneticBackground#727,biologicalModelId#728,biomarkerName#729,biomarkers#730,biosamplesFromSource#731,cellType#732,clinicalPhase#733,clinicalSignificances#734,clinicalStatus#735,cohortDescription#736,cohortId#737,cohortPhenotypes#738,cohortShortName#739,confidence#740,... 65 more fields] parquet
                     :                                                           :        :                       :              +- Project [studyLocusId#1844, studyId#1845, variantId#1846, beta#1850 AS betaGwas#1780, pValueExponent#1853]
                     :                                                           :        :                       :                 +- Relation [studyLocusId#1844,studyId#1845,variantId#1846,chromosome#1847,position#1848,region#1849,beta#1850,zScore#1851,pValueMantissa#1852,pValueExponent#1853,effectAlleleFrequencyFromSource#1854,standardError#1855,subStudyDescription#1856,qualityControls#1857,finemappingMethod#1858,credibleSetIndex#1859,credibleSetlog10BF#1860,purityMeanR2#1861,purityMinR2#1862,locusStart#1863,locusEnd#1864,sampleSize#1865,ldSet#1866,locus#1867,... 3 more fields] parquet
                     :                                                           :        :                       +- Project [id#689 AS diseaseId#1911, name#691, parents#694, therapeuticAreas#700]
                     :                                                           :        :                          +- Relation [id#689,code#690,name#691,description#692,dbXRefs#693,parents#694,synonyms#695,obsoleteTerms#696,obsoleteXRefs#697,children#698,ancestors#699,therapeuticAreas#700,descendants#701,ontology#702] parquet
                     :                                                           :        +- Project [targetId#2088, diseaseId#2170, maxClinPhase#12040, actionType2#11925, drugGoF_protect#16696L, LoF_protect#14843L AS drugLoF_protect#16703L]
                     :                                                           :           +- Project [targetId#2088, diseaseId#2170, maxClinPhase#12040, actionType2#11925, GoF_protect#14842L AS drugGoF_protect#16696L, LoF_protect#14843L]
                     :                                                           :              +- Project [targetId#2088, diseaseId#2170, maxClinPhase#12040, actionType2#11925, GoF_protect#14842L, LoF_protect#14843L]
                     :                                                           :                 +- Project [targetId#2088, diseaseId#2170, maxClinPhase#12040, actionType2#11925, GoF_protect#14842L, LoF_protect#14843L, noEvaluable#14844L, GoF_risk#14858, LoF_risk#16480, coherencyDiagonal#16667, CASE WHEN ((((isnull(LoF_risk#16480) AND isnull(LoF_protect#14843L)) AND isnull(GoF_risk#14858)) AND isnull(GoF_protect#14842L)) AND isnull(noEvaluable#14844L)) THEN noEvid WHEN ((((isnull(LoF_risk#16480) AND isnull(LoF_protect#14843L)) AND isnull(GoF_risk#14858)) AND isnull(GoF_protect#14842L)) AND isnotnull(noEvaluable#14844L)) THEN EvidNotDoE WHEN (((isnotnull(LoF_risk#16480) OR isnotnull(LoF_protect#14843L)) OR isnotnull(GoF_risk#14858)) OR isnotnull(GoF_protect#14842L)) THEN CASE WHEN (isnotnull(LoF_risk#16480) AND ((isnull(LoF_protect#14843L) AND isnull(GoF_risk#14858)) AND isnull(GoF_protect#14842L))) THEN coherent WHEN (isnotnull(GoF_risk#14858) AND ((isnull(LoF_protect#14843L) AND isnull(LoF_risk#16480)) AND isnull(GoF_protect#14842L))) THEN coherent WHEN (isnotnull(LoF_protect#14843L) AND ((isnull(LoF_risk#16480) AND isnull(GoF_risk#14858)) AND isnull(GoF_protect#14842L))) THEN coherent WHEN (isnotnull(GoF_protect#14842L) AND ((isnull(LoF_protect#14843L) AND isnull(GoF_risk#14858)) AND isnull(LoF_risk#16480))) THEN coherent ELSE dispar END END AS coherencyOneCell#16678]
                     :                                                           :                    +- Project [targetId#2088, diseaseId#2170, maxClinPhase#12040, actionType2#11925, GoF_protect#14842L, LoF_protect#14843L, noEvaluable#14844L, GoF_risk#14858, LoF_risk#16480, CASE WHEN ((((isnull(LoF_risk#16480) AND isnull(LoF_protect#14843L)) AND isnull(GoF_risk#14858)) AND isnull(GoF_protect#14842L)) AND isnull(noEvaluable#14844L)) THEN noEvid WHEN ((((isnull(LoF_risk#16480) AND isnull(LoF_protect#14843L)) AND isnull(GoF_risk#14858)) AND isnull(GoF_protect#14842L)) AND isnotnull(noEvaluable#14844L)) THEN EvidNotDoE WHEN (((isnotnull(LoF_risk#16480) OR isnotnull(LoF_protect#14843L)) OR isnotnull(GoF_risk#14858)) OR isnotnull(GoF_protect#14842L)) THEN CASE WHEN (isnotnull(GoF_risk#14858) AND isnotnull(LoF_risk#16480)) THEN dispar WHEN (isnotnull(LoF_protect#14843L) AND isnotnull(LoF_risk#16480)) THEN dispar WHEN (isnotnull(GoF_protect#14842L) AND isnotnull(GoF_risk#14858)) THEN dispar WHEN (isnotnull(GoF_protect#14842L) AND isnotnull(LoF_protect#14843L)) THEN dispar ELSE coherent END END AS coherencyDiagonal#16667]
                     :                                                           :                       +- Project [targetId#2088, diseaseId#2170, maxClinPhase#12040, actionType2#11925, GoF_protect#14842L, LoF_protect#14843L, noEvaluable#14844L, GoF_risk#14858, null AS LoF_risk#16480]
                     :                                                           :                          +- Project [targetId#2088, diseaseId#2170, maxClinPhase#12040, actionType2#11925, GoF_protect#14842L, LoF_protect#14843L, noEvaluable#14844L, null AS GoF_risk#14858]
                     :                                                           :                             +- Project [targetId#2088, diseaseId#2170, maxClinPhase#12040, actionType2#11925, __pivot_count(targetId) AS `count(targetId)`#14841[0] AS GoF_protect#14842L, __pivot_count(targetId) AS `count(targetId)`#14841[1] AS LoF_protect#14843L, __pivot_count(targetId) AS `count(targetId)`#14841[2] AS noEvaluable#14844L]
                     :                                                           :                                +- Aggregate [targetId#2088, diseaseId#2170, maxClinPhase#12040, actionType2#11925], [targetId#2088, diseaseId#2170, maxClinPhase#12040, actionType2#11925, pivotfirst(homogenized#4208, count(targetId)#14833L, GoF_protect, LoF_protect, noEvaluable, 0, 0) AS __pivot_count(targetId) AS `count(targetId)`#14841]
                     :                                                           :                                   +- Aggregate [targetId#2088, diseaseId#2170, maxClinPhase#12040, actionType2#11925, homogenized#4208], [targetId#2088, diseaseId#2170, maxClinPhase#12040, actionType2#11925, homogenized#4208, count(targetId#2088) AS count(targetId)#14833L]
                     :                                                           :                                      +- Project [targetId#2088, drugId#2121, datasourceId#2087, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, ... 82 more fields]
                     :                                                           :                                         +- Project [targetId#2088, drugId#2121, datasourceId#2087, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, ... 83 more fields]
                     :                                                           :                                            +- Window [max(clinicalPhase#2103) windowspecdefinition(targetId#2088, diseaseId#2170, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS maxClinPhase#12040], [targetId#2088, diseaseId#2170]
                     :                                                           :                                               +- Project [targetId#2088, drugId#2121, datasourceId#2087, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, ... 81 more fields]
                     :                                                           :                                                  +- Project [targetId#2088, drugId#2121, datasourceId#2087, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, ... 81 more fields]
                     :                                                           :                                                     +- Join LeftOuter, ((targetId#2088 = targetId#11915) AND (drugId#2121 = drugId#11908))
                     :                                                           :                                                        :- Filter (datasourceId#2087 = chembl)
                     :                                                           :                                                        :  +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 79 more fields]
                     :                                                           :                                                        :     +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 78 more fields]
                     :                                                           :                                                        :        +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 78 more fields]
                     :                                                           :                                                        :           +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 78 more fields]
                     :                                                           :                                                        :              +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 79 more fields]
                     :                                                           :                                                        :                 +- Window [collect_set(intogen_function#3791, 0, 0) windowspecdefinition(targetId#2088, diseaseId#2170, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS _we0#3897], [targetId#2088, diseaseId#2170]
                     :                                                           :                                                        :                    +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 77 more fields]
                     :                                                           :                                                        :                       +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 77 more fields]
                     :                                                           :                                                        :                          +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 76 more fields]
                     :                                                           :                                                        :                             +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 75 more fields]
                     :                                                           :                                                        :                                +- Join LeftOuter, ((drugId2#2875 = drugId#2121) AND (targetId2#2882 = targetId#2088))
                     :                                                           :                                                        :                                   :- Join LeftOuter, (target_id#2925 = targetId#2088)
                     :                                                           :                                                        :                                   :  :- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, concat_ws(,, clinicalSignificances#2104) AS clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 66 more fields]
                     :                                                           :                                                        :                                   :  :  +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#2104, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 66 more fields]
                     :                                                           :                                                        :                                   :  :     +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, cast(beta#2093 as double) AS beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#2104, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 66 more fields]
                     :                                                           :                                                        :                                   :  :        +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2093, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#2104, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 66 more fields]
                     :                                                           :                                                        :                                   :  :           +- Filter datasourceId#2087 IN (gwas_credible_sets,gene_burden,eva,eva_somatic,gene2phenotype,orphanet,cancer_gene_census,intogen,impc,chembl)
                     :                                                           :                                                        :                                   :  :              +- Relation [datasourceId#2087,targetId#2088,alleleOrigins#2089,allelicRequirements#2090,ancestry#2091,ancestryId#2092,beta#2093,betaConfidenceIntervalLower#2094,betaConfidenceIntervalUpper#2095,biologicalModelAllelicComposition#2096,biologicalModelGeneticBackground#2097,biologicalModelId#2098,biomarkerName#2099,biomarkers#2100,biosamplesFromSource#2101,cellType#2102,clinicalPhase#2103,clinicalSignificances#2104,clinicalStatus#2105,cohortDescription#2106,cohortId#2107,cohortPhenotypes#2108,cohortShortName#2109,confidence#2110,... 65 more fields] parquet
                     :                                                           :                                                        :                                   :  +- Project [id#2802 AS target_id#2925, approvedSymbol#2803, description#2910, description_splited#2914, TSorOncogene#2919]
                     :                                                           :                                                        :                                   :     +- Project [id#2802, approvedSymbol#2803, description#2910, description_splited#2914, CASE WHEN (RLIKE(description_splited#2914, ncogene) AND RLIKE(description_splited#2914, TSG)) THEN bivalent WHEN RLIKE(description_splited#2914, ncogene(\s|$)) THEN oncogene WHEN RLIKE(description_splited#2914, TSG(\s|$)) THEN TSG ELSE noEvaluable END AS TSorOncogene#2919]
                     :                                                           :                                                        :                                   :        +- Project [id#2802, approvedSymbol#2803, description#2910, concat_ws(,, description#2910) AS description_splited#2914]
                     :                                                           :                                                        :                                   :           +- Aggregate [id#2802, approvedSymbol#2803], [id#2802, approvedSymbol#2803, collect_set(description#2902, 0, 0) AS description#2910]
                     :                                                           :                                                        :                                   :              +- Filter description#2902 IN (TSG,oncogene,Oncogene,oncogene,oncogene,TSG,TSG,oncogene,fusion,oncogene,oncogene,fusion)
                     :                                                           :                                                        :                                   :                 +- Project [id#2802, approvedSymbol#2803, col#2897.description AS description#2902]
                     :                                                           :                                                        :                                   :                    +- Project [id#2802, approvedSymbol#2803, col#2897]
                     :                                                           :                                                        :                                   :                       +- Generate explode(hallmarks#2812.attributes), true, [col#2897]
                     :                                                           :                                                        :                                   :                          +- Relation [id#2802,approvedSymbol#2803,biotype#2804,transcriptIds#2805,canonicalTranscript#2806,canonicalExons#2807,genomicLocation#2808,alternativeGenes#2809,approvedName#2810,go#2811,hallmarks#2812,synonyms#2813,symbolSynonyms#2814,nameSynonyms#2815,functionDescriptions#2816,subcellularLocations#2817,targetClass#2818,obsoleteSymbols#2819,obsoleteNames#2820,constraint#2821,tep#2822,proteinIds#2823,dbXrefs#2824,chemicalProbes#2825,... 5 more fields] parquet
                     :                                                           :                                                        :                                   +- Aggregate [targetId2#2882, drugId2#2875], [targetId2#2882, drugId2#2875, collect_set(actionType#2860, 0, 0) AS actionType#2892]
                     :                                                           :                                                        :                                      +- Project [targetId2#2882, drugId2#2875, actionType#2860, mechanismOfAction#2861]
                     :                                                           :                                                        :                                         +- Generate explode(targets#2865), true, [targetId2#2882]
                     :                                                           :                                                        :                                            +- Project [drugId2#2875, actionType#2860, mechanismOfAction#2861, targets#2865]
                     :                                                           :                                                        :                                               +- Generate explode(chemblIds#2862), true, [drugId2#2875]
                     :                                                           :                                                        :                                                  +- Relation [actionType#2860,mechanismOfAction#2861,chemblIds#2862,targetName#2863,targetType#2864,targets#2865,references#2866] parquet
                     :                                                           :                                                        +- Project [targetId#11915, drugId#11908, actionType2#11925, size(actionType2#11925, true) AS nMoA#11929]
                     :                                                           :                                                           +- Aggregate [targetId#11915, drugId#11908], [targetId#11915, drugId#11908, collect_set(actionType#11893, 0, 0) AS actionType2#11925]
                     :                                                           :                                                              +- Project [targetId#11915, drugId#11908, actionType#11893, mechanismOfAction#11894]
                     :                                                           :                                                                 +- Generate explode(targets#11898), true, [targetId#11915]
                     :                                                           :                                                                    +- Project [drugId#11908, actionType#11893, mechanismOfAction#11894, targets#11898]
                     :                                                           :                                                                       +- Generate explode(chemblIds#11895), true, [drugId#11908]
                     :                                                           :                                                                          +- Relation [actionType#11893,mechanismOfAction#11894,chemblIds#11895,targetName#11896,targetType#11897,targets#11898,references#11899] parquet
                     :                                                           +- Project [biosampleId#1059, biosampleName#1060]
                     :                                                              +- Relation [biosampleId#1059,biosampleName#1060,description#1061,xrefs#1062,synonyms#1063,parents#1064,ancestors#1065,children#1066,descendants#1067] parquet
                     +- Project [targetId#18079, diseaseId#18161, stopReason#16867]
                        +- Project [targetId#18079, diseaseId#18161, count#16863L, Negative AS stopReason#16867]
                           +- Aggregate [targetId#18079, diseaseId#18161], [targetId#18079, diseaseId#18161, count(1) AS count#16863L]
                              +- Filter array_contains(studyStopReasonCategories#18143, Negative)
                                 +- Project [targetId#18079, diseaseId#18161, studyStopReason#18142, studyStopReasonCategories#18143]
                                    +- Filter (datasourceId#18078 = chembl)
                                       +- Project [datasourceId#18078, targetId#18079, alleleOrigins#18080, allelicRequirements#18081, ancestry#18082, ancestryId#18083, beta#18084, betaConfidenceIntervalLower#18085, betaConfidenceIntervalUpper#18086, biologicalModelAllelicComposition#18087, biologicalModelGeneticBackground#18088, biologicalModelId#18089, biomarkerName#18090, biomarkers#18091, biosamplesFromSource#18092, cellType#18093, clinicalPhase#18094, clinicalSignificances#18095, clinicalStatus#18096, cohortDescription#18097, cohortId#18098, cohortPhenotypes#18099, cohortShortName#18100, confidence#18101, ... 66 more fields]
                                          +- Filter datasourceId#18078 IN (gwas_credible_sets,gene_burden,eva,eva_somatic,gene2phenotype,orphanet,cancer_gene_census,intogen,impc,chembl)
                                             +- Relation [datasourceId#18078,targetId#18079,alleleOrigins#18080,allelicRequirements#18081,ancestry#18082,ancestryId#18083,beta#18084,betaConfidenceIntervalLower#18085,betaConfidenceIntervalUpper#18086,biologicalModelAllelicComposition#18087,biologicalModelGeneticBackground#18088,biologicalModelId#18089,biomarkerName#18090,biomarkers#18091,biosamplesFromSource#18092,cellType#18093,clinicalPhase#18094,clinicalSignificances#18095,clinicalStatus#18096,cohortDescription#18097,cohortId#18098,cohortPhenotypes#18099,cohortShortName#18100,confidence#18101,... 65 more fields] parquet


In [None]:
import time
from array import ArrayType
from functions import (
    relative_success,
    spreadSheetFormatter,
    discrepancifier,
    temporary_directionOfEffect,
    buildColocData,
    gwasDataset,
)
# from stoppedTrials import terminated_td
from DoEAssessment import directionOfEffect
# from membraneTargets import target_membrane
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
from datetime import datetime
from datetime import date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.types import (
    StructType,
    StructField,
    DoubleType,
    DecimalType,
    StringType,
    FloatType,
)
import pandas as pd
from functools import reduce

# --- YARN and Spark Configuration Parameters ---
# These parameters directly influence how Spark requests resources from YARN
# and how memory is managed within your Spark application.

# 1. spark.driver.memory: Memory allocated to the Spark driver program.
#    The driver is responsible for coordinating tasks, scheduling, and collecting
#    results. If you're doing operations like 'collect()' on large datasets,
#    or working with large broadcast variables, increase this.
#    General Guideline: Start with 2g-4g for interactive use, up to 8g-16g
#    for very large metadata or small result collection.
driver_memory = "4g"

# 2. spark.executor.memory: Memory allocated to each Spark executor JVM.
#    Executors are the worker processes that perform the actual data processing.
#    This is *the most critical* setting for memory-related YARN issues.
#    If your tasks are failing due to OOM errors, increase this significantly.
#    General Guideline: Depends on your node size and data. Common values are
#    4g, 8g, 16g, or even more. Ensure it doesn't exceed YARN's max container size.
executor_memory = "8g"

# 3. spark.executor.cores: Number of virtual cores (CPU) allocated to each executor.
#    More cores means an executor can run more tasks concurrently.
#    General Guideline: Typically 2-5 cores per executor. Avoid 1 core (poor parallelism)
#    and too many cores (can lead to fewer executors and memory contention).
executor_cores = "4"

# 4. spark.executor.instances: The total number of executors to launch.
#    This determines the overall parallelism of your application across the cluster.
#    General Guideline: Calculate based on your total cluster resources.
#    (Total available cores on cluster / executor_cores).
#    Start with a reasonable number, e.g., 5-20, and scale up.
num_executors = "10" # Example: 10 executors

# 5. spark.yarn.executor.memoryOverhead: Additional memory for the YARN container
#    beyond the JVM heap (spark.executor.memory). This includes off-heap memory,
#    PySpark's Python process memory, thread stacks, etc.
#    Crucial for PySpark! If this is too low, YARN can kill your containers
#    even if your Java heap (executor_memory) is fine.
#    General Guideline: 10-20% of spark.executor.memory, or a fixed amount like 1g-2g.
#    For PySpark, it's often safer to allocate more.
executor_memory_overhead = "2g" # For an 8g executor, 2g overhead is reasonable (25%)

# 6. spark.sql.shuffle.partitions: The number of partitions used for shuffling data
#    during operations like `groupBy`, `join`, `agg`, `sort`.
#    If this is too low: You can get OOM errors if partitions are too large,
#    or task failures due to data skew.
#    If this is too high: Creates many small tasks, leading to overhead.
#    General Guideline: A common heuristic is 2-4 times the total number of CPU cores
#    available in your application (executor_cores * num_executors).
#    For your current setup (4 cores * 10 executors = 40 cores), 400 is very high.
#    Consider (num_executors * executor_cores * 2) as a starting point.
#    Example: 10 executors * 4 cores/executor = 40 total cores. 40 * 2 = 80 partitions.
#    However, if you have *very* large datasets or significant data skew, 400 might be okay,
#    but it's usually better to start lower and increase if you see skew/large partition processing.
shuffle_partitions = "150" # Adjust based on data size and parallelism

# 7. spark.default.parallelism: This parameter is important for RDD operations (less so for DataFrames,
#    where spark.sql.shuffle.partitions is more relevant for shuffles). It suggests the default
#    number of partitions for RDDs created from scratch, and also influences the number of tasks.
#    It's often set to match or be a multiple of the total number of cores.
default_parallelism = str(int(executor_cores) * int(num_executors) * 2) # A common heuristic

# --- Build the SparkSession ---
# Use the .config() method to set these parameters before calling .getOrCreate()
# This ensures Spark requests the correct resources from YARN at the start.

spark = SparkSession.builder \
    .appName("MyOptimizedPySparkApp") \
    .config("spark.master", "yarn") \
    .config("spark.driver.memory", driver_memory) \
    .config("spark.executor.memory", executor_memory) \
    .config("spark.executor.cores", executor_cores) \
    .config("spark.executor.instances", num_executors) \
    .config("spark.yarn.executor.memoryOverhead", executor_memory_overhead) \
    .config("spark.sql.shuffle.partitions", shuffle_partitions) \
    .config("spark.default.parallelism", default_parallelism) \
    .getOrCreate()

print(f"SparkSession created successfully with the following configurations:")
print(f"  spark.driver.memory: {spark.conf.get('spark.driver.memory')}")
print(f"  spark.executor.memory: {spark.conf.get('spark.executor.memory')}")
print(f"  spark.executor.cores: {spark.conf.get('spark.executor.cores')}")
print(f"  spark.executor.instances: {spark.conf.get('spark.executor.instances')}")
print(f"  spark.yarn.executor.memoryOverhead: {spark.conf.get('spark.yarn.executor.memoryOverhead')}")
print(f"  spark.sql.shuffle.partitions: {spark.conf.get('spark.sql.shuffle.partitions')}")
print(f"  spark.default.parallelism: {spark.conf.get('spark.default.parallelism')}")
print(f"Spark UI available at: {spark.sparkContext.uiWebUrl}")

# --- Your PySpark Code Here ---
# Now you can proceed with your data loading and processing.
# Example:
# df = spark.read.parquet("hdfs:///user/your_user/your_large_data.parquet")
# print(f"Number of rows in DataFrame: {df.count()}")
# df.groupBy("some_column").agg({"another_column": "sum"}).show()

# Remember to stop the SparkSession when you are done
# spark.stop()

path_n='gs://open-targets-data-releases/25.06/output/'

target = spark.read.parquet(f"{path_n}target/")

diseases = spark.read.parquet(f"{path_n}disease/")

evidences = spark.read.parquet(f"{path_n}evidence")

credible = spark.read.parquet(f"{path_n}credible_set")

new = spark.read.parquet(f"{path_n}colocalisation_coloc") 

index=spark.read.parquet(f"{path_n}study/")

variantIndex = spark.read.parquet(f"{path_n}variant")

biosample = spark.read.parquet(f"{path_n}biosample")

ecaviar=spark.read.parquet(f"{path_n}colocalisation_ecaviar")

all_coloc=ecaviar.unionByName(new, allowMissingColumns=True)

print("loaded files")

#### FIRST MODULE: BUILDING COLOC 
newColoc=buildColocData(all_coloc,credible,index)

print("loaded newColoc")

### SECOND MODULE: PROCESS EVIDENCES TO AVOID EXCESS OF COLUMNS 
gwasComplete = gwasDataset(evidences,credible)

#### THIRD MODULE: INCLUDE COLOC IN THE 
resolvedColoc = (
    (
        newColoc.withColumnRenamed("geneId", "targetId")
        .join(
            gwasComplete.withColumnRenamed("studyLocusId", "leftStudyLocusId"),
            on=["leftStudyLocusId", "targetId"],
            how="inner",
        )
        .join(  ### propagated using parent terms
            diseases.selectExpr(
                "id as diseaseId", "name", "parents", "therapeuticAreas"
            ),
            on="diseaseId",
            how="left",
        )
        .withColumn(
            "diseaseId",
            F.explode_outer(F.concat(F.array(F.col("diseaseId")), F.col("parents"))),
        )
        .drop("parents", "oldDiseaseId")
    ).withColumn(
        "colocDoE",
        F.when(
            F.col("rightStudyType").isin(
                ["eqtl", "pqtl", "tuqtl", "sceqtl", "sctuqtl"]
            ),
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_protect"),
            ),
        ).when(
            F.col("rightStudyType").isin(
                ["sqtl", "scsqtl"]
            ),  ### opposite directionality than sqtl
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_protect"),
            ),
        ),
    )
    # .persist()
)
print("loaded resolvedColloc")

datasource_filter = [
#   "ot_genetics_portal",
    "gwas_credible_sets",
    "gene_burden",
    "eva",
    "eva_somatic",
    "gene2phenotype",
    "orphanet",
    "cancer_gene_census",
    "intogen",
    "impc",
    "chembl",
]

assessment, evidences, actionType, oncolabel = temporary_directionOfEffect(
    path_n, datasource_filter
)

print("run temporary direction of effect")


print("built drugApproved dataset")


#### FOURTH MODULE BUILDING CHEMBL ASSOCIATIONS - HERE TAKE CARE WITH FILTERING STEP 
analysis_chembl_indication = (
    discrepancifier(
        assessment.filter((F.col("datasourceId") == "chembl"))
        .withColumn(
            "maxClinPhase",
            F.max(F.col("clinicalPhase")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase")
        .pivot("homogenized")
        .agg(F.count("targetId"))
    )
    #.filter(F.col("coherencyDiagonal") == "coherent")
    .drop(
        "coherencyDiagonal", "coherencyOneCell", "noEvaluable", "GoF_risk", "LoF_risk"
    )
    .withColumnRenamed("GoF_protect", "drugGoF_protect")
    .withColumnRenamed("LoF_protect", "drugLoF_protect")
    # .persist()
)

####2 Define agregation function
import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
from scipy.stats.contingency import odds_ratio
from pyspark.sql.types import *


def convertTuple(tup):
    st = ",".join(map(str, tup))
    return st


#####3 run in a function
def aggregations_original(
    df,
    data,
    listado,
    comparisonColumn,
    comparisonType,
    predictionColumn,
    predictionType,
    today_date,
):
    wComparison = Window.partitionBy(comparisonColumn)
    wPrediction = Window.partitionBy(predictionColumn)
    wPredictionComparison = Window.partitionBy(comparisonColumn, predictionColumn)
    results = []
    # uniqIds = df.select("targetId", "diseaseId").distinct().count()
    out = (
        df.withColumn("comparisonType", F.lit(comparisonType))
        .withColumn("dataset", F.lit(data))
        .withColumn("predictionType", F.lit(predictionType))
        # .withColumn("total", F.lit(uniqIds))
        .withColumn("a", F.count("targetId").over(wPredictionComparison))
        .withColumn("comparisonColumn", F.lit(comparisonColumn))
        .withColumn("predictionColumnValue", F.lit(predictionColumn))
        .withColumn(
            "predictionTotal",
            F.count("targetId").over(wPrediction),
        )
        .withColumn(
            "comparisonTotal",
            F.count("targetId").over(wComparison),
        )
        .select(
            F.col(predictionColumn).alias("prediction"),
            F.col(comparisonColumn).alias("comparison"),
            "dataset",
            "comparisonColumn",
            "predictionColumnValue",
            "comparisonType",
            "predictionType",
            "a",
            "predictionTotal",
            "comparisonTotal",
        )
        .filter(F.col("prediction").isNotNull())
        .filter(F.col("comparison").isNotNull())
        .distinct()
    )

    out.write.mode("overwrite").parquet(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + comparisonType
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )

    listado.append(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + comparisonType
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )
    path = "gs://ot-team/jroldan/" + str(
        today_date
        + "_"
        + "analysis/"
        + data
        # + "_propagated"
        + "/"
        + comparisonColumn
        + "_"
        + comparisonType
        + "_"
        + predictionColumn
        + ".parquet"
    )
    print(path)
    
    ### making analysis
    array1 = np.delete(
        out.join(full_data, on=["prediction", "comparison"], how="outer")
        .groupBy("comparison")
        .pivot("prediction")
        .agg(F.first("a"))
        .sort(F.col("comparison").desc())
        .select("comparison", "yes", "no")
        .fillna(0)
        .toPandas()
        .to_numpy(),
        [0],
        1,
    )
    total = np.sum(array1)
    res_npPhaseX = np.array(array1, dtype=int)
    resX = convertTuple(fisher_exact(res_npPhaseX, alternative="two-sided"))
    resx_CI = convertTuple(
        odds_ratio(res_npPhaseX).confidence_interval(confidence_level=0.95)
    )

    result_st.append(resX)
    result_ci.append(resx_CI)
    (rs_result, rs_ci) = relative_success(array1)
    results.extend(
        [
            comparisonType,
            comparisonColumn,
            predictionColumn,
            round(float(resX.split(",")[0]), 2),
            float(resX.split(",")[1]),
            round(float(resx_CI.split(",")[0]), 2),
            round(float(resx_CI.split(",")[1]), 2),
            str(total),
            np.array(res_npPhaseX).tolist(),
            round(float(rs_result), 2),
            round(float(rs_ci[0]), 2),
            round(float(rs_ci[1]), 2),
            # studies,
            # tissues,
            path,
        ]
    )
    return results


#### 3 Loop over different datasets (as they will have different rows and columns)


def comparisons_df_iterative(elements):
    #toAnalysis = [(key, value) for key, value in disdic.items() if value == projectId]
    toAnalysis = [(col, "predictor") for col in elements]
    schema = StructType(
        [
            StructField("comparison", StringType(), True),
            StructField("comparisonType", StringType(), True),
        ]
    )

    comparisons = spark.createDataFrame(toAnalysis, schema=schema)
    ### include all the columns as predictor

    predictions = spark.createDataFrame(
        data=[
            ("Phase>=4", "clinical"),
            #('Phase>=3','clinical'),
            #('Phase>=2','clinical'),
            #('Phase>=1','clinical'),
            #("PhaseT", "clinical"),
        ]
    )
    return comparisons.join(predictions, how="full").collect()


print("load comparisons_df_iterative function")


full_data = spark.createDataFrame(
    data=[
        ("yes", "yes"),
        ("yes", "no"),
        ("no", "yes"),
        ("no", "no"),
    ],
    schema=StructType(
        [
            StructField("prediction", StringType(), True),
            StructField("comparison", StringType(), True),
        ]
    ),
)
print("created full_data and lists")

#rightTissue = spark.read.csv(
#    'gs://ot-team/jroldan/analysis/20250526_rightTissue.csv',
#    header=True,
#).drop("_c0")

print("loaded rightTissue dataset")

negativeTD = (
    evidences.filter(F.col("datasourceId") == "chembl")
    .select("targetId", "diseaseId", "studyStopReason", "studyStopReasonCategories")
    .filter(F.array_contains(F.col("studyStopReasonCategories"), "Negative"))
    .groupBy("targetId", "diseaseId")
    .count()
    .withColumn("stopReason", F.lit("Negative"))
    .drop("count")
)

print("built negativeTD dataset")

print("built bench2 dataset")

###### cut from here
print("looping for variables_study")

#### new part with chatgpt -- TEST

## QUESTIONS TO ANSWER:
# HAVE ECAVIAR >=0.8
# HAVE COLOC 
# HAVE COLOC >= 0.8
# HAVE COLOC + ECAVIAR >= 0.01
# HAVE COLOC >= 0.8 + ECAVIAR >= 0.01
# RIGHT JOING WITH CHEMBL 

### FIFTH MODULE: BUILDING BENCHMARK OF THE DATASET TO EXTRACT EHE ANALYSIS 

resolvedColocFiltered = resolvedColoc.filter((F.col('clpp')>=0.01) | (F.col('h4')>=0.8))
benchmark = (
    (
        resolvedColocFiltered.filter( ## .filter(F.col("betaGwas") < 0)
        F.col("name") != "COVID-19"
    )
        .join(  ### select just GWAS giving protection
            analysis_chembl_indication, on=["targetId", "diseaseId"], how="right"  ### RIGHT SIDE
        )
        .withColumn(
            "AgreeDrug",
            F.when(
                (F.col("drugGoF_protect").isNotNull())
                & (F.col("colocDoE") == "GoF_protect"),
                F.lit("yes"),
            )
            .when(
                (F.col("drugLoF_protect").isNotNull())
                & (F.col("colocDoE") == "LoF_protect"),
                F.lit("yes"),
            )
            .otherwise(F.lit("no")),
        )
    )  #### remove COVID-19 associations
).join(biosample.select("biosampleId", "biosampleName"), on="biosampleId", how="left")


### drug mechanism of action
mecact_path = f"{path_n}drug_mechanism_of_action/" #  mechanismOfAction == old version
mecact = spark.read.parquet(mecact_path)

inhibitors = [
    "RNAI INHIBITOR",
    "NEGATIVE MODULATOR",
    "NEGATIVE ALLOSTERIC MODULATOR",
    "ANTAGONIST",
    "ANTISENSE INHIBITOR",
    "BLOCKER",
    "INHIBITOR",
    "DEGRADER",
    "INVERSE AGONIST",
    "ALLOSTERIC ANTAGONIST",
    "DISRUPTING AGENT",
]

activators = [
    "PARTIAL AGONIST",
    "ACTIVATOR",
    "POSITIVE ALLOSTERIC MODULATOR",
    "POSITIVE MODULATOR",
    "AGONIST",
    "SEQUESTERING AGENT",  ## lost at 31.01.2025
    "STABILISER",
    # "EXOGENOUS GENE", ## added 24.06.2025
    # "EXOGENOUS PROTEIN" ## added 24.06.2025
]


actionType = (
        mecact.select(
            F.explode_outer("chemblIds").alias("drugId"),
            "actionType",
            "mechanismOfAction",
            "targets",
        )
        .select(
            F.explode_outer("targets").alias("targetId"),
            "drugId",
            "actionType",
            "mechanismOfAction",
        )
        .groupBy("targetId", "drugId")
        .agg(F.collect_set("actionType").alias("actionType2"))
    ).withColumn('nMoA', F.size(F.col('actionType2')))

analysis_chembl_indication = (
    discrepancifier(
        assessment.filter((F.col("datasourceId") == "chembl")).join(actionType, on=['targetId','drugId'], how='left')
        .withColumn(
            "maxClinPhase",
            F.max(F.col("clinicalPhase")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase",'actionType2')
        .pivot("homogenized")
        .agg(F.count("targetId"))
    )
    #.filter(F.col("coherencyDiagonal") == "coherent")
    .drop(
        "coherencyDiagonal", "coherencyOneCell", "noEvaluable", "GoF_risk", "LoF_risk"
    )
    .withColumnRenamed("GoF_protect", "drugGoF_protect")
    .withColumnRenamed("LoF_protect", "drugLoF_protect")
)

benchmark = (
    (
        resolvedColocFiltered.filter( ## .filter(F.col("betaGwas") < 0)
        F.col("name") != "COVID-19"
    )
        .join(  ### select just GWAS giving protection
            analysis_chembl_indication, on=["targetId", "diseaseId"], how="right"  ### RIGHT SIDE
        )
        .withColumn(
            "AgreeDrug",
            F.when(
                (F.col("drugGoF_protect").isNotNull())
                & (F.col("colocDoE") == "GoF_protect"),
                F.lit("yes"),
            )
            .when(
                (F.col("drugLoF_protect").isNotNull())
                & (F.col("colocDoE") == "LoF_protect"),
                F.lit("yes"),
            )
            .otherwise(F.lit("no")),
        )
    )  #### remove COVID-19 associations
).join(biosample.select("biosampleId", "biosampleName"), on="biosampleId", how="left")

negativeTD = (
    evidences.filter(F.col("datasourceId") == "chembl")
    .select("targetId", "diseaseId", "studyStopReason", "studyStopReasonCategories")
    .filter(F.array_contains(F.col("studyStopReasonCategories"), "Negative"))
    .groupBy("targetId", "diseaseId")
    .count()
    .withColumn("stopReason", F.lit("Negative"))
    .drop("count")
)

### create disdic dictionary
disdic={}

# --- Configuration for your iterative pivoting ---
group_by_columns = ['targetId', 'diseaseId','phase4Clean','phase3Clean','phase2Clean','phase1Clean','PhaseT']
columns_to_pivot_on = ['actionType2', 'biosampleName', 'projectId', 'rightStudyType','colocalisationMethod']
columns_to_aggregate = ['NoneCellYes', 'NdiagonalYes','hasGenetics'] # The values you want to collect in the pivoted cells
all_pivoted_dfs = {}

doe_columns=["LoF_protect", "GoF_risk", "LoF_risk", "GoF_protect"]
diagonal_lof=['LoF_protect','GoF_risk']
diagonal_gof=['LoF_risk','GoF_protect']

conditions = [
    F.when(F.col(c) == F.col("maxDoE"), F.lit(c)).otherwise(F.lit(None)) for c in doe_columns
    ]

# --- Nested Loops for Dynamic Pivoting ---
for agg_col_name in columns_to_aggregate:
    for pivot_col_name in columns_to_pivot_on:
        print(f"\n--- Creating DataFrame for Aggregation: '{agg_col_name}' and Pivot: '{pivot_col_name}' ---")
        current_col_pvalue_order_window = Window.partitionBy("targetId", "diseaseId", "maxClinPhase", pivot_col_name).orderBy(F.col('colocalisationMethod').asc(), F.col("qtlPValueExponent").asc())
        test2=discrepancifier(benchmark.withColumn('actionType2', F.concat_ws(",", F.col("actionType2"))).withColumn('qtlColocDoE',F.first('colocDoE').over(current_col_pvalue_order_window)).groupBy(
        "targetId", "diseaseId", "maxClinPhase", "drugLoF_protect", "drugGoF_protect",pivot_col_name)
        .pivot("colocDoE")
        .count()
        .withColumnRenamed('drugLoF_protect', 'LoF_protect_ch')
        .withColumnRenamed('drugGoF_protect', 'GoF_protect_ch')).withColumn( ## .filter(F.col('coherencyDiagonal')!='noEvid')
    "arrayN", F.array(*[F.col(c) for c in doe_columns])
    ).withColumn(
        "maxDoE", F.array_max(F.col("arrayN"))
    ).withColumn("maxDoE_names", F.array(*conditions)
    ).withColumn("maxDoE_names", F.expr("filter(maxDoE_names, x -> x is not null)")
    ).withColumn(
        "NoneCellYes",
        F.when((F.col("LoF_protect_ch").isNotNull() & (F.col('GoF_protect_ch').isNull())) & (F.array_contains(F.col("maxDoE_names"), F.lit("LoF_protect")))==True, F.lit('yes'))
        .when((F.col("GoF_protect_ch").isNotNull() & (F.col('LoF_protect_ch').isNull())) & (F.array_contains(F.col("maxDoE_names"), F.lit("GoF_protect")))==True, F.lit('yes')
            ).otherwise(F.lit('no'))  # If the value is null, return null # Otherwise, check if name is in array
    ).withColumn(
        "NdiagonalYes",
        F.when((F.col("LoF_protect_ch").isNotNull() & (F.col('GoF_protect_ch').isNull())) & 
            (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_lof]))) > 0),
            F.lit("yes")
        ).when((F.col("GoF_protect_ch").isNotNull() & (F.col('LoF_protect_ch').isNull())) & 
            (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_gof]))) > 0),
            F.lit("yes")
        ).otherwise(F.lit('no'))
    ).withColumn(
        "drugCoherency",
        F.when(
            (F.col("LoF_protect_ch").isNotNull())
            & (F.col("GoF_protect_ch").isNull()), F.lit("coherent")
        )
        .when(
            (F.col("LoF_protect_ch").isNull())
            & (F.col("GoF_protect_ch").isNotNull()), F.lit("coherent")
        )
        .when(
            (F.col("LoF_protect_ch").isNotNull())
            & (F.col("GoF_protect_ch").isNotNull()), F.lit("dispar")
        )
        .otherwise(F.lit("other")),
    ).join(negativeTD, on=["targetId", "diseaseId"], how="left").withColumn(
        "PhaseT",
        F.when(F.col("stopReason") == "Negative", F.lit("yes")).otherwise(F.lit("no")),
    ).withColumn(
        "phase4Clean",
        F.when(
            (F.col("maxClinPhase") == 4) & (F.col("PhaseT") == "no"), F.lit("yes")
        ).otherwise(F.lit("no")),
    ).withColumn(
        "phase3Clean",
        F.when(
            (F.col("maxClinPhase") >= 3) & (F.col("PhaseT") == "no"), F.lit("yes")
        ).otherwise(F.lit("no")),
    ).withColumn(
        "phase2Clean",
        F.when(
            (F.col("maxClinPhase") >= 2) & (F.col("PhaseT") == "no"), F.lit("yes")
        ).otherwise(F.lit("no")),
    ).withColumn(
        "phase1Clean",
        F.when(
            (F.col("maxClinPhase") >= 1) & (F.col("PhaseT") == "no"), F.lit("yes")
        ).otherwise(F.lit("no")),
    ).withColumn(
        "hasGenetics",
        F.when(F.col("coherencyDiagonal") != "noEvid", F.lit("yes")).otherwise(F.lit("no")),
    )
        # 1. Get distinct values for the pivot column (essential for pivot())
        # This brings a small amount of data to the driver, but is necessary for the pivot schema.
        # distinct_pivot_values = [row[0] for row in test2.select(pivot_col_name).distinct().collect()]
        print(f"Distinct values for '{pivot_col_name}': {distinct_pivot_values}")

        # 2. Perform the groupBy, pivot, and aggregate operations
        # The .pivot() function requires the list of distinct values for better performance
        # and correct schema inference.
        pivoted_df = (
            test2.groupBy(*group_by_columns)
            .pivot(pivot_col_name) # Provide distinct values
            .agg(F.collect_set(F.col(agg_col_name))) # Collect all values into a set
            .fillna(0) # Fill cells that have no data with an empty list instead of null
        )
        # 3. Add items to dictionary to map the columns:
        # filter out None and 'null':
        filtered = [x for x in pivoted_df.columns if x is not None and x != 'null']
        # using list comprehension
        for item in filtered:
            disdic[item] = pivot_col_name 

        # 3. Add the 'data' literal column dynamically
        # This column indicates which aggregation column was used.
        #pivoted_df = pivoted_df.withColumn('data', F.lit(f'Drug_{agg_col_name}'))

        array_columns_to_convert = [
            field.name for field in pivoted_df.schema.fields
            if isinstance(field.dataType, ArrayType)
        ]
        print(f"Identified ArrayType columns for conversion: {array_columns_to_convert}")

        # 4. Apply the conversion logic to each identified array column
        df_after_conversion = pivoted_df # Start with the pivoted_df
        for col_to_convert in array_columns_to_convert:
            df_after_conversion = df_after_conversion.withColumn(
                col_to_convert,
                F.when(F.col(col_to_convert).isNull(), F.lit('no'))          # Handle NULLs (from pivot for no data)
                .when(F.size(F.col(col_to_convert)) == 0, F.lit('no'))       # Empty array -> 'no'
                .when(F.array_contains(F.col(col_to_convert), F.lit('yes')), F.lit('yes')) # Contains 'yes' -> 'yes'
                .when(F.array_contains(F.col(col_to_convert), F.lit('no')), F.lit('no'))   # Contains 'no' -> 'no'
                .otherwise(F.lit('no')) # Fallback for unexpected array content (e.g., ['other'], ['yes','no'])
            )

        # 4. Generate a unique name for this DataFrame and store it
        df_key = f"df_pivot_{agg_col_name.lower()}_by_{pivot_col_name.lower()}"
        all_pivoted_dfs[df_key] = df_after_conversion.withColumnRenamed( 'phase4Clean','Phase>=4'
        ).withColumnRenamed('phase3Clean','Phase>=3'
        ).withColumnRenamed('phase2Clean','Phase>=2'
        ).withColumnRenamed('phase1Clean','Phase>=1')


# --- Accessing your generated DataFrames ---
print("\n--- All generated DataFrames are stored in 'all_pivoted_dfs' dictionary ---")
print("Keys available:", all_pivoted_dfs.keys())

