In [None]:
import time
from array import ArrayType
from functions import (
    relative_success,
    spreadSheetFormatter,
    discrepancifier,
    temporary_directionOfEffect,
)
# from stoppedTrials import terminated_td
from DoEAssessment import directionOfEffect
# from membraneTargets import target_membrane
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
from datetime import datetime
from datetime import date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.types import (
    StructType,
    StructField,
    DoubleType,
    DecimalType,
    StringType,
    FloatType,
)
import pandas as pd

spark = SparkSession.builder.getOrCreate()
spark.conf.set(
    "spark.sql.shuffle.partitions", "400"
)  # Default is 200, increase if needed


path_n='gs://open-targets-data-releases/25.03/output/'

target = spark.read.parquet(f"{path_n}target/")

diseases = spark.read.parquet(f"{path_n}disease/")

evidences = spark.read.parquet(f"{path_n}evidence")

credible = spark.read.parquet(f"{path_n}credible_set")

new = spark.read.parquet(f"{path_n}colocalisation_coloc") 

index=spark.read.parquet(f"{path_n}study/")

variantIndex = spark.read.parquet(f"{path_n}variant")

biosample = spark.read.parquet(f"{path_n}biosample")

print("loaded files")

newColoc = (
    new.join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on left side
            "studyLocusId as leftStudyLocusId",
            "StudyId as leftStudyId",
            "variantId as leftVariantId",
            "studyType as credibleLeftStudyType",
        ),
        on="leftStudyLocusId",
        how="left",
    )
    .join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on right side
            "studyLocusId as rightStudyLocusId",
            "studyId as rightStudyId",
            "variantId as rightVariantId",
            "studyType as credibleRightStudyType",
            'isTransQtl'
        ),
        on="rightStudyLocusId",
        how="left",
    )
    .join(
        index.selectExpr(  ### bring modulated target on right side (QTL study)
            "studyId as rightStudyId",
            "geneId",
            "projectId",
            "studyType as indexStudyType",
            "condition",
            "biosampleId",
        ),
        on="rightStudyId",
        how="left",
)
    # .persist()
)

print("loaded newColoc")

# remove columns without content (only null values on them)
df = evidences.filter((F.col("datasourceId") == "gwas_credible_sets"))

# Use an aggregation to determine non-null columns
non_null_counts = df.select(
    *[F.sum(F.col(col).isNotNull().cast("int")).alias(col) for col in df.columns]
)

# Collect the counts for each column
non_null_columns = [
    row[0] for row in non_null_counts.collect()[0].asDict().items() if row[1] > 0
]

# Select only the non-null columns
filtered_df = df.select(*non_null_columns)  # .persist()

## bring studyId, variantId, beta from Gwas and pValue
gwasComplete = filtered_df.join(
    credible.selectExpr(
        "studyLocusId", "studyId", "variantId", "beta as betaGwas", "pValueExponent"
    ),
    on="studyLocusId",
    how="left",
)  # .persist()

print("loaded gwasComplete")

resolvedColoc = (
    (
        newColoc.withColumnRenamed("geneId", "targetId")
        .join(
            gwasComplete.withColumnRenamed("studyLocusId", "leftStudyLocusId"),
            on=["leftStudyLocusId", "targetId"],
            how="inner",
        )
        .join(  ### propagated using parent terms
            diseases.selectExpr(
                "id as diseaseId", "name", "parents", "therapeuticAreas"
            ),
            on="diseaseId",
            how="left",
        )
        .withColumn(
            "diseaseId",
            F.explode_outer(F.concat(F.array(F.col("diseaseId")), F.col("parents"))),
        )
        .drop("parents", "oldDiseaseId")
    ).withColumn(
        "colocDoE",
        F.when(
            F.col("rightStudyType").isin(
                ["eqtl", "pqtl", "tuqtl", "sceqtl", "sctuqtl"]
            ),
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_protect"),
            ),
        ).when(
            F.col("rightStudyType").isin(
                ["sqtl", "scsqtl"]
            ),  ### opposite directionality than sqtl
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_protect"),
            ),
        ),
    )
    # .persist()
)
print("loaded resolvedColloc")

path = "gs://open-targets-pre-data-releases/24.12-uo_test-3/output/etl/parquet/"

datasource_filter = [
    "ot_genetics_portal",
    "gwas_credible_sets",
    "gene_burden",
    "eva",
    "eva_somatic",
    "gene2phenotype",
    "orphanet",
    "cancer_gene_census",
    "intogen",
    "impc",
    "chembl",
]

assessment, evidences, actionType, oncolabel = temporary_directionOfEffect(
    path_n, datasource_filter
)

print("run temporary direction of effect")


print("built drugApproved dataset")

analysis_chembl_indication = (
    discrepancifier(
        assessment.filter((F.col("datasourceId") == "chembl"))
        .withColumn(
            "maxClinPhase",
            F.max(F.col("clinicalPhase")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase")
        .pivot("homogenized")
        .agg(F.count("targetId"))
    )
    .filter(F.col("coherencyDiagonal") == "coherent")
    .drop(
        "coherencyDiagonal", "coherencyOneCell", "noEvaluable", "GoF_risk", "LoF_risk"
    )
    .withColumnRenamed("GoF_protect", "drugGoF_protect")
    .withColumnRenamed("LoF_protect", "drugLoF_protect")
    # .persist()
)

chemblAssoc = (
    discrepancifier(
        assessment.filter(
            (F.col("datasourceId") == "chembl")
            & (F.col("homogenized") != "noEvaluable")
        )
        .withColumn(
            "maxClinPhase",
            F.max("clinicalPhase").over(Window.partitionBy("targetId", "diseaseId")),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase")
        .pivot("homogenized")
        .count()
    )
    .filter(F.col("coherencyDiagonal") == "coherent")
    .drop(
        "coherencyDiagonal", "coherencyOneCell", "noEvaluable", "GoF_risk", "LoF_risk"
    )
    .withColumnRenamed("GoF_protect", "drugGoF_protect")
    .withColumnRenamed("LoF_protect", "drugLoF_protect")
)

print("built chemblAssoc dataset")

benchmark = (
    (
        resolvedColoc.filter(F.col("betaGwas") < 0)
        .join(  ### select just GWAS giving protection
            analysis_chembl_indication, on=["targetId", "diseaseId"], how="inner"
        )
        .withColumn(
            "AgreeDrug",
            F.when(
                (F.col("drugGoF_protect").isNotNull())
                & (F.col("colocDoE") == "GoF_protect"),
                F.lit("yes"),
            )
            .when(
                (F.col("drugLoF_protect").isNotNull())
                & (F.col("colocDoE") == "LoF_protect"),
                F.lit("yes"),
            )
            .otherwise(F.lit("no")),
        )
    ).filter(
        F.col("name") != "COVID-19"
    )  #### remove COVID-19 associations
).join(biosample.select("biosampleId", "biosampleName"), on="biosampleId", how="left")


print("built benchmark dataset")

#### Analysis

#### 1 Build a dictionary with the distinct values as key and column names as value
variables_study = ["projectId", "biosampleName", "rightStudyType", "colocDoE"]

# Initialize an empty dictionary
disdic = {}

# Iterate over the list of column names
for col_name in variables_study:
    # Extract distinct values for the column
    distinct_values = benchmark.select(col_name).distinct().collect()

    # Populate the dictionary
    for row in distinct_values:
        distinct_value = row[col_name]
        if distinct_value is not None:  # Exclude None (null) values
            disdic[distinct_value] = col_name

####2 Define agregation function
import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
from scipy.stats.contingency import odds_ratio
from pyspark.sql.types import *


def convertTuple(tup):
    st = ",".join(map(str, tup))
    return st


#####3 run in a function
def aggregations_original(
    df,
    data,
    listado,
    comparisonColumn,
    comparisonType,
    predictionColumn,
    predictionType,
    today_date,
):
    wComparison = Window.partitionBy(comparisonColumn)
    wPrediction = Window.partitionBy(predictionColumn)
    wPredictionComparison = Window.partitionBy(comparisonColumn, predictionColumn)
    results = []
    # uniqIds = df.select("targetId", "diseaseId").distinct().count()
    out = (
        df.withColumn("comparisonType", F.lit(comparisonType))
        .withColumn("dataset", F.lit(data))
        .withColumn("predictionType", F.lit(predictionType))
        # .withColumn("total", F.lit(uniqIds))
        .withColumn("a", F.count("targetId").over(wPredictionComparison))
        .withColumn("comparisonColumn", F.lit(comparisonColumn))
        .withColumn("predictionColumnValue", F.lit(predictionColumn))
        .withColumn(
            "predictionTotal",
            F.count("targetId").over(wPrediction),
        )
        .withColumn(
            "comparisonTotal",
            F.count("targetId").over(wComparison),
        )
        .select(
            F.col(predictionColumn).alias("prediction"),
            F.col(comparisonColumn).alias("comparison"),
            "dataset",
            "comparisonColumn",
            "predictionColumnValue",
            "comparisonType",
            "predictionType",
            "a",
            "predictionTotal",
            "comparisonTotal",
        )
        .filter(F.col("prediction").isNotNull())
        .filter(F.col("comparison").isNotNull())
        .distinct()
    )

    out.write.mode("overwrite").parquet(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + comparisonType
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )

    listado.append(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + comparisonType
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )
    path = "gs://ot-team/jroldan/" + str(
        today_date
        + "_"
        + "analysis/"
        + data
        # + "_propagated"
        + "/"
        + comparisonColumn
        + "_"
        + comparisonType
        + "_"
        + predictionColumn
        + ".parquet"
    )
    print(path)
    
    ### making analysis
    array1 = np.delete(
        out.join(full_data, on=["prediction", "comparison"], how="outer")
        .groupBy("comparison")
        .pivot("prediction")
        .agg(F.first("a"))
        .sort(F.col("comparison").desc())
        .select("comparison", "yes", "no")
        .fillna(0)
        .toPandas()
        .to_numpy(),
        [0],
        1,
    )
    total = np.sum(array1)
    res_npPhaseX = np.array(array1, dtype=int)
    resX = convertTuple(fisher_exact(res_npPhaseX, alternative="two-sided"))
    resx_CI = convertTuple(
        odds_ratio(res_npPhaseX).confidence_interval(confidence_level=0.95)
    )

    result_st.append(resX)
    result_ci.append(resx_CI)
    (rs_result, rs_ci) = relative_success(array1)
    results.extend(
        [
            comparisonType,
            comparisonColumn,
            predictionColumn,
            round(float(resX.split(",")[0]), 2),
            float(resX.split(",")[1]),
            round(float(resx_CI.split(",")[0]), 2),
            round(float(resx_CI.split(",")[1]), 2),
            str(total),
            np.array(res_npPhaseX).tolist(),
            round(float(rs_result), 2),
            round(float(rs_ci[0]), 2),
            round(float(rs_ci[1]), 2),
            # studies,
            # tissues,
            path,
        ]
    )
    return results


#### 3 Loop over different datasets (as they will have different rows and columns)


def comparisons_df_iterative(elements):
    # toAnalysis = [(key, value) for key, value in disdic.items() if value == projectId]
    toAnalysis = [(col, "predictor") for col in elements]
    schema = StructType(
        [
            StructField("comparison", StringType(), True),
            StructField("comparisonType", StringType(), True),
        ]
    )

    comparisons = spark.createDataFrame(toAnalysis, schema=schema)
    ### include all the columns as predictor

    predictions = spark.createDataFrame(
        data=[
            ("Phase4", "clinical"),
            ('Phase>=3','clinical'),
            ('Phase>=2','clinical'),
            ('Phase>=1','clinical'),
            ("PhaseT", "clinical"),
        ]
    )
    return comparisons.join(predictions, how="full").collect()


print("load comparisons_df_iterative function")


full_data = spark.createDataFrame(
    data=[
        ("yes", "yes"),
        ("yes", "no"),
        ("no", "yes"),
        ("no", "no"),
    ],
    schema=StructType(
        [
            StructField("prediction", StringType(), True),
            StructField("comparison", StringType(), True),
        ]
    ),
)
print("created full_data and lists")

rightTissue = spark.read.csv(
    'gs://ot-team/jroldan/analysis/20250402_rightTissueListMarchRelease.csv',
    header=True,
).drop("_c0")

print("loaded rightTissue dataset")

negativeTD = (
    evidences.filter(F.col("datasourceId") == "chembl")
    .select("targetId", "diseaseId", "studyStopReason", "studyStopReasonCategories")
    .filter(F.array_contains(F.col("studyStopReasonCategories"), "Negative"))
    .groupBy("targetId", "diseaseId")
    .count()
    .withColumn("stopReason", F.lit("Negative"))
    .drop("count")
)

print("built negativeTD dataset")

bench2 = benchmark.join(
    rightTissue, on=["name", "bioSampleName"], how="left"
).withColumn(
    "rightTissue",
    F.when(F.col("rightTissue1") == "yes", F.lit("yes")).otherwise(F.lit("no")),
)

print("built bench2 dataset")

###### cut from here
print("looping for variables_study")
# List of columns to analyze
variables_study = ["projectId", "biosampleName", "rightStudyType", "colocDoE"]

# Dictionary to store results
pivoted_dfs = {}

# Loop over the columns
for col in variables_study:
    window_spec = Window.partitionBy("targetId", "diseaseId", col).orderBy(
        F.col("pValueExponent").asc()
    )
    print(f"Processing: {col}")

    pivoted_df = (
        bench2.withColumn(
            "rightTissue",
            F.when(F.col("rightTissue1") == "yes", F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "agree_lowestPval",
            F.first("AgreeDrug", ignorenulls=True).over(
                window_spec
            ),  ### ignore nulls aded 29.01.2025
            #### take directionality from lowest p value
        )
        .withColumn(
            "isRightTissueSignalAgreed",
            F.collect_set(
                F.when(F.col("rightTissue") == "yes", F.col("agree_lowestPval"))
            ).over(window_spec),
        )
        .withColumn(
            "isSignalFromRightTissue",
            F.first(
                F.when(
                    F.col("AgreeDrug") == F.col("agree_lowestPval"),
                    F.col("rightTissue"),
                ),
                ignorenulls=True,
            ).over(window_spec),
        )
        .groupBy(
            "targetId",
            "diseaseId",
            "maxClinPhase",
            "rightTissue",
            "isRightTissueSignalAgreed",
            "isSignalFromRightTissue",
        )
        .pivot(col)  # Pivot the column dynamically
        .agg(F.collect_set("agree_lowestPval"))
        .join(negativeTD, on=["targetId", "diseaseId"], how="left")
        .withColumn(
            "PhaseT",
            F.when(F.col("stopReason") == "Negative", F.lit("yes")).otherwise(
                F.lit("no")
            ),
        )
        .withColumn(
            "Phase4",
            F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=3",
            F.when(F.col("maxClinPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=2",
            F.when(F.col("maxClinPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=1",
            F.when(F.col("maxClinPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
        )
        .select(
            ["*"]
            + (
                [  ### single columns
                    F.when(F.array_contains(F.col(x), "yes"), F.lit("yes"))
                    .otherwise(F.lit("no"))
                    .alias(f"{x}_only")
                    for x, value in [
                        (key, val) for key, val in disdic.items() if val == col
                    ]
                ]
            )
            + (
                [
                    F.when(
                        F.array_contains(F.col("isRightTissueSignalAgreed"), "yes"),
                        F.lit("yes"),
                    )
                    .otherwise(F.lit("no"))
                    .alias(f"{x}_isRightTissueSignalAgreed")
                    for x, value in [
                        (key, val) for key, val in disdic.items() if val == col
                    ]
                ]
            )
        )
    )  # Collect unique values

    # Store the DataFrame in the dictionary
    pivoted_dfs[col] = pivoted_df





''' 
result = []
result_st = []
result_ci = []
array2 = []
listado = []
result_all = []
today_date = str(date.today())
variables_study = ["projectId", "biosampleName", "rightStudyType", "colocDoE"]

##### PROJECT ID ###### 
print('working with projectId')
pivoted_dfs['projectId'].persist()
unique_values = benchmark.select('projectId').distinct().rdd.flatMap(lambda x: x).collect()
filter = len(pivoted_dfs['projectId'].drop(*unique_values).columns[12:])
print('There are ', filter, 'columns to analyse with phases')
rows = comparisons_df_iterative(pivoted_dfs['projectId'].columns[-filter:])

# If needed, now process the rest
for row in rows:
    results = aggregations_original(
        pivoted_dfs['projectId'], "propagated", listado, *row, today_date
    )
    result_all.append(results)

pivoted_dfs['projectId'].unpersist()
print('df unpersisted')

##### BIOSAMPLE NAME ###### 
print('working with biosampleName')
pivoted_dfs['biosampleName'].persist()
unique_values = benchmark.select('biosampleName').distinct().rdd.flatMap(lambda x: x).collect()
filter = len(pivoted_dfs['biosampleName'].drop(*unique_values).columns[12:])
print('There are ', filter, 'columns to analyse with phases')
rows = comparisons_df_iterative(pivoted_dfs['biosampleName'].columns[-filter:])

for row in rows:
    results = aggregations_original(
        pivoted_dfs['biosampleName'], "propagated", listado, *row, today_date
    )
    result_all.append(results)

pivoted_dfs['biosampleName'].unpersist()
print('df unpersisted')

##### RIGHTSTUDYTYPE  ###### 
print('working with rightStudyType')
pivoted_dfs['rightStudyType'].persist()
unique_values = benchmark.select('rightStudyType').distinct().rdd.flatMap(lambda x: x).collect()
filter = len(pivoted_dfs['rightStudyType'].drop(*unique_values).columns[12:])
print('There are ', filter, 'columns to analyse with phases')
rows = comparisons_df_iterative(pivoted_dfs['rightStudyType'].columns[-filter:])

for row in rows:
    results = aggregations_original(
        pivoted_dfs['rightStudyType'], "propagated", listado, *row, today_date
    )
    result_all.append(results)
pivoted_dfs['rightStudyType'].unpersist()
print('df unpersisted')

##### COLOC DOE ######
print('working with colocDoE')
pivoted_dfs['colocDoE'].persist()
unique_values = benchmark.select('colocDoE').distinct().rdd.flatMap(lambda x: x).collect()
filter = len(pivoted_dfs['colocDoE'].drop(*unique_values).columns[12:])
print('There are ', filter, 'columns to analyse with phases')
rows = comparisons_df_iterative(pivoted_dfs['colocDoE'].columns[-filter:])

for row in rows:
    results = aggregations_original(
        pivoted_dfs['colocDoE'], "propagated", listado, *row, today_date
    )
    result_all.append(results)
pivoted_dfs['colocDoE'].unpersist()
print('df unpersisted')

schema = StructType(
    [
        StructField("group", StringType(), True),
        StructField("comparison", StringType(), True),
        StructField("phase", StringType(), True),
        StructField("oddsRatio", DoubleType(), True),
        StructField("pValue", DoubleType(), True),
        StructField("lowerInterval", DoubleType(), True),
        StructField("upperInterval", DoubleType(), True),
        StructField("total", StringType(), True),
        StructField("values", ArrayType(ArrayType(IntegerType())), True),
        StructField("relSuccess", DoubleType(), True),
        StructField("rsLower", DoubleType(), True),
        StructField("rsUpper", DoubleType(), True),
        StructField("path", StringType(), True),
    ]
)
import re

# Define the list of patterns to search for
patterns = [
    "_only",
    #"_tissue",
    #"_isSignalFromRightTissue",
    "_isRightTissueSignalAgreed",
]
# Create a regex pattern to match any of the substrings
regex_pattern = "(" + "|".join(map(re.escape, patterns)) + ")"

# Convert list of lists to DataFrame
df = (
    spreadSheetFormatter(spark.createDataFrame(result_all, schema=schema))
    .withColumn(
        "prefix",
        F.regexp_replace(
            F.col("comparison"), regex_pattern + ".*", ""
        ),  # Extract part before the pattern
    )
    .withColumn(
        "suffix",
        F.regexp_extract(
            F.col("comparison"), regex_pattern, 0
        ),  # Extract the pattern itself
    )
)

### annotate projectId, tissue, qtl type and doe type:

from pyspark.sql.functions import create_map
from itertools import chain

mapping_expr=create_map([F.lit(x) for x in chain(*disdic.items())])

df_annot=df.withColumn('annotation',mapping_expr.getItem(F.col('prefix')))

df_annot.toPandas().to_csv(
    f"gs://ot-team/jroldan/analysis/{today_date}_credibleSetColocDoEanalysis_RightTissues.csv"
)

print("dataframe written \n Analysis finished")
'''

spark session created at 2025-05-16 20:31:59.245545
Analysis started on 2025-05-16 at  2025-05-16 20:31:59.245545


25/05/16 20:32:04 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

loaded files
loaded newColoc


                                                                                

loaded gwasComplete
loaded resolvedColloc
run temporary direction of effect
built drugApproved dataset


                                                                                

built chemblAssoc dataset
built benchmark dataset


                                                                                

load comparisons_df_iterative function
created full_data and lists
loaded rightTissue dataset
built negativeTD dataset
built bench2 dataset
looping for variables_study
Processing: projectId


                                                                                

Processing: biosampleName


                                                                                

Processing: rightStudyType


                                                                                

Processing: colocDoE


                                                                                

' \nresult = []\nresult_st = []\nresult_ci = []\narray2 = []\nlistado = []\nresult_all = []\ntoday_date = str(date.today())\nvariables_study = ["projectId", "biosampleName", "rightStudyType", "colocDoE"]\n\n##### PROJECT ID ###### \nprint(\'working with projectId\')\npivoted_dfs[\'projectId\'].persist()\nunique_values = benchmark.select(\'projectId\').distinct().rdd.flatMap(lambda x: x).collect()\nfilter = len(pivoted_dfs[\'projectId\'].drop(*unique_values).columns[12:])\nprint(\'There are \', filter, \'columns to analyse with phases\')\nrows = comparisons_df_iterative(pivoted_dfs[\'projectId\'].columns[-filter:])\n\n# If needed, now process the rest\nfor row in rows:\n    results = aggregations_original(\n        pivoted_dfs[\'projectId\'], "propagated", listado, *row, today_date\n    )\n    result_all.append(results)\n\npivoted_dfs[\'projectId\'].unpersist()\nprint(\'df unpersisted\')\n\n##### BIOSAMPLE NAME ###### \nprint(\'working with biosampleName\')\npivoted_dfs[\'biosampleName\

In [13]:
### dataframe Study X biosampleName

window_spec = Window.partitionBy("targetId", "diseaseId", 'projectId','biosampleName').orderBy(
    F.col("pValueExponent").asc()
)

pivoted_df = (
    bench2.withColumn(
        "rightTissue",
        F.when(F.col("rightTissue1") == "yes", F.lit("yes")).otherwise(F.lit("no")),
    )
    .withColumn(
        "agree_lowestPval",
        F.first("AgreeDrug", ignorenulls=True).over(
            window_spec
        ),  ### ignore nulls aded 29.01.2025
        #### take directionality from lowest p value
    )
    .withColumn(
        "isRightTissueSignalAgreed",
        F.collect_set(
            F.when(F.col("rightTissue") == "yes", F.col("agree_lowestPval"))
        ).over(window_spec),
    )
    .withColumn(
        "isSignalFromRightTissue",
        F.first(
            F.when(
                F.col("AgreeDrug") == F.col("agree_lowestPval"),
                F.col("rightTissue"),
            ),
            ignorenulls=True,
        ).over(window_spec),
    )
    .groupBy(
        "targetId",
        "diseaseId",
        "maxClinPhase",
        #"rightTissue",
        #"isRightTissueSignalAgreed",
        #"isSignalFromRightTissue",
        'biosampleName'
    )
    .pivot('projectId')  # Pivot the column dynamically
    .agg(F.collect_set("agree_lowestPval"))
)    

# Store the DataFrame in the dictionary
pivoted_dfs['projectBiosample'] = pivoted_df

                                                                                

In [18]:
# 1. Create a struct of the columns you want to combine
df_combined = pivoted_dfs['projectBiosample'].withColumn("combined", F.struct("targetId", "diseaseId"))

# 2. Use dense_rank over a window partitioned by the combined struct
window_spec = Window.orderBy("combined")
df_with_rank = df_combined.withColumn("unique_id", F.dense_rank().over(window_spec))

# 3. Drop the intermediate 'combined' column (optional)
df_final = df_with_rank.drop("combined")

#pivoted_dfs['projectBiosample'].withColumn("unique_id", F.dense_rank().over(window_spec)).show()

In [24]:
array_columns = [field.name for field in df_final.schema if isinstance(field.dataType, ArrayType)]

In [25]:
array_columns

['Alasoo_2018',
 'Aygun_2021',
 'BLUEPRINT',
 'Bossini-Castillo_2019',
 'BrainSeq',
 'Braineac2',
 'CAP',
 'CEDAR',
 'CommonMind',
 'Cytoimmgen',
 'FUSION',
 'Fairfax_2014',
 'GENCORD',
 'GEUVADIS',
 'GTEx',
 'Gilchrist_2021',
 'HipSci',
 'Jerber_2021',
 'Kim-Hellmuth_2017',
 'Lepik_2017',
 'Nathan_2022',
 'Nedelec_2016',
 'OneK1K',
 'PISA',
 'Peng_2018',
 'Perez_2022',
 'PhLiPS',
 'Quach_2016',
 'ROSMAP',
 'Randolph_2021',
 'Schmiedel_2018',
 'Schwartzentruber_2018',
 'Sun_2018',
 'TwinsUK',
 'UKB_PPP_EUR',
 'Walker_2019',
 'Young_2019',
 'iPSCORE',
 'van_de_Bunt_2015']

In [27]:
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, StringType

def transform_array_column_native_fixed(df, col_name):
    """
    Transforms the content of an array column using native Spark functions with consistent output type.
    """
    return df.withColumn(
        col_name + "_transformed",
        F.when(F.size(F.col(col_name)) == 0, F.lit("0"))
        .when(F.array_contains(F.col(col_name), "no") & F.array_contains(F.col(col_name), "yes"), F.lit("0"))
        .when(F.array_contains(F.col(col_name), "no"), F.lit("1"))
        .when(F.array_contains(F.col(col_name), "yes"), F.lit("2"))
        .when(F.size(F.col(col_name)) == 1, F.element_at(F.col(col_name), 1))
        .otherwise(F.array_join(F.col(col_name), ",")) # Or handle other arrays differently as strings
    )

# Identify array columns
array_columns = [field.name for field in df_final.schema if isinstance(field.dataType, ArrayType)]

# Transform each array column
for col_name in array_columns:
    df_final = transform_array_column_native_fixed(df_final, col_name)
    df_final = df_final.drop(col_name).withColumnRenamed(col_name + "_transformed", col_name)

df_final.printSchema()
df_final.show(truncate=False)

root
 |-- targetId: string (nullable = true)
 |-- diseaseId: string (nullable = true)
 |-- maxClinPhase: double (nullable = true)
 |-- biosampleName: string (nullable = true)
 |-- unique_id: integer (nullable = false)
 |-- Alasoo_2018: string (nullable = true)
 |-- Aygun_2021: string (nullable = true)
 |-- BLUEPRINT: string (nullable = true)
 |-- Bossini-Castillo_2019: string (nullable = true)
 |-- BrainSeq: string (nullable = true)
 |-- Braineac2: string (nullable = true)
 |-- CAP: string (nullable = true)
 |-- CEDAR: string (nullable = true)
 |-- CommonMind: string (nullable = true)
 |-- Cytoimmgen: string (nullable = true)
 |-- FUSION: string (nullable = true)
 |-- Fairfax_2014: string (nullable = true)
 |-- GENCORD: string (nullable = true)
 |-- GEUVADIS: string (nullable = true)
 |-- GTEx: string (nullable = true)
 |-- Gilchrist_2021: string (nullable = true)
 |-- HipSci: string (nullable = true)
 |-- Jerber_2021: string (nullable = true)
 |-- Kim-Hellmuth_2017: string (nullable =

25/05/16 21:16:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/16 21:16:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/16 21:16:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/16 21:16:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/16 21:16:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/16 21:16:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/05/16 2

+---------------+-------------+------------+------------------------------+---------+-----------+----------+---------+---------------------+--------+---------+---+-----+----------+----------+------+------------+-------+--------+----+--------------+------+-----------+-----------------+----------+-----------+------------+------+----+---------+----------+------+----------+------+-------------+--------------+---------------------+--------+-------+-----------+-----------+----------+-------+----------------+
|targetId       |diseaseId    |maxClinPhase|biosampleName                 |unique_id|Alasoo_2018|Aygun_2021|BLUEPRINT|Bossini-Castillo_2019|BrainSeq|Braineac2|CAP|CEDAR|CommonMind|Cytoimmgen|FUSION|Fairfax_2014|GENCORD|GEUVADIS|GTEx|Gilchrist_2021|HipSci|Jerber_2021|Kim-Hellmuth_2017|Lepik_2017|Nathan_2022|Nedelec_2016|OneK1K|PISA|Peng_2018|Perez_2022|PhLiPS|Quach_2016|ROSMAP|Randolph_2021|Schmiedel_2018|Schwartzentruber_2018|Sun_2018|TwinsUK|UKB_PPP_EUR|Walker_2019|Young_2019|iPSCORE|va

                                                                                

In [10]:
### mmatrix Study X biosampleName
pivoted_dfs['projectBiosample'].withColumn(
    "unique_batch_id",F.monotonically_increasing_id().over(Window.partitionBy('targetId','diseaseId'))).show()

AnalysisException: [UNSUPPORTED_EXPR_FOR_WINDOW] Expression "monotonically_increasing_id()" not supported within a window function.;
Project [targetId#1758, diseaseId#1911, maxClinPhase#6142, rightTissue#22416, isRightTissueSignalAgreed#22515, isSignalFromRightTissue#22567, biosampleName#1060, Alasoo_2018#22887, Aygun_2021#22889, BLUEPRINT#22891, Bossini-Castillo_2019#22893, BrainSeq#22895, Braineac2#22897, CAP#22899, CEDAR#22901, CommonMind#22903, Cytoimmgen#22905, FUSION#22907, Fairfax_2014#22909, GENCORD#22911, GEUVADIS#22913, GTEx#22915, Gilchrist_2021#22917, HipSci#22919, ... 23 more fields]
+- Project [targetId#1758, diseaseId#1911, maxClinPhase#6142, rightTissue#22416, isRightTissueSignalAgreed#22515, isSignalFromRightTissue#22567, biosampleName#1060, Alasoo_2018#22887, Aygun_2021#22889, BLUEPRINT#22891, Bossini-Castillo_2019#22893, BrainSeq#22895, Braineac2#22897, CAP#22899, CEDAR#22901, CommonMind#22903, Cytoimmgen#22905, FUSION#22907, Fairfax_2014#22909, GENCORD#22911, GEUVADIS#22913, GTEx#22915, Gilchrist_2021#22917, HipSci#22919, ... 24 more fields]
   +- Window [monotonically_increasing_id() windowspecdefinition(targetId#1758, diseaseId#1911, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS unique_batch_id#27405L], [targetId#1758, diseaseId#1911]
      +- Project [targetId#1758, diseaseId#1911, maxClinPhase#6142, rightTissue#22416, isRightTissueSignalAgreed#22515, isSignalFromRightTissue#22567, biosampleName#1060, Alasoo_2018#22887, Aygun_2021#22889, BLUEPRINT#22891, Bossini-Castillo_2019#22893, BrainSeq#22895, Braineac2#22897, CAP#22899, CEDAR#22901, CommonMind#22903, Cytoimmgen#22905, FUSION#22907, Fairfax_2014#22909, GENCORD#22911, GEUVADIS#22913, GTEx#22915, Gilchrist_2021#22917, HipSci#22919, ... 22 more fields]
         +- Aggregate [targetId#1758, diseaseId#1911, maxClinPhase#6142, rightTissue#22416, isRightTissueSignalAgreed#22515, isSignalFromRightTissue#22567, biosampleName#1060], [targetId#1758, diseaseId#1911, maxClinPhase#6142, rightTissue#22416, isRightTissueSignalAgreed#22515, isSignalFromRightTissue#22567, biosampleName#1060, collect_set(if ((projectId#975 <=> cast(Alasoo_2018 as string))) agree_lowestPval#22465 else cast(null as string), 0, 0) AS Alasoo_2018#22887, collect_set(if ((projectId#975 <=> cast(Aygun_2021 as string))) agree_lowestPval#22465 else cast(null as string), 0, 0) AS Aygun_2021#22889, collect_set(if ((projectId#975 <=> cast(BLUEPRINT as string))) agree_lowestPval#22465 else cast(null as string), 0, 0) AS BLUEPRINT#22891, collect_set(if ((projectId#975 <=> cast(Bossini-Castillo_2019 as string))) agree_lowestPval#22465 else cast(null as string), 0, 0) AS Bossini-Castillo_2019#22893, collect_set(if ((projectId#975 <=> cast(BrainSeq as string))) agree_lowestPval#22465 else cast(null as string), 0, 0) AS BrainSeq#22895, collect_set(if ((projectId#975 <=> cast(Braineac2 as string))) agree_lowestPval#22465 else cast(null as string), 0, 0) AS Braineac2#22897, collect_set(if ((projectId#975 <=> cast(CAP as string))) agree_lowestPval#22465 else cast(null as string), 0, 0) AS CAP#22899, collect_set(if ((projectId#975 <=> cast(CEDAR as string))) agree_lowestPval#22465 else cast(null as string), 0, 0) AS CEDAR#22901, collect_set(if ((projectId#975 <=> cast(CommonMind as string))) agree_lowestPval#22465 else cast(null as string), 0, 0) AS CommonMind#22903, collect_set(if ((projectId#975 <=> cast(Cytoimmgen as string))) agree_lowestPval#22465 else cast(null as string), 0, 0) AS Cytoimmgen#22905, collect_set(if ((projectId#975 <=> cast(FUSION as string))) agree_lowestPval#22465 else cast(null as string), 0, 0) AS FUSION#22907, collect_set(if ((projectId#975 <=> cast(Fairfax_2014 as string))) agree_lowestPval#22465 else cast(null as string), 0, 0) AS Fairfax_2014#22909, collect_set(if ((projectId#975 <=> cast(GENCORD as string))) agree_lowestPval#22465 else cast(null as string), 0, 0) AS GENCORD#22911, collect_set(if ((projectId#975 <=> cast(GEUVADIS as string))) agree_lowestPval#22465 else cast(null as string), 0, 0) AS GEUVADIS#22913, collect_set(if ((projectId#975 <=> cast(GTEx as string))) agree_lowestPval#22465 else cast(null as string), 0, 0) AS GTEx#22915, collect_set(if ((projectId#975 <=> cast(Gilchrist_2021 as string))) agree_lowestPval#22465 else cast(null as string), 0, 0) AS Gilchrist_2021#22917, collect_set(if ((projectId#975 <=> cast(HipSci as string))) agree_lowestPval#22465 else cast(null as string), 0, 0) AS HipSci#22919, ... 22 more fields]
            +- Project [name#691, biosampleName#1060, biosampleId#1002, targetId#1758, diseaseId#1911, leftStudyLocusId#949, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, ... 26 more fields]
               +- Project [name#691, biosampleName#1060, biosampleId#1002, targetId#1758, diseaseId#1911, leftStudyLocusId#949, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, ... 28 more fields]
                  +- Window [first(_w0#22568, true) windowspecdefinition(targetId#1758, diseaseId#1911, projectId#975, biosampleName#1060, pValueExponent#1808 ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS isSignalFromRightTissue#22567], [targetId#1758, diseaseId#1911, projectId#975, biosampleName#1060], [pValueExponent#1808 ASC NULLS FIRST]
                     +- Project [name#691, biosampleName#1060, biosampleId#1002, targetId#1758, diseaseId#1911, leftStudyLocusId#949, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, ... 26 more fields]
                        +- Project [name#691, biosampleName#1060, biosampleId#1002, targetId#1758, diseaseId#1911, leftStudyLocusId#949, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, ... 25 more fields]
                           +- Project [name#691, biosampleName#1060, biosampleId#1002, targetId#1758, diseaseId#1911, leftStudyLocusId#949, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, ... 27 more fields]
                              +- Window [collect_set(_w0#22516, 0, 0) windowspecdefinition(targetId#1758, diseaseId#1911, projectId#975, biosampleName#1060, pValueExponent#1808 ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS isRightTissueSignalAgreed#22515], [targetId#1758, diseaseId#1911, projectId#975, biosampleName#1060], [pValueExponent#1808 ASC NULLS FIRST]
                                 +- Project [name#691, biosampleName#1060, biosampleId#1002, targetId#1758, diseaseId#1911, leftStudyLocusId#949, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, ... 25 more fields]
                                    +- Project [name#691, biosampleName#1060, biosampleId#1002, targetId#1758, diseaseId#1911, leftStudyLocusId#949, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, ... 24 more fields]
                                       +- Project [name#691, biosampleName#1060, biosampleId#1002, targetId#1758, diseaseId#1911, leftStudyLocusId#949, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, ... 25 more fields]
                                          +- Window [first(AgreeDrug#16463, true) windowspecdefinition(targetId#1758, diseaseId#1911, projectId#975, biosampleName#1060, pValueExponent#1808 ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS agree_lowestPval#22465], [targetId#1758, diseaseId#1911, projectId#975, biosampleName#1060], [pValueExponent#1808 ASC NULLS FIRST]
                                             +- Project [name#691, biosampleName#1060, biosampleId#1002, targetId#1758, diseaseId#1911, leftStudyLocusId#949, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, ... 23 more fields]
                                                +- Project [name#691, biosampleName#1060, biosampleId#1002, targetId#1758, diseaseId#1911, leftStudyLocusId#949, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, ... 23 more fields]
                                                   +- Project [name#691, biosampleName#1060, biosampleId#1002, targetId#1758, diseaseId#1911, leftStudyLocusId#949, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, ... 23 more fields]
                                                      +- Project [name#691, biosampleName#1060, biosampleId#1002, targetId#1758, diseaseId#1911, leftStudyLocusId#949, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, ... 22 more fields]
                                                         +- Join LeftOuter, ((name#691 = name#17980) AND (biosampleName#1060 = biosampleName#17981))
                                                            :- Project [biosampleId#1002, targetId#1758, diseaseId#1911, leftStudyLocusId#949, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, projectId#975, indexStudyType#1159, ... 21 more fields]
                                                            :  +- Join LeftOuter, (biosampleId#1002 = biosampleId#1059)
                                                            :     :- Filter NOT (name#691 = COVID-19)
                                                            :     :  +- Project [targetId#1758, diseaseId#1911, leftStudyLocusId#949, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, projectId#975, indexStudyType#1159, condition#998, ... 20 more fields]
                                                            :     :     +- Project [targetId#1758, diseaseId#1911, leftStudyLocusId#949, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, projectId#975, indexStudyType#1159, condition#998, ... 19 more fields]
                                                            :     :        +- Join Inner, ((targetId#1758 = targetId#2033) AND (diseaseId#1911 = diseaseId#2115))
                                                            :     :           :- Filter (betaGwas#1737 < cast(0 as double))
                                                            :     :           :  +- Project [diseaseId#1911, leftStudyLocusId#949, targetId#1758, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, projectId#975, indexStudyType#1159, condition#998, ... 16 more fields]
                                                            :     :           :     +- Project [diseaseId#1911, leftStudyLocusId#949, targetId#1758, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, projectId#975, indexStudyType#1159, condition#998, ... 15 more fields]
                                                            :     :           :        +- Project [diseaseId#1911, leftStudyLocusId#949, targetId#1758, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, projectId#975, indexStudyType#1159, condition#998, ... 16 more fields]
                                                            :     :           :           +- Generate explode(concat(array(diseaseId#800), parents#694)), true, [diseaseId#1911]
                                                            :     :           :              +- Project [diseaseId#800, leftStudyLocusId#949, targetId#1758, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, projectId#975, indexStudyType#1159, condition#998, ... 16 more fields]
                                                            :     :           :                 +- Join LeftOuter, (diseaseId#800 = diseaseId#1864)
                                                            :     :           :                    :- Project [leftStudyLocusId#949, targetId#1758, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, projectId#975, indexStudyType#1159, condition#998, biosampleId#1002, ... 13 more fields]
                                                            :     :           :                    :  +- Join Inner, ((leftStudyLocusId#949 = leftStudyLocusId#1783) AND (targetId#1758 = targetId#718))
                                                            :     :           :                    :     :- Project [rightStudyId#1103, rightStudyLocusId#950, leftStudyLocusId#949, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, geneId#974 AS targetId#1758, projectId#975, indexStudyType#1159, condition#998, biosampleId#1002]
                                                            :     :           :                    :     :  +- Project [rightStudyId#1103, rightStudyLocusId#950, leftStudyLocusId#949, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, geneId#974, projectId#975, indexStudyType#1159, condition#998, biosampleId#1002]
                                                            :     :           :                    :     :     +- Join LeftOuter, (rightStudyId#1103 = rightStudyId#1158)
                                                            :     :           :                    :     :        :- Project [rightStudyLocusId#950, leftStudyLocusId#949, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightStudyId#1103, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137]
                                                            :     :           :                    :     :        :  +- Join LeftOuter, (rightStudyLocusId#950 = rightStudyLocusId#1102)
                                                            :     :           :                    :     :        :     :- Project [leftStudyLocusId#949, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080]
                                                            :     :           :                    :     :        :     :  +- Join LeftOuter, (leftStudyLocusId#949 = leftStudyLocusId#1077)
                                                            :     :           :                    :     :        :     :     :- Relation [leftStudyLocusId#949,rightStudyLocusId#950,chromosome#951,rightStudyType#952,numberColocalisingVariants#953L,h0#954,h1#955,h2#956,h3#957,h4#958,colocalisationMethod#959,betaRatioSignAverage#960] parquet
                                                            :     :           :                    :     :        :     :     +- Project [studyLocusId#895 AS leftStudyLocusId#1077, StudyId#896 AS leftStudyId#1078, variantId#897 AS leftVariantId#1079, studyType#920 AS credibleLeftStudyType#1080]
                                                            :     :           :                    :     :        :     :        +- Relation [studyLocusId#895,studyId#896,variantId#897,chromosome#898,position#899,region#900,beta#901,zScore#902,pValueMantissa#903,pValueExponent#904,effectAlleleFrequencyFromSource#905,standardError#906,subStudyDescription#907,qualityControls#908,finemappingMethod#909,credibleSetIndex#910,credibleSetlog10BF#911,purityMeanR2#912,purityMinR2#913,locusStart#914,locusEnd#915,sampleSize#916,ldSet#917,locus#918,... 3 more fields] parquet
                                                            :     :           :                    :     :        :     +- Project [studyLocusId#1111 AS rightStudyLocusId#1102, studyId#1112 AS rightStudyId#1103, variantId#1113 AS rightVariantId#1104, studyType#1136 AS credibleRightStudyType#1105, isTransQtl#1137]
                                                            :     :           :                    :     :        :        +- Relation [studyLocusId#1111,studyId#1112,variantId#1113,chromosome#1114,position#1115,region#1116,beta#1117,zScore#1118,pValueMantissa#1119,pValueExponent#1120,effectAlleleFrequencyFromSource#1121,standardError#1122,subStudyDescription#1123,qualityControls#1124,finemappingMethod#1125,credibleSetIndex#1126,credibleSetlog10BF#1127,purityMeanR2#1128,purityMinR2#1129,locusStart#1130,locusEnd#1131,sampleSize#1132,ldSet#1133,locus#1134,... 3 more fields] parquet
                                                            :     :           :                    :     :        +- Project [studyId#973 AS rightStudyId#1158, geneId#974, projectId#975, studyType#976 AS indexStudyType#1159, condition#998, biosampleId#1002]
                                                            :     :           :                    :     :           +- Relation [studyId#973,geneId#974,projectId#975,studyType#976,traitFromSource#977,traitFromSourceMappedIds#978,biosampleFromSourceId#979,pubmedId#980,publicationTitle#981,publicationFirstAuthor#982,publicationDate#983,publicationJournal#984,backgroundTraitFromSourceMappedIds#985,initialSampleSize#986,nCases#987,nControls#988,nSamples#989,cohorts#990,ldPopulationStructure#991,discoverySamples#992,replicationSamples#993,qualityControls#994,analysisFlags#995,summarystatsLocation#996,... 6 more fields] parquet
                                                            :     :           :                    :     +- Project [studyLocusId#798 AS leftStudyLocusId#1783, datasourceId#717, targetId#718, datatypeId#743, diseaseFromSourceMappedId#747, resourceScore#769, targetFromSourceId#784, diseaseId#800, id#801, score#802, sourceId#805, studyId#1800, variantId#1801, betaGwas#1737, pValueExponent#1808]
                                                            :     :           :                    :        +- Project [studyLocusId#798, datasourceId#717, targetId#718, datatypeId#743, diseaseFromSourceMappedId#747, resourceScore#769, targetFromSourceId#784, diseaseId#800, id#801, score#802, sourceId#805, studyId#1800, variantId#1801, betaGwas#1737, pValueExponent#1808]
                                                            :     :           :                    :           +- Join LeftOuter, (studyLocusId#798 = studyLocusId#1799)
                                                            :     :           :                    :              :- Project [datasourceId#717, targetId#718, datatypeId#743, diseaseFromSourceMappedId#747, resourceScore#769, targetFromSourceId#784, studyLocusId#798, diseaseId#800, id#801, score#802, sourceId#805]
                                                            :     :           :                    :              :  +- Filter (datasourceId#717 = gwas_credible_sets)
                                                            :     :           :                    :              :     +- Relation [datasourceId#717,targetId#718,alleleOrigins#719,allelicRequirements#720,ancestry#721,ancestryId#722,beta#723,betaConfidenceIntervalLower#724,betaConfidenceIntervalUpper#725,biologicalModelAllelicComposition#726,biologicalModelGeneticBackground#727,biologicalModelId#728,biomarkerName#729,biomarkers#730,biosamplesFromSource#731,cellType#732,clinicalPhase#733,clinicalSignificances#734,clinicalStatus#735,cohortDescription#736,cohortId#737,cohortPhenotypes#738,cohortShortName#739,confidence#740,... 65 more fields] parquet
                                                            :     :           :                    :              +- Project [studyLocusId#1799, studyId#1800, variantId#1801, beta#1805 AS betaGwas#1737, pValueExponent#1808]
                                                            :     :           :                    :                 +- Relation [studyLocusId#1799,studyId#1800,variantId#1801,chromosome#1802,position#1803,region#1804,beta#1805,zScore#1806,pValueMantissa#1807,pValueExponent#1808,effectAlleleFrequencyFromSource#1809,standardError#1810,subStudyDescription#1811,qualityControls#1812,finemappingMethod#1813,credibleSetIndex#1814,credibleSetlog10BF#1815,purityMeanR2#1816,purityMinR2#1817,locusStart#1818,locusEnd#1819,sampleSize#1820,ldSet#1821,locus#1822,... 3 more fields] parquet
                                                            :     :           :                    +- Project [id#689 AS diseaseId#1864, name#691, parents#694, therapeuticAreas#700]
                                                            :     :           :                       +- Relation [id#689,code#690,name#691,description#692,dbXRefs#693,parents#694,synonyms#695,obsoleteTerms#696,obsoleteXRefs#697,children#698,ancestors#699,therapeuticAreas#700,descendants#701,ontology#702] parquet
                                                            :     :           +- Project [targetId#2033, diseaseId#2115, maxClinPhase#6142, drugGoF_protect#11659L, LoF_protect#9839L AS drugLoF_protect#11665L]
                                                            :     :              +- Project [targetId#2033, diseaseId#2115, maxClinPhase#6142, GoF_protect#9838L AS drugGoF_protect#11659L, LoF_protect#9839L]
                                                            :     :                 +- Project [targetId#2033, diseaseId#2115, maxClinPhase#6142, GoF_protect#9838L, LoF_protect#9839L]
                                                            :     :                    +- Filter (coherencyDiagonal#11633 = coherent)
                                                            :     :                       +- Project [targetId#2033, diseaseId#2115, maxClinPhase#6142, GoF_protect#9838L, LoF_protect#9839L, noEvaluable#9840L, GoF_risk#9853, LoF_risk#11467, coherencyDiagonal#11633, CASE WHEN ((((isnull(LoF_risk#11467) AND isnull(LoF_protect#9839L)) AND isnull(GoF_risk#9853)) AND isnull(GoF_protect#9838L)) AND isnull(noEvaluable#9840L)) THEN noEvid WHEN ((((isnull(LoF_risk#11467) AND isnull(LoF_protect#9839L)) AND isnull(GoF_risk#9853)) AND isnull(GoF_protect#9838L)) AND isnotnull(noEvaluable#9840L)) THEN EvidNotDoE WHEN (((isnotnull(LoF_risk#11467) OR isnotnull(LoF_protect#9839L)) OR isnotnull(GoF_risk#9853)) OR isnotnull(GoF_protect#9838L)) THEN CASE WHEN (isnotnull(LoF_risk#11467) AND ((isnull(LoF_protect#9839L) AND isnull(GoF_risk#9853)) AND isnull(GoF_protect#9838L))) THEN coherent WHEN (isnotnull(GoF_risk#9853) AND ((isnull(LoF_protect#9839L) AND isnull(LoF_risk#11467)) AND isnull(GoF_protect#9838L))) THEN coherent WHEN (isnotnull(LoF_protect#9839L) AND ((isnull(LoF_risk#11467) AND isnull(GoF_risk#9853)) AND isnull(GoF_protect#9838L))) THEN coherent WHEN (isnotnull(GoF_protect#9838L) AND ((isnull(LoF_protect#9839L) AND isnull(GoF_risk#9853)) AND isnull(LoF_risk#11467))) THEN coherent ELSE dispar END END AS coherencyOneCell#11643]
                                                            :     :                          +- Project [targetId#2033, diseaseId#2115, maxClinPhase#6142, GoF_protect#9838L, LoF_protect#9839L, noEvaluable#9840L, GoF_risk#9853, LoF_risk#11467, CASE WHEN ((((isnull(LoF_risk#11467) AND isnull(LoF_protect#9839L)) AND isnull(GoF_risk#9853)) AND isnull(GoF_protect#9838L)) AND isnull(noEvaluable#9840L)) THEN noEvid WHEN ((((isnull(LoF_risk#11467) AND isnull(LoF_protect#9839L)) AND isnull(GoF_risk#9853)) AND isnull(GoF_protect#9838L)) AND isnotnull(noEvaluable#9840L)) THEN EvidNotDoE WHEN (((isnotnull(LoF_risk#11467) OR isnotnull(LoF_protect#9839L)) OR isnotnull(GoF_risk#9853)) OR isnotnull(GoF_protect#9838L)) THEN CASE WHEN (isnotnull(GoF_risk#9853) AND isnotnull(LoF_risk#11467)) THEN dispar WHEN (isnotnull(LoF_protect#9839L) AND isnotnull(LoF_risk#11467)) THEN dispar WHEN (isnotnull(GoF_protect#9838L) AND isnotnull(GoF_risk#9853)) THEN dispar WHEN (isnotnull(GoF_protect#9838L) AND isnotnull(LoF_protect#9839L)) THEN dispar ELSE coherent END END AS coherencyDiagonal#11633]
                                                            :     :                             +- Project [targetId#2033, diseaseId#2115, maxClinPhase#6142, GoF_protect#9838L, LoF_protect#9839L, noEvaluable#9840L, GoF_risk#9853, null AS LoF_risk#11467]
                                                            :     :                                +- Project [targetId#2033, diseaseId#2115, maxClinPhase#6142, GoF_protect#9838L, LoF_protect#9839L, noEvaluable#9840L, null AS GoF_risk#9853]
                                                            :     :                                   +- Project [targetId#2033, diseaseId#2115, maxClinPhase#6142, __pivot_count(targetId) AS `count(targetId)`#9837[0] AS GoF_protect#9838L, __pivot_count(targetId) AS `count(targetId)`#9837[1] AS LoF_protect#9839L, __pivot_count(targetId) AS `count(targetId)`#9837[2] AS noEvaluable#9840L]
                                                            :     :                                      +- Aggregate [targetId#2033, diseaseId#2115, maxClinPhase#6142], [targetId#2033, diseaseId#2115, maxClinPhase#6142, pivotfirst(homogenized#4153, count(targetId)#9829L, GoF_protect, LoF_protect, noEvaluable, 0, 0) AS __pivot_count(targetId) AS `count(targetId)`#9837]
                                                            :     :                                         +- Aggregate [targetId#2033, diseaseId#2115, maxClinPhase#6142, homogenized#4153], [targetId#2033, diseaseId#2115, maxClinPhase#6142, homogenized#4153, count(targetId#2033) AS count(targetId)#9829L]
                                                            :     :                                            +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 80 more fields]
                                                            :     :                                               +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 81 more fields]
                                                            :     :                                                  +- Window [max(clinicalPhase#2048) windowspecdefinition(targetId#2033, diseaseId#2115, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS maxClinPhase#6142], [targetId#2033, diseaseId#2115]
                                                            :     :                                                     +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 79 more fields]
                                                            :     :                                                        +- Filter (datasourceId#2032 = chembl)
                                                            :     :                                                           +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 79 more fields]
                                                            :     :                                                              +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 78 more fields]
                                                            :     :                                                                 +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 78 more fields]
                                                            :     :                                                                    +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 78 more fields]
                                                            :     :                                                                       +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 79 more fields]
                                                            :     :                                                                          +- Window [collect_set(intogen_function#3736, 0, 0) windowspecdefinition(targetId#2033, diseaseId#2115, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS _we0#3842], [targetId#2033, diseaseId#2115]
                                                            :     :                                                                             +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 77 more fields]
                                                            :     :                                                                                +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 77 more fields]
                                                            :     :                                                                                   +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 76 more fields]
                                                            :     :                                                                                      +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 75 more fields]
                                                            :     :                                                                                         +- Join LeftOuter, ((drugId2#2820 = drugId#2066) AND (targetId2#2827 = targetId#2033))
                                                            :     :                                                                                            :- Join LeftOuter, (target_id#2870 = targetId#2033)
                                                            :     :                                                                                            :  :- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, concat_ws(,, clinicalSignificances#2049) AS clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 66 more fields]
                                                            :     :                                                                                            :  :  +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#2049, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 66 more fields]
                                                            :     :                                                                                            :  :     +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, cast(beta#2038 as double) AS beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#2049, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 66 more fields]
                                                            :     :                                                                                            :  :        +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2038, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#2049, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 66 more fields]
                                                            :     :                                                                                            :  :           +- Filter datasourceId#2032 IN (ot_genetics_portal,gwas_credible_sets,gene_burden,eva,eva_somatic,gene2phenotype,orphanet,cancer_gene_census,intogen,impc,chembl)
                                                            :     :                                                                                            :  :              +- Relation [datasourceId#2032,targetId#2033,alleleOrigins#2034,allelicRequirements#2035,ancestry#2036,ancestryId#2037,beta#2038,betaConfidenceIntervalLower#2039,betaConfidenceIntervalUpper#2040,biologicalModelAllelicComposition#2041,biologicalModelGeneticBackground#2042,biologicalModelId#2043,biomarkerName#2044,biomarkers#2045,biosamplesFromSource#2046,cellType#2047,clinicalPhase#2048,clinicalSignificances#2049,clinicalStatus#2050,cohortDescription#2051,cohortId#2052,cohortPhenotypes#2053,cohortShortName#2054,confidence#2055,... 65 more fields] parquet
                                                            :     :                                                                                            :  +- Project [id#2747 AS target_id#2870, approvedSymbol#2748, description#2855, description_splited#2859, TSorOncogene#2864]
                                                            :     :                                                                                            :     +- Project [id#2747, approvedSymbol#2748, description#2855, description_splited#2859, CASE WHEN (RLIKE(description_splited#2859, ncogene) AND RLIKE(description_splited#2859, TSG)) THEN bivalent WHEN RLIKE(description_splited#2859, ncogene(\s|$)) THEN oncogene WHEN RLIKE(description_splited#2859, TSG(\s|$)) THEN TSG ELSE noEvaluable END AS TSorOncogene#2864]
                                                            :     :                                                                                            :        +- Project [id#2747, approvedSymbol#2748, description#2855, concat_ws(,, description#2855) AS description_splited#2859]
                                                            :     :                                                                                            :           +- Aggregate [id#2747, approvedSymbol#2748], [id#2747, approvedSymbol#2748, collect_set(description#2847, 0, 0) AS description#2855]
                                                            :     :                                                                                            :              +- Filter description#2847 IN (TSG,oncogene,Oncogene,oncogene,oncogene,TSG,TSG,oncogene,fusion,oncogene,oncogene,fusion)
                                                            :     :                                                                                            :                 +- Project [id#2747, approvedSymbol#2748, col#2842.description AS description#2847]
                                                            :     :                                                                                            :                    +- Project [id#2747, approvedSymbol#2748, col#2842]
                                                            :     :                                                                                            :                       +- Generate explode(hallmarks#2757.attributes), true, [col#2842]
                                                            :     :                                                                                            :                          +- Relation [id#2747,approvedSymbol#2748,biotype#2749,transcriptIds#2750,canonicalTranscript#2751,canonicalExons#2752,genomicLocation#2753,alternativeGenes#2754,approvedName#2755,go#2756,hallmarks#2757,synonyms#2758,symbolSynonyms#2759,nameSynonyms#2760,functionDescriptions#2761,subcellularLocations#2762,targetClass#2763,obsoleteSymbols#2764,obsoleteNames#2765,constraint#2766,tep#2767,proteinIds#2768,dbXrefs#2769,chemicalProbes#2770,... 5 more fields] parquet
                                                            :     :                                                                                            +- Aggregate [targetId2#2827, drugId2#2820], [targetId2#2827, drugId2#2820, collect_set(actionType#2805, 0, 0) AS actionType#2837]
                                                            :     :                                                                                               +- Project [targetId2#2827, drugId2#2820, actionType#2805, mechanismOfAction#2806]
                                                            :     :                                                                                                  +- Generate explode(targets#2810), true, [targetId2#2827]
                                                            :     :                                                                                                     +- Project [drugId2#2820, actionType#2805, mechanismOfAction#2806, targets#2810]
                                                            :     :                                                                                                        +- Generate explode(chemblIds#2807), true, [drugId2#2820]
                                                            :     :                                                                                                           +- Relation [actionType#2805,mechanismOfAction#2806,chemblIds#2807,targetName#2808,targetType#2809,targets#2810,references#2811] parquet
                                                            :     +- Project [biosampleId#1059, biosampleName#1060]
                                                            :        +- Relation [biosampleId#1059,biosampleName#1060,description#1061,xrefs#1062,synonyms#1063,parents#1064,ancestors#1065,children#1066,descendants#1067] parquet
                                                            +- Project [name#17980, biosampleName#17981, rightTissue1#17982]
                                                               +- Relation [_c0#17979,name#17980,biosampleName#17981,rightTissue1#17982] csv


In [2]:
import time
from array import ArrayType
from functions import (
    relative_success,
    spreadSheetFormatter,
    discrepancifier,
    temporary_directionOfEffect,
)
# from stoppedTrials import terminated_td
from DoEAssessment import directionOfEffect
# from membraneTargets import target_membrane
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
from datetime import datetime
from datetime import date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.types import (
    StructType,
    StructField,
    DoubleType,
    DecimalType,
    StringType,
    FloatType,
)
import pandas as pd

spark = SparkSession.builder.getOrCreate()
spark.conf.set(
    "spark.sql.shuffle.partitions", "400"
)  # Default is 200, increase if needed


path_n='gs://open-targets-data-releases/25.03/output/'

target = spark.read.parquet(f"{path_n}target/")

diseases = spark.read.parquet(f"{path_n}disease/")

evidences = spark.read.parquet(f"{path_n}evidence")

credible = spark.read.parquet(f"{path_n}credible_set")

new = spark.read.parquet(f"{path_n}colocalisation_coloc") 

index=spark.read.parquet(f"{path_n}study/")

variantIndex = spark.read.parquet(f"{path_n}variant")

biosample = spark.read.parquet(f"{path_n}biosample")

print("loaded files")

newColoc = (
    new.join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on left side
            "studyLocusId as leftStudyLocusId",
            "StudyId as leftStudyId",
            "variantId as leftVariantId",
            "studyType as credibleLeftStudyType",
        ),
        on="leftStudyLocusId",
        how="left",
    )
    .join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on right side
            "studyLocusId as rightStudyLocusId",
            "studyId as rightStudyId",
            "variantId as rightVariantId",
            "studyType as credibleRightStudyType",
            'isTransQtl'
        ),
        on="rightStudyLocusId",
        how="left",
    )
    .join(
        index.selectExpr(  ### bring modulated target on right side (QTL study)
            "studyId as rightStudyId",
            "geneId",
            "projectId",
            "studyType as indexStudyType",
            "condition",
            "biosampleId",
        ),
        on="rightStudyId",
        how="left",
)
    # .persist()
)

print("loaded newColoc")

# remove columns without content (only null values on them)
df = evidences.filter((F.col("datasourceId") == "gwas_credible_sets"))

# Use an aggregation to determine non-null columns
non_null_counts = df.select(
    *[F.sum(F.col(col).isNotNull().cast("int")).alias(col) for col in df.columns]
)

# Collect the counts for each column
non_null_columns = [
    row[0] for row in non_null_counts.collect()[0].asDict().items() if row[1] > 0
]

# Select only the non-null columns
filtered_df = df.select(*non_null_columns)  # .persist()

## bring studyId, variantId, beta from Gwas and pValue
gwasComplete = filtered_df.join(
    credible.selectExpr(
        "studyLocusId", "studyId", "variantId", "beta as betaGwas", "pValueExponent"
    ),
    on="studyLocusId",
    how="left",
)  # .persist()

print("loaded gwasComplete")

resolvedColoc = (
    (
        newColoc.withColumnRenamed("geneId", "targetId")
        .join(
            gwasComplete.withColumnRenamed("studyLocusId", "leftStudyLocusId"),
            on=["leftStudyLocusId", "targetId"],
            how="inner",
        )
        .join(  ### propagated using parent terms
            diseases.selectExpr(
                "id as diseaseId", "name", "parents", "therapeuticAreas"
            ),
            on="diseaseId",
            how="left",
        )
        .withColumn(
            "diseaseId",
            F.explode_outer(F.concat(F.array(F.col("diseaseId")), F.col("parents"))),
        )
        .drop("parents", "oldDiseaseId")
    ).withColumn(
        "colocDoE",
        F.when(
            F.col("rightStudyType").isin(
                ["eqtl", "pqtl", "tuqtl", "sceqtl", "sctuqtl"]
            ),
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_protect"),
            ),
        ).when(
            F.col("rightStudyType").isin(
                ["sqtl", "scsqtl"]
            ),  ### opposite directionality than sqtl
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_protect"),
            ),
        ),
    )
    # .persist()
)
print("loaded resolvedColloc")

path = "gs://open-targets-pre-data-releases/24.12-uo_test-3/output/etl/parquet/"

datasource_filter = [
    "ot_genetics_portal",
    "gwas_credible_sets",
    "gene_burden",
    "eva",
    "eva_somatic",
    "gene2phenotype",
    "orphanet",
    "cancer_gene_census",
    "intogen",
    "impc",
    "chembl",
]

assessment, evidences, actionType, oncolabel = temporary_directionOfEffect(
    path_n, datasource_filter
)

print("run temporary direction of effect")


print("built drugApproved dataset")

analysis_chembl_indication = (
    discrepancifier(
        assessment.filter((F.col("datasourceId") == "chembl"))
        .withColumn(
            "maxClinPhase",
            F.max(F.col("clinicalPhase")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase")
        .pivot("homogenized")
        .agg(F.count("targetId"))
    )
    .filter(F.col("coherencyDiagonal") == "coherent")
    .drop(
        "coherencyDiagonal", "coherencyOneCell", "noEvaluable", "GoF_risk", "LoF_risk"
    )
    .withColumnRenamed("GoF_protect", "drugGoF_protect")
    .withColumnRenamed("LoF_protect", "drugLoF_protect")
    # .persist()
)

chemblAssoc = (
    discrepancifier(
        assessment.filter(
            (F.col("datasourceId") == "chembl")
            & (F.col("homogenized") != "noEvaluable")
        )
        .withColumn(
            "maxClinPhase",
            F.max("clinicalPhase").over(Window.partitionBy("targetId", "diseaseId")),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase")
        .pivot("homogenized")
        .count()
    )
    .filter(F.col("coherencyDiagonal") == "coherent")
    .drop(
        "coherencyDiagonal", "coherencyOneCell", "noEvaluable", "GoF_risk", "LoF_risk"
    )
    .withColumnRenamed("GoF_protect", "drugGoF_protect")
    .withColumnRenamed("LoF_protect", "drugLoF_protect")
)

print("built chemblAssoc dataset")

benchmark = (
    (
        resolvedColoc.filter(F.col("betaGwas") < 0)
        .join(  ### select just GWAS giving protection
            analysis_chembl_indication, on=["targetId", "diseaseId"], how="inner"
        )
        .withColumn(
            "AgreeDrug",
            F.when(
                (F.col("drugGoF_protect").isNotNull())
                & (F.col("colocDoE") == "GoF_protect"),
                F.lit("yes"),
            )
            .when(
                (F.col("drugLoF_protect").isNotNull())
                & (F.col("colocDoE") == "LoF_protect"),
                F.lit("yes"),
            )
            .otherwise(F.lit("no")),
        )
    ).filter(
        F.col("name") != "COVID-19"
    )  #### remove COVID-19 associations
).join(biosample.select("biosampleId", "biosampleName"), on="biosampleId", how="left")


print("built benchmark dataset")

#### Analysis

#### 1 Build a dictionary with the distinct values as key and column names as value
variables_study = ["projectId", "biosampleName", "rightStudyType", "colocDoE"]

# Initialize an empty dictionary
disdic = {}

# Iterate over the list of column names
for col_name in variables_study:
    # Extract distinct values for the column
    distinct_values = benchmark.select(col_name).distinct().collect()

    # Populate the dictionary
    for row in distinct_values:
        distinct_value = row[col_name]
        if distinct_value is not None:  # Exclude None (null) values
            disdic[distinct_value] = col_name

####2 Define agregation function
import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
from scipy.stats.contingency import odds_ratio
from pyspark.sql.types import *


def convertTuple(tup):
    st = ",".join(map(str, tup))
    return st


#####3 run in a function
def aggregations_original(
    df,
    data,
    listado,
    comparisonColumn,
    comparisonType,
    predictionColumn,
    predictionType,
    today_date,
):
    wComparison = Window.partitionBy(comparisonColumn)
    wPrediction = Window.partitionBy(predictionColumn)
    wPredictionComparison = Window.partitionBy(comparisonColumn, predictionColumn)
    results = []
    # uniqIds = df.select("targetId", "diseaseId").distinct().count()
    out = (
        df.withColumn("comparisonType", F.lit(comparisonType))
        .withColumn("dataset", F.lit(data))
        .withColumn("predictionType", F.lit(predictionType))
        # .withColumn("total", F.lit(uniqIds))
        .withColumn("a", F.count("targetId").over(wPredictionComparison))
        .withColumn("comparisonColumn", F.lit(comparisonColumn))
        .withColumn("predictionColumnValue", F.lit(predictionColumn))
        .withColumn(
            "predictionTotal",
            F.count("targetId").over(wPrediction),
        )
        .withColumn(
            "comparisonTotal",
            F.count("targetId").over(wComparison),
        )
        .select(
            F.col(predictionColumn).alias("prediction"),
            F.col(comparisonColumn).alias("comparison"),
            "dataset",
            "comparisonColumn",
            "predictionColumnValue",
            "comparisonType",
            "predictionType",
            "a",
            "predictionTotal",
            "comparisonTotal",
        )
        .filter(F.col("prediction").isNotNull())
        .filter(F.col("comparison").isNotNull())
        .distinct()
    )

    out.write.mode("overwrite").parquet(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + comparisonType
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )

    listado.append(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + comparisonType
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )
    path = "gs://ot-team/jroldan/" + str(
        today_date
        + "_"
        + "analysis/"
        + data
        # + "_propagated"
        + "/"
        + comparisonColumn
        + "_"
        + comparisonType
        + "_"
        + predictionColumn
        + ".parquet"
    )
    print(path)
    
    ### making analysis
    array1 = np.delete(
        out.join(full_data, on=["prediction", "comparison"], how="outer")
        .groupBy("comparison")
        .pivot("prediction")
        .agg(F.first("a"))
        .sort(F.col("comparison").desc())
        .select("comparison", "yes", "no")
        .fillna(0)
        .toPandas()
        .to_numpy(),
        [0],
        1,
    )
    total = np.sum(array1)
    res_npPhaseX = np.array(array1, dtype=int)
    resX = convertTuple(fisher_exact(res_npPhaseX, alternative="two-sided"))
    resx_CI = convertTuple(
        odds_ratio(res_npPhaseX).confidence_interval(confidence_level=0.95)
    )

    result_st.append(resX)
    result_ci.append(resx_CI)
    (rs_result, rs_ci) = relative_success(array1)
    results.extend(
        [
            comparisonType,
            comparisonColumn,
            predictionColumn,
            round(float(resX.split(",")[0]), 2),
            float(resX.split(",")[1]),
            round(float(resx_CI.split(",")[0]), 2),
            round(float(resx_CI.split(",")[1]), 2),
            str(total),
            np.array(res_npPhaseX).tolist(),
            round(float(rs_result), 2),
            round(float(rs_ci[0]), 2),
            round(float(rs_ci[1]), 2),
            # studies,
            # tissues,
            path,
        ]
    )
    return results


#### 3 Loop over different datasets (as they will have different rows and columns)


def comparisons_df_iterative(elements):
    # toAnalysis = [(key, value) for key, value in disdic.items() if value == projectId]
    toAnalysis = [(col, "predictor") for col in elements]
    schema = StructType(
        [
            StructField("comparison", StringType(), True),
            StructField("comparisonType", StringType(), True),
        ]
    )

    comparisons = spark.createDataFrame(toAnalysis, schema=schema)
    ### include all the columns as predictor

    predictions = spark.createDataFrame(
        data=[
            ("Phase4", "clinical"),
            ('Phase>=3','clinical'),
            ('Phase>=2','clinical'),
            ('Phase>=1','clinical'),
            ("PhaseT", "clinical"),
        ]
    )
    return comparisons.join(predictions, how="full").collect()


print("load comparisons_df_iterative function")


full_data = spark.createDataFrame(
    data=[
        ("yes", "yes"),
        ("yes", "no"),
        ("no", "yes"),
        ("no", "no"),
    ],
    schema=StructType(
        [
            StructField("prediction", StringType(), True),
            StructField("comparison", StringType(), True),
        ]
    ),
)
print("created full_data and lists")

rightTissue = spark.read.csv(
    'gs://ot-team/jroldan/analysis/jroldan_analysis_20250526_curatedRightTissue.csv',
    header=True,
).drop("_c0")

print("loaded rightTissue dataset")

negativeTD = (
    evidences.filter(F.col("datasourceId") == "chembl")
    .select("targetId", "diseaseId", "studyStopReason", "studyStopReasonCategories")
    .filter(F.array_contains(F.col("studyStopReasonCategories"), "Negative"))
    .groupBy("targetId", "diseaseId")
    .count()
    .withColumn("stopReason", F.lit("Negative"))
    .drop("count")
)

print("built negativeTD dataset")

bench2 = benchmark.join(
    rightTissue, on=["name", "bioSampleName"], how="left"
).withColumn(
    "rightTissue",
    F.when(F.col("rightTissue1") == "yes", F.lit("yes")).otherwise(F.lit("no")),
)

print("built bench2 dataset")

###### cut from here
print("looping for variables_study")
# List of columns to analyze
variables_study = ["projectId", "biosampleName", "rightStudyType", "colocDoE"]

# Dictionary to store results
pivoted_dfs = {}

# Loop over the columns
for col in variables_study:
    window_spec = Window.partitionBy("targetId", "diseaseId", col).orderBy(
        F.col("pValueExponent").asc()
    )
    print(f"Processing: {col}")

    pivoted_df = (
        bench2.withColumn(
            "rightTissue",
            F.when(F.col("rightTissue1") == "yes", F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "agree_lowestPval",
            F.first("AgreeDrug", ignorenulls=True).over(
                window_spec
            ),  ### ignore nulls aded 29.01.2025
            #### take directionality from lowest p value
        )
        .withColumn(
            "isRightTissueSignalAgreed",
            F.collect_set(
                F.when(F.col("rightTissue") == "yes", F.col("agree_lowestPval"))
            ).over(window_spec),
        )
        .withColumn(
            "isSignalFromRightTissue",
            F.first(
                F.when(
                    F.col("AgreeDrug") == F.col("agree_lowestPval"),
                    F.col("rightTissue"),
                ),
                ignorenulls=True,
            ).over(window_spec),
        )
        .groupBy(
            "targetId",
            "diseaseId",
            "maxClinPhase",
            "rightTissue",
            "isRightTissueSignalAgreed",
            "isSignalFromRightTissue",
        )
        .pivot(col)  # Pivot the column dynamically
        .agg(F.collect_set("agree_lowestPval"))
        .join(negativeTD, on=["targetId", "diseaseId"], how="left")
        .withColumn(
            "PhaseT",
            F.when(F.col("stopReason") == "Negative", F.lit("yes")).otherwise(
                F.lit("no")
            ),
        )
        .withColumn(
            "Phase4",
            F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=3",
            F.when(F.col("maxClinPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=2",
            F.when(F.col("maxClinPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=1",
            F.when(F.col("maxClinPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
        ))  # Collect unique values

    # Store the DataFrame in the dictionary
    pivoted_dfs[col] = pivoted_df

spark session created at 2025-05-27 06:51:58.577600
Analysis started on 2025-05-27 at  2025-05-27 06:51:58.577600


25/05/27 06:52:04 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

loaded files
loaded newColoc


                                                                                

loaded gwasComplete
loaded resolvedColloc
run temporary direction of effect
built drugApproved dataset


                                                                                

built chemblAssoc dataset
built benchmark dataset


                                                                                

load comparisons_df_iterative function
created full_data and lists
loaded rightTissue dataset
built negativeTD dataset
built bench2 dataset
looping for variables_study
Processing: projectId


                                                                                

Processing: biosampleName


                                                                                

Processing: rightStudyType


                                                                                

Processing: colocDoE


                                                                                

In [4]:
pivoted_dfs['projectId'].show()



+---------------+-------------+------------+-----------+-------------------------+-----------------------+-----------+----------+---------+---------------------+--------+---------+---+-----+----------+----------+------+------------+-------+--------+-----+--------------+------+-----------+-----------------+----------+-----------+------------+------+----+---------+----------+------+----------+------+-------------+--------------+---------------------+--------+-------+-----------+-----------+----------+-------+----------------+----------+------+------+--------+--------+--------+
|       targetId|    diseaseId|maxClinPhase|rightTissue|isRightTissueSignalAgreed|isSignalFromRightTissue|Alasoo_2018|Aygun_2021|BLUEPRINT|Bossini-Castillo_2019|BrainSeq|Braineac2|CAP|CEDAR|CommonMind|Cytoimmgen|FUSION|Fairfax_2014|GENCORD|GEUVADIS| GTEx|Gilchrist_2021|HipSci|Jerber_2021|Kim-Hellmuth_2017|Lepik_2017|Nathan_2022|Nedelec_2016|OneK1K|PISA|Peng_2018|Perez_2022|PhLiPS|Quach_2016|ROSMAP|Randolph_2021|Sch

                                                                                

In [None]:
disdic.items()

In [8]:
disdic.items()

dict_items([('Nedelec_2016', 'projectId'), ('BrainSeq', 'projectId'), ('GEUVADIS', 'projectId'), ('GTEx', 'projectId'), ('Schmiedel_2018', 'projectId'), ('Bossini-Castillo_2019', 'projectId'), ('CEDAR', 'projectId'), ('ROSMAP', 'projectId'), ('FUSION', 'projectId'), ('Cytoimmgen', 'projectId'), ('Quach_2016', 'projectId'), ('HipSci', 'projectId'), ('OneK1K', 'projectId'), ('TwinsUK', 'projectId'), ('iPSCORE', 'projectId'), ('CommonMind', 'projectId'), ('Sun_2018', 'projectId'), ('Alasoo_2018', 'projectId'), ('Randolph_2021', 'projectId'), ('GENCORD', 'projectId'), ('Lepik_2017', 'projectId'), ('Braineac2', 'projectId'), ('UKB_PPP_EUR', 'projectId'), ('Kim-Hellmuth_2017', 'projectId'), ('BLUEPRINT', 'projectId'), ('Perez_2022', 'projectId'), ('van_de_Bunt_2015', 'projectId'), ('Young_2019', 'projectId'), ('Jerber_2021', 'projectId'), ('PISA', 'projectId'), ('CAP', 'projectId'), ('Aygun_2021', 'projectId'), ('Walker_2019', 'projectId'), ('Schwartzentruber_2018', 'projectId'), ('Peng_2018

In [3]:
pivoted_dfs['projectId'].select(
            ["*"]
            + (
                [  ### single columns
                    F.when(F.array_contains(F.col(x), "yes"), F.lit("yes"))
                    .otherwise(F.lit("no"))
                    .alias(f"{x}_only")
                    for x, value in [
                        (key, val) for key, val in disdic.items() if val == col
                    ]
                ]
            )
            + (
                [
                    F.when(
                        F.array_contains(F.col("isRightTissueSignalAgreed"), "yes"),
                        F.lit("yes"),
                    )
                    .otherwise(F.lit("no"))
                    .alias(f"{x}_isRightTissueSignalAgreed")
                    for x, value in [
                        (key, val) for key, val in disdic.items() if val == col
                    ]
                ]
            )
        ).show()

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `GoF_protect` cannot be resolved. Did you mean one of the following? [`BrainSeq`, `FUSION`, `HipSci`, `Phase4`, `PhaseT`].;
'Project [targetId#1758, diseaseId#1911, maxClinPhase#6142, rightTissue#18104, isRightTissueSignalAgreed#18203, isSignalFromRightTissue#18255, Alasoo_2018#18575, Aygun_2021#18577, BLUEPRINT#18579, Bossini-Castillo_2019#18581, BrainSeq#18583, Braineac2#18585, CAP#18587, CEDAR#18589, CommonMind#18591, Cytoimmgen#18593, FUSION#18595, Fairfax_2014#18597, GENCORD#18599, GEUVADIS#18601, GTEx#18603, Gilchrist_2021#18605, HipSci#18607, Jerber_2021#18609, ... 31 more fields]
+- Project [targetId#1758, diseaseId#1911, maxClinPhase#6142, rightTissue#18104, isRightTissueSignalAgreed#18203, isSignalFromRightTissue#18255, Alasoo_2018#18575, Aygun_2021#18577, BLUEPRINT#18579, Bossini-Castillo_2019#18581, BrainSeq#18583, Braineac2#18585, CAP#18587, CEDAR#18589, CommonMind#18591, Cytoimmgen#18593, FUSION#18595, Fairfax_2014#18597, GENCORD#18599, GEUVADIS#18601, GTEx#18603, Gilchrist_2021#18605, HipSci#18607, Jerber_2021#18609, ... 27 more fields]
   +- Project [targetId#1758, diseaseId#1911, maxClinPhase#6142, rightTissue#18104, isRightTissueSignalAgreed#18203, isSignalFromRightTissue#18255, Alasoo_2018#18575, Aygun_2021#18577, BLUEPRINT#18579, Bossini-Castillo_2019#18581, BrainSeq#18583, Braineac2#18585, CAP#18587, CEDAR#18589, CommonMind#18591, Cytoimmgen#18593, FUSION#18595, Fairfax_2014#18597, GENCORD#18599, GEUVADIS#18601, GTEx#18603, Gilchrist_2021#18605, HipSci#18607, Jerber_2021#18609, ... 26 more fields]
      +- Project [targetId#1758, diseaseId#1911, maxClinPhase#6142, rightTissue#18104, isRightTissueSignalAgreed#18203, isSignalFromRightTissue#18255, Alasoo_2018#18575, Aygun_2021#18577, BLUEPRINT#18579, Bossini-Castillo_2019#18581, BrainSeq#18583, Braineac2#18585, CAP#18587, CEDAR#18589, CommonMind#18591, Cytoimmgen#18593, FUSION#18595, Fairfax_2014#18597, GENCORD#18599, GEUVADIS#18601, GTEx#18603, Gilchrist_2021#18605, HipSci#18607, Jerber_2021#18609, ... 25 more fields]
         +- Project [targetId#1758, diseaseId#1911, maxClinPhase#6142, rightTissue#18104, isRightTissueSignalAgreed#18203, isSignalFromRightTissue#18255, Alasoo_2018#18575, Aygun_2021#18577, BLUEPRINT#18579, Bossini-Castillo_2019#18581, BrainSeq#18583, Braineac2#18585, CAP#18587, CEDAR#18589, CommonMind#18591, Cytoimmgen#18593, FUSION#18595, Fairfax_2014#18597, GENCORD#18599, GEUVADIS#18601, GTEx#18603, Gilchrist_2021#18605, HipSci#18607, Jerber_2021#18609, ... 24 more fields]
            +- Project [targetId#1758, diseaseId#1911, maxClinPhase#6142, rightTissue#18104, isRightTissueSignalAgreed#18203, isSignalFromRightTissue#18255, Alasoo_2018#18575, Aygun_2021#18577, BLUEPRINT#18579, Bossini-Castillo_2019#18581, BrainSeq#18583, Braineac2#18585, CAP#18587, CEDAR#18589, CommonMind#18591, Cytoimmgen#18593, FUSION#18595, Fairfax_2014#18597, GENCORD#18599, GEUVADIS#18601, GTEx#18603, Gilchrist_2021#18605, HipSci#18607, Jerber_2021#18609, ... 23 more fields]
               +- Project [targetId#1758, diseaseId#1911, maxClinPhase#6142, rightTissue#18104, isRightTissueSignalAgreed#18203, isSignalFromRightTissue#18255, Alasoo_2018#18575, Aygun_2021#18577, BLUEPRINT#18579, Bossini-Castillo_2019#18581, BrainSeq#18583, Braineac2#18585, CAP#18587, CEDAR#18589, CommonMind#18591, Cytoimmgen#18593, FUSION#18595, Fairfax_2014#18597, GENCORD#18599, GEUVADIS#18601, GTEx#18603, Gilchrist_2021#18605, HipSci#18607, Jerber_2021#18609, ... 22 more fields]
                  +- Join LeftOuter, ((targetId#1758 = targetId#18698) AND (diseaseId#1911 = diseaseId#18780))
                     :- Aggregate [targetId#1758, diseaseId#1911, maxClinPhase#6142, rightTissue#18104, isRightTissueSignalAgreed#18203, isSignalFromRightTissue#18255], [targetId#1758, diseaseId#1911, maxClinPhase#6142, rightTissue#18104, isRightTissueSignalAgreed#18203, isSignalFromRightTissue#18255, collect_set(if ((projectId#975 <=> cast(Alasoo_2018 as string))) agree_lowestPval#18153 else cast(null as string), 0, 0) AS Alasoo_2018#18575, collect_set(if ((projectId#975 <=> cast(Aygun_2021 as string))) agree_lowestPval#18153 else cast(null as string), 0, 0) AS Aygun_2021#18577, collect_set(if ((projectId#975 <=> cast(BLUEPRINT as string))) agree_lowestPval#18153 else cast(null as string), 0, 0) AS BLUEPRINT#18579, collect_set(if ((projectId#975 <=> cast(Bossini-Castillo_2019 as string))) agree_lowestPval#18153 else cast(null as string), 0, 0) AS Bossini-Castillo_2019#18581, collect_set(if ((projectId#975 <=> cast(BrainSeq as string))) agree_lowestPval#18153 else cast(null as string), 0, 0) AS BrainSeq#18583, collect_set(if ((projectId#975 <=> cast(Braineac2 as string))) agree_lowestPval#18153 else cast(null as string), 0, 0) AS Braineac2#18585, collect_set(if ((projectId#975 <=> cast(CAP as string))) agree_lowestPval#18153 else cast(null as string), 0, 0) AS CAP#18587, collect_set(if ((projectId#975 <=> cast(CEDAR as string))) agree_lowestPval#18153 else cast(null as string), 0, 0) AS CEDAR#18589, collect_set(if ((projectId#975 <=> cast(CommonMind as string))) agree_lowestPval#18153 else cast(null as string), 0, 0) AS CommonMind#18591, collect_set(if ((projectId#975 <=> cast(Cytoimmgen as string))) agree_lowestPval#18153 else cast(null as string), 0, 0) AS Cytoimmgen#18593, collect_set(if ((projectId#975 <=> cast(FUSION as string))) agree_lowestPval#18153 else cast(null as string), 0, 0) AS FUSION#18595, collect_set(if ((projectId#975 <=> cast(Fairfax_2014 as string))) agree_lowestPval#18153 else cast(null as string), 0, 0) AS Fairfax_2014#18597, collect_set(if ((projectId#975 <=> cast(GENCORD as string))) agree_lowestPval#18153 else cast(null as string), 0, 0) AS GENCORD#18599, collect_set(if ((projectId#975 <=> cast(GEUVADIS as string))) agree_lowestPval#18153 else cast(null as string), 0, 0) AS GEUVADIS#18601, collect_set(if ((projectId#975 <=> cast(GTEx as string))) agree_lowestPval#18153 else cast(null as string), 0, 0) AS GTEx#18603, collect_set(if ((projectId#975 <=> cast(Gilchrist_2021 as string))) agree_lowestPval#18153 else cast(null as string), 0, 0) AS Gilchrist_2021#18605, collect_set(if ((projectId#975 <=> cast(HipSci as string))) agree_lowestPval#18153 else cast(null as string), 0, 0) AS HipSci#18607, collect_set(if ((projectId#975 <=> cast(Jerber_2021 as string))) agree_lowestPval#18153 else cast(null as string), 0, 0) AS Jerber_2021#18609, ... 21 more fields]
                     :  +- Project [name#691, biosampleName#1060, biosampleId#1002, targetId#1758, diseaseId#1911, leftStudyLocusId#949, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, ... 26 more fields]
                     :     +- Project [name#691, biosampleName#1060, biosampleId#1002, targetId#1758, diseaseId#1911, leftStudyLocusId#949, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, ... 28 more fields]
                     :        +- Window [first(_w0#18256, true) windowspecdefinition(targetId#1758, diseaseId#1911, projectId#975, pValueExponent#1808 ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS isSignalFromRightTissue#18255], [targetId#1758, diseaseId#1911, projectId#975], [pValueExponent#1808 ASC NULLS FIRST]
                     :           +- Project [name#691, biosampleName#1060, biosampleId#1002, targetId#1758, diseaseId#1911, leftStudyLocusId#949, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, ... 26 more fields]
                     :              +- Project [name#691, biosampleName#1060, biosampleId#1002, targetId#1758, diseaseId#1911, leftStudyLocusId#949, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, ... 25 more fields]
                     :                 +- Project [name#691, biosampleName#1060, biosampleId#1002, targetId#1758, diseaseId#1911, leftStudyLocusId#949, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, ... 27 more fields]
                     :                    +- Window [collect_set(_w0#18204, 0, 0) windowspecdefinition(targetId#1758, diseaseId#1911, projectId#975, pValueExponent#1808 ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS isRightTissueSignalAgreed#18203], [targetId#1758, diseaseId#1911, projectId#975], [pValueExponent#1808 ASC NULLS FIRST]
                     :                       +- Project [name#691, biosampleName#1060, biosampleId#1002, targetId#1758, diseaseId#1911, leftStudyLocusId#949, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, ... 25 more fields]
                     :                          +- Project [name#691, biosampleName#1060, biosampleId#1002, targetId#1758, diseaseId#1911, leftStudyLocusId#949, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, ... 24 more fields]
                     :                             +- Project [name#691, biosampleName#1060, biosampleId#1002, targetId#1758, diseaseId#1911, leftStudyLocusId#949, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, ... 25 more fields]
                     :                                +- Window [first(AgreeDrug#16463, true) windowspecdefinition(targetId#1758, diseaseId#1911, projectId#975, pValueExponent#1808 ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS agree_lowestPval#18153], [targetId#1758, diseaseId#1911, projectId#975], [pValueExponent#1808 ASC NULLS FIRST]
                     :                                   +- Project [name#691, biosampleName#1060, biosampleId#1002, targetId#1758, diseaseId#1911, leftStudyLocusId#949, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, ... 23 more fields]
                     :                                      +- Project [name#691, biosampleName#1060, biosampleId#1002, targetId#1758, diseaseId#1911, leftStudyLocusId#949, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, ... 23 more fields]
                     :                                         +- Project [name#691, biosampleName#1060, biosampleId#1002, targetId#1758, diseaseId#1911, leftStudyLocusId#949, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, ... 23 more fields]
                     :                                            +- Project [name#691, biosampleName#1060, biosampleId#1002, targetId#1758, diseaseId#1911, leftStudyLocusId#949, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, ... 22 more fields]
                     :                                               +- Join LeftOuter, ((name#691 = name#17979) AND (biosampleName#1060 = biosampleName#17980))
                     :                                                  :- Project [biosampleId#1002, targetId#1758, diseaseId#1911, leftStudyLocusId#949, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, projectId#975, indexStudyType#1159, ... 21 more fields]
                     :                                                  :  +- Join LeftOuter, (biosampleId#1002 = biosampleId#1059)
                     :                                                  :     :- Filter NOT (name#691 = COVID-19)
                     :                                                  :     :  +- Project [targetId#1758, diseaseId#1911, leftStudyLocusId#949, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, projectId#975, indexStudyType#1159, condition#998, ... 20 more fields]
                     :                                                  :     :     +- Project [targetId#1758, diseaseId#1911, leftStudyLocusId#949, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, projectId#975, indexStudyType#1159, condition#998, ... 19 more fields]
                     :                                                  :     :        +- Join Inner, ((targetId#1758 = targetId#2033) AND (diseaseId#1911 = diseaseId#2115))
                     :                                                  :     :           :- Filter (betaGwas#1737 < cast(0 as double))
                     :                                                  :     :           :  +- Project [diseaseId#1911, leftStudyLocusId#949, targetId#1758, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, projectId#975, indexStudyType#1159, condition#998, ... 16 more fields]
                     :                                                  :     :           :     +- Project [diseaseId#1911, leftStudyLocusId#949, targetId#1758, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, projectId#975, indexStudyType#1159, condition#998, ... 15 more fields]
                     :                                                  :     :           :        +- Project [diseaseId#1911, leftStudyLocusId#949, targetId#1758, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, projectId#975, indexStudyType#1159, condition#998, ... 16 more fields]
                     :                                                  :     :           :           +- Generate explode(concat(array(diseaseId#800), parents#694)), true, [diseaseId#1911]
                     :                                                  :     :           :              +- Project [diseaseId#800, leftStudyLocusId#949, targetId#1758, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, projectId#975, indexStudyType#1159, condition#998, ... 16 more fields]
                     :                                                  :     :           :                 +- Join LeftOuter, (diseaseId#800 = diseaseId#1864)
                     :                                                  :     :           :                    :- Project [leftStudyLocusId#949, targetId#1758, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, projectId#975, indexStudyType#1159, condition#998, biosampleId#1002, ... 13 more fields]
                     :                                                  :     :           :                    :  +- Join Inner, ((leftStudyLocusId#949 = leftStudyLocusId#1783) AND (targetId#1758 = targetId#718))
                     :                                                  :     :           :                    :     :- Project [rightStudyId#1103, rightStudyLocusId#950, leftStudyLocusId#949, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, geneId#974 AS targetId#1758, projectId#975, indexStudyType#1159, condition#998, biosampleId#1002]
                     :                                                  :     :           :                    :     :  +- Project [rightStudyId#1103, rightStudyLocusId#950, leftStudyLocusId#949, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, geneId#974, projectId#975, indexStudyType#1159, condition#998, biosampleId#1002]
                     :                                                  :     :           :                    :     :     +- Join LeftOuter, (rightStudyId#1103 = rightStudyId#1158)
                     :                                                  :     :           :                    :     :        :- Project [rightStudyLocusId#950, leftStudyLocusId#949, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightStudyId#1103, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137]
                     :                                                  :     :           :                    :     :        :  +- Join LeftOuter, (rightStudyLocusId#950 = rightStudyLocusId#1102)
                     :                                                  :     :           :                    :     :        :     :- Project [leftStudyLocusId#949, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080]
                     :                                                  :     :           :                    :     :        :     :  +- Join LeftOuter, (leftStudyLocusId#949 = leftStudyLocusId#1077)
                     :                                                  :     :           :                    :     :        :     :     :- Relation [leftStudyLocusId#949,rightStudyLocusId#950,chromosome#951,rightStudyType#952,numberColocalisingVariants#953L,h0#954,h1#955,h2#956,h3#957,h4#958,colocalisationMethod#959,betaRatioSignAverage#960] parquet
                     :                                                  :     :           :                    :     :        :     :     +- Project [studyLocusId#895 AS leftStudyLocusId#1077, StudyId#896 AS leftStudyId#1078, variantId#897 AS leftVariantId#1079, studyType#920 AS credibleLeftStudyType#1080]
                     :                                                  :     :           :                    :     :        :     :        +- Relation [studyLocusId#895,studyId#896,variantId#897,chromosome#898,position#899,region#900,beta#901,zScore#902,pValueMantissa#903,pValueExponent#904,effectAlleleFrequencyFromSource#905,standardError#906,subStudyDescription#907,qualityControls#908,finemappingMethod#909,credibleSetIndex#910,credibleSetlog10BF#911,purityMeanR2#912,purityMinR2#913,locusStart#914,locusEnd#915,sampleSize#916,ldSet#917,locus#918,... 3 more fields] parquet
                     :                                                  :     :           :                    :     :        :     +- Project [studyLocusId#1111 AS rightStudyLocusId#1102, studyId#1112 AS rightStudyId#1103, variantId#1113 AS rightVariantId#1104, studyType#1136 AS credibleRightStudyType#1105, isTransQtl#1137]
                     :                                                  :     :           :                    :     :        :        +- Relation [studyLocusId#1111,studyId#1112,variantId#1113,chromosome#1114,position#1115,region#1116,beta#1117,zScore#1118,pValueMantissa#1119,pValueExponent#1120,effectAlleleFrequencyFromSource#1121,standardError#1122,subStudyDescription#1123,qualityControls#1124,finemappingMethod#1125,credibleSetIndex#1126,credibleSetlog10BF#1127,purityMeanR2#1128,purityMinR2#1129,locusStart#1130,locusEnd#1131,sampleSize#1132,ldSet#1133,locus#1134,... 3 more fields] parquet
                     :                                                  :     :           :                    :     :        +- Project [studyId#973 AS rightStudyId#1158, geneId#974, projectId#975, studyType#976 AS indexStudyType#1159, condition#998, biosampleId#1002]
                     :                                                  :     :           :                    :     :           +- Relation [studyId#973,geneId#974,projectId#975,studyType#976,traitFromSource#977,traitFromSourceMappedIds#978,biosampleFromSourceId#979,pubmedId#980,publicationTitle#981,publicationFirstAuthor#982,publicationDate#983,publicationJournal#984,backgroundTraitFromSourceMappedIds#985,initialSampleSize#986,nCases#987,nControls#988,nSamples#989,cohorts#990,ldPopulationStructure#991,discoverySamples#992,replicationSamples#993,qualityControls#994,analysisFlags#995,summarystatsLocation#996,... 6 more fields] parquet
                     :                                                  :     :           :                    :     +- Project [studyLocusId#798 AS leftStudyLocusId#1783, datasourceId#717, targetId#718, datatypeId#743, diseaseFromSourceMappedId#747, resourceScore#769, targetFromSourceId#784, diseaseId#800, id#801, score#802, sourceId#805, studyId#1800, variantId#1801, betaGwas#1737, pValueExponent#1808]
                     :                                                  :     :           :                    :        +- Project [studyLocusId#798, datasourceId#717, targetId#718, datatypeId#743, diseaseFromSourceMappedId#747, resourceScore#769, targetFromSourceId#784, diseaseId#800, id#801, score#802, sourceId#805, studyId#1800, variantId#1801, betaGwas#1737, pValueExponent#1808]
                     :                                                  :     :           :                    :           +- Join LeftOuter, (studyLocusId#798 = studyLocusId#1799)
                     :                                                  :     :           :                    :              :- Project [datasourceId#717, targetId#718, datatypeId#743, diseaseFromSourceMappedId#747, resourceScore#769, targetFromSourceId#784, studyLocusId#798, diseaseId#800, id#801, score#802, sourceId#805]
                     :                                                  :     :           :                    :              :  +- Filter (datasourceId#717 = gwas_credible_sets)
                     :                                                  :     :           :                    :              :     +- Relation [datasourceId#717,targetId#718,alleleOrigins#719,allelicRequirements#720,ancestry#721,ancestryId#722,beta#723,betaConfidenceIntervalLower#724,betaConfidenceIntervalUpper#725,biologicalModelAllelicComposition#726,biologicalModelGeneticBackground#727,biologicalModelId#728,biomarkerName#729,biomarkers#730,biosamplesFromSource#731,cellType#732,clinicalPhase#733,clinicalSignificances#734,clinicalStatus#735,cohortDescription#736,cohortId#737,cohortPhenotypes#738,cohortShortName#739,confidence#740,... 65 more fields] parquet
                     :                                                  :     :           :                    :              +- Project [studyLocusId#1799, studyId#1800, variantId#1801, beta#1805 AS betaGwas#1737, pValueExponent#1808]
                     :                                                  :     :           :                    :                 +- Relation [studyLocusId#1799,studyId#1800,variantId#1801,chromosome#1802,position#1803,region#1804,beta#1805,zScore#1806,pValueMantissa#1807,pValueExponent#1808,effectAlleleFrequencyFromSource#1809,standardError#1810,subStudyDescription#1811,qualityControls#1812,finemappingMethod#1813,credibleSetIndex#1814,credibleSetlog10BF#1815,purityMeanR2#1816,purityMinR2#1817,locusStart#1818,locusEnd#1819,sampleSize#1820,ldSet#1821,locus#1822,... 3 more fields] parquet
                     :                                                  :     :           :                    +- Project [id#689 AS diseaseId#1864, name#691, parents#694, therapeuticAreas#700]
                     :                                                  :     :           :                       +- Relation [id#689,code#690,name#691,description#692,dbXRefs#693,parents#694,synonyms#695,obsoleteTerms#696,obsoleteXRefs#697,children#698,ancestors#699,therapeuticAreas#700,descendants#701,ontology#702] parquet
                     :                                                  :     :           +- Project [targetId#2033, diseaseId#2115, maxClinPhase#6142, drugGoF_protect#11659L, LoF_protect#9839L AS drugLoF_protect#11665L]
                     :                                                  :     :              +- Project [targetId#2033, diseaseId#2115, maxClinPhase#6142, GoF_protect#9838L AS drugGoF_protect#11659L, LoF_protect#9839L]
                     :                                                  :     :                 +- Project [targetId#2033, diseaseId#2115, maxClinPhase#6142, GoF_protect#9838L, LoF_protect#9839L]
                     :                                                  :     :                    +- Filter (coherencyDiagonal#11633 = coherent)
                     :                                                  :     :                       +- Project [targetId#2033, diseaseId#2115, maxClinPhase#6142, GoF_protect#9838L, LoF_protect#9839L, noEvaluable#9840L, GoF_risk#9853, LoF_risk#11467, coherencyDiagonal#11633, CASE WHEN ((((isnull(LoF_risk#11467) AND isnull(LoF_protect#9839L)) AND isnull(GoF_risk#9853)) AND isnull(GoF_protect#9838L)) AND isnull(noEvaluable#9840L)) THEN noEvid WHEN ((((isnull(LoF_risk#11467) AND isnull(LoF_protect#9839L)) AND isnull(GoF_risk#9853)) AND isnull(GoF_protect#9838L)) AND isnotnull(noEvaluable#9840L)) THEN EvidNotDoE WHEN (((isnotnull(LoF_risk#11467) OR isnotnull(LoF_protect#9839L)) OR isnotnull(GoF_risk#9853)) OR isnotnull(GoF_protect#9838L)) THEN CASE WHEN (isnotnull(LoF_risk#11467) AND ((isnull(LoF_protect#9839L) AND isnull(GoF_risk#9853)) AND isnull(GoF_protect#9838L))) THEN coherent WHEN (isnotnull(GoF_risk#9853) AND ((isnull(LoF_protect#9839L) AND isnull(LoF_risk#11467)) AND isnull(GoF_protect#9838L))) THEN coherent WHEN (isnotnull(LoF_protect#9839L) AND ((isnull(LoF_risk#11467) AND isnull(GoF_risk#9853)) AND isnull(GoF_protect#9838L))) THEN coherent WHEN (isnotnull(GoF_protect#9838L) AND ((isnull(LoF_protect#9839L) AND isnull(GoF_risk#9853)) AND isnull(LoF_risk#11467))) THEN coherent ELSE dispar END END AS coherencyOneCell#11643]
                     :                                                  :     :                          +- Project [targetId#2033, diseaseId#2115, maxClinPhase#6142, GoF_protect#9838L, LoF_protect#9839L, noEvaluable#9840L, GoF_risk#9853, LoF_risk#11467, CASE WHEN ((((isnull(LoF_risk#11467) AND isnull(LoF_protect#9839L)) AND isnull(GoF_risk#9853)) AND isnull(GoF_protect#9838L)) AND isnull(noEvaluable#9840L)) THEN noEvid WHEN ((((isnull(LoF_risk#11467) AND isnull(LoF_protect#9839L)) AND isnull(GoF_risk#9853)) AND isnull(GoF_protect#9838L)) AND isnotnull(noEvaluable#9840L)) THEN EvidNotDoE WHEN (((isnotnull(LoF_risk#11467) OR isnotnull(LoF_protect#9839L)) OR isnotnull(GoF_risk#9853)) OR isnotnull(GoF_protect#9838L)) THEN CASE WHEN (isnotnull(GoF_risk#9853) AND isnotnull(LoF_risk#11467)) THEN dispar WHEN (isnotnull(LoF_protect#9839L) AND isnotnull(LoF_risk#11467)) THEN dispar WHEN (isnotnull(GoF_protect#9838L) AND isnotnull(GoF_risk#9853)) THEN dispar WHEN (isnotnull(GoF_protect#9838L) AND isnotnull(LoF_protect#9839L)) THEN dispar ELSE coherent END END AS coherencyDiagonal#11633]
                     :                                                  :     :                             +- Project [targetId#2033, diseaseId#2115, maxClinPhase#6142, GoF_protect#9838L, LoF_protect#9839L, noEvaluable#9840L, GoF_risk#9853, null AS LoF_risk#11467]
                     :                                                  :     :                                +- Project [targetId#2033, diseaseId#2115, maxClinPhase#6142, GoF_protect#9838L, LoF_protect#9839L, noEvaluable#9840L, null AS GoF_risk#9853]
                     :                                                  :     :                                   +- Project [targetId#2033, diseaseId#2115, maxClinPhase#6142, __pivot_count(targetId) AS `count(targetId)`#9837[0] AS GoF_protect#9838L, __pivot_count(targetId) AS `count(targetId)`#9837[1] AS LoF_protect#9839L, __pivot_count(targetId) AS `count(targetId)`#9837[2] AS noEvaluable#9840L]
                     :                                                  :     :                                      +- Aggregate [targetId#2033, diseaseId#2115, maxClinPhase#6142], [targetId#2033, diseaseId#2115, maxClinPhase#6142, pivotfirst(homogenized#4153, count(targetId)#9829L, GoF_protect, LoF_protect, noEvaluable, 0, 0) AS __pivot_count(targetId) AS `count(targetId)`#9837]
                     :                                                  :     :                                         +- Aggregate [targetId#2033, diseaseId#2115, maxClinPhase#6142, homogenized#4153], [targetId#2033, diseaseId#2115, maxClinPhase#6142, homogenized#4153, count(targetId#2033) AS count(targetId)#9829L]
                     :                                                  :     :                                            +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 80 more fields]
                     :                                                  :     :                                               +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 81 more fields]
                     :                                                  :     :                                                  +- Window [max(clinicalPhase#2048) windowspecdefinition(targetId#2033, diseaseId#2115, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS maxClinPhase#6142], [targetId#2033, diseaseId#2115]
                     :                                                  :     :                                                     +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 79 more fields]
                     :                                                  :     :                                                        +- Filter (datasourceId#2032 = chembl)
                     :                                                  :     :                                                           +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 79 more fields]
                     :                                                  :     :                                                              +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 78 more fields]
                     :                                                  :     :                                                                 +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 78 more fields]
                     :                                                  :     :                                                                    +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 78 more fields]
                     :                                                  :     :                                                                       +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 79 more fields]
                     :                                                  :     :                                                                          +- Window [collect_set(intogen_function#3736, 0, 0) windowspecdefinition(targetId#2033, diseaseId#2115, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS _we0#3842], [targetId#2033, diseaseId#2115]
                     :                                                  :     :                                                                             +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 77 more fields]
                     :                                                  :     :                                                                                +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 77 more fields]
                     :                                                  :     :                                                                                   +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 76 more fields]
                     :                                                  :     :                                                                                      +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 75 more fields]
                     :                                                  :     :                                                                                         +- Join LeftOuter, ((drugId2#2820 = drugId#2066) AND (targetId2#2827 = targetId#2033))
                     :                                                  :     :                                                                                            :- Join LeftOuter, (target_id#2870 = targetId#2033)
                     :                                                  :     :                                                                                            :  :- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, concat_ws(,, clinicalSignificances#2049) AS clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 66 more fields]
                     :                                                  :     :                                                                                            :  :  +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#2049, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 66 more fields]
                     :                                                  :     :                                                                                            :  :     +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, cast(beta#2038 as double) AS beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#2049, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 66 more fields]
                     :                                                  :     :                                                                                            :  :        +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2038, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#2049, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 66 more fields]
                     :                                                  :     :                                                                                            :  :           +- Filter datasourceId#2032 IN (ot_genetics_portal,gwas_credible_sets,gene_burden,eva,eva_somatic,gene2phenotype,orphanet,cancer_gene_census,intogen,impc,chembl)
                     :                                                  :     :                                                                                            :  :              +- Relation [datasourceId#2032,targetId#2033,alleleOrigins#2034,allelicRequirements#2035,ancestry#2036,ancestryId#2037,beta#2038,betaConfidenceIntervalLower#2039,betaConfidenceIntervalUpper#2040,biologicalModelAllelicComposition#2041,biologicalModelGeneticBackground#2042,biologicalModelId#2043,biomarkerName#2044,biomarkers#2045,biosamplesFromSource#2046,cellType#2047,clinicalPhase#2048,clinicalSignificances#2049,clinicalStatus#2050,cohortDescription#2051,cohortId#2052,cohortPhenotypes#2053,cohortShortName#2054,confidence#2055,... 65 more fields] parquet
                     :                                                  :     :                                                                                            :  +- Project [id#2747 AS target_id#2870, approvedSymbol#2748, description#2855, description_splited#2859, TSorOncogene#2864]
                     :                                                  :     :                                                                                            :     +- Project [id#2747, approvedSymbol#2748, description#2855, description_splited#2859, CASE WHEN (RLIKE(description_splited#2859, ncogene) AND RLIKE(description_splited#2859, TSG)) THEN bivalent WHEN RLIKE(description_splited#2859, ncogene(\s|$)) THEN oncogene WHEN RLIKE(description_splited#2859, TSG(\s|$)) THEN TSG ELSE noEvaluable END AS TSorOncogene#2864]
                     :                                                  :     :                                                                                            :        +- Project [id#2747, approvedSymbol#2748, description#2855, concat_ws(,, description#2855) AS description_splited#2859]
                     :                                                  :     :                                                                                            :           +- Aggregate [id#2747, approvedSymbol#2748], [id#2747, approvedSymbol#2748, collect_set(description#2847, 0, 0) AS description#2855]
                     :                                                  :     :                                                                                            :              +- Filter description#2847 IN (TSG,oncogene,Oncogene,oncogene,oncogene,TSG,TSG,oncogene,fusion,oncogene,oncogene,fusion)
                     :                                                  :     :                                                                                            :                 +- Project [id#2747, approvedSymbol#2748, col#2842.description AS description#2847]
                     :                                                  :     :                                                                                            :                    +- Project [id#2747, approvedSymbol#2748, col#2842]
                     :                                                  :     :                                                                                            :                       +- Generate explode(hallmarks#2757.attributes), true, [col#2842]
                     :                                                  :     :                                                                                            :                          +- Relation [id#2747,approvedSymbol#2748,biotype#2749,transcriptIds#2750,canonicalTranscript#2751,canonicalExons#2752,genomicLocation#2753,alternativeGenes#2754,approvedName#2755,go#2756,hallmarks#2757,synonyms#2758,symbolSynonyms#2759,nameSynonyms#2760,functionDescriptions#2761,subcellularLocations#2762,targetClass#2763,obsoleteSymbols#2764,obsoleteNames#2765,constraint#2766,tep#2767,proteinIds#2768,dbXrefs#2769,chemicalProbes#2770,... 5 more fields] parquet
                     :                                                  :     :                                                                                            +- Aggregate [targetId2#2827, drugId2#2820], [targetId2#2827, drugId2#2820, collect_set(actionType#2805, 0, 0) AS actionType#2837]
                     :                                                  :     :                                                                                               +- Project [targetId2#2827, drugId2#2820, actionType#2805, mechanismOfAction#2806]
                     :                                                  :     :                                                                                                  +- Generate explode(targets#2810), true, [targetId2#2827]
                     :                                                  :     :                                                                                                     +- Project [drugId2#2820, actionType#2805, mechanismOfAction#2806, targets#2810]
                     :                                                  :     :                                                                                                        +- Generate explode(chemblIds#2807), true, [drugId2#2820]
                     :                                                  :     :                                                                                                           +- Relation [actionType#2805,mechanismOfAction#2806,chemblIds#2807,targetName#2808,targetType#2809,targets#2810,references#2811] parquet
                     :                                                  :     +- Project [biosampleId#1059, biosampleName#1060]
                     :                                                  :        +- Relation [biosampleId#1059,biosampleName#1060,description#1061,xrefs#1062,synonyms#1063,parents#1064,ancestors#1065,children#1066,descendants#1067] parquet
                     :                                                  +- Relation [name#17979,biosampleName#17980,rightTissue1#17981] csv
                     +- Project [targetId#18698, diseaseId#18780, stopReason#18001]
                        +- Project [targetId#18698, diseaseId#18780, count#17997L, Negative AS stopReason#18001]
                           +- Aggregate [targetId#18698, diseaseId#18780], [targetId#18698, diseaseId#18780, count(1) AS count#17997L]
                              +- Filter array_contains(studyStopReasonCategories#18762, Negative)
                                 +- Project [targetId#18698, diseaseId#18780, studyStopReason#18761, studyStopReasonCategories#18762]
                                    +- Filter (datasourceId#18697 = chembl)
                                       +- Project [datasourceId#18697, targetId#18698, alleleOrigins#18699, allelicRequirements#18700, ancestry#18701, ancestryId#18702, beta#18703, betaConfidenceIntervalLower#18704, betaConfidenceIntervalUpper#18705, biologicalModelAllelicComposition#18706, biologicalModelGeneticBackground#18707, biologicalModelId#18708, biomarkerName#18709, biomarkers#18710, biosamplesFromSource#18711, cellType#18712, clinicalPhase#18713, clinicalSignificances#18714, clinicalStatus#18715, cohortDescription#18716, cohortId#18717, cohortPhenotypes#18718, cohortShortName#18719, confidence#18720, ... 66 more fields]
                                          +- Filter datasourceId#18697 IN (ot_genetics_portal,gwas_credible_sets,gene_burden,eva,eva_somatic,gene2phenotype,orphanet,cancer_gene_census,intogen,impc,chembl)
                                             +- Relation [datasourceId#18697,targetId#18698,alleleOrigins#18699,allelicRequirements#18700,ancestry#18701,ancestryId#18702,beta#18703,betaConfidenceIntervalLower#18704,betaConfidenceIntervalUpper#18705,biologicalModelAllelicComposition#18706,biologicalModelGeneticBackground#18707,biologicalModelId#18708,biomarkerName#18709,biomarkers#18710,biosamplesFromSource#18711,cellType#18712,clinicalPhase#18713,clinicalSignificances#18714,clinicalStatus#18715,cohortDescription#18716,cohortId#18717,cohortPhenotypes#18718,cohortShortName#18719,confidence#18720,... 65 more fields] parquet


In [None]:
bench2

In [19]:
####
### 
value='projectId'
window_spec = Window.partitionBy("targetId", "diseaseId",'projectId').orderBy(
        F.col("pValueExponent").asc()
    )

bench2.withColumn(
            "agree_lowestPval",
            F.first("AgreeDrug", ignorenulls=True).over(
                window_spec
            ),  ### ignore nulls aded 29.01.2025
            #### take directionality from lowest p value
        ).groupBy('targetId','diseaseId','maxClinPhase').pivot('projectId').agg(F.collect_set('agree_lowestPval')).select(
            ["*"]
            + (
                [  ### single columns
                    F.when(F.array_contains(F.col(x), "yes"), F.lit("yes"))
                    .otherwise(F.lit("no"))
                    .alias(f"{x}_only")
                    for x,value  in [
                        (key, val) for key, val in disdic.items() if val == value
                    ]
                ]
            )).withColumn('Phase4', F.when(F.col('maxClinPhase')==4, F.lit('yes')).otherwise(F.lit('no'))).show()
                          
##).groupBy('GTEx_only').pivot('Phase4').count().show()



+---------------+-------------+------------+-----------+----------+---------+---------------------+--------+---------+---+-----+----------+----------+------+------------+-------+--------+-----+--------------+------+-----------+-----------------+----------+-----------+------------+------+----+---------+----------+------+----------+------+-------------+--------------+---------------------+--------+-------+-----------+-----------+----------+-------+----------------+-----------------+-------------+-------------+---------+-------------------+--------------------------+----------+-----------+-----------+---------------+---------------+-----------+-----------+------------+------------+---------------+-------------+----------------+------------------+------------+---------------+--------------+----------------+----------------------+--------------+---------------+---------------------+---------------+----------------+---------+--------+---------------+----------------+-------------------------

                                                                                

In [5]:
(bench2.withColumn(
            "rightTissue",
            F.when(F.col("rightTissue1") == "yes", F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "agree_lowestPval",
            F.first("AgreeDrug", ignorenulls=True).over(
                window_spec
            ),  ### ignore nulls aded 29.01.2025
            #### take directionality from lowest p value
        )
        .withColumn(
            "isRightTissueSignalAgreed",
            F.collect_set(
                F.when(F.col("rightTissue") == "yes", F.col("agree_lowestPval"))
            ).over(window_spec),
        )
        .withColumn(
            "isSignalFromRightTissue",
            F.first(
                F.when(
                    F.col("AgreeDrug") == F.col("agree_lowestPval"),
                    F.col("rightTissue"),
                ).otherwise(F.lit('no')),
                ignorenulls=True,
            ).over(window_spec),
        ).show())



+--------------------+--------------------+--------------+---------------+-------------+--------------------+--------------------+--------------------+----------+--------------+--------------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------+---------------------+--------------------+----------------------+----------+---------+--------------+---------+------------------+-------------------+-------------------------+-------------------+------------------+--------------------+-------------------+------------------+--------------------+---------------+--------------------+--------------+--------------------+-----------+------------+---------------+---------------+---------+------------+-----------+----------------+-------------------------+-----------------------+
|                name|       biosampleName|   biosampleId|       targetId|    dise

                                                                                

In [15]:
window_spec

<pyspark.sql.window.WindowSpec at 0x7fc37dc576d0>

In [None]:
bench2.select('')

In [None]:
bench2.groupBy('targetId','diseaseId').pivot('biosampleName').agg(F.collect_set(''))

                                                                                

+--------------------+--------------------+--------------+---------------+-------------+--------------------+--------------------+--------------------+----------+--------------+--------------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+---------------+---------------------+----------------+----------------------+----------+---------+--------------+---------+------------------+-------------------+-------------------------+------------------+------------------+--------------------+------------------+------------------+--------------------+---------------+--------------------+--------------+--------------------+-----------+------------+---------------+---------------+---------+------------+-----------+
|                name|       biosampleName|   biosampleId|       targetId|    diseaseId|    leftStudyLocusId|        rightStudyId|   rightStudyLocusId|chromo

In [22]:
window_spec = Window.partitionBy("targetId", "diseaseId",'biosampleName').orderBy(
        F.col("pValueExponent").asc()
    )

bench2.withColumn('agree_tissue', F.when((F.col('rightTissue1')=='yes') & (F.col('AgreeDrug')=='yes'), F.lit('yes')).otherwise(F.lit('no'))).withColumn(
            "Phase4",
            F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
        ).withColumn(
            "agree_lowestPval",
            F.first("agree_tissue", ignorenulls=True).over(
                window_spec
            ),  ### ignore nulls aded 29.01.2025
            #### take directionality from lowest p value
        ).select('targetId','diseaseId','agree_lowestPval','Phase4').distinct().groupBy('agree_lowestPval').pivot('Phase4').count().show()



+----------------+---+---+
|agree_lowestPval| no|yes|
+----------------+---+---+
|              no| 96| 55|
|             yes| 48| 34|
+----------------+---+---+



                                                                                

In [23]:
bench2.show()

                                                                                ]

+--------------------+--------------------+--------------+---------------+-------------+--------------------+--------------------+--------------------+----------+--------------+--------------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+---------------+---------------------+----------------+----------------------+----------+---------+--------------+---------+------------------+-------------------+-------------------------+------------------+------------------+--------------------+------------------+------------------+--------------------+---------------+--------------------+--------------+--------------------+-----------+------------+---------------+---------------+---------+------------+-----------+
|                name|       biosampleName|   biosampleId|       targetId|    diseaseId|    leftStudyLocusId|        rightStudyId|   rightStudyLocusId|chromo

In [25]:
bench2.select('targetId','diseaseId').distinct().count()

                                                                                

172

In [26]:
bench2.show()



+--------------------+--------------------+--------------+---------------+-------------+--------------------+--------------------+--------------------+----------+--------------+--------------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+---------------+---------------------+----------------+----------------------+----------+---------+--------------+---------+------------------+-------------------+-------------------------+------------------+------------------+--------------------+------------------+------------------+--------------------+---------------+--------------------+--------------+--------------------+-----------+------------+---------------+---------------+---------+------------+-----------+
|                name|       biosampleName|   biosampleId|       targetId|    diseaseId|    leftStudyLocusId|        rightStudyId|   rightStudyLocusId|chromo

                                                                                

In [30]:
window_spec = Window.partitionBy("targetId", "diseaseId",'biosampleName','rightStudyType').orderBy(
        F.col("pValueExponent").asc()
    )
bench2.withColumn(
            "agree_lowestPval",
            F.first("AgreeDrug", ignorenulls=True).over(
                window_spec
            ),  ### ignore nulls aded 29.01.2025
            #### take directionality from lowest p value
        ).withColumn(
            "RightTissue_Agree",
            F.when((F.col('agree_lowestPval')=='yes') & (F.col('rightTissue1')=='yes'), F.lit('yes')).otherwise(F.lit('no')),
        ).withColumn(
            "Phase4",
            F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
        ).groupBy('RightTissue_Agree', 'rightStudyType').pivot('Phase4').count().sort(F.col('rightStudyType').desc(),F.col('RightTissue_Agree').desc()).show(20)

                                                                                

+-----------------+--------------+---+----+
|RightTissue_Agree|rightStudyType| no| yes|
+-----------------+--------------+---+----+
|              yes|         tuqtl|201| 174|
|               no|         tuqtl|727| 860|
|              yes|          sqtl|  4|  12|
|               no|          sqtl|218| 237|
|              yes|        sceqtl|  9|  35|
|               no|        sceqtl| 37|  44|
|              yes|          pqtl| 56|  45|
|               no|          pqtl| 12|  66|
|              yes|          eqtl|157| 151|
|               no|          eqtl|887|1283|
+-----------------+--------------+---+----+



In [39]:
window_spec = Window.partitionBy("targetId", "diseaseId",'colocDoE').orderBy(
        F.col("pValueExponent").asc()
    )
bench2.withColumn(
            "agree_lowestPval",
            F.first("AgreeDrug", ignorenulls=True).over(
                window_spec
            ), 
        ).withColumn(
            "RightTissue_Agree",
            F.when((F.col('agree_lowestPval')=='yes') & (F.col('rightTissue1')=='yes'), F.lit('yes')).otherwise(F.lit('no')),
        ).withColumn(
            "Phase4",
            F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
        ).groupBy('targetId', 'diseaseId','Phase4').pivot('colocDoE').agg(F.collect_set('agree_lowestPval')).show()



+---------------+-------------+------+-----------+-----------+
|       targetId|    diseaseId|Phase4|GoF_protect|LoF_protect|
+---------------+-------------+------+-----------+-----------+
|ENSG00000006071|  EFO_0000400|   yes|       [no]|      [yes]|
|ENSG00000006071|  EFO_0000537|    no|       [no]|      [yes]|
|ENSG00000006071|MONDO_0005148|   yes|       [no]|      [yes]|
|ENSG00000010310|  EFO_0000400|   yes|      [yes]|       [no]|
|ENSG00000010310|MONDO_0005148|   yes|      [yes]|       [no]|
|ENSG00000064989|  EFO_0001073|    no|         []|       [no]|
|ENSG00000065989|  EFO_0000540|   yes|         []|      [yes]|
|ENSG00000065989|  EFO_0000676|   yes|         []|      [yes]|
|ENSG00000065989|MONDO_0002406|    no|         []|      [yes]|
|ENSG00000066468|MONDO_0008315|    no|       [no]|         []|
|ENSG00000068078|  EFO_0000313|    no|       [no]|      [yes]|
|ENSG00000068078|MONDO_0001187|    no|       [no]|      [yes]|
|ENSG00000068078|MONDO_0004986|    no|       [no]|     

                                                                                

In [37]:
window_spec = Window.partitionBy("targetId", "diseaseId",'colocDoE').orderBy(
        F.col("pValueExponent").asc()
    )
bench2.withColumn(
            "agree_lowestPval",
            F.first("AgreeDrug", ignorenulls=True).over(
                window_spec
            ), 
        ).withColumn(
            "RightTissue_Agree",
            F.when((F.col('agree_lowestPval')=='yes') & (F.col('rightTissue1')=='yes'), F.lit('yes')).otherwise(F.lit('no')),
        ).withColumn(
            "Phase4",
            F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
        ).groupBy('targetId', 'diseaseId','Phase4').pivot('colocDoE').agg(F.collect_set('agree_lowestPval')).groupBy('GoF_protect').pivot('Phase4').count().show()



+-----------+---+---+
|GoF_protect| no|yes|
+-----------+---+---+
|         []| 34| 21|
|       [no]| 62| 39|
|      [yes]| 10|  6|
+-----------+---+---+



                                                                                

In [38]:
window_spec = Window.partitionBy("targetId", "diseaseId",'colocDoE').orderBy(
        F.col("pValueExponent").asc()
    )
bench2.withColumn(
            "agree_lowestPval",
            F.first("AgreeDrug", ignorenulls=True).over(
                window_spec
            ), 
        ).withColumn(
            "RightTissue_Agree",
            F.when((F.col('agree_lowestPval')=='yes') & (F.col('rightTissue1')=='yes'), F.lit('yes')).otherwise(F.lit('no')),
        ).withColumn(
            "Phase4",
            F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
        ).groupBy('targetId', 'diseaseId','Phase4').pivot('colocDoE').agg(F.collect_set('agree_lowestPval')).groupBy('LoF_protect').pivot('Phase4').count().show()



+-----------+---+---+
|LoF_protect| no|yes|
+-----------+---+---+
|         []| 24|  9|
|       [no]|  8|  8|
|      [yes]| 74| 49|
+-----------+---+---+



                                                                                

In [None]:
window_spec = Window.partitionBy("targetId", "diseaseId").orderBy(
        F.col("pValueExponent").asc()
    )
bench2.withColumn(
            "agree_lowestPval",
            F.first("AgreeDrug", ignorenulls=True).over(
                window_spec
            ),  ### ignore nulls aded 29.01.2025
            #### take directionality from lowest p value
        ).withColumn(
            "RightTissue_Agree",
            F.when((F.col('agree_lowestPval')=='yes') & (F.col('rightTissue1')=='yes'), F.lit('yes')).otherwise(F.lit('no')),
        ).withColumn(
            "Phase4",
            F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
        ).groupBy('agree_lowestPval', 'colocDoE').pivot('Phase4').count().sort(F.col('colocDoE').desc(),F.col('agree_lowestPval').desc()).show(20)



+----------------+-----------+----+----+
|agree_lowestPval|   colocDoE|  no| yes|
+----------------+-----------+----+----+
|             yes|LoF_protect|1348|1879|
|              no|LoF_protect|  76| 104|
|             yes|GoF_protect|  60|  84|
|              no|GoF_protect| 824| 840|
+----------------+-----------+----+----+



                                                                                

In [31]:
window_spec = Window.partitionBy("targetId", "diseaseId",'rightStudyType').orderBy(
        F.col("pValueExponent").asc()
    )
bench2.withColumn(
            "agree_lowestPval",
            F.first("AgreeDrug", ignorenulls=True).over(
                window_spec
            ),  ### ignore nulls aded 29.01.2025
            #### take directionality from lowest p value
        ).withColumn(
            "RightTissue_Agree",
            F.when((F.col('agree_lowestPval')=='yes') & (F.col('rightTissue1')=='yes'), F.lit('yes')).otherwise(F.lit('no')),
        ).withColumn(
            "Phase4",
            F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
        ).groupBy('agree_lowestPval', 'rightStudyType').pivot('Phase4').count().sort(F.col('rightStudyType').desc(),F.col('agree_lowestPval').desc()).show(20)



+----------------+--------------+---+----+
|agree_lowestPval|rightStudyType| no| yes|
+----------------+--------------+---+----+
|             yes|         tuqtl|686| 767|
|              no|         tuqtl|242| 267|
|             yes|          sqtl| 39|  47|
|              no|          sqtl|183| 202|
|             yes|        sceqtl| 41|  79|
|              no|        sceqtl|  5|NULL|
|             yes|          pqtl| 59|  48|
|              no|          pqtl|  9|  63|
|             yes|          eqtl|475| 798|
|              no|          eqtl|569| 636|
+----------------+--------------+---+----+



                                                                                

In [16]:
window_spec = Window.partitionBy("targetId", "diseaseId",'biosampleName')
bench2.withColumn(
            "agree_lowestPval",
            F.first("AgreeDrug", ignorenulls=True).over(
                window_spec
            ),  ### ignore nulls aded 29.01.2025
            #### take directionality from lowest p value
        ).withColumn(
            "RightTissue_Agree",
            F.when((F.col('agree_lowestPval')=='yes') & (F.col('rightTissue1')=='yes'), F.lit('yes')).otherwise(F.lit('no')),
        ).withColumn(
            "Phase4",
            F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
        ).groupBy('RightTissue_Agree').pivot('Phase4').count().show()



+-----------------+----+----+
|RightTissue_Agree|  no| yes|
+-----------------+----+----+
|               no|1881|2486|
|              yes| 427| 421|
+-----------------+----+----+



                                                                                

In [2]:
#######
##
####
##
#######

pivoted_dfs['biosampleName'].show()



+---------------+-------------+------------+-----------+-------------------------+-----------------------+------------+------+----------------------------------+--------------------------------+-----------------------------------------------+-------------------------------+--------------------------------------+-------------------------------+-------------+------+----------------+---------------+--------------+-------------+--------+-------------------------+----------------------+---------------+-----+------------+----------------+-----------------+---------------+----------------------------------------------+----------+---------------+-------------------+------------------------------+-----------------------------------------------+---------------------------+-----------------------------+----------+-----------+--------------+--------------------------+----------+------------+-----+-----------------------------+-------------------+-------------------------+------------------------+-

                                                                                

In [10]:
benchmark.join(
    rightTissue, on=["name", "bioSampleName"], how="left"
).select('name','biosampleName','rightTissue1').distinct().toPandas().to_csv('gs://ot-team/jroldan/analysis/20250526_rightTissue.csv')

                                                                                

In [3]:
pivoted_dfs['projectId'].show(vertical=True)



-RECORD 0----------------------------------------------------------
 targetId                                        | ENSG00000006071 
 diseaseId                                       | EFO_0000400     
 maxClinPhase                                    | 4.0             
 rightTissue                                     | no              
 isRightTissueSignalAgreed                       | []              
 isSignalFromRightTissue                         | no              
 Alasoo_2018                                     | []              
 Aygun_2021                                      | []              
 BLUEPRINT                                       | []              
 Bossini-Castillo_2019                           | []              
 BrainSeq                                        | [no]            
 Braineac2                                       | []              
 CAP                                             | []              
 CEDAR                                          

                                                                                