In [None]:
import time
from array import ArrayType
from functions import (
    relative_success,
    spreadSheetFormatter,
    discrepancifier,
    temporary_directionOfEffect,
)
# from stoppedTrials import terminated_td
from DoEAssessment import directionOfEffect
# from membraneTargets import target_membrane
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
from datetime import datetime
from datetime import date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.types import (
    StructType,
    StructField,
    DoubleType,
    DecimalType,
    StringType,
    FloatType,
)
import pandas as pd

spark = SparkSession.builder.getOrCreate()
spark.conf.set(
    "spark.sql.shuffle.partitions", "400"
)  # Default is 200, increase if needed


path_n='gs://open-targets-data-releases/25.03/output/'

target = spark.read.parquet(f"{path_n}target/")

diseases = spark.read.parquet(f"{path_n}disease/")

evidences = spark.read.parquet(f"{path_n}evidence")

credible = spark.read.parquet(f"{path_n}credible_set")

new = spark.read.parquet(f"{path_n}colocalisation_coloc") 

index=spark.read.parquet(f"{path_n}study/")

variantIndex = spark.read.parquet(f"{path_n}variant")

biosample = spark.read.parquet(f"{path_n}biosample")

ecaviar=spark.read.parquet(f"{path_n}colocalisation_ecaviar")

all_coloc=ecaviar.unionByName(new, allowMissingColumns=True)

print("loaded files")

newColoc = (
    all_coloc.join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on left side
            "studyLocusId as leftStudyLocusId",
            "StudyId as leftStudyId",
            "variantId as leftVariantId",
            "studyType as credibleLeftStudyType",
        ),
        on="leftStudyLocusId",
        how="left",
    )
    .join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on right side
            "studyLocusId as rightStudyLocusId",
            "studyId as rightStudyId",
            "variantId as rightVariantId",
            "studyType as credibleRightStudyType",
            "pValueExponent as qtlPValueExponent",
            'isTransQtl'
        ),
        on="rightStudyLocusId",
        how="left",
    )
    .join(
        index.selectExpr(  ### bring modulated target on right side (QTL study)
            "studyId as rightStudyId",
            "geneId",
            "projectId",
            "studyType as indexStudyType",
            "condition",
            "biosampleId",
        ),
        on="rightStudyId",
        how="left",
)
    # .persist()
)

print("loaded newColoc")

# remove columns without content (only null values on them)
df = evidences.filter((F.col("datasourceId") == "gwas_credible_sets"))

# Use an aggregation to determine non-null columns
non_null_counts = df.select(
    *[F.sum(F.col(col).isNotNull().cast("int")).alias(col) for col in df.columns]
)

# Collect the counts for each column
non_null_columns = [
    row[0] for row in non_null_counts.collect()[0].asDict().items() if row[1] > 0
]

# Select only the non-null columns
filtered_df = df.select(*non_null_columns)  # .persist()

## bring studyId, variantId, beta from Gwas and pValue
gwasComplete = filtered_df.join(
    credible.selectExpr(
        "studyLocusId", "studyId", "variantId", "beta as betaGwas", "pValueExponent"
    ),
    on="studyLocusId",
    how="left",
)  # .persist()

print("loaded gwasComplete")

resolvedColoc = (
    (
        newColoc.withColumnRenamed("geneId", "targetId")
        .join(
            gwasComplete.withColumnRenamed("studyLocusId", "leftStudyLocusId"),
            on=["leftStudyLocusId", "targetId"],
            how="inner",
        )
        .join(  ### propagated using parent terms
            diseases.selectExpr(
                "id as diseaseId", "name", "parents", "therapeuticAreas"
            ),
            on="diseaseId",
            how="left",
        )
        .withColumn(
            "diseaseId",
            F.explode_outer(F.concat(F.array(F.col("diseaseId")), F.col("parents"))),
        )
        .drop("parents", "oldDiseaseId")
    ).withColumn(
        "colocDoE",
        F.when(
            F.col("rightStudyType").isin(
                ["eqtl", "pqtl", "tuqtl", "sceqtl", "sctuqtl"]
            ),
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_protect"),
            ),
        ).when(
            F.col("rightStudyType").isin(
                ["sqtl", "scsqtl"]
            ),  ### opposite directionality than sqtl
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_protect"),
            ),
        ),
    )
    # .persist()
)
print("loaded resolvedColloc")

datasource_filter = [
#   "ot_genetics_portal",
    "gwas_credible_sets",
    "gene_burden",
    "eva",
    "eva_somatic",
    "gene2phenotype",
    "orphanet",
    "cancer_gene_census",
    "intogen",
    "impc",
    "chembl",
]

assessment, evidences, actionType, oncolabel = temporary_directionOfEffect(
    path_n, datasource_filter
)

print("run temporary direction of effect")


print("built drugApproved dataset")

analysis_chembl_indication = (
    discrepancifier(
        assessment.filter((F.col("datasourceId") == "chembl"))
        .withColumn(
            "maxClinPhase",
            F.max(F.col("clinicalPhase")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase")
        .pivot("homogenized")
        .agg(F.count("targetId"))
    )
    .filter(F.col("coherencyDiagonal") == "coherent")
    .drop(
        "coherencyDiagonal", "coherencyOneCell", "noEvaluable", "GoF_risk", "LoF_risk"
    )
    .withColumnRenamed("GoF_protect", "drugGoF_protect")
    .withColumnRenamed("LoF_protect", "drugLoF_protect")
    # .persist()
)

chemblAssoc = (
    discrepancifier(
        assessment.filter(
            (F.col("datasourceId") == "chembl")
            & (F.col("homogenized") != "noEvaluable")
        )
        .withColumn(
            "maxClinPhase",
            F.max("clinicalPhase").over(Window.partitionBy("targetId", "diseaseId")),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase")
        .pivot("homogenized")
        .count()
    )
    .filter(F.col("coherencyDiagonal") == "coherent")
    .drop(
        "coherencyDiagonal", "coherencyOneCell", "noEvaluable", "GoF_risk", "LoF_risk"
    )
    .withColumnRenamed("GoF_protect", "drugGoF_protect")
    .withColumnRenamed("LoF_protect", "drugLoF_protect")
)

print("built chemblAssoc dataset")



spark session created at 2025-06-18 17:58:48.685820
Analysis started on 2025-06-18 at  2025-06-18 17:58:48.685820


25/06/18 17:59:06 WARN YarnScheduler: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
25/06/18 17:59:19 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


loaded files
loaded newColoc


                                                                                

loaded gwasComplete
loaded resolvedColloc
run temporary direction of effect
built drugApproved dataset


                                                                                

built chemblAssoc dataset


In [None]:
#### there are transQTL but not in the data that we do the benchmark with coloc
newColoc.groupBy('isTransQtl').count().show()



+----------+--------+
|isTransQtl|   count|
+----------+--------+
|      NULL|53109519|
|     false|26428266|
|      true| 4591745|
+----------+--------+



                                                                                

In [135]:
benchmarck2.unpersist()
bench.unpersist()

DataFrame[targetId: string, diseaseId: string, leftStudyLocusId: string, rightStudyId: string, rightStudyLocusId: string, chromosome: string, rightStudyType: string, numberColocalisingVariants: bigint, clpp: double, colocalisationMethod: string, betaRatioSignAverage: double, h0: double, h1: double, h2: double, h3: double, h4: double, leftStudyId: string, leftVariantId: string, credibleLeftStudyType: string, rightVariantId: string, credibleRightStudyType: string, qtlPValueExponent: int, isTransQtl: boolean, projectId: string, indexStudyType: string, condition: string, biosampleId: string, datasourceId: string, datatypeId: string, diseaseFromSourceMappedId: string, resourceScore: double, targetFromSourceId: string, id: string, score: double, sourceId: string, studyId: string, variantId: string, betaGwas: double, pValueExponent: int, name: string, therapeuticAreas: array<string>, colocDoE: string, maxClinPhase: double, drugGoF_protect: bigint, drugLoF_protect: bigint, AgreeDrug: string, h

In [None]:
####2 Define agregation function
import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
from scipy.stats.contingency import odds_ratio
from pyspark.sql.types import *


def convertTuple(tup):
    st = ",".join(map(str, tup))
    return st


#####3 run in a function
def aggregations_original(
    df,
    data,
    listado,
    comparisonColumn,
    comparisonType,
    predictionColumn,
    predictionType,
    today_date,
):
    wComparison = Window.partitionBy(comparisonColumn)
    wPrediction = Window.partitionBy(predictionColumn)
    wPredictionComparison = Window.partitionBy(comparisonColumn, predictionColumn)
    results = []
    # uniqIds = df.select("targetId", "diseaseId").distinct().count()
    out = (
        df.withColumn("comparisonType", F.lit(comparisonType))
        .withColumn("dataset", F.lit(data))
        .withColumn("predictionType", F.lit(predictionType))
        # .withColumn("total", F.lit(uniqIds))
        .withColumn("a", F.count("targetId").over(wPredictionComparison))
        .withColumn("comparisonColumn", F.lit(comparisonColumn))
        .withColumn("predictionColumnValue", F.lit(predictionColumn))
        .withColumn(
            "predictionTotal",
            F.count("targetId").over(wPrediction),
        )
        .withColumn(
            "comparisonTotal",
            F.count("targetId").over(wComparison),
        )
        .select(
            F.col(predictionColumn).alias("prediction"),
            F.col(comparisonColumn).alias("comparison"),
            "dataset",
            "comparisonColumn",
            "predictionColumnValue",
            "comparisonType",
            "predictionType",
            "a",
            "predictionTotal",
            "comparisonTotal",
        )
        .filter(F.col("prediction").isNotNull())
        .filter(F.col("comparison").isNotNull())
        .distinct()
    )

    out.write.mode("overwrite").parquet(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + comparisonType
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )

    listado.append(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + comparisonType
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )
    path = "gs://ot-team/jroldan/" + str(
        today_date
        + "_"
        + "analysis/"
        + data
        # + "_propagated"
        + "/"
        + comparisonColumn
        + "_"
        + comparisonType
        + "_"
        + predictionColumn
        + ".parquet"
    )
    print(path)
    
    ### making analysis
    array1 = np.delete(
        out.join(full_data, on=["prediction", "comparison"], how="outer")
        .groupBy("comparison")
        .pivot("prediction")
        .agg(F.first("a"))
        .sort(F.col("comparison").desc())
        .select("comparison", "yes", "no")
        .fillna(0)
        .toPandas()
        .to_numpy(),
        [0],
        1,
    )
    total = np.sum(array1)
    res_npPhaseX = np.array(array1, dtype=int)
    resX = convertTuple(fisher_exact(res_npPhaseX, alternative="two-sided"))
    resx_CI = convertTuple(
        odds_ratio(res_npPhaseX).confidence_interval(confidence_level=0.95)
    )

    result_st.append(resX)
    result_ci.append(resx_CI)
    (rs_result, rs_ci) = relative_success(array1)
    results.extend(
        [
            comparisonType,
            comparisonColumn,
            predictionColumn,
            round(float(resX.split(",")[0]), 2),
            float(resX.split(",")[1]),
            round(float(resx_CI.split(",")[0]), 2),
            round(float(resx_CI.split(",")[1]), 2),
            str(total),
            np.array(res_npPhaseX).tolist(),
            round(float(rs_result), 2),
            round(float(rs_ci[0]), 2),
            round(float(rs_ci[1]), 2),
            # studies,
            # tissues,
            path,
        ]
    )
    return results


#### 3 Loop over different datasets (as they will have different rows and columns)


def comparisons_df_iterative(elements):
    # toAnalysis = [(key, value) for key, value in disdic.items() if value == projectId]
    toAnalysis = [(col, "predictor") for col in elements]
    schema = StructType(
        [
            StructField("comparison", StringType(), True),
            StructField("comparisonType", StringType(), True),
        ]
    )

    comparisons = spark.createDataFrame(toAnalysis, schema=schema)
    ### include all the columns as predictor

    predictions = spark.createDataFrame(
        data=[
            ("Phase>=4", "clinical"),
            #('Phase>=3','clinical'),
            #('Phase>=2','clinical'),
            #('Phase>=1','clinical'),
            #("PhaseT", "clinical"),
        ]
    )
    return comparisons.join(predictions, how="full").collect()


print("load comparisons_df_iterative function")


full_data = spark.createDataFrame(
    data=[
        ("yes", "yes"),
        ("yes", "no"),
        ("no", "yes"),
        ("no", "no"),
    ],
    schema=StructType(
        [
            StructField("prediction", StringType(), True),
            StructField("comparison", StringType(), True),
        ]
    ),
)
print("created full_data and lists")

#rightTissue = spark.read.csv(
#    'gs://ot-team/jroldan/analysis/20250526_rightTissue.csv',
#    header=True,
#).drop("_c0")

print("loaded rightTissue dataset")

negativeTD = (
    evidences.filter(F.col("datasourceId") == "chembl")
    .select("targetId", "diseaseId", "studyStopReason", "studyStopReasonCategories")
    .filter(F.array_contains(F.col("studyStopReasonCategories"), "Negative"))
    .groupBy("targetId", "diseaseId")
    .count()
    .withColumn("stopReason", F.lit("Negative"))
    .drop("count")
)

print("built negativeTD dataset")

print("built bench2 dataset")

###### cut from here
print("looping for variables_study")


load comparisons_df_iterative function
created full_data and lists
loaded rightTissue dataset
built negativeTD dataset
built bench2 dataset
looping for variables_study


In [3]:
#### new part with chatgpt -- TEST

## QUESTIONS TO ANSWER:
# HAVE ECAVIAR >=0.8
# HAVE COLOC 
# HAVE COLOC >= 0.8
# HAVE COLOC + ECAVIAR >= 0.01
# HAVE COLOC >= 0.8 + ECAVIAR >= 0.01
# RIGHT JOING WITH CHEMBL 

resolvedColocFiltered = resolvedColoc.filter((F.col('clpp')>=0.01) | (F.col('h4')>=0.8))
benchmark = (
    (
        resolvedColocFiltered.filter(F.col("betaGwas") < 0).filter(
        F.col("name") != "COVID-19"
    )
        .join(  ### select just GWAS giving protection
            analysis_chembl_indication, on=["targetId", "diseaseId"], how="right"  ### RIGHT SIDE
        )
        .withColumn(
            "AgreeDrug",
            F.when(
                (F.col("drugGoF_protect").isNotNull())
                & (F.col("colocDoE") == "GoF_protect"),
                F.lit("yes"),
            )
            .when(
                (F.col("drugLoF_protect").isNotNull())
                & (F.col("colocDoE") == "LoF_protect"),
                F.lit("yes"),
            )
            .otherwise(F.lit("no")),
        )
    )  #### remove COVID-19 associations
).join(biosample.select("biosampleId", "biosampleName"), on="biosampleId", how="left")

#bench2 = benchmark.join(
#    rightTissue, on=["name", "bioSampleName"], how="left"
#).withColumn(
#    "rightTissue",
#    F.when(F.col("rightTissue1") == "yes", F.lit("yes")).otherwise(F.lit("no")),
#)

print("built benchmark dataset")

## write the benchmark 
name='benchmark'
output_partitioned_path = f"gs://ot-team/jroldan/analysis/parquetFiles/{name}"
benchmark.write.mode("overwrite").parquet(output_partitioned_path)
print(f'written {name}')
#### Analysis

#### 1 Build a dictionary with the distinct values as key and column names as value
variables_study = ["projectId", "biosampleName", "rightStudyType", "colocDoE","colocalisationMethod"]

# List to hold temporary DataFrames
temp_dfs_for_union = []

# Iterate over the column names to prepare DataFrames for union
for col_name in variables_study:
    # Select the current column, alias it to 'distinct_value' for consistent schema
    # Filter out nulls, then get distinct values
    # Add a literal column with the original 'col_name'
    df_temp = (
        benchmark.select(F.col(col_name).alias("distinct_value"))
        .filter(F.col("distinct_value").isNotNull()) # Exclude None (null) values
        .distinct()
        .withColumn("column_name", F.lit(col_name))
    )
    temp_dfs_for_union.append(df_temp)

disdic = {}

if temp_dfs_for_union:
    # Union all the temporary DataFrames.
    # unionByName is crucial to handle potential schema differences (e.g., if columns have same name but different types)
    # and ensures columns are matched by name.
    combined_distinct_values_df = temp_dfs_for_union[0]
    for i in range(1, len(temp_dfs_for_union)):
        combined_distinct_values_df = combined_distinct_values_df.unionByName(temp_dfs_for_union[i])

    # Now, collect the combined distinct values.
    # This is a single collect operation on the aggregated DataFrame.
    print("Collecting combined distinct values from the cluster...")
    collected_rows = combined_distinct_values_df.collect()

    # Populate the dictionary from the collected rows
    for row in collected_rows:
        disdic[row.distinct_value] = row.column_name
else:
    print("variables_study list is empty, disdic will be empty.")


print("\nFinal disdic:", disdic)

built benchmark dataset


                                                                                

written benchmark
Collecting combined distinct values from the cluster...





Final disdic: {'HipSci': 'projectId', 'Nedelec_2016': 'projectId', 'GTEx': 'projectId', 'Schmiedel_2018': 'projectId', 'Jerber_2021': 'projectId', 'BLUEPRINT': 'projectId', 'Quach_2016': 'projectId', 'BrainSeq': 'projectId', 'UKB_PPP_EUR': 'projectId', 'FUSION': 'projectId', 'Sun_2018': 'projectId', 'ROSMAP': 'projectId', 'Alasoo_2018': 'projectId', 'GENCORD': 'projectId', 'GEUVADIS': 'projectId', 'Lepik_2017': 'projectId', 'TwinsUK': 'projectId', 'CommonMind': 'projectId', 'PhLiPS': 'projectId', 'van_de_Bunt_2015': 'projectId', 'Bossini-Castillo_2019': 'projectId', 'Aygun_2021': 'projectId', 'Fairfax_2012': 'projectId', 'Peng_2018': 'projectId', 'CAP': 'projectId', 'Fairfax_2014': 'projectId', 'Cytoimmgen': 'projectId', 'Schwartzentruber_2018': 'projectId', 'PISA': 'projectId', 'Walker_2019': 'projectId', 'CEDAR': 'projectId', 'Braineac2': 'projectId', 'iPSCORE': 'projectId', 'Young_2019': 'projectId', 'Kim-Hellmuth_2017': 'projectId', 'Perez_2022': 'projectId', 'OneK1K': 'projectId'

                                                                                

In [None]:
# Assuming 'spark' session, 'benchmark' DataFrame, 'negativeTD' DataFrame, and 'disdic' dictionary are defined

# --- Step 1: Pre-compute 'hasboth' ONCE ---
# This is a shuffle, but only happens once.
print("Pre-computing 'hasboth' column...")
window_target_disease_only = Window.partitionBy('targetId', 'diseaseId')
benchmark_processed = benchmark.withColumn(
    'hasboth',
    F.size(F.collect_set('colocalisationMethod').over(window_target_disease_only))
)

# You might consider caching this intermediate result if 'benchmark' is very large
# and you have enough memory, to avoid re-reading from source if possible.
# benchmark_processed.cache() # or .persist(StorageLevel.MEMORY_AND_DISK)
# benchmark_processed.count() # Force computation if you cache

pivoted_dfs = {}

# --- Step 2: Loop for each variable_study column ---
for col_name in variables_study:
    print(f"Processing pivot for: {col_name}")

    # Define window specs for the current iteration, including 'col_name' in partition
    # (This shuffle is still per iteration, but unavoidable if 'resolvedAgreeDrug' depends on 'col_name' values)
    current_col_window_spec_qtl = Window.partitionBy("targetId", "diseaseId", col_name).orderBy(F.col("qtlPValueExponent").asc())
    current_col_pvalue_order_window = Window.partitionBy("targetId", "diseaseId", col_name).orderBy(F.col('colocalisationMethod').asc(), F.col("qtlPValueExponent").asc())

    # Calculate 'resolvedAgreeDrug' for the current 'col_name'
    # This involves a shuffle per iteration.
    temp_df_with_resolved = benchmark_processed.withColumn('resolvedAgreeDrug',
        F.when(F.col('hasboth') > 1,
            F.first(F.col('AgreeDrug'), ignorenulls=True).over(current_col_pvalue_order_window)
        ).otherwise(F.first(F.col('AgreeDrug'), ignorenulls=True).over(current_col_window_spec_qtl))
    )

    # --- Step 3: Perform the pivot and join ---
    # This is an expensive operation (shuffle, potential wide dataframe)
    pivoted_df = (
        temp_df_with_resolved
        .groupBy(
            "targetId",
            "diseaseId",
            "maxClinPhase",
        )
        .pivot(col_name) # Pivoting on values of the 'col_name' column
        .agg(F.collect_set("resolvedAgreeDrug"))
        .join(negativeTD, on=["targetId", "diseaseId"], how="left") # Ensure negativeTD is broadcast if small
    )

    # --- Step 4: Add derived columns (these are generally cheap) ---
    for phase in [1, 2, 3, 4]:
        pivoted_df = pivoted_df.withColumn(
            f"Phase>={phase}",
            F.when(F.col("maxClinPhase") >= phase, F.lit("yes")).otherwise(F.lit("no")),
        )

    pivoted_df = pivoted_df.withColumn(
        "PhaseT",
        F.when(F.col("stopReason") == "Negative", F.lit("yes")).otherwise(F.lit("no")),
    )

    # Add _only columns dynamically based on disdic values matching current column
    matching_keys = [key for key, val in disdic.items() if val == col_name]

    for key in matching_keys:
        # F.col(key) assumes 'key' refers to a column that exists in pivoted_df after the pivot.
        pivoted_df = pivoted_df.withColumn(
            f"{key}_only",
            F.when(F.array_contains(F.col(key), "yes"), F.lit("yes")).otherwise(F.lit("no")),
        )

    # --- Step 5: Store result. Consider writing to GCS to break lineage if memory is an issue ---
    # This is highly recommended if 'variables_study' is very large.
    # Write to Parquet for efficient storage and schema preservation.
    # output_path = f"gs://your-bucket/temp_pivoted_results/{col_name}"
    # print(f"Writing results for {col_name} to {output_path}")
    # pivoted_df.write.mode("overwrite").parquet(output_path)
    # pivoted_dfs[col_name] = spark.read.parquet(output_path) # Read back if needed later
    output_partitioned_path = f"gs://ot-team/jroldan/analysis/parquetFiles/pivoted_df_{col_name}"
    pivoted_df.write.mode("overwrite").parquet(output_partitioned_path)
    print(f"DataFrame successfully written and partitioned to {output_partitioned_path}")
    # If not writing to GCS, just store the DF in memory (be cautious for large number of DFs)
    pivoted_dfs[col_name] = pivoted_df

# Example of how to access a result
# if 'some_col_name' in pivoted_dfs:
#     pivoted_dfs['some_col_name'].show()

# If benchmark_processed was cached, unpersist it after the loop
# benchmark_processed.unpersist()

Pre-computing 'hasboth' column...
Processing pivot for: projectId


                                                                                

DataFrame successfully written and partitioned to gs://ot-team/jroldan/analysis/parquetFiles/pivoted_df_projectId
Processing pivot for: biosampleName


                                                                                

DataFrame successfully written and partitioned to gs://ot-team/jroldan/analysis/parquetFiles/pivoted_df_biosampleName
Processing pivot for: rightStudyType


25/06/18 18:07:48 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_71_161 !
25/06/18 18:07:48 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_156_340 !
25/06/18 18:07:48 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_150_393 !
25/06/18 18:07:48 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_38_51 !
25/06/18 18:07:48 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_156_133 !
25/06/18 18:07:48 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_71_269 !
25/06/18 18:07:48 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_71_14 !
25/06/18 18:07:48 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_150_279 !
25/06/18 18:07:48 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_150_300 !
25/06/18 18:07:48 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_71_246 !
25/06/18 18:07:48 WARN BlockManagerMasterEndpoi

DataFrame successfully written and partitioned to gs://ot-team/jroldan/analysis/parquetFiles/pivoted_df_rightStudyType
Processing pivot for: colocDoE


                                                                                0]

DataFrame successfully written and partitioned to gs://ot-team/jroldan/analysis/parquetFiles/pivoted_df_colocDoE
Processing pivot for: colocalisationMethod


                                                                                

DataFrame successfully written and partitioned to gs://ot-team/jroldan/analysis/parquetFiles/pivoted_df_colocalisationMethod


In [None]:
#### NOT TO TAKE
# Dictionary to store results
pivoted_dfs = {}

# Loop over the columns
for col in variables_study:
    print(f"Processing: {col}")

    window_spec = Window.partitionBy("targetId", "diseaseId", col).orderBy(
        F.col("qtlPValueExponent").asc() #### here the pvalue is the one from the QTL variant
    )
    window_spec_pvaluegwas = Window.partitionBy("targetId", "diseaseId", col).orderBy(
        F.col("pValueExponent").asc()
    )
    pvalue_order_window = Window.partitionBy("targetId", "diseaseId",col).orderBy(
    F.col('colocalisationMethod').asc(),F.col("qtlPValueExponent").asc()
    )

    # Build the pivoted dataframe
    pivoted_df = (
        benchmark
        .withColumn(
        'hasboth', F.size(F.collect_set('colocalisationMethod').over(Window.partitionBy('targetId','diseaseId')))
        )
        .withColumn('resolvedAgreeDrug',
            F.when(F.col('hasboth')>1, 
                F.first(F.col('AgreeDrug'),ignorenulls=True).over(pvalue_order_window) ### order by Coloc First and then Lowest PValue
            ).otherwise(F.first(F.col('AgreeDrug'),ignorenulls=True).over(window_spec))) ## Lowest PValue
        .groupBy(
            "targetId",
            "diseaseId",
            "maxClinPhase",
        )
        .pivot(col)
        .agg(F.collect_set("resolvedAgreeDrug"))
        .join(negativeTD, on=["targetId", "diseaseId"], how="left")
    )


    # Add _only columns dynamically based on disdic values matching current column
    matching_keys = [key for key, val in disdic.items() if val == col]

    for key in matching_keys:
        pivoted_df = pivoted_df.withColumn(
            f"{key}_only",
            F.when(F.array_contains(F.col(key), "yes"), F.lit("yes")).otherwise(F.lit("no")),
        )

    # Add Phase columns in a loop
    for phase in [1, 2, 3, 4]:
        pivoted_df = pivoted_df.withColumn(
            f"Phase>={phase}",
            F.when(F.col("maxClinPhase") >= phase, F.lit("yes")).otherwise(F.lit("no")),
        )

    pivoted_df = pivoted_df.withColumn(
        "PhaseT",
        F.when(F.col("stopReason") == "Negative", F.lit("yes")).otherwise(F.lit("no")),
    )
    output_partitioned_path = f"gs://ot-team/jroldan/analysis/parquetFiles/pivoted_df_{col}"
    pivoted_df.write.mode("overwrite").parquet(output_partitioned_path)
    print(f"DataFrame successfully written and partitioned to {output_partitioned_path}")

    # Store result
    pivoted_dfs[col] = pivoted_df




built benchmark dataset


                                                                                

Processing: projectId


25/06/18 17:49:40 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_71_368 !
25/06/18 17:49:40 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_38_29 !
25/06/18 17:49:40 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_38_37 !
25/06/18 17:49:40 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_140_271 !
25/06/18 17:49:40 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_71_179 !
25/06/18 17:49:40 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_38_45 !
25/06/18 17:49:40 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_134_168 !
25/06/18 17:49:40 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_134_170 !
25/06/18 17:49:40 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_38_149 !
25/06/18 17:49:40 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_140_233 !
25/06/18 17:49:40 WARN BlockManagerMasterEndpoint

In [139]:
[key for key, val in disdic.items() if val == col]

['GoF_protect', 'LoF_protect']

In [None]:
result = []
result_st = []
result_ci = []
array2 = []
listado = []
result_all = []
today_date = str(date.today())

##### PROJECT ID ###### 
print('working with projectId')
pivoted_dfs['projectId'].persist()
unique_values = benchmark.select('projectId').filter(F.col('projectId').isNotNull()).distinct().rdd.flatMap(lambda x: x).collect()
filter = len(pivoted_dfs['projectId'].drop(*unique_values).columns[10:])
print('There are ', filter, 'columns to analyse with phases')
rows = comparisons_df_iterative(pivoted_dfs['projectId'].columns[-filter:])


working with projectId


25/06/18 18:15:50 WARN CacheManager: Asked to cache already cached data.
25/06/18 18:17:23 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_71_136 !
25/06/18 18:17:23 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_156_44 !
25/06/18 18:17:23 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_150_247 !
25/06/18 18:17:23 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_38_185 !
25/06/18 18:17:23 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_71_12 !
25/06/18 18:17:23 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_156_339 !
25/06/18 18:17:23 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_156_374 !
25/06/18 18:17:23 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_156_340 !
25/06/18 18:17:23 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_38_29 !
25/06/18 18:17:23 WARN BlockManagerMasterEndpoint: No more replicas av

There are  36 columns to analyse with phases


                                                                                

In [23]:
pivoted_dfs['projectId'].drop(*unique_values).columns[10:]

['HipSci_only',
 'Nedelec_2016_only',
 'GTEx_only',
 'Schmiedel_2018_only',
 'Jerber_2021_only',
 'BLUEPRINT_only',
 'Quach_2016_only',
 'BrainSeq_only',
 'UKB_PPP_EUR_only',
 'FUSION_only',
 'Sun_2018_only',
 'ROSMAP_only',
 'Alasoo_2018_only',
 'GENCORD_only',
 'GEUVADIS_only',
 'Lepik_2017_only',
 'TwinsUK_only',
 'CommonMind_only',
 'PhLiPS_only',
 'van_de_Bunt_2015_only',
 'Bossini-Castillo_2019_only',
 'Aygun_2021_only',
 'Fairfax_2012_only',
 'Peng_2018_only',
 'CAP_only',
 'Fairfax_2014_only',
 'Cytoimmgen_only',
 'Schwartzentruber_2018_only',
 'PISA_only',
 'Walker_2019_only',
 'CEDAR_only',
 'Braineac2_only',
 'iPSCORE_only',
 'Young_2019_only',
 'Kim-Hellmuth_2017_only',
 'Perez_2022_only',
 'OneK1K_only',
 'Kasela_2017_only']

In [28]:
result = []
result_st = []
result_ci = []
array2 = []
listado = []
result_all = []
today_date = str(date.today())

##### PROJECT ID ###### 
print('working with projectId')
pivoted_dfs['projectId'].persist()
unique_values = benchmark.select('projectId').filter(F.col('projectId').isNotNull()).distinct().rdd.flatMap(lambda x: x).collect()
filter = len(pivoted_dfs['projectId'].drop(*unique_values).columns[10:])
print('There are ', filter, 'columns to analyse with phases')
rows = comparisons_df_iterative(pivoted_dfs['projectId'].columns[-filter:])

# If needed, now process the rest
for row in rows:
    results = aggregations_original(
        pivoted_dfs['projectId'], "propagated", listado, *row, today_date
    )
    result_all.append(results)

pivoted_dfs['projectId'].unpersist()
print('df unpersisted')

##### BIOSAMPLE NAME ###### 
print('working with biosampleName')
pivoted_dfs['biosampleName'].persist()
unique_values = benchmark.select('biosampleName').filter(F.col('biosampleName').isNotNull()).distinct().rdd.flatMap(lambda x: x).collect()
filter = len(pivoted_dfs['biosampleName'].drop(*unique_values).columns[10:])
print('There are ', filter, 'columns to analyse with phases')
rows = comparisons_df_iterative(pivoted_dfs['biosampleName'].columns[-filter:])

for row in rows:
    results = aggregations_original(
        pivoted_dfs['biosampleName'], "propagated", listado, *row, today_date
    )
    result_all.append(results)

pivoted_dfs['biosampleName'].unpersist()
print('df unpersisted')

##### RIGHTSTUDYTYPE  ###### 
print('working with rightStudyType')
pivoted_dfs['rightStudyType'].persist()
unique_values = benchmark.select('rightStudyType').filter(F.col('rightStudyType').isNotNull()).distinct().rdd.flatMap(lambda x: x).collect()
filter = len(pivoted_dfs['rightStudyType'].drop(*unique_values).columns[10:])
print('There are ', filter, 'columns to analyse with phases')
rows = comparisons_df_iterative(pivoted_dfs['rightStudyType'].columns[-filter:])

for row in rows:
    results = aggregations_original(
        pivoted_dfs['rightStudyType'], "propagated", listado, *row, today_date
    )
    result_all.append(results)
pivoted_dfs['rightStudyType'].unpersist()
print('df unpersisted')

##### COLOC DOE ######
print('working with colocDoE')
pivoted_dfs['colocDoE'].persist()
unique_values = benchmark.select('colocDoE').filter(F.col('colocDoE').isNotNull()).distinct().rdd.flatMap(lambda x: x).collect()
filter = len(pivoted_dfs['colocDoE'].drop(*unique_values).columns[10:])
print('There are ', filter, 'columns to analyse with phases')
rows = comparisons_df_iterative(pivoted_dfs['colocDoE'].columns[-filter:])

for row in rows:
    results = aggregations_original(
        pivoted_dfs['colocDoE'], "propagated", listado, *row, today_date
    )
    result_all.append(results)
pivoted_dfs['colocDoE'].unpersist()
print('df unpersisted')

##### COLOCALISATION METHOD ######
print('working with colocalisationMethod')
pivoted_dfs['colocalisationMethod'].persist()
unique_values = benchmark.select('colocalisationMethod').filter(F.col('colocalisationMethod').isNotNull()).distinct().rdd.flatMap(lambda x: x).collect()
filter = len(pivoted_dfs['colocalisationMethod'].drop(*unique_values).columns[10:])
print('There are ', filter, 'columns to analyse with phases')
rows = comparisons_df_iterative(pivoted_dfs['colocalisationMethod'].columns[-filter:])

for row in rows:
    results = aggregations_original(
        pivoted_dfs['colocalisationMethod'], "propagated", listado, *row, today_date
    )
    result_all.append(results)
pivoted_dfs['colocalisationMethod'].unpersist()
print('df unpersisted')

schema = StructType(
    [
        StructField("group", StringType(), True),
        StructField("comparison", StringType(), True),
        StructField("phase", StringType(), True),
        StructField("oddsRatio", DoubleType(), True),
        StructField("pValue", DoubleType(), True),
        StructField("lowerInterval", DoubleType(), True),
        StructField("upperInterval", DoubleType(), True),
        StructField("total", StringType(), True),
        StructField("values", ArrayType(ArrayType(IntegerType())), True),
        StructField("relSuccess", DoubleType(), True),
        StructField("rsLower", DoubleType(), True),
        StructField("rsUpper", DoubleType(), True),
        StructField("path", StringType(), True),
    ]
)
import re

# Define the list of patterns to search for
patterns = [
    "_only",
    #"_tissue",
    #"_isSignalFromRightTissue",
    "_isRightTissueSignalAgreed",
]
# Create a regex pattern to match any of the substrings
regex_pattern = "(" + "|".join(map(re.escape, patterns)) + ")"

# Convert list of lists to DataFrame
df = (
    spreadSheetFormatter(spark.createDataFrame(result_all, schema=schema))
    .withColumn(
        "prefix",
        F.regexp_replace(
            F.col("comparison"), regex_pattern + ".*", ""
        ),  # Extract part before the pattern
    )
    .withColumn(
        "suffix",
        F.regexp_extract(
            F.col("comparison"), regex_pattern, 0
        ),  # Extract the pattern itself
    )
)

### annotate projectId, tissue, qtl type and doe type:

from pyspark.sql.functions import create_map
from itertools import chain

mapping_expr=create_map([F.lit(x) for x in chain(*disdic.items())])

df_annot=df.withColumn('annotation',mapping_expr.getItem(F.col('prefix')))

df_annot.toPandas().to_csv(
    f"gs://ot-team/jroldan/analysis/{today_date}_credibleSetColocDoEanalysis.csv"
)

print("dataframe written \n Analysis finished")

working with projectId


25/06/18 18:24:56 WARN CacheManager: Asked to cache already cached data.
                                                                                

There are  38 columns to analyse with phases


                                                                                

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `Phase4` cannot be resolved. Did you mean one of the following? [`PhaseT`, `Phase>=4`, `Phase>=1`, `Phase>=2`, `Phase>=3`].;
'Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 66 more fields]
+- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 65 more fields]
   +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 64 more fields]
      +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 63 more fields]
         +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 62 more fields]
            +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 61 more fields]
               +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 60 more fields]
                  +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 59 more fields]
                     +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 58 more fields]
                        +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 57 more fields]
                           +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 56 more fields]
                              +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 55 more fields]
                                 +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 54 more fields]
                                    +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 53 more fields]
                                       +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 52 more fields]
                                          +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 51 more fields]
                                             +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 50 more fields]
                                                +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 49 more fields]
                                                   +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 48 more fields]
                                                      +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 47 more fields]
                                                         +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 46 more fields]
                                                            +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 45 more fields]
                                                               +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 44 more fields]
                                                                  +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 43 more fields]
                                                                     +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 42 more fields]
                                                                        +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 41 more fields]
                                                                           +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 40 more fields]
                                                                              +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 39 more fields]
                                                                                 +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 38 more fields]
                                                                                    +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 37 more fields]
                                                                                       +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 36 more fields]
                                                                                          +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 35 more fields]
                                                                                             +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 34 more fields]
                                                                                                +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 33 more fields]
                                                                                                   +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 32 more fields]
                                                                                                      +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 31 more fields]
                                                                                                         +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 30 more fields]
                                                                                                            +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 29 more fields]
                                                                                                               +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 28 more fields]
                                                                                                                  +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 27 more fields]
                                                                                                                     +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 26 more fields]
                                                                                                                        +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 25 more fields]
                                                                                                                           +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 24 more fields]
                                                                                                                              +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 23 more fields]
                                                                                                                                 +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 22 more fields]
                                                                                                                                    +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 21 more fields]
                                                                                                                                       +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 20 more fields]
                                                                                                                                          +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, null#23898, Alasoo_2018#23900, Aygun_2021#23902, BLUEPRINT#23904, Bossini-Castillo_2019#23906, BrainSeq#23908, Braineac2#23910, CAP#23912, CEDAR#23914, CommonMind#23916, Cytoimmgen#23918, FUSION#23920, Fairfax_2012#23922, Fairfax_2014#23924, GENCORD#23926, GEUVADIS#23928, GTEx#23930, HipSci#23932, Jerber_2021#23934, Kasela_2017#23936, Kim-Hellmuth_2017#23938, ... 19 more fields]
                                                                                                                                             +- Join LeftOuter, ((targetId#2088 = targetId#24018) AND (diseaseId#2170 = diseaseId#24100))
                                                                                                                                                :- Aggregate [targetId#2088, diseaseId#2170, maxClinPhase#6197], [targetId#2088, diseaseId#2170, maxClinPhase#6197, collect_set(if ((projectId#975 <=> cast(null as string))) resolvedAgreeDrug#23579 else cast(null as string), 0, 0) AS null#23898, collect_set(if ((projectId#975 <=> cast(Alasoo_2018 as string))) resolvedAgreeDrug#23579 else cast(null as string), 0, 0) AS Alasoo_2018#23900, collect_set(if ((projectId#975 <=> cast(Aygun_2021 as string))) resolvedAgreeDrug#23579 else cast(null as string), 0, 0) AS Aygun_2021#23902, collect_set(if ((projectId#975 <=> cast(BLUEPRINT as string))) resolvedAgreeDrug#23579 else cast(null as string), 0, 0) AS BLUEPRINT#23904, collect_set(if ((projectId#975 <=> cast(Bossini-Castillo_2019 as string))) resolvedAgreeDrug#23579 else cast(null as string), 0, 0) AS Bossini-Castillo_2019#23906, collect_set(if ((projectId#975 <=> cast(BrainSeq as string))) resolvedAgreeDrug#23579 else cast(null as string), 0, 0) AS BrainSeq#23908, collect_set(if ((projectId#975 <=> cast(Braineac2 as string))) resolvedAgreeDrug#23579 else cast(null as string), 0, 0) AS Braineac2#23910, collect_set(if ((projectId#975 <=> cast(CAP as string))) resolvedAgreeDrug#23579 else cast(null as string), 0, 0) AS CAP#23912, collect_set(if ((projectId#975 <=> cast(CEDAR as string))) resolvedAgreeDrug#23579 else cast(null as string), 0, 0) AS CEDAR#23914, collect_set(if ((projectId#975 <=> cast(CommonMind as string))) resolvedAgreeDrug#23579 else cast(null as string), 0, 0) AS CommonMind#23916, collect_set(if ((projectId#975 <=> cast(Cytoimmgen as string))) resolvedAgreeDrug#23579 else cast(null as string), 0, 0) AS Cytoimmgen#23918, collect_set(if ((projectId#975 <=> cast(FUSION as string))) resolvedAgreeDrug#23579 else cast(null as string), 0, 0) AS FUSION#23920, collect_set(if ((projectId#975 <=> cast(Fairfax_2012 as string))) resolvedAgreeDrug#23579 else cast(null as string), 0, 0) AS Fairfax_2012#23922, collect_set(if ((projectId#975 <=> cast(Fairfax_2014 as string))) resolvedAgreeDrug#23579 else cast(null as string), 0, 0) AS Fairfax_2014#23924, collect_set(if ((projectId#975 <=> cast(GENCORD as string))) resolvedAgreeDrug#23579 else cast(null as string), 0, 0) AS GENCORD#23926, collect_set(if ((projectId#975 <=> cast(GEUVADIS as string))) resolvedAgreeDrug#23579 else cast(null as string), 0, 0) AS GEUVADIS#23928, collect_set(if ((projectId#975 <=> cast(GTEx as string))) resolvedAgreeDrug#23579 else cast(null as string), 0, 0) AS GTEx#23930, collect_set(if ((projectId#975 <=> cast(HipSci as string))) resolvedAgreeDrug#23579 else cast(null as string), 0, 0) AS HipSci#23932, collect_set(if ((projectId#975 <=> cast(Jerber_2021 as string))) resolvedAgreeDrug#23579 else cast(null as string), 0, 0) AS Jerber_2021#23934, collect_set(if ((projectId#975 <=> cast(Kasela_2017 as string))) resolvedAgreeDrug#23579 else cast(null as string), 0, 0) AS Kasela_2017#23936, collect_set(if ((projectId#975 <=> cast(Kim-Hellmuth_2017 as string))) resolvedAgreeDrug#23579 else cast(null as string), 0, 0) AS Kim-Hellmuth_2017#23938, ... 18 more fields]
                                                                                                                                                :  +- Project [biosampleId#1002, targetId#2088, diseaseId#2170, leftStudyLocusId#1077, rightStudyId#1140, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, ... 25 more fields]
                                                                                                                                                :     +- Project [biosampleId#1002, targetId#2088, diseaseId#2170, leftStudyLocusId#1077, rightStudyId#1140, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, ... 27 more fields]
                                                                                                                                                :        +- Window [first(AgreeDrug#16545, true) windowspecdefinition(targetId#2088, diseaseId#2170, projectId#975, qtlPValueExponent#1143 ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS _we1#23581], [targetId#2088, diseaseId#2170, projectId#975], [qtlPValueExponent#1143 ASC NULLS FIRST]
                                                                                                                                                :           +- Window [first(AgreeDrug#16545, true) windowspecdefinition(targetId#2088, diseaseId#2170, projectId#975, colocalisationMethod#1083 ASC NULLS FIRST, qtlPValueExponent#1143 ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS _we0#23580], [targetId#2088, diseaseId#2170, projectId#975], [colocalisationMethod#1083 ASC NULLS FIRST, qtlPValueExponent#1143 ASC NULLS FIRST]
                                                                                                                                                :              +- Project [biosampleId#1002, targetId#2088, diseaseId#2170, leftStudyLocusId#1077, rightStudyId#1140, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, ... 24 more fields]
                                                                                                                                                :                 +- Project [biosampleId#1002, targetId#2088, diseaseId#2170, leftStudyLocusId#1077, rightStudyId#1140, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, ... 24 more fields]
                                                                                                                                                :                    +- Project [biosampleId#1002, targetId#2088, diseaseId#2170, leftStudyLocusId#1077, rightStudyId#1140, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, ... 25 more fields]
                                                                                                                                                :                       +- Window [collect_set(colocalisationMethod#1083, 0, 0) windowspecdefinition(targetId#2088, diseaseId#2170, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS _we0#23528], [targetId#2088, diseaseId#2170]
                                                                                                                                                :                          +- Project [biosampleId#1002, targetId#2088, diseaseId#2170, leftStudyLocusId#1077, rightStudyId#1140, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, ... 23 more fields]
                                                                                                                                                :                             +- Project [biosampleId#1002, targetId#2088, diseaseId#2170, leftStudyLocusId#1077, rightStudyId#1140, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, ... 23 more fields]
                                                                                                                                                :                                +- Join LeftOuter, (biosampleId#1002 = biosampleId#1059)
                                                                                                                                                :                                   :- Project [targetId#2088, diseaseId#2170, leftStudyLocusId#1077, rightStudyId#1140, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, projectId#975, ... 22 more fields]
                                                                                                                                                :                                   :  +- Project [targetId#2088, diseaseId#2170, leftStudyLocusId#1077, rightStudyId#1140, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, projectId#975, ... 21 more fields]
                                                                                                                                                :                                   :     +- Join RightOuter, ((targetId#1801 = targetId#2088) AND (diseaseId#1960 = diseaseId#2170))
                                                                                                                                                :                                   :        :- Filter NOT (name#691 = COVID-19)
                                                                                                                                                :                                   :        :  +- Filter (betaGwas#1780 < cast(0 as double))
                                                                                                                                                :                                   :        :     +- Filter ((clpp#1082 >= 0.01) OR (h4#1098 >= 0.8))
                                                                                                                                                :                                   :        :        +- Project [diseaseId#1960, leftStudyLocusId#1077, targetId#1801, rightStudyId#1140, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, projectId#975, ... 18 more fields]
                                                                                                                                                :                                   :        :           +- Project [diseaseId#1960, leftStudyLocusId#1077, targetId#1801, rightStudyId#1140, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, projectId#975, ... 17 more fields]
                                                                                                                                                :                                   :        :              +- Project [diseaseId#1960, leftStudyLocusId#1077, targetId#1801, rightStudyId#1140, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, projectId#975, ... 18 more fields]
                                                                                                                                                :                                   :        :                 +- Generate explode(concat(array(diseaseId#800), parents#694)), true, [diseaseId#1960]
                                                                                                                                                :                                   :        :                    +- Project [diseaseId#800, leftStudyLocusId#1077, targetId#1801, rightStudyId#1140, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, projectId#975, ... 18 more fields]
                                                                                                                                                :                                   :        :                       +- Join LeftOuter, (diseaseId#800 = diseaseId#1911)
                                                                                                                                                :                                   :        :                          :- Project [leftStudyLocusId#1077, targetId#1801, rightStudyId#1140, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, projectId#975, indexStudyType#1200, ... 15 more fields]
                                                                                                                                                :                                   :        :                          :  +- Join Inner, ((leftStudyLocusId#1077 = leftStudyLocusId#1828) AND (targetId#1801 = targetId#718))
                                                                                                                                                :                                   :        :                          :     :- Project [rightStudyId#1140, rightStudyLocusId#1078, leftStudyLocusId#1077, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, geneId#974 AS targetId#1801, projectId#975, indexStudyType#1200, ... 2 more fields]
                                                                                                                                                :                                   :        :                          :     :  +- Project [rightStudyId#1140, rightStudyLocusId#1078, leftStudyLocusId#1077, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176, geneId#974, projectId#975, indexStudyType#1200, ... 2 more fields]
                                                                                                                                                :                                   :        :                          :     :     +- Join LeftOuter, (rightStudyId#1140 = rightStudyId#1199)
                                                                                                                                                :                                   :        :                          :     :        :- Project [rightStudyLocusId#1078, leftStudyLocusId#1077, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117, rightStudyId#1140, rightVariantId#1141, credibleRightStudyType#1142, qtlPValueExponent#1143, isTransQtl#1176]
                                                                                                                                                :                                   :        :                          :     :        :  +- Join LeftOuter, (rightStudyLocusId#1078 = rightStudyLocusId#1139)
                                                                                                                                                :                                   :        :                          :     :        :     :- Project [leftStudyLocusId#1077, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, h0#1094, h1#1095, h2#1096, h3#1097, h4#1098, leftStudyId#1115, leftVariantId#1116, credibleLeftStudyType#1117]
                                                                                                                                                :                                   :        :                          :     :        :     :  +- Join LeftOuter, (leftStudyLocusId#1077 = leftStudyLocusId#1114)
                                                                                                                                                :                                   :        :                          :     :        :     :     :- Union false, false
                                                                                                                                                :                                   :        :                          :     :        :     :     :  :- Project [leftStudyLocusId#1077, rightStudyLocusId#1078, chromosome#1079, rightStudyType#1080, numberColocalisingVariants#1081L, clpp#1082, colocalisationMethod#1083, betaRatioSignAverage#1084, null AS h0#1094, null AS h1#1095, null AS h2#1096, null AS h3#1097, null AS h4#1098]
                                                                                                                                                :                                   :        :                          :     :        :     :     :  :  +- Relation [leftStudyLocusId#1077,rightStudyLocusId#1078,chromosome#1079,rightStudyType#1080,numberColocalisingVariants#1081L,clpp#1082,colocalisationMethod#1083,betaRatioSignAverage#1084] parquet
                                                                                                                                                :                                   :        :                          :     :        :     :     :  +- Project [leftStudyLocusId#949, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, null AS clpp#1093, colocalisationMethod#959, betaRatioSignAverage#960, h0#954, h1#955, h2#956, h3#957, h4#958]
                                                                                                                                                :                                   :        :                          :     :        :     :     :     +- Relation [leftStudyLocusId#949,rightStudyLocusId#950,chromosome#951,rightStudyType#952,numberColocalisingVariants#953L,h0#954,h1#955,h2#956,h3#957,h4#958,colocalisationMethod#959,betaRatioSignAverage#960] parquet
                                                                                                                                                :                                   :        :                          :     :        :     :     +- Project [studyLocusId#895 AS leftStudyLocusId#1114, StudyId#896 AS leftStudyId#1115, variantId#897 AS leftVariantId#1116, studyType#920 AS credibleLeftStudyType#1117]
                                                                                                                                                :                                   :        :                          :     :        :     :        +- Relation [studyLocusId#895,studyId#896,variantId#897,chromosome#898,position#899,region#900,beta#901,zScore#902,pValueMantissa#903,pValueExponent#904,effectAlleleFrequencyFromSource#905,standardError#906,subStudyDescription#907,qualityControls#908,finemappingMethod#909,credibleSetIndex#910,credibleSetlog10BF#911,purityMeanR2#912,purityMinR2#913,locusStart#914,locusEnd#915,sampleSize#916,ldSet#917,locus#918,... 3 more fields] parquet
                                                                                                                                                :                                   :        :                          :     :        :     +- Project [studyLocusId#1150 AS rightStudyLocusId#1139, studyId#1151 AS rightStudyId#1140, variantId#1152 AS rightVariantId#1141, studyType#1175 AS credibleRightStudyType#1142, pValueExponent#1159 AS qtlPValueExponent#1143, isTransQtl#1176]
                                                                                                                                                :                                   :        :                          :     :        :        +- Relation [studyLocusId#1150,studyId#1151,variantId#1152,chromosome#1153,position#1154,region#1155,beta#1156,zScore#1157,pValueMantissa#1158,pValueExponent#1159,effectAlleleFrequencyFromSource#1160,standardError#1161,subStudyDescription#1162,qualityControls#1163,finemappingMethod#1164,credibleSetIndex#1165,credibleSetlog10BF#1166,purityMeanR2#1167,purityMinR2#1168,locusStart#1169,locusEnd#1170,sampleSize#1171,ldSet#1172,locus#1173,... 3 more fields] parquet
                                                                                                                                                :                                   :        :                          :     :        +- Project [studyId#973 AS rightStudyId#1199, geneId#974, projectId#975, studyType#976 AS indexStudyType#1200, condition#998, biosampleId#1002]
                                                                                                                                                :                                   :        :                          :     :           +- Relation [studyId#973,geneId#974,projectId#975,studyType#976,traitFromSource#977,traitFromSourceMappedIds#978,biosampleFromSourceId#979,pubmedId#980,publicationTitle#981,publicationFirstAuthor#982,publicationDate#983,publicationJournal#984,backgroundTraitFromSourceMappedIds#985,initialSampleSize#986,nCases#987,nControls#988,nSamples#989,cohorts#990,ldPopulationStructure#991,discoverySamples#992,replicationSamples#993,qualityControls#994,analysisFlags#995,summarystatsLocation#996,... 6 more fields] parquet
                                                                                                                                                :                                   :        :                          :     +- Project [studyLocusId#798 AS leftStudyLocusId#1828, datasourceId#717, targetId#718, datatypeId#743, diseaseFromSourceMappedId#747, resourceScore#769, targetFromSourceId#784, diseaseId#800, id#801, score#802, sourceId#805, studyId#1845, variantId#1846, betaGwas#1780, pValueExponent#1853]
                                                                                                                                                :                                   :        :                          :        +- Project [studyLocusId#798, datasourceId#717, targetId#718, datatypeId#743, diseaseFromSourceMappedId#747, resourceScore#769, targetFromSourceId#784, diseaseId#800, id#801, score#802, sourceId#805, studyId#1845, variantId#1846, betaGwas#1780, pValueExponent#1853]
                                                                                                                                                :                                   :        :                          :           +- Join LeftOuter, (studyLocusId#798 = studyLocusId#1844)
                                                                                                                                                :                                   :        :                          :              :- Project [datasourceId#717, targetId#718, datatypeId#743, diseaseFromSourceMappedId#747, resourceScore#769, targetFromSourceId#784, studyLocusId#798, diseaseId#800, id#801, score#802, sourceId#805]
                                                                                                                                                :                                   :        :                          :              :  +- Filter (datasourceId#717 = gwas_credible_sets)
                                                                                                                                                :                                   :        :                          :              :     +- Relation [datasourceId#717,targetId#718,alleleOrigins#719,allelicRequirements#720,ancestry#721,ancestryId#722,beta#723,betaConfidenceIntervalLower#724,betaConfidenceIntervalUpper#725,biologicalModelAllelicComposition#726,biologicalModelGeneticBackground#727,biologicalModelId#728,biomarkerName#729,biomarkers#730,biosamplesFromSource#731,cellType#732,clinicalPhase#733,clinicalSignificances#734,clinicalStatus#735,cohortDescription#736,cohortId#737,cohortPhenotypes#738,cohortShortName#739,confidence#740,... 65 more fields] parquet
                                                                                                                                                :                                   :        :                          :              +- Project [studyLocusId#1844, studyId#1845, variantId#1846, beta#1850 AS betaGwas#1780, pValueExponent#1853]
                                                                                                                                                :                                   :        :                          :                 +- Relation [studyLocusId#1844,studyId#1845,variantId#1846,chromosome#1847,position#1848,region#1849,beta#1850,zScore#1851,pValueMantissa#1852,pValueExponent#1853,effectAlleleFrequencyFromSource#1854,standardError#1855,subStudyDescription#1856,qualityControls#1857,finemappingMethod#1858,credibleSetIndex#1859,credibleSetlog10BF#1860,purityMeanR2#1861,purityMinR2#1862,locusStart#1863,locusEnd#1864,sampleSize#1865,ldSet#1866,locus#1867,... 3 more fields] parquet
                                                                                                                                                :                                   :        :                          +- Project [id#689 AS diseaseId#1911, name#691, parents#694, therapeuticAreas#700]
                                                                                                                                                :                                   :        :                             +- Relation [id#689,code#690,name#691,description#692,dbXRefs#693,parents#694,synonyms#695,obsoleteTerms#696,obsoleteXRefs#697,children#698,ancestors#699,therapeuticAreas#700,descendants#701,ontology#702] parquet
                                                                                                                                                :                                   :        +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, drugGoF_protect#11714L, LoF_protect#9894L AS drugLoF_protect#11720L]
                                                                                                                                                :                                   :           +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, GoF_protect#9893L AS drugGoF_protect#11714L, LoF_protect#9894L]
                                                                                                                                                :                                   :              +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, GoF_protect#9893L, LoF_protect#9894L]
                                                                                                                                                :                                   :                 +- Filter (coherencyDiagonal#11688 = coherent)
                                                                                                                                                :                                   :                    +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, GoF_protect#9893L, LoF_protect#9894L, noEvaluable#9895L, GoF_risk#9908, LoF_risk#11522, coherencyDiagonal#11688, CASE WHEN ((((isnull(LoF_risk#11522) AND isnull(LoF_protect#9894L)) AND isnull(GoF_risk#9908)) AND isnull(GoF_protect#9893L)) AND isnull(noEvaluable#9895L)) THEN noEvid WHEN ((((isnull(LoF_risk#11522) AND isnull(LoF_protect#9894L)) AND isnull(GoF_risk#9908)) AND isnull(GoF_protect#9893L)) AND isnotnull(noEvaluable#9895L)) THEN EvidNotDoE WHEN (((isnotnull(LoF_risk#11522) OR isnotnull(LoF_protect#9894L)) OR isnotnull(GoF_risk#9908)) OR isnotnull(GoF_protect#9893L)) THEN CASE WHEN (isnotnull(LoF_risk#11522) AND ((isnull(LoF_protect#9894L) AND isnull(GoF_risk#9908)) AND isnull(GoF_protect#9893L))) THEN coherent WHEN (isnotnull(GoF_risk#9908) AND ((isnull(LoF_protect#9894L) AND isnull(LoF_risk#11522)) AND isnull(GoF_protect#9893L))) THEN coherent WHEN (isnotnull(LoF_protect#9894L) AND ((isnull(LoF_risk#11522) AND isnull(GoF_risk#9908)) AND isnull(GoF_protect#9893L))) THEN coherent WHEN (isnotnull(GoF_protect#9893L) AND ((isnull(LoF_protect#9894L) AND isnull(GoF_risk#9908)) AND isnull(LoF_risk#11522))) THEN coherent ELSE dispar END END AS coherencyOneCell#11698]
                                                                                                                                                :                                   :                       +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, GoF_protect#9893L, LoF_protect#9894L, noEvaluable#9895L, GoF_risk#9908, LoF_risk#11522, CASE WHEN ((((isnull(LoF_risk#11522) AND isnull(LoF_protect#9894L)) AND isnull(GoF_risk#9908)) AND isnull(GoF_protect#9893L)) AND isnull(noEvaluable#9895L)) THEN noEvid WHEN ((((isnull(LoF_risk#11522) AND isnull(LoF_protect#9894L)) AND isnull(GoF_risk#9908)) AND isnull(GoF_protect#9893L)) AND isnotnull(noEvaluable#9895L)) THEN EvidNotDoE WHEN (((isnotnull(LoF_risk#11522) OR isnotnull(LoF_protect#9894L)) OR isnotnull(GoF_risk#9908)) OR isnotnull(GoF_protect#9893L)) THEN CASE WHEN (isnotnull(GoF_risk#9908) AND isnotnull(LoF_risk#11522)) THEN dispar WHEN (isnotnull(LoF_protect#9894L) AND isnotnull(LoF_risk#11522)) THEN dispar WHEN (isnotnull(GoF_protect#9893L) AND isnotnull(GoF_risk#9908)) THEN dispar WHEN (isnotnull(GoF_protect#9893L) AND isnotnull(LoF_protect#9894L)) THEN dispar ELSE coherent END END AS coherencyDiagonal#11688]
                                                                                                                                                :                                   :                          +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, GoF_protect#9893L, LoF_protect#9894L, noEvaluable#9895L, GoF_risk#9908, null AS LoF_risk#11522]
                                                                                                                                                :                                   :                             +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, GoF_protect#9893L, LoF_protect#9894L, noEvaluable#9895L, null AS GoF_risk#9908]
                                                                                                                                                :                                   :                                +- Project [targetId#2088, diseaseId#2170, maxClinPhase#6197, __pivot_count(targetId) AS `count(targetId)`#9892[0] AS GoF_protect#9893L, __pivot_count(targetId) AS `count(targetId)`#9892[1] AS LoF_protect#9894L, __pivot_count(targetId) AS `count(targetId)`#9892[2] AS noEvaluable#9895L]
                                                                                                                                                :                                   :                                   +- Aggregate [targetId#2088, diseaseId#2170, maxClinPhase#6197], [targetId#2088, diseaseId#2170, maxClinPhase#6197, pivotfirst(homogenized#4208, count(targetId)#9884L, GoF_protect, LoF_protect, noEvaluable, 0, 0) AS __pivot_count(targetId) AS `count(targetId)`#9892]
                                                                                                                                                :                                   :                                      +- Aggregate [targetId#2088, diseaseId#2170, maxClinPhase#6197, homogenized#4208], [targetId#2088, diseaseId#2170, maxClinPhase#6197, homogenized#4208, count(targetId#2088) AS count(targetId)#9884L]
                                                                                                                                                :                                   :                                         +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 80 more fields]
                                                                                                                                                :                                   :                                            +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 81 more fields]
                                                                                                                                                :                                   :                                               +- Window [max(clinicalPhase#2103) windowspecdefinition(targetId#2088, diseaseId#2170, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS maxClinPhase#6197], [targetId#2088, diseaseId#2170]
                                                                                                                                                :                                   :                                                  +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 79 more fields]
                                                                                                                                                :                                   :                                                     +- Filter (datasourceId#2087 = chembl)
                                                                                                                                                :                                   :                                                        +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 79 more fields]
                                                                                                                                                :                                   :                                                           +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 78 more fields]
                                                                                                                                                :                                   :                                                              +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 78 more fields]
                                                                                                                                                :                                   :                                                                 +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 78 more fields]
                                                                                                                                                :                                   :                                                                    +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 79 more fields]
                                                                                                                                                :                                   :                                                                       +- Window [collect_set(intogen_function#3791, 0, 0) windowspecdefinition(targetId#2088, diseaseId#2170, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS _we0#3897], [targetId#2088, diseaseId#2170]
                                                                                                                                                :                                   :                                                                          +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 77 more fields]
                                                                                                                                                :                                   :                                                                             +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 77 more fields]
                                                                                                                                                :                                   :                                                                                +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 76 more fields]
                                                                                                                                                :                                   :                                                                                   +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 75 more fields]
                                                                                                                                                :                                   :                                                                                      +- Join LeftOuter, ((drugId2#2875 = drugId#2121) AND (targetId2#2882 = targetId#2088))
                                                                                                                                                :                                   :                                                                                         :- Join LeftOuter, (target_id#2925 = targetId#2088)
                                                                                                                                                :                                   :                                                                                         :  :- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, concat_ws(,, clinicalSignificances#2104) AS clinicalSignificances#3113, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 66 more fields]
                                                                                                                                                :                                   :                                                                                         :  :  +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#2104, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 66 more fields]
                                                                                                                                                :                                   :                                                                                         :  :     +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, cast(beta#2093 as double) AS beta#2931, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#2104, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 66 more fields]
                                                                                                                                                :                                   :                                                                                         :  :        +- Project [datasourceId#2087, targetId#2088, alleleOrigins#2089, allelicRequirements#2090, ancestry#2091, ancestryId#2092, beta#2093, betaConfidenceIntervalLower#2094, betaConfidenceIntervalUpper#2095, biologicalModelAllelicComposition#2096, biologicalModelGeneticBackground#2097, biologicalModelId#2098, biomarkerName#2099, biomarkers#2100, biosamplesFromSource#2101, cellType#2102, clinicalPhase#2103, clinicalSignificances#2104, clinicalStatus#2105, cohortDescription#2106, cohortId#2107, cohortPhenotypes#2108, cohortShortName#2109, confidence#2110, ... 66 more fields]
                                                                                                                                                :                                   :                                                                                         :  :           +- Filter datasourceId#2087 IN (gwas_credible_sets,gene_burden,eva,eva_somatic,gene2phenotype,orphanet,cancer_gene_census,intogen,impc,chembl)
                                                                                                                                                :                                   :                                                                                         :  :              +- Relation [datasourceId#2087,targetId#2088,alleleOrigins#2089,allelicRequirements#2090,ancestry#2091,ancestryId#2092,beta#2093,betaConfidenceIntervalLower#2094,betaConfidenceIntervalUpper#2095,biologicalModelAllelicComposition#2096,biologicalModelGeneticBackground#2097,biologicalModelId#2098,biomarkerName#2099,biomarkers#2100,biosamplesFromSource#2101,cellType#2102,clinicalPhase#2103,clinicalSignificances#2104,clinicalStatus#2105,cohortDescription#2106,cohortId#2107,cohortPhenotypes#2108,cohortShortName#2109,confidence#2110,... 65 more fields] parquet
                                                                                                                                                :                                   :                                                                                         :  +- Project [id#2802 AS target_id#2925, approvedSymbol#2803, description#2910, description_splited#2914, TSorOncogene#2919]
                                                                                                                                                :                                   :                                                                                         :     +- Project [id#2802, approvedSymbol#2803, description#2910, description_splited#2914, CASE WHEN (RLIKE(description_splited#2914, ncogene) AND RLIKE(description_splited#2914, TSG)) THEN bivalent WHEN RLIKE(description_splited#2914, ncogene(\s|$)) THEN oncogene WHEN RLIKE(description_splited#2914, TSG(\s|$)) THEN TSG ELSE noEvaluable END AS TSorOncogene#2919]
                                                                                                                                                :                                   :                                                                                         :        +- Project [id#2802, approvedSymbol#2803, description#2910, concat_ws(,, description#2910) AS description_splited#2914]
                                                                                                                                                :                                   :                                                                                         :           +- Aggregate [id#2802, approvedSymbol#2803], [id#2802, approvedSymbol#2803, collect_set(description#2902, 0, 0) AS description#2910]
                                                                                                                                                :                                   :                                                                                         :              +- Filter description#2902 IN (TSG,oncogene,Oncogene,oncogene,oncogene,TSG,TSG,oncogene,fusion,oncogene,oncogene,fusion)
                                                                                                                                                :                                   :                                                                                         :                 +- Project [id#2802, approvedSymbol#2803, col#2897.description AS description#2902]
                                                                                                                                                :                                   :                                                                                         :                    +- Project [id#2802, approvedSymbol#2803, col#2897]
                                                                                                                                                :                                   :                                                                                         :                       +- Generate explode(hallmarks#2812.attributes), true, [col#2897]
                                                                                                                                                :                                   :                                                                                         :                          +- Relation [id#2802,approvedSymbol#2803,biotype#2804,transcriptIds#2805,canonicalTranscript#2806,canonicalExons#2807,genomicLocation#2808,alternativeGenes#2809,approvedName#2810,go#2811,hallmarks#2812,synonyms#2813,symbolSynonyms#2814,nameSynonyms#2815,functionDescriptions#2816,subcellularLocations#2817,targetClass#2818,obsoleteSymbols#2819,obsoleteNames#2820,constraint#2821,tep#2822,proteinIds#2823,dbXrefs#2824,chemicalProbes#2825,... 5 more fields] parquet
                                                                                                                                                :                                   :                                                                                         +- Aggregate [targetId2#2882, drugId2#2875], [targetId2#2882, drugId2#2875, collect_set(actionType#2860, 0, 0) AS actionType#2892]
                                                                                                                                                :                                   :                                                                                            +- Project [targetId2#2882, drugId2#2875, actionType#2860, mechanismOfAction#2861]
                                                                                                                                                :                                   :                                                                                               +- Generate explode(targets#2865), true, [targetId2#2882]
                                                                                                                                                :                                   :                                                                                                  +- Project [drugId2#2875, actionType#2860, mechanismOfAction#2861, targets#2865]
                                                                                                                                                :                                   :                                                                                                     +- Generate explode(chemblIds#2862), true, [drugId2#2875]
                                                                                                                                                :                                   :                                                                                                        +- Relation [actionType#2860,mechanismOfAction#2861,chemblIds#2862,targetName#2863,targetType#2864,targets#2865,references#2866] parquet
                                                                                                                                                :                                   +- Project [biosampleId#1059, biosampleName#1060]
                                                                                                                                                :                                      +- Relation [biosampleId#1059,biosampleName#1060,description#1061,xrefs#1062,synonyms#1063,parents#1064,ancestors#1065,children#1066,descendants#1067] parquet
                                                                                                                                                +- Project [targetId#24018, diseaseId#24100, stopReason#16492]
                                                                                                                                                   +- Project [targetId#24018, diseaseId#24100, count#16488L, Negative AS stopReason#16492]
                                                                                                                                                      +- Aggregate [targetId#24018, diseaseId#24100], [targetId#24018, diseaseId#24100, count(1) AS count#16488L]
                                                                                                                                                         +- Filter array_contains(studyStopReasonCategories#24082, Negative)
                                                                                                                                                            +- Project [targetId#24018, diseaseId#24100, studyStopReason#24081, studyStopReasonCategories#24082]
                                                                                                                                                               +- Filter (datasourceId#24017 = chembl)
                                                                                                                                                                  +- Project [datasourceId#24017, targetId#24018, alleleOrigins#24019, allelicRequirements#24020, ancestry#24021, ancestryId#24022, beta#24023, betaConfidenceIntervalLower#24024, betaConfidenceIntervalUpper#24025, biologicalModelAllelicComposition#24026, biologicalModelGeneticBackground#24027, biologicalModelId#24028, biomarkerName#24029, biomarkers#24030, biosamplesFromSource#24031, cellType#24032, clinicalPhase#24033, clinicalSignificances#24034, clinicalStatus#24035, cohortDescription#24036, cohortId#24037, cohortPhenotypes#24038, cohortShortName#24039, confidence#24040, ... 66 more fields]
                                                                                                                                                                     +- Filter datasourceId#24017 IN (gwas_credible_sets,gene_burden,eva,eva_somatic,gene2phenotype,orphanet,cancer_gene_census,intogen,impc,chembl)
                                                                                                                                                                        +- Relation [datasourceId#24017,targetId#24018,alleleOrigins#24019,allelicRequirements#24020,ancestry#24021,ancestryId#24022,beta#24023,betaConfidenceIntervalLower#24024,betaConfidenceIntervalUpper#24025,biologicalModelAllelicComposition#24026,biologicalModelGeneticBackground#24027,biologicalModelId#24028,biomarkerName#24029,biomarkers#24030,biosamplesFromSource#24031,cellType#24032,clinicalPhase#24033,clinicalSignificances#24034,clinicalStatus#24035,cohortDescription#24036,cohortId#24037,cohortPhenotypes#24038,cohortShortName#24039,confidence#24040,... 65 more fields] parquet


In [None]:
ecaviar=spark.read.parquet('gs://open-targets-data-releases/25.03/output/colocalisation_ecaviar/')
all_coloc=ecaviar.unionByName(new, allowMissingColumns=True)

In [4]:
ecaviar.count()

59153506

In [6]:
ecaviar.filter(F.col('clpp')>=0.01).count()

                                                                                

40631820

In [11]:
all_coloc=ecaviar.unionByName(new, allowMissingColumns=True)

In [None]:
### 
newColoc.printSchema()
### 


root
 |-- rightStudyId: string (nullable = true)
 |-- rightStudyLocusId: string (nullable = true)
 |-- leftStudyLocusId: string (nullable = true)
 |-- chromosome: string (nullable = true)
 |-- rightStudyType: string (nullable = true)
 |-- numberColocalisingVariants: long (nullable = true)
 |-- h0: double (nullable = true)
 |-- h1: double (nullable = true)
 |-- h2: double (nullable = true)
 |-- h3: double (nullable = true)
 |-- h4: double (nullable = true)
 |-- colocalisationMethod: string (nullable = true)
 |-- betaRatioSignAverage: double (nullable = true)
 |-- leftStudyId: string (nullable = true)
 |-- leftVariantId: string (nullable = true)
 |-- credibleLeftStudyType: string (nullable = true)
 |-- rightVariantId: string (nullable = true)
 |-- credibleRightStudyType: string (nullable = true)
 |-- isTransQtl: boolean (nullable = true)
 |-- geneId: string (nullable = true)
 |-- projectId: string (nullable = true)
 |-- indexStudyType: string (nullable = true)
 |-- condition: string (nul

In [15]:
resolvedColoc.show()



+-----------+--------------------+---------------+--------------------+--------------------+----------+--------------+--------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+------------+----------------+---------------------+-------------------+----------------------+----------+-----------+--------------+--------------------+--------------+------------------+-------------------+-------------------------+-------------------+------------------+--------------------+-------------------+------------------+------------+----------------+--------------------+--------------+--------------------+----------------+-----------+
|  diseaseId|    leftStudyLocusId|       targetId|        rightStudyId|   rightStudyLocusId|chromosome|rightStudyType|numberColocalisingVariants|                  h0|                  h1|                  h2|                  h3|                 h4|colocalisa

                                                                                

In [None]:
### TASKS
# make the right join with chembl -- CHECK IT!
# Test new dataset pivoted_df generated using chatgpt
# create new columns asking the questions below
# generate the same in the genetic evidence


## QUESTIONS TO ANSWER:
# HAVE ECAVIAR >=0.8
# HAVE COLOC 
# HAVE COLOC >= 0.8
# HAVE COLOC + ECAVIAR >= 0.01
# HAVE COLOC >= 0.8 + ECAVIAR >= 0.01
# RIGHT JOING WITH CHEMBL 


In [14]:
gwasCredibleAssoc_qtlPValue.count()

                                                                                ]]

20821482

In [16]:
gwasCredibleAssoc_qtlPValue.show()



+---------------+-----------+-----------+-----------+
|       targetId|  diseaseId|leftStudyId|homogenized|
+---------------+-----------+-----------+-----------+
|ENSG00000000971|EFO_0004683| GCST003219|noEvaluable|
|ENSG00000000971|EFO_0004683| GCST003219|noEvaluable|
|ENSG00000000971|EFO_0004683| GCST003219|noEvaluable|
|ENSG00000000971|EFO_0004683| GCST003219|noEvaluable|
|ENSG00000000971|EFO_0004683| GCST003219|noEvaluable|
|ENSG00000000971|EFO_0004683| GCST003219|noEvaluable|
|ENSG00000000971|EFO_0004683| GCST003219|noEvaluable|
|ENSG00000000971|EFO_0004683| GCST003219|noEvaluable|
|ENSG00000000971|EFO_0004683| GCST003219|noEvaluable|
|ENSG00000000971|EFO_0004683| GCST003219|noEvaluable|
|ENSG00000000971|EFO_0004683| GCST003219|noEvaluable|
|ENSG00000000971|EFO_0004683| GCST003219|noEvaluable|
|ENSG00000000971|EFO_0004683| GCST003219|noEvaluable|
|ENSG00000000971|EFO_0004683| GCST003219|noEvaluable|
|ENSG00000000971|EFO_0004683| GCST003219|noEvaluable|
|ENSG00000000971|EFO_0004683

                                                                                

In [15]:
gwasCredibleAssoc.count()

                                                                                ]]

20821482

In [28]:
assoc2=gwasCredibleAssoc_qtlPValue.groupBy('targetId','diseaseId').pivot('homogenized').count().persist()

                                                                                58]]

In [29]:
benchGenEvid2=assoc2.withColumnRenamed('GoF_protect','GoF_protectG').withColumnRenamed('LoF_protect','LoF_protectG').withColumnRenamed('LoF_risk','LoF_riskG').withColumnRenamed('GoF_risk','GoF_riskG').join(analysis_chembl, on=['targetId','diseaseId'],how='right').persist()
assoc2.unpersist()

25/06/18 21:52:45 WARN CacheManager: Asked to cache already cached data.


DataFrame[targetId: string, diseaseId: string, GoF_protect: bigint, GoF_risk: bigint, LoF_protect: bigint, LoF_risk: bigint, noEvaluable: bigint]

In [24]:
benchGenEvid=assoc.withColumnRenamed('GoF_protect','GoF_protectG').withColumnRenamed('LoF_protect','LoF_protectG').withColumnRenamed('LoF_risk','LoF_riskG').withColumnRenamed('GoF_risk','GoF_riskG').join(analysis_chembl, on=['targetId','diseaseId'],how='right').persist()
assoc.unpersist()

DataFrame[targetId: string, diseaseId: string, GoF_protect: bigint, GoF_risk: bigint, LoF_protect: bigint, LoF_risk: bigint, noEvaluable: bigint]

In [30]:
benchGenEvid2.filter(F.col('GoF_protectG').isNotNull()|F.col('GoF_riskG').isNotNull()|F.col('LoF_protectG').isNotNull()|F.col('LoF_riskG').isNotNull()).count()

341

In [25]:

benchGenEvid.filter(F.col('GoF_protectG').isNotNull()|F.col('GoF_riskG').isNotNull()|F.col('LoF_protectG').isNotNull()|F.col('LoF_riskG').isNotNull()).count()

25/06/18 21:41:15 WARN BatchHelper: Forcibly shutting down batch helper thread pool.
                                                                                

+---------------+-------------+------------+---------+------------+---------+-----------+------------+-----------+-----------+-----------+--------+--------+-----------------+----------------+
|       targetId|    diseaseId|GoF_protectG|GoF_riskG|LoF_protectG|LoF_riskG|noEvaluable|maxClinPhase|GoF_protect|LoF_protect|noEvaluable|GoF_risk|LoF_risk|coherencyDiagonal|coherencyOneCell|
+---------------+-------------+------------+---------+------------+---------+-----------+------------+-----------+-----------+-----------+--------+--------+-----------------+----------------+
|ENSG00000112541|  EFO_0000537|           5|     NULL|        NULL|     NULL|       NULL|         1.0|       NULL|          1|       NULL|    NULL|    NULL|         coherent|        coherent|
|ENSG00000115232|  EFO_0003767|        NULL|        2|         104|     NULL|       NULL|         1.0|       NULL|          1|       NULL|    NULL|    NULL|         coherent|        coherent|
|ENSG00000132855|  EFO_0000589|        N

In [26]:
benchGenEvid.filter(F.col('GoF_protectG').isNotNull()|F.col('GoF_riskG').isNotNull()|F.col('LoF_protectG').isNotNull()|F.col('LoF_riskG').isNotNull()).count()

341

In [19]:
gwasCredibleAssoc_qtlPValue.groupBy('targetId','diseaseId').pivot('homogenized').count().join(analysis_chembl, on=['targetId','diseaseId'],how='right').show()

                                                                                / 67]

+---------------+-------------+-----------+--------+-----------+--------+-----------+------------+-----------+-----------+-----------+--------+--------+-----------------+----------------+
|       targetId|    diseaseId|GoF_protect|GoF_risk|LoF_protect|LoF_risk|noEvaluable|maxClinPhase|GoF_protect|LoF_protect|noEvaluable|GoF_risk|LoF_risk|coherencyDiagonal|coherencyOneCell|
+---------------+-------------+-----------+--------+-----------+--------+-----------+------------+-----------+-----------+-----------+--------+--------+-----------------+----------------+
|ENSG00000007314|  EFO_0000555|       NULL|    NULL|       NULL|    NULL|       NULL|         2.0|       NULL|          3|       NULL|    NULL|    NULL|         coherent|        coherent|
|ENSG00000007314|  EFO_1000249|       NULL|    NULL|       NULL|    NULL|       NULL|         3.0|       NULL|          1|       NULL|    NULL|    NULL|         coherent|        coherent|
|ENSG00000008018|  EFO_1000453|       NULL|    NULL|       N

In [17]:
analysis_chembl.join(gwasCredibleAssoc_qtlPValue, on=['targetId','diseaseId'],how='right').show()

25/06/18 21:29:06 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_76128_204 !
25/06/18 21:29:06 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_76140_204 !
25/06/18 21:29:06 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_75731_295 !
25/06/18 21:29:06 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_94956_344 !
25/06/18 21:29:06 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_76134_104 !
25/06/18 21:29:06 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_94956_44 !
25/06/18 21:29:06 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_76128_29 !
25/06/18 21:29:06 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_87632_83 !
25/06/18 21:29:06 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_94956_79 !
25/06/18 21:29:06 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_75731_45 !
25/06/18 21:29:06 WARN Bl

KeyboardInterrupt: 

In [27]:
resolvedColoc = (
    (
        newColoc.withColumnRenamed("geneId", "targetId")
        .join(
            gwasComplete.withColumnRenamed("studyLocusId", "leftStudyLocusId"),
            on=["leftStudyLocusId", "targetId"],
            how="inner", ### has to be right?
        )
        #.join(  ### propagated using parent terms
        #    diseases.selectExpr(
        #        "id as diseaseId", "name", "parents", "therapeuticAreas"
        #    ),
        #    on="diseaseId",
        #    how="left",
        #)
        #.withColumn(
        #    "diseaseId",
        #    F.explode_outer(F.concat(F.array(F.col("diseaseId")), F.col("parents"))),
        #)
        #.drop("parents", "oldDiseaseId")
    ).withColumn(
        "colocDoE",
        F.when(
            F.col("rightStudyType").isin(
                ["eqtl", "pqtl", "tuqtl", "sceqtl", "sctuqtl"]
            ),
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_protect"),
            ),
        ).when(
            F.col("rightStudyType").isin(
                ["sqtl", "scsqtl"]
            ),  ### opposite directionality than sqtl
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_protect"),
            ),
        ),
    )
    # .persist()
)
print("loaded resolvedColloc")

datasource_filter = [
    #"gwas_credible_set", remove so avoid potential duplicates as it will be incorporated later (DoE is done separately)
    "gene_burden",
    "eva",
    "eva_somatic",
    "gene2phenotype",
    "orphanet",
    "cancer_gene_census",
    "intogen",
    "impc",
    "chembl",
]

assessment, evidences, actionType, oncolabel = temporary_directionOfEffect(
    path_n, datasource_filter
)

print("run temporary direction of effect")

window_spec = Window.partitionBy("targetId", "diseaseId",'leftStudyId').orderBy( ### include gwas study
    F.col("pValueExponent").asc()
)
gwasCredibleAssoc = (
    resolvedColoc.withColumn(
        "homogenized", F.first("colocDoE", ignorenulls=True).over(window_spec)
    )  ## added 30.01.2025
    .select("targetId", "diseaseId",'leftStudyId', "homogenized")
    .withColumn(
        "homogenized",
        F.when(F.col("homogenized").isNull(), F.lit("noEvaluable")).otherwise(
            F.col("homogenized")
        ),
    )
)

window_spec_qtl = Window.partitionBy("targetId", "diseaseId",'leftStudyId').orderBy( ### include gwas study
    F.col("qtlPValueExponent").asc()
)
# qtlPValueExponent
gwasCredibleAssoc_qtlPValue = (
    resolvedColoc.withColumn(
        "homogenized", F.first("colocDoE", ignorenulls=True).over(window_spec_qtl)
    )  ## added 30.01.2025
    .select("targetId", "diseaseId",'leftStudyId', "homogenized")
    .withColumn(
        "homogenized",
        F.when(F.col("homogenized").isNull(), F.lit("noEvaluable")).otherwise(
            F.col("homogenized")
        ),
    )
)

loaded resolvedColloc


25/06/18 21:51:29 WARN CacheManager: Asked to cache already cached data.


run temporary direction of effect


25/06/18 21:51:30 WARN CacheManager: Asked to cache already cached data.


In [None]:
#### testing ecaviar for genevid analysis
import time
#from array import ArrayType
from functions import (
    relative_success,
    spreadSheetFormatter,
    discrepancifier,
    temporary_directionOfEffect,
)
# from stoppedTrials import terminated_td
from DoEAssessment import directionOfEffect
# from membraneTargets import target_membrane
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
#from itertools import islice
from datetime import datetime
from datetime import date
from pyspark.sql.types import (
    StructType,
    StructField,
    DoubleType,
    StringType,
    IntegerType,
    ArrayType
)
import pandas as pd


spark = SparkSession.builder.getOrCreate()
spark.conf.set(
    "spark.sql.shuffle.partitions", "400"
)  # Default is 200, increase if needed


path_n='gs://open-targets-data-releases/25.03/output/'

target = spark.read.parquet(f"{path_n}target/")

diseases = spark.read.parquet(f"{path_n}disease/")

evidences = spark.read.parquet(f"{path_n}evidence")

credible = spark.read.parquet(f"{path_n}credible_set")

new = spark.read.parquet(f"{path_n}colocalisation_coloc") 

index=spark.read.parquet(f"{path_n}study/")

variantIndex = spark.read.parquet(f"{path_n}variant")

biosample = spark.read.parquet(f"{path_n}biosample")

ecaviar=spark.read.parquet(f"{path_n}colocalisation_ecaviar")

all_coloc=ecaviar.unionByName(new, allowMissingColumns=True)

print("loaded files")

print("loaded files")

newColoc = (
    all_coloc.join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on left side
            "studyLocusId as leftStudyLocusId",
            "StudyId as leftStudyId",
            "variantId as leftVariantId",
            "studyType as credibleLeftStudyType",
        ),
        on="leftStudyLocusId",
        how="left",
    )
    .join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on right side
            "studyLocusId as rightStudyLocusId",
            "studyId as rightStudyId",
            "variantId as rightVariantId",
            "studyType as credibleRightStudyType",
            "pValueExponent as qtlPValueExponent",
            'isTransQtl'
        ),
        on="rightStudyLocusId",
        how="left",
    )
    .join(
        index.selectExpr(  ### bring modulated target on right side (QTL study)
            "studyId as rightStudyId",
            "geneId",
            "projectId",
            "studyType as indexStudyType",
            "condition",
            "biosampleId",
        ),
        on="rightStudyId",
        how="left",
)
    # .persist()
)

print("loaded newColoc")

# remove columns without content (only null values on them)
df = evidences.filter((F.col("datasourceId") == "gwas_credible_sets"))

# Use an aggregation to determine non-null columns
non_null_counts = df.select(
    *[F.sum(F.col(col).isNotNull().cast("int")).alias(col) for col in df.columns]
)

# Collect the counts for each column
non_null_columns = [
    row[0] for row in non_null_counts.collect()[0].asDict().items() if row[1] > 0
]

# Select only the non-null columns
filtered_df = df.select(*non_null_columns)  # .persist()

## bring studyId, variantId, beta from Gwas and pValue
gwasComplete = filtered_df.join(
    credible.selectExpr(
        "studyLocusId", "studyId", "variantId", "beta as betaGwas", "pValueExponent"
    ),
    on="studyLocusId",
    how="left",
)  # .persist()

print("loaded gwasComplete")

resolvedColoc = (
    (
        newColoc.withColumnRenamed("geneId", "targetId")
        .join(
            gwasComplete.withColumnRenamed("studyLocusId", "leftStudyLocusId"),
            on=["leftStudyLocusId", "targetId"],
            how="right", ### has to be right?
        )
        #.join(  ### propagated using parent terms
        #    diseases.selectExpr(
        #        "id as diseaseId", "name", "parents", "therapeuticAreas"
        #    ),
        #    on="diseaseId",
        #    how="left",
        #)
        #.withColumn(
        #    "diseaseId",
        #    F.explode_outer(F.concat(F.array(F.col("diseaseId")), F.col("parents"))),
        #)
        #.drop("parents", "oldDiseaseId")
    ).withColumn(
        "colocDoE",
        F.when(
            F.col("rightStudyType").isin(
                ["eqtl", "pqtl", "tuqtl", "sceqtl", "sctuqtl"]
            ),
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_protect"),
            ),
        ).when(
            F.col("rightStudyType").isin(
                ["sqtl", "scsqtl"]
            ),  ### opposite directionality than sqtl
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_protect"),
            ),
        ),
    )
    # .persist()
)
print("loaded resolvedColloc")

datasource_filter = [
    #"gwas_credible_set", remove so avoid potential duplicates as it will be incorporated later (DoE is done separately)
    "gene_burden",
    "eva",
    "eva_somatic",
    "gene2phenotype",
    "orphanet",
    "cancer_gene_census",
    "intogen",
    "impc",
    "chembl",
]

assessment, evidences, actionType, oncolabel = temporary_directionOfEffect(
    path_n, datasource_filter
)

print("run temporary direction of effect")

window_spec = Window.partitionBy("targetId", "diseaseId",'leftStudyId').orderBy( ### include gwas study
    F.col("pValueExponent").asc()
)
gwasCredibleAssoc = (
    resolvedColoc.withColumn(
        "homogenized", F.first("colocDoE", ignorenulls=True).over(window_spec)
    )  ## added 30.01.2025
    .select("targetId", "diseaseId",'leftStudyId', "homogenized")
    .withColumn(
        "homogenized",
        F.when(F.col("homogenized").isNull(), F.lit("noEvaluable")).otherwise(
            F.col("homogenized")
        ),
    )
)

window_spec_qtl = Window.partitionBy("targetId", "diseaseId",'leftStudyId').orderBy( ### include gwas study
    F.col("qtlPValueExponent").asc()
)
# qtlPValueExponent
gwasCredibleAssoc_qtlPValue = (
    resolvedColoc.withColumn(
        "homogenized", F.first("colocDoE", ignorenulls=True).over(window_spec_qtl)
    )  ## added 30.01.2025
    .select("targetId", "diseaseId",'leftStudyId', "homogenized")
    .withColumn(
        "homogenized",
        F.when(F.col("homogenized").isNull(), F.lit("noEvaluable")).otherwise(
            F.col("homogenized")
        ),
    )
)


print("Moving to step 2")

columns_chembl = ["LoF_protect", "GoF_protect"]
columns_dataset = ["LoF_protect", "GoF_protect", "LoF_risk", "GoF_risk", "evidenceDif"]
columns = ["GoF_risk", "LoF_protect", "LoF_risk", "GoF_protect"]
terms = ["noEvaluable", "bivalent_risk", "null", "dispar"]

taDf = spark.createDataFrame(
    data=[
        ("MONDO_0045024", "cell proliferation disorder", "Oncology"),
        ("EFO_0005741", "infectious disease", "Other"),
        ("OTAR_0000014", "pregnancy or perinatal disease", "Other"),
        ("EFO_0005932", "animal disease", "Other"),
        ("MONDO_0024458", "disease of visual system", "Other"),
        ("EFO_0000319", "cardiovascular disease", "Other"),
        ("EFO_0009605", "pancreas disease", "Other"),
        ("EFO_0010282", "gastrointestinal disease", "Other"),
        ("OTAR_0000017", "reproductive system or breast disease", "Other"),
        ("EFO_0010285", "integumentary system disease", "Other"),
        ("EFO_0001379", "endocrine system disease", "Other"),
        ("OTAR_0000010", "respiratory or thoracic disease", "Other"),
        ("EFO_0009690", "urinary system disease", "Other"),
        ("OTAR_0000006", "musculoskeletal or connective tissue disease", "Other"),
        ("MONDO_0021205", "disease of ear", "Other"),
        ("EFO_0000540", "immune system disease", "Other"),
        ("EFO_0005803", "hematologic disease", "Other"),
        ("EFO_0000618", "nervous system disease", "Other"),
        ("MONDO_0002025", "psychiatric disorder", "Other"),
        ("MONDO_0024297", "nutritional or metabolic disease", "Other"),
        ("OTAR_0000018", "genetic, familial or congenital disease", "Other"),
        ("OTAR_0000009", "injury, poisoning or other complication", "Other"),
        ("EFO_0000651", "phenotype", "Other"),
        ("EFO_0001444", "measurement", "Other"),
        ("GO_0008150", "biological process", "Other"),
    ],
    schema=StructType(
        [
            StructField("taId", StringType(), True),
            StructField("taLabel", StringType(), True),
            StructField("taLabelSimple", StringType(), True),
        ]
    ),
).withColumn("taRank", F.monotonically_increasing_id())

### give us a classification of Oncology VS non oncology
wByDisease = Window.partitionBy("diseaseId")  #### checked 31.05.2023
diseaseTA = (
    diseases.withColumn("taId", F.explode("therapeuticAreas"))
    .select(F.col("id").alias("diseaseId"), "taId", "parents")
    .join(taDf, on="taId", how="left")
    .withColumn("minRank", F.min("taRank").over(wByDisease))
    .filter(F.col("taRank") == F.col("minRank"))
    .drop("taRank", "minRank")
)

#### give us propagation of diseases and list of therapeutic areas associated
diseases2 = diseases.select("id", "parents").withColumn(
    "diseaseIdPropagated",
    F.explode_outer(F.concat(F.array(F.col("id")), F.col("parents"))),
)

chembl_trials = (
    assessment.filter((F.col("datasourceId").isin(["chembl"])))
    .groupBy("targetId", "diseaseId")
    .agg(F.max(F.col("clinicalPhase")).alias("maxClinPhase"))
)

negativeTD = (
    evidences.filter(F.col("datasourceId") == "chembl")
    .select("targetId", "diseaseId", "studyStopReason", "studyStopReasonCategories")
    .filter(F.array_contains(F.col("studyStopReasonCategories"), "Negative"))
    .groupBy("targetId", "diseaseId")
    .count()
    .withColumn("stopReason", F.lit("Negative"))
    .drop("count")
)

assessment_all = assessment.unionByName(
    gwasCredibleAssoc_qtlPValue.withColumn("datasourceId", F.lit("gwas_credible_set")),
    allowMissingColumns=True,
)

print("defining non propagated,propagated and analysis_drugs functions")

def analysis_nonPropagated(assessment_all, analysisDatasources):
    return discrepancifier(
        assessment_all.filter(F.col("datasourceId").isin(analysisDatasources))
        .withColumn(
            "datasources",
            F.collect_set(F.col("datasourceId")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy(
            "targetId",
            "diseaseId",
        )
        .pivot("homogenized")
        .agg(F.count("targetId"))
        # .persist()
    )


def analysis_propagated(assessment_all, analysisDatasources):
    return discrepancifier(
        assessment_all.filter(F.col("datasourceId").isin(analysisDatasources))
        .withColumn(
            "datasources",
            F.collect_set(F.col("datasourceId")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .join(
            diseases2.selectExpr("id as diseaseId", "diseaseIdPropagated"),
            on="diseaseId",
            how="left",
        )
        .withColumnRenamed("diseaseId", "oldDiseaseId")
        .withColumnRenamed("diseaseIdPropagated", "diseaseId")
        .groupBy(
            "targetId",
            "diseaseId",
        )
        .pivot("homogenized")
        .agg(F.count("targetId"))
        # .persist()
    )

chembl_ds = ["chembl"]

def analysis_drugs(assessment_all, chembl_ds):
    return discrepancifier(
        assessment_all.filter((F.col("datasourceId").isin(chembl_ds))
        )
        .withColumn(
            "maxClinPhase",
            F.max(F.col("clinicalPhase")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase")
        .pivot("homogenized")
        .agg(F.count("targetId"))
        .persist()
    )


analysis_chembl = analysis_drugs(assessment_all, chembl_ds)

#######
## include here the analysis
#######

analysisDatasources = []

print("defining full_analysis_propagation")

doe_columns=["LoF_protect", "GoF_risk", "LoF_risk", "GoF_protect"]
diagonal_lof=['LoF_protect','GoF_risk']
diagonal_gof=['LoF_risk','GoF_protect']

def full_analysis_propagation(
    doe_columns,assessment_all, analysisDatasources, analysis_chembl, negativeTD, diseaseTA,diagonal_lof,diagonal_gof
):
    conditions = [
    F.when(F.col(c) == F.col("maxDoE"), F.lit(c)).otherwise(F.lit(None)) for c in doe_columns
    ]
    
    return (
        analysis_propagated(assessment_all, analysisDatasources)
        .join(
            analysis_chembl.selectExpr(
                "targetId",
                "diseaseId",
                "maxClinPhase",
                "coherencyDiagonal as coherencyDiagonal_ch",
                "coherencyOneCell as coherencyOneCell_ch",
                "LoF_protect as LoF_protect_ch",
                "GoF_protect as GoF_protect_ch",
            ),
            on=["targetId", "diseaseId"],
            how="right",
        )
        #### Should remove the coherencyDiagonal.isNotNull()
        .withColumn(
            "geneticEvidence",
            F.when(
                F.col("coherencyDiagonal").isNotNull(), F.lit("hasGeneticEvidence")
            ).otherwise(F.lit("noGeneticEvidence")),
        )
        # .filter(F.col("coherencyDiagonal_ch").isNotNull())
        .withColumn(
            "diagonalAgreeWithDrugs",
            F.when(
                (F.col("coherencyDiagonal_ch") == "coherent")
                & (F.col("coherencyDiagonal") == "coherent"),
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        F.col("GoF_risk").isNotNull() | F.col("LoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .when(
                    F.col("GoF_protect_ch").isNotNull()
                    & (
                        F.col("LoF_risk").isNotNull() | F.col("GoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        )
        .withColumn(
            "oneCellAgreeWithDrugs",
            F.when(
                (F.col("coherencyOneCell_ch") == "coherent")
                & (F.col("coherencyOneCell") == "coherent"),
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        (F.col("LoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("GoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .when(
                    (F.col("GoF_protect_ch").isNotNull())
                    & (
                        (F.col("GoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("LoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        ).withColumn(
            "arrayN", F.array(*[F.col(c) for c in doe_columns])
        ).withColumn(
            "maxDoE", F.array_max(F.col("arrayN"))
        ).withColumn("maxDoE_names", F.array(*conditions)
        ).withColumn("maxDoE_names", F.expr("filter(maxDoE_names, x -> x is not null)")
        ).withColumn(
            "Phase4",
            F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=3",
            F.when(F.col("maxClinPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=2",
            F.when(F.col("maxClinPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=1",
            F.when(F.col("maxClinPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
        )
        .join(negativeTD, on=["targetId", "diseaseId"], how="left")
        .withColumn(
            "PhaseT",
            F.when(F.col("stopReason") == "Negative", F.lit("yes")).otherwise(F.lit("no")),
        )
        .join(
            diseaseTA.select("diseaseId", "taLabelSimple"), on="diseaseId", how="left"
        )
        .withColumn(
            "hasGeneticEvidence",
            F.when(
                F.col("geneticEvidence") == "hasGeneticEvidence", F.lit("yes")
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "diagonalYes",
            F.when(
                F.col("hasGeneticEvidence") == "yes",
                F.when(F.col("diagonalAgreeWithDrugs") == "coherent", F.lit("yes"))
                .when(F.col("diagonalAgreeWithDrugs") == "dispar", F.lit("no"))
                .otherwise(F.lit("no")),
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "oneCellYes",
            F.when(
                F.col("hasGeneticEvidence") == "yes",
                F.when(F.col("oneCellAgreeWithDrugs") == "coherent", F.lit("yes"))
                .when(F.col("oneCellAgreeWithDrugs") == "dispar", F.lit("no"))
                .otherwise(F.lit("no")),
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "maxDoEArrayN",
            F.expr("aggregate(arrayN, 0, (acc, x) -> acc + IF(x = maxDoE, 1, 0))")
        ).withColumn(
            "NoneCellYes",
            F.when((F.col("LoF_protect_ch").isNotNull() & (F.col('GoF_protect_ch').isNull())) & (F.array_contains(F.col("maxDoE_names"), F.lit("LoF_protect")))==True, F.lit('yes'))
            .when((F.col("GoF_protect_ch").isNotNull() & (F.col('LoF_protect_ch').isNull())) & (F.array_contains(F.col("maxDoE_names"), F.lit("GoF_protect")))==True, F.lit('yes')
                ).otherwise(F.lit('no'))  # If the value is null, return null # Otherwise, check if name is in array
        ).withColumn(
            "NdiagonalYes",
            F.when((F.col("LoF_protect_ch").isNotNull() & (F.col('GoF_protect_ch').isNull())) & 
                (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_lof]))) > 0),
                F.lit("yes")
            ).when((F.col("GoF_protect_ch").isNotNull() & (F.col('LoF_protect_ch').isNull())) & 
                (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_gof]))) > 0),
                F.lit("yes")
            ).otherwise(F.lit('no'))
        )
        # .persist()
    )


#####
## no propag
#####
print("defining full analysis no propagation")


def full_analysis_noPropagation(
    doe_columns,assessment_all, analysisDatasources, analysis_chembl, negativeTD, diseaseTA,diagonal_lof,diagonal_gof
):
    conditions = [
    F.when(F.col(c) == F.col("maxDoE"), F.lit(c)).otherwise(F.lit(None)) for c in doe_columns
    ]
    return (
        analysis_nonPropagated(assessment_all, analysisDatasources)
        .join(
            analysis_chembl.selectExpr(
                "targetId",
                "diseaseId",
                "maxClinPhase",
                "coherencyDiagonal as coherencyDiagonal_ch",
                "coherencyOneCell as coherencyOneCell_ch",
                "LoF_protect as LoF_protect_ch",
                "GoF_protect as GoF_protect_ch",
            ),
            on=["targetId", "diseaseId"],
            how="right",
        )
        .withColumn(
            "geneticEvidence",
            F.when(
                F.col("coherencyDiagonal").isNotNull(), F.lit("hasGeneticEvidence")
            ).otherwise(F.lit("noGeneticEvidence")),
        )
        # .filter(F.col("coherencyDiagonal_ch").isNotNull())
        .withColumn(
            "diagonalAgreeWithDrugs",
            F.when(
                (F.col("coherencyDiagonal_ch") == "coherent")
                & (F.col("coherencyDiagonal") == "coherent"),
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        F.col("GoF_risk").isNotNull() | F.col("LoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .when(
                    F.col("GoF_protect_ch").isNotNull()
                    & (
                        F.col("LoF_risk").isNotNull() | F.col("GoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        )
        .withColumn(
            "oneCellAgreeWithDrugs",
            F.when(
                (F.col("coherencyOneCell_ch") == "coherent")
                & (F.col("coherencyOneCell") == "coherent"),
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        (F.col("LoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("GoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .when(
                    (F.col("GoF_protect_ch").isNotNull())
                    & (
                        (F.col("GoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("LoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        ).withColumn(
            "arrayN", F.array(*[F.col(c) for c in doe_columns])
        ).withColumn(
            "maxDoE", F.array_max(F.col("arrayN"))
        ).withColumn("maxDoE_names", F.array(*conditions)
        ).withColumn("maxDoE_names", F.expr("filter(maxDoE_names, x -> x is not null)")
        )
        .withColumn(
            "Phase4",
            F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=3",
            F.when(F.col("maxClinPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=2",
            F.when(F.col("maxClinPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=1",
            F.when(F.col("maxClinPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
        )
        .join(negativeTD, on=["targetId", "diseaseId"], how="left")
        .withColumn(
            "PhaseT",
            F.when(F.col("stopReason") == "Negative", F.lit("yes")).otherwise(F.lit("no")),
        )
        .join(
            diseaseTA.select("diseaseId", "taLabelSimple"), on="diseaseId", how="left"
        )
        .withColumn(
            "hasGeneticEvidence",
            F.when(
                F.col("geneticEvidence") == "hasGeneticEvidence", F.lit("yes")
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "diagonalYes",
            F.when(
                F.col("hasGeneticEvidence") == "yes",
                F.when(F.col("diagonalAgreeWithDrugs") == "coherent", F.lit("yes"))
                .when(F.col("diagonalAgreeWithDrugs") == "dispar", F.lit("no"))
                .otherwise(F.lit("no")),
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "oneCellYes",
            F.when(
                F.col("hasGeneticEvidence") == "yes",
                F.when(F.col("oneCellAgreeWithDrugs") == "coherent", F.lit("yes"))
                .when(F.col("oneCellAgreeWithDrugs") == "dispar", F.lit("no"))
                .otherwise(F.lit("no")),
            ).otherwise(F.lit("no")),
        ).withColumn(
            "maxDoEArrayN",
            F.expr("aggregate(arrayN, 0, (acc, x) -> acc + IF(x = maxDoE, 1, 0))")
        ).withColumn(
            "NoneCellYes",
            F.when(F.col("LoF_protect_ch").isNotNull() & (F.array_contains(F.col("maxDoE_names"), F.lit("LoF_protect")))==True, F.lit('yes'))
            .when(F.col("GoF_protect_ch").isNotNull() & (F.array_contains(F.col("maxDoE_names"), F.lit("GoF_protect")))==True, F.lit('yes')
                ).otherwise(F.lit('no'))  # If the value is null, return null # Otherwise, check if name is in array
        ).withColumn(
            "NdiagonalYes",
            F.when(F.col("LoF_protect_ch").isNotNull() & 
                (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_lof]))) > 0),
                F.lit("yes")
            ).when(F.col("GoF_protect_ch").isNotNull() & 
                (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_gof]))) > 0),
                F.lit("yes")
            ).otherwise(F.lit('no'))
        )
        # .persist()
    )

print("moving to Step 3")

from functions import relative_success, spreadSheetFormatter, convertTuple
import re
import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
from scipy.stats.contingency import odds_ratio, relative_risk

full_data = spark.createDataFrame(
    data=[
        ("yes", "yes"),
        ("yes", "no"),
        ("no", "yes"),
        ("no", "no"),
    ],
    schema=StructType(
        [
            StructField("prediction", StringType(), True),
            StructField("comparison", StringType(), True),
        ]
    ),
)
c = datetime.now()
print("starting dictionaries at", c)

#### continue here on 10.07.2024

## 1nd dictionary
dfs_dict = {}  ### checked and changed on 01.06.2023
dfs_dict_propag = {}


wocgc_list = [
    "gene_burden",
    "intogen",
    "eva",
    "eva_somatic",
    "impc",
    "orphanet",
    "gene2phenotype",
    "gwas_credible_set",
]
wCgc_list = [
    "gene_burden",
    "intogen",
    "eva",
    "eva_somatic",
    "impc",
    "orphanet",
    "gene2phenotype",
    "gwas_credible_set",
    "cancer_gene_census",
]

datasource_list = [
    #"gene_burden",
    #"intogen",
    #"cancer_gene_census",
    #"eva",
    #"eva_somatic",
    "gwas_credible_set",
    #"impc",
    #"orphanet",
    #"gene2phenotype",
    #"WOcgc",
    #"wCgc",
    "somatic",
    "germline",
]

germline_list = [
    "gene_burden",
    "eva",
    "gwas_credible_set",
    "impc",
    "orphanet",
    "gene2phenotype",
]

somatic_list = ["intogen", "cancer_gene_census", "eva_somatic"]


# assessment = prueba_assessment.filter(F.col("datasourceId").isin(datasources_analysis))
def dataset_builder(assessment_all, value, analysis_chembl, negativeTD, diseaseTA):
    nonPropagated = full_analysis_noPropagation(
        doe_columns,assessment_all, value, analysis_chembl, negativeTD, diseaseTA,diagonal_lof,diagonal_gof
    )
    propagated = full_analysis_propagation(
        doe_columns,assessment_all, value, analysis_chembl, negativeTD, diseaseTA,diagonal_lof,diagonal_gof
    )
    return (
        # Non propagation
        ## All
        nonPropagated,
        ## Other
#        nonPropagated.filter(F.col("taLabelSimple") == "Other"),
#        ## Other&Null
#        nonPropagated.filter(
#            (F.col("taLabelSimple").isNull()) | (F.col("taLabelSimple") == "Other")
#        ),
#        ## Oncology
#        nonPropagated.filter(F.col("taLabelSimple") == "Oncology"),
        # Propagation
        ## All
        propagated,
        ## Other
#        propagated.filter(F.col("taLabelSimple") == "Other"),
#        ## Other&Null
#        propagated.filter(
#            (F.col("taLabelSimple").isNull()) | (F.col("taLabelSimple") == "Other")
#        ),
#        ## Oncology
#        propagated.filter(F.col("taLabelSimple") == "Oncology"),
    )


for value in datasource_list:
    print(value)
    if value == "WOcgc":
        (
            dfs_dict[f"df_{value}_All_original"],
            dfs_dict[f"df_{value}_Other_original"],
            dfs_dict[f"df_{value}_OtherNull_original"],
            dfs_dict[f"df_{value}_Oncology_original"],
            dfs_dict_propag[f"df_{value}_All_propag"],
            dfs_dict_propag[f"df_{value}_Other_propag"],
            dfs_dict_propag[f"df_{value}_OtherNull_propag"],
            dfs_dict_propag[f"df_{value}_Oncology_propag"],
        ) = dataset_builder(
            assessment_all, wocgc_list, analysis_chembl, negativeTD, diseaseTA
        )
    elif value == "wCgc":
        (
            dfs_dict[f"df_{value}_All_original"],
            dfs_dict[f"df_{value}_Other_original"],
            dfs_dict[f"df_{value}_OtherNull_original"],
            dfs_dict[f"df_{value}_Oncology_original"],
            dfs_dict_propag[f"df_{value}_All_propag"],
            dfs_dict_propag[f"df_{value}_Other_propag"],
            dfs_dict_propag[f"df_{value}_OtherNull_propag"],
            dfs_dict_propag[f"df_{value}_Oncology_propag"],
        ) = dataset_builder(
            assessment_all, wCgc_list, analysis_chembl, negativeTD, diseaseTA
        )
    elif value == "germline":
        (
            dfs_dict[f"df_{value}_All_original"],
            #dfs_dict[f"df_{value}_Other_original"],
            #dfs_dict[f"df_{value}_OtherNull_original"],
            #dfs_dict[f"df_{value}_Oncology_original"],
            dfs_dict_propag[f"df_{value}_All_propag"],
            #dfs_dict_propag[f"df_{value}_Other_propag"],
            #dfs_dict_propag[f"df_{value}_OtherNull_propag"],
            #dfs_dict_propag[f"df_{value}_Oncology_propag"],
        ) = dataset_builder(
            assessment_all,
            germline_list,
            analysis_chembl,
            negativeTD,
            diseaseTA,
        )

    elif value == "somatic":
        (
            dfs_dict[f"df_{value}_All_original"],
            #dfs_dict[f"df_{value}_Other_original"],
            #dfs_dict[f"df_{value}_OtherNull_original"],
            #dfs_dict[f"df_{value}_Oncology_original"],
            dfs_dict_propag[f"df_{value}_All_propag"],
            #dfs_dict_propag[f"df_{value}_Other_propag"],
            #dfs_dict_propag[f"df_{value}_OtherNull_propag"],
            #dfs_dict_propag[f"df_{value}_Oncology_propag"],
        ) = dataset_builder(
            assessment_all,
            somatic_list,
            analysis_chembl,
            negativeTD,
            diseaseTA,
        )

    else:
        (
            dfs_dict[f"df_{value}_All_original"],
            #dfs_dict[f"df_{value}_Other_original"],
            #dfs_dict[f"df_{value}_OtherNull_original"],
            #dfs_dict[f"df_{value}_Oncology_original"],
            dfs_dict_propag[f"df_{value}_All_propag"],
            #dfs_dict_propag[f"df_{value}_Other_propag"],
            #dfs_dict_propag[f"df_{value}_OtherNull_propag"],
            #dfs_dict_propag[f"df_{value}_Oncology_propag"]
        ) = dataset_builder(
            assessment_all, value, analysis_chembl, negativeTD, diseaseTA
        )


def comparisons_df() -> list:
    """Return list of all comparisons to be used in the analysis"""
    comparisons = spark.createDataFrame(
        data=[
            ("hasGeneticEvidence", "byDatatype"),
            ("diagonalYes", "byDatatype"),
            ("oneCellYes", "byDatatype"),
            ("NdiagonalYes", "byDatatype"),
            ("NoneCellYes", "byDatatype"),
        ],
        schema=StructType(
            [
                StructField("comparison", StringType(), True),
                StructField("comparisonType", StringType(), True),
            ]
        ),
    )

    predictions = spark.createDataFrame(
        data=[
            ("Phase4", "clinical"),
            ("Phase>=3", "clinical"),
            ("Phase>=2", "clinical"),
            ("Phase>=1", "clinical"),
            ("PhaseT", "clinical"),
        ]
    )
    return comparisons.join(predictions, how="full").collect()



result = []
result_st = []
result_ci = []
array2 = []
results = []


def aggregations_original(
    df,
    data,
    listado,
    comparisonColumn,
    comparisonType,
    predictionColumn,
    predictionType,
    today_date,
):

    wComparison = Window.partitionBy(comparisonColumn)
    wPrediction = Window.partitionBy(predictionColumn)
    wPredictionComparison = Window.partitionBy(comparisonColumn, predictionColumn)

    uniqIds = df.select("targetId", "diseaseId").distinct().count()

    out = (
        df.withColumn("comparisonType", F.lit(comparisonType))
        .withColumn("predictionType", F.lit(predictionType))
        .withColumn("total", F.lit(uniqIds))
        .withColumn("a", F.count("targetId").over(wPredictionComparison))
        .withColumn(
            "predictionTotal",
            F.count("targetId").over(wPrediction),
        )
        .withColumn(
            "comparisonTotal",
            F.count("targetId").over(wComparison),
        )
        .select(
            F.col(predictionColumn).alias("prediction"),
            F.col(comparisonColumn).alias("comparison"),
            "comparisonType",
            "predictionType",
            "a",
            "predictionTotal",
            "comparisonTotal",
            "total",
        )
        .filter(F.col("prediction").isNotNull())
        .filter(F.col("comparison").isNotNull())
        .distinct()
    )

    out.write.mode("overwrite").parquet(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )

    filePath = "gs://ot-team/jroldan/" + str(
        today_date
        + "_"
        + "analysis/"
        + data
        # + "_propagated"
        + "/"
        + comparisonColumn
        + "_"
        + predictionColumn
        + ".parquet"
    )
    listado.append(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )
    print(
        today_date
        + "_"
        + "analysis/"
        + data
        # + "_propagated"
        + "/"
        + comparisonColumn
        + "_"
        + predictionColumn
        + ".parquet"
    )
    c = datetime.now()
    c.strftime("%H:%M:%S")
    print(c)

    array1 = np.delete(
        out.join(full_data, on=["prediction", "comparison"], how="outer")
        .groupBy("comparison")
        .pivot("prediction")
        .agg(F.first("a"))
        .sort(F.col("comparison").desc())
        .select("comparison", "yes", "no")
        .fillna(0)
        .toPandas()
        .to_numpy(),
        [0],
        1,
    )
    total = np.sum(array1)
    res_npPhaseX = np.array(array1, dtype=int)
    resX = convertTuple(fisher_exact(res_npPhaseX, alternative="two-sided"))
    resx_CI = convertTuple(
        odds_ratio(res_npPhaseX).confidence_interval(confidence_level=0.95)
    )

    result_st.append(resX)
    result_ci.append(resx_CI)
    (rs_result, rs_ci) = relative_success(array1)

    results.append(
        [
            data,
            comparisonColumn,
            predictionColumn,
            round(float(resX.split(",")[0]), 2),
            float(resX.split(",")[1]),
            round(float(resx_CI.split(",")[0]), 2),
            round(float(resx_CI.split(",")[1]), 2),
            str(total),
            np.array(res_npPhaseX).tolist(),
            round(float(rs_result), 2),
            round(float(rs_ci[0]), 2),
            round(float(rs_ci[1]), 2),
            filePath,
        ]
    )
    return results


c = datetime.now()
print("start doing aggregations and writing")
today_date = str(date.today())
aggSetups_original = comparisons_df()
listado = []
results = []


print("starting with non-propagated aggregations at", c)
#for key, df in islice(dfs_dict.items(), 1): ## for debugging
for key, df in dfs_dict.items():
    df = df.persist()
    for row in aggSetups_original:
        aggregations_original(df, key, listado, *row, today_date)
    df.unpersist()
    print(key + " df unpersisted")

print("non propagated files wroten succesfully at", c)


print("starting with propagated aggregations at", c)
#for key, df in islice(dfs_dict_propag.items(), 1): ## for debugging
for key, df in dfs_dict_propag.items():
    df = df.persist()
    for row in aggSetups_original:
        aggregations_original(df, key, listado, *row, today_date)
    df.unpersist()
    print(key + " df unpersisted")

print("propagated files wroten succesfully at", c)


print("creating pandas dataframe with resulting rows")
df_results = pd.DataFrame(
    results,
    columns=[
        "group",
        "comparison",
        "phase",
        "OR",
        "pValue",
        "LowCI",
        "HighCI",
        "total",
        "array",
        "rs",
        "lowRs",
        "HighRs",
        "path",
    ],
)
print("created pandas dataframe")
print("converting to spark dataframe")
print("preparing dataframe")

schema = StructType(
    [
        StructField("group", StringType(), True),
        StructField("comparison", StringType(), True),
        StructField("phase", StringType(), True),
        StructField("oddsRatio", DoubleType(), True),
        StructField("pValue", DoubleType(), True),
        StructField("lowerInterval", DoubleType(), True),
        StructField("upperInterval", DoubleType(), True),
        StructField("total", StringType(), True),
        StructField("values", ArrayType(ArrayType(IntegerType())), True),
        StructField("relSuccess", DoubleType(), True),
        StructField("rsLower", DoubleType(), True),
        StructField("rsUpper", DoubleType(), True),
        StructField("path", StringType(), True),
    ]
)

print("read pattern variables")
df = spreadSheetFormatter(spark.createDataFrame(df_results, schema=schema))
print("processed spreadsheet")
print("writting the dataframe")

# Convert list of lists to DataFrame
# Regular expressions
    
value_pattern = r"df_([^_]+)_"  # Extracts {value}
middle_pattern = r"df_[^_]+_([^_]+)_"  # Extracts middle part (All, Other, etc.)
suffix_pattern = r"(original|propag)$"  # Extracts suffix (original or propag)

df.withColumn(
    "datasource",
    F.regexp_extract(F.col("group"), r"df_(.*?)_(All|Other|OtherNull|Oncology)_(propag|original)", 1)
).withColumn(
    "therArea",
    F.regexp_extract(F.col("group"), r"_(All|Other|OtherNull|Oncology)_", 1)
).withColumn(
    "type",
    F.regexp_extract(F.col("group"), r"_(propag|original)$", 1)
).toPandas().to_csv(
    f"gs://ot-team/jroldan/analysis/{today_date}_genEvidAnalysis_new_rightJoin.csv"
)

print("dataframe written \n Analysis finished")


loaded files
loaded files
loaded newColoc


                                                                                

loaded gwasComplete
loaded resolvedColloc


25/06/18 22:11:39 WARN CacheManager: Asked to cache already cached data.
25/06/18 22:11:41 WARN CacheManager: Asked to cache already cached data.        


run temporary direction of effect
Moving to step 2
defining non propagated,propagated and analysis_drugs functions


25/06/18 22:11:43 WARN CacheManager: Asked to cache already cached data.        
25/06/18 22:11:43 WARN CacheManager: Asked to cache already cached data.
25/06/18 22:11:43 WARN CacheManager: Asked to cache already cached data.


defining full_analysis_propagation
defining full analysis no propagation
moving to Step 3
starting dictionaries at 2025-06-18 22:11:43.205945
gwas_credible_set


                                                                                 207]

somatic


25/06/18 22:13:03 WARN CacheManager: Asked to cache already cached data.
25/06/18 22:13:03 WARN CacheManager: Asked to cache already cached data.
25/06/18 22:13:04 WARN CacheManager: Asked to cache already cached data.
25/06/18 22:13:04 WARN CacheManager: Asked to cache already cached data.


germline


[Stage 94453:>                                                    (0 + 64) / 85] 207]

In [13]:

print("creating pandas dataframe with resulting rows")
df_results = pd.DataFrame(
    results,
    columns=[
        "group",
        "comparison",
        "phase",
        "OR",
        "pValue",
        "LowCI",
        "HighCI",
        "total",
        "array",
        "rs",
        "lowRs",
        "HighRs",
        "path",
    ],
)
print("created pandas dataframe")
print("converting to spark dataframe")
print("preparing dataframe")

schema = StructType(
    [
        StructField("group", StringType(), True),
        StructField("comparison", StringType(), True),
        StructField("phase", StringType(), True),
        StructField("oddsRatio", DoubleType(), True),
        StructField("pValue", DoubleType(), True),
        StructField("lowerInterval", DoubleType(), True),
        StructField("upperInterval", DoubleType(), True),
        StructField("total", StringType(), True),
        StructField("values", ArrayType(ArrayType(IntegerType())), True),
        StructField("relSuccess", DoubleType(), True),
        StructField("rsLower", DoubleType(), True),
        StructField("rsUpper", DoubleType(), True),
        StructField("path", StringType(), True),
    ]
)

print("read pattern variables")
df = spreadSheetFormatter(spark.createDataFrame(df_results, schema=schema))
print("processed spreadsheet")
print("writting the dataframe")

# Convert list of lists to DataFrame
# Regular expressions
    
value_pattern = r"df_([^_]+)_"  # Extracts {value}
middle_pattern = r"df_[^_]+_([^_]+)_"  # Extracts middle part (All, Other, etc.)
suffix_pattern = r"(original|propag)$"  # Extracts suffix (original or propag)

df.withColumn(
    "datasource",
    F.regexp_extract(F.col("group"), r"df_(.*?)_(All|Other|OtherNull|Oncology)_(propag|original)", 1)
).withColumn(
    "therArea",
    F.regexp_extract(F.col("group"), r"_(All|Other|OtherNull|Oncology)_", 1)
).withColumn(
    "type",
    F.regexp_extract(F.col("group"), r"_(propag|original)$", 1)
).toPandas().to_csv(
    f"gs://ot-team/jroldan/analysis/{today_date}_genEvidAnalysis_new.csv"
)

print("dataframe written \n Analysis finished")


creating pandas dataframe with resulting rows
created pandas dataframe
converting to spark dataframe
preparing dataframe
read pattern variables
importing functions
imported functions
processed spreadsheet
writting the dataframe
dataframe written 
 Analysis finished


In [7]:
import time
from array import ArrayType
from functions import (
    relative_success,
    spreadSheetFormatter,
    discrepancifier,
    temporary_directionOfEffect,
)
# from stoppedTrials import terminated_td
from DoEAssessment import directionOfEffect
# from membraneTargets import target_membrane
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
from datetime import datetime
from datetime import date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.types import (
    StructType,
    StructField,
    DoubleType,
    DecimalType,
    StringType,
    FloatType,
)
import pandas as pd

spark = SparkSession.builder.getOrCreate()
spark.conf.set(
    "spark.sql.shuffle.partitions", "400"
)  # Default is 200, increase if needed


path_n='gs://open-targets-data-releases/25.03/output/'

target = spark.read.parquet(f"{path_n}target/")

diseases = spark.read.parquet(f"{path_n}disease/")

evidences = spark.read.parquet(f"{path_n}evidence")

credible = spark.read.parquet(f"{path_n}credible_set")

new = spark.read.parquet(f"{path_n}colocalisation_coloc") 

index=spark.read.parquet(f"{path_n}study/")

variantIndex = spark.read.parquet(f"{path_n}variant")

biosample = spark.read.parquet(f"{path_n}biosample")

ecaviar=spark.read.parquet(f"{path_n}colocalisation_ecaviar")

all_coloc=ecaviar.unionByName(new, allowMissingColumns=True)

print("loaded files")

datasource_filter = [
#   "ot_genetics_portal",
    "gwas_credible_sets",
    "gene_burden",
    "eva",
    "eva_somatic",
    "gene2phenotype",
    "orphanet",
    "cancer_gene_census",
    "intogen",
    "impc",
    "chembl",
]

####2 Define agregation function
import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
from scipy.stats.contingency import odds_ratio
from pyspark.sql.types import *


def convertTuple(tup):
    st = ",".join(map(str, tup))
    return st


#####3 run in a function
def aggregations_original(
    df,
    data,
    listado,
    comparisonColumn,
    comparisonType,
    predictionColumn,
    predictionType,
    today_date,
):
    wComparison = Window.partitionBy(comparisonColumn)
    wPrediction = Window.partitionBy(predictionColumn)
    wPredictionComparison = Window.partitionBy(comparisonColumn, predictionColumn)
    results = []
    # uniqIds = df.select("targetId", "diseaseId").distinct().count()
    out = (
        df.withColumn("comparisonType", F.lit(comparisonType))
        .withColumn("dataset", F.lit(data))
        .withColumn("predictionType", F.lit(predictionType))
        # .withColumn("total", F.lit(uniqIds))
        .withColumn("a", F.count("targetId").over(wPredictionComparison))
        .withColumn("comparisonColumn", F.lit(comparisonColumn))
        .withColumn("predictionColumnValue", F.lit(predictionColumn))
        .withColumn(
            "predictionTotal",
            F.count("targetId").over(wPrediction),
        )
        .withColumn(
            "comparisonTotal",
            F.count("targetId").over(wComparison),
        )
        .select(
            F.col(predictionColumn).alias("prediction"),
            F.col(comparisonColumn).alias("comparison"),
            "dataset",
            "comparisonColumn",
            "predictionColumnValue",
            "comparisonType",
            "predictionType",
            "a",
            "predictionTotal",
            "comparisonTotal",
        )
        .filter(F.col("prediction").isNotNull())
        .filter(F.col("comparison").isNotNull())
        .distinct()
    )

    out.write.mode("overwrite").parquet(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + comparisonType
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )

    listado.append(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + comparisonType
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )
    path = "gs://ot-team/jroldan/" + str(
        today_date
        + "_"
        + "analysis/"
        + data
        # + "_propagated"
        + "/"
        + comparisonColumn
        + "_"
        + comparisonType
        + "_"
        + predictionColumn
        + ".parquet"
    )
    print(path)
    
    ### making analysis
    array1 = np.delete(
        out.join(full_data, on=["prediction", "comparison"], how="outer")
        .groupBy("comparison")
        .pivot("prediction")
        .agg(F.first("a"))
        .sort(F.col("comparison").desc())
        .select("comparison", "yes", "no")
        .fillna(0)
        .toPandas()
        .to_numpy(),
        [0],
        1,
    )
    total = np.sum(array1)
    res_npPhaseX = np.array(array1, dtype=int)
    resX = convertTuple(fisher_exact(res_npPhaseX, alternative="two-sided"))
    resx_CI = convertTuple(
        odds_ratio(res_npPhaseX).confidence_interval(confidence_level=0.95)
    )

    result_st.append(resX)
    result_ci.append(resx_CI)
    (rs_result, rs_ci) = relative_success(array1)
    results.extend(
        [
            comparisonType,
            comparisonColumn,
            predictionColumn,
            round(float(resX.split(",")[0]), 2),
            float(resX.split(",")[1]),
            round(float(resx_CI.split(",")[0]), 2),
            round(float(resx_CI.split(",")[1]), 2),
            str(total),
            np.array(res_npPhaseX).tolist(),
            round(float(rs_result), 2),
            round(float(rs_ci[0]), 2),
            round(float(rs_ci[1]), 2),
            # studies,
            # tissues,
            path,
        ]
    )
    return results


#### 3 Loop over different datasets (as they will have different rows and columns)


def comparisons_df_iterative(elements):
    # toAnalysis = [(key, value) for key, value in disdic.items() if value == projectId]
    toAnalysis = [(col, "predictor") for col in elements]
    schema = StructType(
        [
            StructField("comparison", StringType(), True),
            StructField("comparisonType", StringType(), True),
        ]
    )

    comparisons = spark.createDataFrame(toAnalysis, schema=schema)
    ### include all the columns as predictor

    predictions = spark.createDataFrame(
        data=[
            ("Phase>=4", "clinical"),
            ('Phase>=3','clinical'),
            ('Phase>=2','clinical'),
            ('Phase>=1','clinical'),
            ("PhaseT", "clinical"),
        ]
    )
    return comparisons.join(predictions, how="full").collect()


print("load comparisons_df_iterative function")


full_data = spark.createDataFrame(
    data=[
        ("yes", "yes"),
        ("yes", "no"),
        ("no", "yes"),
        ("no", "no"),
    ],
    schema=StructType(
        [
            StructField("prediction", StringType(), True),
            StructField("comparison", StringType(), True),
        ]
    ),
)
print("created full_data and lists")

#rightTissue = spark.read.csv(
#    'gs://ot-team/jroldan/analysis/20250526_rightTissue.csv',
#    header=True,
#).drop("_c0")

print("loaded rightTissue dataset")

negativeTD = (
    evidences.filter(F.col("datasourceId") == "chembl")
    .select("targetId", "diseaseId", "studyStopReason", "studyStopReasonCategories")
    .filter(F.array_contains(F.col("studyStopReasonCategories"), "Negative"))
    .groupBy("targetId", "diseaseId")
    .count()
    .withColumn("stopReason", F.lit("Negative"))
    .drop("count")
)

print("built negativeTD dataset")

print("built bench2 dataset")

###### cut from here
print("looping for variables_study")

#### new part with chatgpt -- TEST

## QUESTIONS TO ANSWER:
# HAVE ECAVIAR >=0.8
# HAVE COLOC 
# HAVE COLOC >= 0.8
# HAVE COLOC + ECAVIAR >= 0.01
# HAVE COLOC >= 0.8 + ECAVIAR >= 0.01
# RIGHT JOING WITH CHEMBL 


## write the benchmark 
name='benchmark'
input_partitioned_path = f"gs://ot-team/jroldan/analysis/parquetFiles/{name}"
benchmark=spark.read.parquet(input_partitioned_path)
print(f'read {name}')
#### Analysis

#### 1 Build a dictionary with the distinct values as key and column names as value
variables_study = ["projectId", "biosampleName", "rightStudyType", "colocDoE","colocalisationMethod"]

# List to hold temporary DataFrames
temp_dfs_for_union = []
# Iterate over the column names to prepare DataFrames for union
for col_name in variables_study:
    # Select the current column, alias it to 'distinct_value' for consistent schema
    # Filter out nulls, then get distinct values
    # Add a literal column with the original 'col_name'
    df_temp = (
        benchmark.select(F.col(col_name).alias("distinct_value"))
        .filter(F.col("distinct_value").isNotNull()) # Exclude None (null) values
        .distinct()
        .withColumn("column_name", F.lit(col_name))
    )
    temp_dfs_for_union.append(df_temp)

disdic = {}

if temp_dfs_for_union:
    # Union all the temporary DataFrames.
    # unionByName is crucial to handle potential schema differences (e.g., if columns have same name but different types)
    # and ensures columns are matched by name.
    combined_distinct_values_df = temp_dfs_for_union[0]
    for i in range(1, len(temp_dfs_for_union)):
        combined_distinct_values_df = combined_distinct_values_df.unionByName(temp_dfs_for_union[i])

    # Now, collect the combined distinct values.
    # This is a single collect operation on the aggregated DataFrame.
    print("Collecting combined distinct values from the cluster...")
    collected_rows = combined_distinct_values_df.collect()

    # Populate the dictionary from the collected rows
    for row in collected_rows:
        disdic[row.distinct_value] = row.column_name
else:
    print("variables_study list is empty, disdic will be empty.")


print("\nFinal disdic:", disdic)

pivoted_dfs={}
# Iterate over the column names to prepare DataFrames for union
for col_name in variables_study:

    input_partitioned_path = f"gs://ot-team/jroldan/analysis/parquetFiles/pivoted_df_{col_name}"
    pivoted_df=spark.read.parquet(input_partitioned_path)
    pivoted_dfs[col_name] = pivoted_df
    print(f"DataFrame successfully read and saved as pivoted_dfs[{col_name}]")
    # If not writing to GCS, just store the DF in memory (be cautious for large number of DFs)
    

# Example of how to access a result
# if 'some_col_name' in pivoted_dfs:
#     pivoted_dfs['some_col_name'].show()

# If benchmark_processed was cached, unpersist it after the loop
# benchmark_processed.unpersist()

result = []
result_st = []
result_ci = []
array2 = []
listado = []
result_all = []
today_date = str(date.today())

##### PROJECT ID ###### 
print('working with projectId')
pivoted_dfs['projectId'].persist()
unique_values = benchmark.select('projectId').filter(F.col('projectId').isNotNull()).distinct().rdd.flatMap(lambda x: x).collect()
filter = len(pivoted_dfs['projectId'].drop(*unique_values).columns[10:])
print('There are ', filter, 'columns to analyse with phases')
rows = comparisons_df_iterative(pivoted_dfs['projectId'].columns[-filter:])

# If needed, now process the rest
for row in rows:
    results = aggregations_original(
        pivoted_dfs['projectId'], "propagated", listado, *row, today_date
    )
    result_all.append(results)

pivoted_dfs['projectId'].unpersist()
print('df unpersisted')

##### BIOSAMPLE NAME ###### 
print('working with biosampleName')
pivoted_dfs['biosampleName'].persist()
unique_values = benchmark.select('biosampleName').filter(F.col('biosampleName').isNotNull()).distinct().rdd.flatMap(lambda x: x).collect()
filter = len(pivoted_dfs['biosampleName'].drop(*unique_values).columns[10:])
print('There are ', filter, 'columns to analyse with phases')
rows = comparisons_df_iterative(pivoted_dfs['biosampleName'].columns[-filter:])

for row in rows:
    results = aggregations_original(
        pivoted_dfs['biosampleName'], "propagated", listado, *row, today_date
    )
    result_all.append(results)

pivoted_dfs['biosampleName'].unpersist()
print('df unpersisted')

##### RIGHTSTUDYTYPE  ###### 
print('working with rightStudyType')
pivoted_dfs['rightStudyType'].persist()
unique_values = benchmark.select('rightStudyType').filter(F.col('rightStudyType').isNotNull()).distinct().rdd.flatMap(lambda x: x).collect()
filter = len(pivoted_dfs['rightStudyType'].drop(*unique_values).columns[10:])
print('There are ', filter, 'columns to analyse with phases')
rows = comparisons_df_iterative(pivoted_dfs['rightStudyType'].columns[-filter:])

for row in rows:
    results = aggregations_original(
        pivoted_dfs['rightStudyType'], "propagated", listado, *row, today_date
    )
    result_all.append(results)
pivoted_dfs['rightStudyType'].unpersist()
print('df unpersisted')

##### COLOC DOE ######
print('working with colocDoE')
pivoted_dfs['colocDoE'].persist()
unique_values = benchmark.select('colocDoE').filter(F.col('colocDoE').isNotNull()).distinct().rdd.flatMap(lambda x: x).collect()
filter = len(pivoted_dfs['colocDoE'].drop(*unique_values).columns[10:])
print('There are ', filter, 'columns to analyse with phases')
rows = comparisons_df_iterative(pivoted_dfs['colocDoE'].columns[-filter:])

for row in rows:
    results = aggregations_original(
        pivoted_dfs['colocDoE'], "propagated", listado, *row, today_date
    )
    result_all.append(results)
pivoted_dfs['colocDoE'].unpersist()
print('df unpersisted')

##### COLOCALISATION METHOD ######
print('working with colocalisationMethod')
pivoted_dfs['colocalisationMethod'].persist()
unique_values = benchmark.select('colocalisationMethod').filter(F.col('colocalisationMethod').isNotNull()).distinct().rdd.flatMap(lambda x: x).collect()
filter = len(pivoted_dfs['colocalisationMethod'].drop(*unique_values).columns[10:])
print('There are ', filter, 'columns to analyse with phases')
rows = comparisons_df_iterative(pivoted_dfs['colocalisationMethod'].columns[-filter:])

for row in rows:
    results = aggregations_original(
        pivoted_dfs['colocalisationMethod'], "propagated", listado, *row, today_date
    )
    result_all.append(results)
pivoted_dfs['colocalisationMethod'].unpersist()
print('df unpersisted')

schema = StructType(
    [
        StructField("group", StringType(), True),
        StructField("comparison", StringType(), True),
        StructField("phase", StringType(), True),
        StructField("oddsRatio", DoubleType(), True),
        StructField("pValue", DoubleType(), True),
        StructField("lowerInterval", DoubleType(), True),
        StructField("upperInterval", DoubleType(), True),
        StructField("total", StringType(), True),
        StructField("values", ArrayType(ArrayType(IntegerType())), True),
        StructField("relSuccess", DoubleType(), True),
        StructField("rsLower", DoubleType(), True),
        StructField("rsUpper", DoubleType(), True),
        StructField("path", StringType(), True),
    ]
)
import re

# Define the list of patterns to search for
patterns = [
    "_only",
    #"_tissue",
    #"_isSignalFromRightTissue",
    "_isRightTissueSignalAgreed",
]
# Create a regex pattern to match any of the substrings
regex_pattern = "(" + "|".join(map(re.escape, patterns)) + ")"

# Convert list of lists to DataFrame
df = (
    spreadSheetFormatter(spark.createDataFrame(result_all, schema=schema))
    .withColumn(
        "prefix",
        F.regexp_replace(
            F.col("comparison"), regex_pattern + ".*", ""
        ),  # Extract part before the pattern
    )
    .withColumn(
        "suffix",
        F.regexp_extract(
            F.col("comparison"), regex_pattern, 0
        ),  # Extract the pattern itself
    )
)

### annotate projectId, tissue, qtl type and doe type:

from pyspark.sql.functions import create_map
from itertools import chain

mapping_expr=create_map([F.lit(x) for x in chain(*disdic.items())])

df_annot=df.withColumn('annotation',mapping_expr.getItem(F.col('prefix')))

df_annot.toPandas().to_csv(
    f"gs://ot-team/jroldan/analysis/{today_date}_credibleSetColocDoEanalysis.csv"
)

print("dataframe written \n Analysis finished")

loaded files
load comparisons_df_iterative function
created full_data and lists
loaded rightTissue dataset
built negativeTD dataset
built bench2 dataset
looping for variables_study
read benchmark
Collecting combined distinct values from the cluster...


                                                                                ]


Final disdic: {'Fairfax_2014': 'projectId', 'Walker_2019': 'projectId', 'Sun_2018': 'projectId', 'HipSci': 'projectId', 'van_de_Bunt_2015': 'projectId', 'Nedelec_2016': 'projectId', 'BrainSeq': 'projectId', 'Aygun_2021': 'projectId', 'GEUVADIS': 'projectId', 'OneK1K': 'projectId', 'Kim-Hellmuth_2017': 'projectId', 'GTEx': 'projectId', 'Schmiedel_2018': 'projectId', 'Young_2019': 'projectId', 'Lepik_2017': 'projectId', 'Schwartzentruber_2018': 'projectId', 'Peng_2018': 'projectId', 'Bossini-Castillo_2019': 'projectId', 'CEDAR': 'projectId', 'ROSMAP': 'projectId', 'Braineac2': 'projectId', 'BLUEPRINT': 'projectId', 'TwinsUK': 'projectId', 'UKB_PPP_EUR': 'projectId', 'Alasoo_2018': 'projectId', 'FUSION': 'projectId', 'Perez_2022': 'projectId', 'Cytoimmgen': 'projectId', 'Quach_2016': 'projectId', 'iPSCORE': 'projectId', 'CommonMind': 'projectId', 'PhLiPS': 'projectId', 'GENCORD': 'projectId', 'CAP': 'projectId', 'PISA': 'projectId', 'Jerber_2021': 'projectId', 'Fairfax_2012': 'projectId'

                                                                                

DataFrame successfully read and saved as pivoted_dfs[rightStudyType]
DataFrame successfully read and saved as pivoted_dfs[colocDoE]
DataFrame successfully read and saved as pivoted_dfs[colocalisationMethod]
working with projectId


                                                                                

There are  38 columns to analyse with phases


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/HipSci_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/HipSci_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/HipSci_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/HipSci_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/HipSci_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Nedelec_2016_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Nedelec_2016_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Nedelec_2016_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Nedelec_2016_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Nedelec_2016_only_predictor_PhaseT.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/GTEx_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/GTEx_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/GTEx_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/GTEx_only_predictor_Phase>=1.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/GTEx_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Schmiedel_2018_only_predictor_Phase>=4.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/Schmiedel_2018_only_predictor_Phase>=3.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/Schmiedel_2018_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Schmiedel_2018_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Schmiedel_2018_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Jerber_2021_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Jerber_2021_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Jerber_2021_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Jerber_2021_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Jerber_2021_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/BLUEPRINT_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/BLUEPRINT_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/BLUEPRINT_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/BLUEPRINT_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/BLUEPRINT_only_predictor_PhaseT.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/Quach_2016_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Quach_2016_only_predictor_Phase>=3.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/Quach_2016_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Quach_2016_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Quach_2016_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/BrainSeq_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/BrainSeq_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/BrainSeq_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/BrainSeq_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/BrainSeq_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/UKB_PPP_EUR_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/UKB_PPP_EUR_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/UKB_PPP_EUR_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/UKB_PPP_EUR_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/UKB_PPP_EUR_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/FUSION_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/FUSION_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/FUSION_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/FUSION_only_predictor_Phase>=1.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/FUSION_only_predictor_PhaseT.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/Sun_2018_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Sun_2018_only_predictor_Phase>=3.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/Sun_2018_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/Sun_2018_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Sun_2018_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/ROSMAP_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/ROSMAP_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/ROSMAP_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/ROSMAP_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/ROSMAP_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Alasoo_2018_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Alasoo_2018_only_predictor_Phase>=3.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/Alasoo_2018_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/Alasoo_2018_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Alasoo_2018_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/GENCORD_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/GENCORD_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/GENCORD_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/GENCORD_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/GENCORD_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/GEUVADIS_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/GEUVADIS_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/GEUVADIS_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/GEUVADIS_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/GEUVADIS_only_predictor_PhaseT.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/Lepik_2017_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Lepik_2017_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Lepik_2017_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Lepik_2017_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Lepik_2017_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/TwinsUK_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/TwinsUK_only_predictor_Phase>=3.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/TwinsUK_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/TwinsUK_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/TwinsUK_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/CommonMind_only_predictor_Phase>=4.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/CommonMind_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/CommonMind_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/CommonMind_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/CommonMind_only_predictor_PhaseT.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/PhLiPS_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/PhLiPS_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/PhLiPS_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/PhLiPS_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/PhLiPS_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/van_de_Bunt_2015_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/van_de_Bunt_2015_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/van_de_Bunt_2015_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/van_de_Bunt_2015_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/van_de_Bunt_2015_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Bossini-Castillo_2019_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Bossini-Castillo_2019_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Bossini-Castillo_2019_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Bossini-Castillo_2019_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Bossini-Castillo_2019_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Aygun_2021_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Aygun_2021_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Aygun_2021_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Aygun_2021_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Aygun_2021_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Fairfax_2012_only_predictor_Phase>=4.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/Fairfax_2012_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Fairfax_2012_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Fairfax_2012_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Fairfax_2012_only_predictor_PhaseT.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/Peng_2018_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Peng_2018_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Peng_2018_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Peng_2018_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Peng_2018_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/CAP_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/CAP_only_predictor_Phase>=3.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/CAP_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/CAP_only_predictor_Phase>=1.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/CAP_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Fairfax_2014_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Fairfax_2014_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Fairfax_2014_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Fairfax_2014_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Fairfax_2014_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Cytoimmgen_only_predictor_Phase>=4.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/Cytoimmgen_only_predictor_Phase>=3.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/Cytoimmgen_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Cytoimmgen_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Cytoimmgen_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Schwartzentruber_2018_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Schwartzentruber_2018_only_predictor_Phase>=3.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/Schwartzentruber_2018_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Schwartzentruber_2018_only_predictor_Phase>=1.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/Schwartzentruber_2018_only_predictor_PhaseT.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/PISA_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/PISA_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/PISA_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/PISA_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/PISA_only_predictor_PhaseT.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/Walker_2019_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Walker_2019_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Walker_2019_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Walker_2019_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Walker_2019_only_predictor_PhaseT.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/CEDAR_only_predictor_Phase>=4.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/CEDAR_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/CEDAR_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/CEDAR_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/CEDAR_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Braineac2_only_predictor_Phase>=4.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/Braineac2_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Braineac2_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/Braineac2_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Braineac2_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/iPSCORE_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/iPSCORE_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/iPSCORE_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/iPSCORE_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/iPSCORE_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Young_2019_only_predictor_Phase>=4.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/Young_2019_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Young_2019_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Young_2019_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Young_2019_only_predictor_PhaseT.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/Kim-Hellmuth_2017_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Kim-Hellmuth_2017_only_predictor_Phase>=3.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/Kim-Hellmuth_2017_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Kim-Hellmuth_2017_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Kim-Hellmuth_2017_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Perez_2022_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Perez_2022_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Perez_2022_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/Perez_2022_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Perez_2022_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/OneK1K_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/OneK1K_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/OneK1K_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/OneK1K_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/OneK1K_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Kasela_2017_only_predictor_Phase>=4.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/Kasela_2017_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Kasela_2017_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Kasela_2017_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Kasela_2017_only_predictor_PhaseT.parquet
df unpersisted
working with biosampleName


                                                                                

There are  73 columns to analyse with phases


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/skeletal muscle tissue_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/skeletal muscle tissue_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/skeletal muscle tissue_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/skeletal muscle tissue_only_predictor_Phase>=1.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/skeletal muscle tissue_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/tibial nerve_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/tibial nerve_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/tibial nerve_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/tibial nerve_only_predictor_Phase>=1.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/tibial nerve_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/placenta_only_predictor_Phase>=4.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/placenta_only_predictor_Phase>=3.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/placenta_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/placenta_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/placenta_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/induced pluripotent stem cell_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/induced pluripotent stem cell_only_predictor_Phase>=3.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/induced pluripotent stem cell_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/induced pluripotent stem cell_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/induced pluripotent stem cell_only_predictor_PhaseT.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/adipose tissue_only_predictor_Phase>=4.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/adipose tissue_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/adipose tissue_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/adipose tissue_only_predictor_Phase>=1.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/adipose tissue_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/right atrium auricular region_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/right atrium auricular region_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/right atrium auricular region_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/right atrium auricular region_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/right atrium auricular region_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/sigmoid colon_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/sigmoid colon_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/sigmoid colon_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/sigmoid colon_only_predictor_Phase>=1.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/sigmoid colon_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/left ventricle myocardium_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/left ventricle myocardium_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/left ventricle myocardium_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/left ventricle myocardium_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/left ventricle myocardium_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/macrophage_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/macrophage_only_predictor_Phase>=3.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/macrophage_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/macrophage_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/macrophage_only_predictor_PhaseT.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/frontal cortex_only_predictor_Phase>=4.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/frontal cortex_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/frontal cortex_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/frontal cortex_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/frontal cortex_only_predictor_PhaseT.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/fibroblast_only_predictor_Phase>=4.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/fibroblast_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/fibroblast_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/fibroblast_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/fibroblast_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/esophagus muscularis mucosa_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/esophagus muscularis mucosa_only_predictor_Phase>=3.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/esophagus muscularis mucosa_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/esophagus muscularis mucosa_only_predictor_Phase>=1.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/esophagus muscularis mucosa_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/vagina_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/vagina_only_predictor_Phase>=3.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/vagina_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/vagina_only_predictor_Phase>=1.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/vagina_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/body of pancreas_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/body of pancreas_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/body of pancreas_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/body of pancreas_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/body of pancreas_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/pituitary gland_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/pituitary gland_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/pituitary gland_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/pituitary gland_only_predictor_Phase>=1.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/pituitary gland_only_predictor_PhaseT.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/dorsolateral prefrontal cortex_only_predictor_Phase>=4.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/dorsolateral prefrontal cortex_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/dorsolateral prefrontal cortex_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/dorsolateral prefrontal cortex_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/dorsolateral prefrontal cortex_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/tibial artery_only_predictor_Phase>=4.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/tibial artery_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/tibial artery_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/tibial artery_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/tibial artery_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/blood_only_predictor_Phase>=4.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/blood_only_predictor_Phase>=3.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/blood_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/blood_only_predictor_Phase>=1.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/blood_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/CD4-positive, alpha-beta T cell_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/CD4-positive, alpha-beta T cell_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/CD4-positive, alpha-beta T cell_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/CD4-positive, alpha-beta T cell_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/CD4-positive, alpha-beta T cell_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/cerebellum_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/cerebellum_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/cerebellum_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/cerebellum_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/cerebellum_only_predictor_PhaseT.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/upper lobe of left lung_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/upper lobe of left lung_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/upper lobe of left lung_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/upper lobe of left lung_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/upper lobe of left lung_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/prostate gland_only_predictor_Phase>=4.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/prostate gland_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/prostate gland_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/prostate gland_only_predictor_Phase>=1.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/prostate gland_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/gastroesophageal sphincter_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/gastroesophageal sphincter_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/gastroesophageal sphincter_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/gastroesophageal sphincter_only_predictor_Phase>=1.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/gastroesophageal sphincter_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/transverse colon_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/transverse colon_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/transverse colon_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/transverse colon_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/transverse colon_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/lymphoblastoid cell line_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/lymphoblastoid cell line_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/lymphoblastoid cell line_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/lymphoblastoid cell line_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/lymphoblastoid cell line_only_predictor_PhaseT.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/CD14-positive, CD16-negative classical monocyte_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/CD14-positive, CD16-negative classical monocyte_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/CD14-positive, CD16-negative classical monocyte_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/CD14-positive, CD16-negative classical monocyte_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/CD14-positive, CD16-negative classical monocyte_only_predictor_PhaseT.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/thyroid gland_only_predictor_Phase>=4.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/thyroid gland_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/thyroid gland_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/thyroid gland_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/thyroid gland_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/breast epithelium_only_predictor_Phase>=4.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/breast epithelium_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/breast epithelium_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/breast epithelium_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/breast epithelium_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/hypothalamus_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/hypothalamus_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/hypothalamus_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/hypothalamus_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/hypothalamus_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/CD8-positive, alpha-beta T cell_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/CD8-positive, alpha-beta T cell_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/CD8-positive, alpha-beta T cell_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/CD8-positive, alpha-beta T cell_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/CD8-positive, alpha-beta T cell_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/memory regulatory T cell_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/memory regulatory T cell_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/memory regulatory T cell_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/memory regulatory T cell_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/memory regulatory T cell_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/testis_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/testis_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/testis_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/testis_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/testis_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/neutrophil_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/neutrophil_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/neutrophil_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/neutrophil_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/neutrophil_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/caudate nucleus_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/caudate nucleus_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/caudate nucleus_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/caudate nucleus_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/caudate nucleus_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/cortex of kidney_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/cortex of kidney_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/cortex of kidney_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/cortex of kidney_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/cortex of kidney_only_predictor_PhaseT.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/esophagus squamous epithelium_only_predictor_Phase>=4.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/esophagus squamous epithelium_only_predictor_Phase>=3.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/esophagus squamous epithelium_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/esophagus squamous epithelium_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/esophagus squamous epithelium_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/CD14-low, CD16-positive monocyte_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/CD14-low, CD16-positive monocyte_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/CD14-low, CD16-positive monocyte_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/CD14-low, CD16-positive monocyte_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/CD14-low, CD16-positive monocyte_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/ascending aorta_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/ascending aorta_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/ascending aorta_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/ascending aorta_only_predictor_Phase>=1.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/ascending aorta_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/blood plasma_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/blood plasma_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/blood plasma_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/blood plasma_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/blood plasma_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/adrenal gland_only_predictor_Phase>=4.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/adrenal gland_only_predictor_Phase>=3.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/adrenal gland_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/adrenal gland_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/adrenal gland_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/omental fat pad_only_predictor_Phase>=4.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/omental fat pad_only_predictor_Phase>=3.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/omental fat pad_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/omental fat pad_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/omental fat pad_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/neocortex_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/neocortex_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/neocortex_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/neocortex_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/neocortex_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/skin of body_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/skin of body_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/skin of body_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/skin of body_only_predictor_Phase>=1.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/skin of body_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/nucleus accumbens_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/nucleus accumbens_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/nucleus accumbens_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/nucleus accumbens_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/nucleus accumbens_only_predictor_PhaseT.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/C1 segment of cervical spinal cord_only_predictor_Phase>=4.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/C1 segment of cervical spinal cord_only_predictor_Phase>=3.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/C1 segment of cervical spinal cord_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/C1 segment of cervical spinal cord_only_predictor_Phase>=1.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/C1 segment of cervical spinal cord_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/naive regulatory T cell_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/naive regulatory T cell_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/naive regulatory T cell_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/naive regulatory T cell_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/naive regulatory T cell_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/T-helper 2 cell_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/T-helper 2 cell_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/T-helper 2 cell_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/T-helper 2 cell_only_predictor_Phase>=1.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/T-helper 2 cell_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/sensory neuron_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/sensory neuron_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/sensory neuron_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/sensory neuron_only_predictor_Phase>=1.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/sensory neuron_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/uterus_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/uterus_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/uterus_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/uterus_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/uterus_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/floor plate_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/floor plate_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/floor plate_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/floor plate_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/floor plate_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/coronary artery_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/coronary artery_only_predictor_Phase>=3.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/coronary artery_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/coronary artery_only_predictor_Phase>=1.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/coronary artery_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/suprapubic skin_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/suprapubic skin_only_predictor_Phase>=3.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/suprapubic skin_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/suprapubic skin_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/suprapubic skin_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/T-helper 17 cell_only_predictor_Phase>=4.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/T-helper 17 cell_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/T-helper 17 cell_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/T-helper 17 cell_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/T-helper 17 cell_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/stomach_only_predictor_Phase>=4.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/stomach_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/stomach_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/stomach_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/stomach_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/putamen_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/putamen_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/putamen_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/putamen_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/putamen_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/amygdala_only_predictor_Phase>=4.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/amygdala_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/amygdala_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/amygdala_only_predictor_Phase>=1.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/amygdala_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/neural progenitor cell_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/neural progenitor cell_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/neural progenitor cell_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/neural progenitor cell_only_predictor_Phase>=1.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/neural progenitor cell_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/anterior lingual gland_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/anterior lingual gland_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/anterior lingual gland_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/anterior lingual gland_only_predictor_Phase>=1.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/anterior lingual gland_only_predictor_PhaseT.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/spleen_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/spleen_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/spleen_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/spleen_only_predictor_Phase>=1.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/spleen_only_predictor_PhaseT.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/microglial cell_only_predictor_Phase>=4.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/microglial cell_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/microglial cell_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/microglial cell_only_predictor_Phase>=1.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/microglial cell_only_predictor_PhaseT.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/Peyer's patch_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Peyer's patch_only_predictor_Phase>=3.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/Peyer's patch_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/Peyer's patch_only_predictor_Phase>=1.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/Peyer's patch_only_predictor_PhaseT.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/anterior cingulate cortex_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/anterior cingulate cortex_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/anterior cingulate cortex_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/anterior cingulate cortex_only_predictor_Phase>=1.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/anterior cingulate cortex_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/T cell_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/T cell_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/T cell_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/T cell_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/T cell_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/T-helper 1 cell_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/T-helper 1 cell_only_predictor_Phase>=3.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/T-helper 1 cell_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/T-helper 1 cell_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/T-helper 1 cell_only_predictor_PhaseT.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/T follicular helper cell_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/T follicular helper cell_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/T follicular helper cell_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/T follicular helper cell_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/T follicular helper cell_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/right lobe of liver_only_predictor_Phase>=4.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/right lobe of liver_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/right lobe of liver_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/right lobe of liver_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/right lobe of liver_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/islet of Langerhans_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/islet of Langerhans_only_predictor_Phase>=3.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/islet of Langerhans_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/islet of Langerhans_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/islet of Langerhans_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/ovary_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/ovary_only_predictor_Phase>=3.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/ovary_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/ovary_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/ovary_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/central memory CD4-positive, alpha-beta T cell_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/central memory CD4-positive, alpha-beta T cell_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/central memory CD4-positive, alpha-beta T cell_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/central memory CD4-positive, alpha-beta T cell_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/central memory CD4-positive, alpha-beta T cell_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/B cell_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/B cell_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/B cell_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/B cell_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/B cell_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/substantia nigra_only_predictor_Phase>=4.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/substantia nigra_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/substantia nigra_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/substantia nigra_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/substantia nigra_only_predictor_PhaseT.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/Ammon's horn_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Ammon's horn_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Ammon's horn_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/Ammon's horn_only_predictor_Phase>=1.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/Ammon's horn_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/hepatocyte_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/hepatocyte_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/hepatocyte_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/hepatocyte_only_predictor_Phase>=1.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/hepatocyte_only_predictor_PhaseT.parquet
df unpersisted
working with rightStudyType


                                                                                

There are  5 columns to analyse with phases


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/tuqtl_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/tuqtl_only_predictor_Phase>=3.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/tuqtl_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/tuqtl_only_predictor_Phase>=1.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/tuqtl_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/sqtl_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/sqtl_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/sqtl_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/sqtl_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/sqtl_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/pqtl_only_predictor_Phase>=4.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/pqtl_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/pqtl_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/pqtl_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/pqtl_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/eqtl_only_predictor_Phase>=4.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/eqtl_only_predictor_Phase>=3.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/eqtl_only_predictor_Phase>=2.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/eqtl_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/eqtl_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/sceqtl_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/sceqtl_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/sceqtl_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/sceqtl_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/sceqtl_only_predictor_PhaseT.parquet
df unpersisted
working with colocDoE


                                                                                

There are  2 columns to analyse with phases


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/GoF_protect_only_predictor_Phase>=4.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/GoF_protect_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/GoF_protect_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/GoF_protect_only_predictor_Phase>=1.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/GoF_protect_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/LoF_protect_only_predictor_Phase>=4.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/LoF_protect_only_predictor_Phase>=3.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/LoF_protect_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/LoF_protect_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/LoF_protect_only_predictor_PhaseT.parquet
df unpersisted
working with colocalisationMethod


                                                                                

There are  2 columns to analyse with phases


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/eCAVIAR_only_predictor_Phase>=4.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/eCAVIAR_only_predictor_Phase>=3.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/eCAVIAR_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/eCAVIAR_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/eCAVIAR_only_predictor_PhaseT.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/COLOC_only_predictor_Phase>=4.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/COLOC_only_predictor_Phase>=3.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/COLOC_only_predictor_Phase>=2.parquet
gs://ot-team/jroldan/2025-06-18_analysis/propagated/COLOC_only_predictor_Phase>=1.parquet


                                                                                

gs://ot-team/jroldan/2025-06-18_analysis/propagated/COLOC_only_predictor_PhaseT.parquet
df unpersisted
importing functions
imported functions




dataframe written 
 Analysis finished


In [2]:
#### testing ecaviar for genevid analysis
import time
#from array import ArrayType
from functions import (
    relative_success,
    spreadSheetFormatter,
    discrepancifier,
    temporary_directionOfEffect,
)
# from stoppedTrials import terminated_td
from DoEAssessment import directionOfEffect
# from membraneTargets import target_membrane
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
#from itertools import islice
from datetime import datetime
from datetime import date
from pyspark.sql.types import (
    StructType,
    StructField,
    DoubleType,
    StringType,
    IntegerType,
    ArrayType
)
import pandas as pd


spark = SparkSession.builder.getOrCreate()
spark.conf.set(
    "spark.sql.shuffle.partitions", "400"
)  # Default is 200, increase if needed
print('This time we want to have all Coloc and ecaviar >0.01')

path_n='gs://open-targets-data-releases/25.03/output/'

target = spark.read.parquet(f"{path_n}target/")

diseases = spark.read.parquet(f"{path_n}disease/")

evidences = spark.read.parquet(f"{path_n}evidence")

credible = spark.read.parquet(f"{path_n}credible_set")

new = spark.read.parquet(f"{path_n}colocalisation_coloc") 

index=spark.read.parquet(f"{path_n}study/")

variantIndex = spark.read.parquet(f"{path_n}variant")

biosample = spark.read.parquet(f"{path_n}biosample")

ecaviar=spark.read.parquet(f"{path_n}colocalisation_ecaviar")

all_coloc=ecaviar.unionByName(new, allowMissingColumns=True)

print("loaded files")

print("loaded files")

newColoc = (
    all_coloc.join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on left side
            "studyLocusId as leftStudyLocusId",
            "StudyId as leftStudyId",
            "variantId as leftVariantId",
            "studyType as credibleLeftStudyType",
        ),
        on="leftStudyLocusId",
        how="left",
    )
    .join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on right side
            "studyLocusId as rightStudyLocusId",
            "studyId as rightStudyId",
            "variantId as rightVariantId",
            "studyType as credibleRightStudyType",
            "pValueExponent as qtlPValueExponent",
            'isTransQtl'
        ),
        on="rightStudyLocusId",
        how="left",
    )
    .join(
        index.selectExpr(  ### bring modulated target on right side (QTL study)
            "studyId as rightStudyId",
            "geneId",
            "projectId",
            "studyType as indexStudyType",
            "condition",
            "biosampleId",
        ),
        on="rightStudyId",
        how="left",
)
    # .persist()
)

print("loaded newColoc")

# remove columns without content (only null values on them)
df = evidences.filter((F.col("datasourceId") == "gwas_credible_sets"))

# Use an aggregation to determine non-null columns
non_null_counts = df.select(
    *[F.sum(F.col(col).isNotNull().cast("int")).alias(col) for col in df.columns]
)

# Collect the counts for each column
non_null_columns = [
    row[0] for row in non_null_counts.collect()[0].asDict().items() if row[1] > 0
]

# Select only the non-null columns
filtered_df = df.select(*non_null_columns)  # .persist()

## bring studyId, variantId, beta from Gwas and pValue
gwasComplete = filtered_df.join(
    credible.selectExpr(
        "studyLocusId", "studyId", "variantId", "beta as betaGwas", "pValueExponent"
    ),
    on="studyLocusId",
    how="left",
)  # .persist()

print("loaded gwasComplete")

resolvedColoc = (
    (
        newColoc.withColumnRenamed("geneId", "targetId")
        .join(
            gwasComplete.withColumnRenamed("studyLocusId", "leftStudyLocusId"),
            on=["leftStudyLocusId", "targetId"],
            how="right", ## has to be right to have the whole genetic evidence subset (having doe or not)
        )
        #.join(  ### propagated using parent terms
        #    diseases.selectExpr(
        #        "id as diseaseId", "name", "parents", "therapeuticAreas"
        #    ),
        #    on="diseaseId",
        #    how="left",
        #)
        #.withColumn(
        #    "diseaseId",
        #    F.explode_outer(F.concat(F.array(F.col("diseaseId")), F.col("parents"))),
        #)
        #.drop("parents", "oldDiseaseId")
    ).withColumn(
        "colocDoE",
        F.when(
            F.col("rightStudyType").isin(
                ["eqtl", "pqtl", "tuqtl", "sceqtl", "sctuqtl"]
            ),
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_protect"),
            ),
        ).when(
            F.col("rightStudyType").isin(
                ["sqtl", "scsqtl"]
            ),  ### opposite directionality than sqtl
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_protect"),
            ),
        ),
    )
    # .persist()
)
print("loaded resolvedColloc")
resolvedColocFiltered = resolvedColoc.filter((F.col('clpp')>=0.01) | F.col('h4').isNotNull()) #| (F.col('h4')>=0.8))
datasource_filter = [
    #"gwas_credible_set", remove so avoid potential duplicates as it will be incorporated later (DoE is done separately)
    "gene_burden",
    "eva",
    "eva_somatic",
    "gene2phenotype",
    "orphanet",
    "cancer_gene_census",
    "intogen",
    "impc",
    "chembl",
]

assessment, evidences, actionType, oncolabel = temporary_directionOfEffect(
    path_n, datasource_filter
)

print("run temporary direction of effect")

window_spec = Window.partitionBy("targetId", "diseaseId",'leftStudyId').orderBy( ### include gwas study
    F.col("pValueExponent").asc()
)
gwasCredibleAssoc = (
    resolvedColocFiltered.withColumn(
        "homogenized", F.first("colocDoE", ignorenulls=True).over(window_spec)
    )  ## added 30.01.2025
    .select("targetId", "diseaseId",'leftStudyId', "homogenized")
    .withColumn(
        "homogenized",
        F.when(F.col("homogenized").isNull(), F.lit("noEvaluable")).otherwise(
            F.col("homogenized")
        ),
    )
)

window_spec_qtl = Window.partitionBy("targetId", "diseaseId",'leftStudyId').orderBy( ### include gwas study
    F.col("qtlPValueExponent").asc()
)
# qtlPValueExponent
gwasCredibleAssoc_qtlPValue = (
    resolvedColocFiltered.withColumn(
        "homogenized", F.first("colocDoE", ignorenulls=True).over(window_spec_qtl)
    )  ## added 30.01.2025
    .select("targetId", "diseaseId",'leftStudyId', "homogenized")
    .withColumn(
        "homogenized",
        F.when(F.col("homogenized").isNull(), F.lit("noEvaluable")).otherwise(
            F.col("homogenized")
        ),
    )
)


print("Moving to step 2")

columns_chembl = ["LoF_protect", "GoF_protect"]
columns_dataset = ["LoF_protect", "GoF_protect", "LoF_risk", "GoF_risk", "evidenceDif"]
columns = ["GoF_risk", "LoF_protect", "LoF_risk", "GoF_protect"]
terms = ["noEvaluable", "bivalent_risk", "null", "dispar"]

taDf = spark.createDataFrame(
    data=[
        ("MONDO_0045024", "cell proliferation disorder", "Oncology"),
        ("EFO_0005741", "infectious disease", "Other"),
        ("OTAR_0000014", "pregnancy or perinatal disease", "Other"),
        ("EFO_0005932", "animal disease", "Other"),
        ("MONDO_0024458", "disease of visual system", "Other"),
        ("EFO_0000319", "cardiovascular disease", "Other"),
        ("EFO_0009605", "pancreas disease", "Other"),
        ("EFO_0010282", "gastrointestinal disease", "Other"),
        ("OTAR_0000017", "reproductive system or breast disease", "Other"),
        ("EFO_0010285", "integumentary system disease", "Other"),
        ("EFO_0001379", "endocrine system disease", "Other"),
        ("OTAR_0000010", "respiratory or thoracic disease", "Other"),
        ("EFO_0009690", "urinary system disease", "Other"),
        ("OTAR_0000006", "musculoskeletal or connective tissue disease", "Other"),
        ("MONDO_0021205", "disease of ear", "Other"),
        ("EFO_0000540", "immune system disease", "Other"),
        ("EFO_0005803", "hematologic disease", "Other"),
        ("EFO_0000618", "nervous system disease", "Other"),
        ("MONDO_0002025", "psychiatric disorder", "Other"),
        ("MONDO_0024297", "nutritional or metabolic disease", "Other"),
        ("OTAR_0000018", "genetic, familial or congenital disease", "Other"),
        ("OTAR_0000009", "injury, poisoning or other complication", "Other"),
        ("EFO_0000651", "phenotype", "Other"),
        ("EFO_0001444", "measurement", "Other"),
        ("GO_0008150", "biological process", "Other"),
    ],
    schema=StructType(
        [
            StructField("taId", StringType(), True),
            StructField("taLabel", StringType(), True),
            StructField("taLabelSimple", StringType(), True),
        ]
    ),
).withColumn("taRank", F.monotonically_increasing_id())

### give us a classification of Oncology VS non oncology
wByDisease = Window.partitionBy("diseaseId")  #### checked 31.05.2023
diseaseTA = (
    diseases.withColumn("taId", F.explode("therapeuticAreas"))
    .select(F.col("id").alias("diseaseId"), "taId", "parents")
    .join(taDf, on="taId", how="left")
    .withColumn("minRank", F.min("taRank").over(wByDisease))
    .filter(F.col("taRank") == F.col("minRank"))
    .drop("taRank", "minRank")
)

#### give us propagation of diseases and list of therapeutic areas associated
diseases2 = diseases.select("id", "parents").withColumn(
    "diseaseIdPropagated",
    F.explode_outer(F.concat(F.array(F.col("id")), F.col("parents"))),
)

chembl_trials = (
    assessment.filter((F.col("datasourceId").isin(["chembl"])))
    .groupBy("targetId", "diseaseId")
    .agg(F.max(F.col("clinicalPhase")).alias("maxClinPhase"))
)

negativeTD = (
    evidences.filter(F.col("datasourceId") == "chembl")
    .select("targetId", "diseaseId", "studyStopReason", "studyStopReasonCategories")
    .filter(F.array_contains(F.col("studyStopReasonCategories"), "Negative"))
    .groupBy("targetId", "diseaseId")
    .count()
    .withColumn("stopReason", F.lit("Negative"))
    .drop("count")
)

assessment_all = assessment.unionByName(
    gwasCredibleAssoc_qtlPValue.withColumn("datasourceId", F.lit("gwas_credible_set")),
    allowMissingColumns=True,
)


This time we want to have all Coloc and ecaviar >0.01
loaded files
loaded files
loaded newColoc


                                                                                

loaded gwasComplete
loaded resolvedColloc


25/06/19 08:04:24 WARN CacheManager: Asked to cache already cached data.


run temporary direction of effect
Moving to step 2


25/06/19 08:04:25 WARN CacheManager: Asked to cache already cached data.


In [4]:

### give us a classification of Oncology VS non oncology
wByDisease = Window.partitionBy("diseaseId")  #### checked 31.05.2023
diseaseTA = (
    diseases.withColumn("taId", F.explode("therapeuticAreas"))
    .select(F.col("id").alias("diseaseId"), "taId", "parents")
    .join(taDf, on="taId", how="left")
    .withColumn("minRank", F.min("taRank").over(wByDisease))
    .filter(F.col("taRank") == F.col("minRank"))
    .drop("taRank", "minRank")
)

#### give us propagation of diseases and list of therapeutic areas associated
diseases2 = diseases.select("id", "parents").withColumn(
    "diseaseIdPropagated",
    F.explode_outer(F.concat(F.array(F.col("id")), F.col("parents"))),
)

chembl_trials = (
    assessment.filter((F.col("datasourceId").isin(["chembl"])))
    .groupBy("targetId", "diseaseId")
    .agg(F.max(F.col("clinicalPhase")).alias("maxClinPhase"))
)

negativeTD = (
    evidences.filter(F.col("datasourceId") == "chembl")
    .select("targetId", "diseaseId", "studyStopReason", "studyStopReasonCategories")
    .filter(F.array_contains(F.col("studyStopReasonCategories"), "Negative"))
    .groupBy("targetId", "diseaseId")
    .count()
    .withColumn("stopReason", F.lit("Negative"))
    .drop("count")
)

assessment_all = assessment.unionByName(
    gwasCredibleAssoc_qtlPValue.withColumn("datasourceId", F.lit("gwas_credible_set")),
    allowMissingColumns=True,
)

print("defining non propagated,propagated and analysis_drugs functions")

def analysis_nonPropagated(assessment_all, analysisDatasources):
    return discrepancifier(
        assessment_all.filter(F.col("datasourceId").isin(analysisDatasources))
        .withColumn(
            "datasources",
            F.collect_set(F.col("datasourceId")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy(
            "targetId",
            "diseaseId",
        )
        .pivot("homogenized")
        .agg(F.count("targetId"))
        # .persist()
    )


def analysis_propagated(assessment_all, analysisDatasources):
    return discrepancifier(
        assessment_all.filter(F.col("datasourceId").isin(analysisDatasources))
        .withColumn(
            "datasources",
            F.collect_set(F.col("datasourceId")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .join(
            diseases2.selectExpr("id as diseaseId", "diseaseIdPropagated"),
            on="diseaseId",
            how="left",
        )
        .withColumnRenamed("diseaseId", "oldDiseaseId")
        .withColumnRenamed("diseaseIdPropagated", "diseaseId")
        .groupBy(
            "targetId",
            "diseaseId",
        )
        .pivot("homogenized")
        .agg(F.count("targetId"))
        # .persist()
    )

chembl_ds = ["chembl"]

def analysis_drugs(assessment_all, chembl_ds):
    return discrepancifier(
        assessment_all.filter((F.col("datasourceId").isin(chembl_ds))
        )
        .withColumn(
            "maxClinPhase",
            F.max(F.col("clinicalPhase")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase")
        .pivot("homogenized")
        .agg(F.count("targetId"))
        .persist()
    )


analysis_chembl = analysis_drugs(assessment_all, chembl_ds)


defining non propagated,propagated and analysis_drugs functions


                                                                                

In [2]:
all_coloc.show()

                                                                                

+--------------------+--------------------+----------+--------------+--------------------------+--------------------+--------------------+--------------------+----+----+----+----+----+
|    leftStudyLocusId|   rightStudyLocusId|chromosome|rightStudyType|numberColocalisingVariants|                clpp|colocalisationMethod|betaRatioSignAverage|  h0|  h1|  h2|  h3|  h4|
+--------------------+--------------------+----------+--------------+--------------------------+--------------------+--------------------+--------------------+----+----+----+----+----+
|000d05393cd5863b3...|9651cca8bc8386a72...|        17|          eqtl|                        14|0.041950637555999475|             eCAVIAR|                 1.0|NULL|NULL|NULL|NULL|NULL|
|000d24cc7709a9f40...|6a985202a00d48c5a...|        11|          pqtl|                        23|0.023020449155915983|             eCAVIAR|                -1.0|NULL|NULL|NULL|NULL|NULL|
|000eab0795d970c4e...|8b80e731719d98526...|        16|          eqtl|      

In [3]:
newColoc.show()

[Stage 34:>                                                         (0 + 1) / 1]

+--------------------+--------------------+--------------------+----------+--------------+--------------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+------------+----------------+---------------------+--------------------+----------------------+-----------------+----------+---------------+------------+--------------+-----------+--------------+
|        rightStudyId|   rightStudyLocusId|    leftStudyLocusId|chromosome|rightStudyType|numberColocalisingVariants|                clpp|colocalisationMethod|betaRatioSignAverage|                  h0|                  h1|                  h2|                  h3|                h4| leftStudyId|   leftVariantId|credibleLeftStudyType|      rightVariantId|credibleRightStudyType|qtlPValueExponent|isTransQtl|         geneId|   projectId|indexStudyType|  condition|   biosampleId|
+--------------------+------------------

                                                                                

In [3]:
resolvedColocFiltered.groupBy('targetId','diseaseId').pivot('colocalisationMethod').count().show()



+---------------+-------------+-----+-------+
|       targetId|    diseaseId|COLOC|eCAVIAR|
+---------------+-------------+-----+-------+
|ENSG00000082641|  EFO_0007629|   89|     38|
|ENSG00000093010|  EFO_0006794|   30|     38|
|ENSG00000148655|  EFO_0004309|  251|    358|
|ENSG00000130203|  EFO_0022323|   14|     18|
|ENSG00000092445|  EFO_0004713|    6|     10|
|ENSG00000214300|  EFO_0004509|    3|      5|
|ENSG00000134824|MONDO_0008903|    2|      2|
|ENSG00000182263|  EFO_0000612|    1|      1|
|ENSG00000167840|MONDO_0004979| NULL|      2|
|ENSG00000240230|  EFO_0004518|    1|      2|
|ENSG00000151617|  EFO_0000612|    3|      5|
|ENSG00000091583|  EFO_0004339| NULL|      1|
|ENSG00000132763|  EFO_0004509|   16|     10|
|ENSG00000118271|MONDO_0100096| NULL|      1|
|ENSG00000124207|  EFO_0004735|    8|      9|
|ENSG00000198945|  EFO_0004696|  189|     68|
|ENSG00000178202|  EFO_0004526|    5|      8|
|ENSG00000092929|  EFO_0000537|   12|      6|
|ENSG00000134365|  EFO_0004346| NU

                                                                                

In [5]:
### numbers of ecaviar and coloc (FILTERED )

import time
from array import ArrayType
from functions import (
    relative_success,
    spreadSheetFormatter,
    discrepancifier,
    temporary_directionOfEffect,
)
# from stoppedTrials import terminated_td
from DoEAssessment import directionOfEffect
# from membraneTargets import target_membrane
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
from datetime import datetime
from datetime import date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.types import (
    StructType,
    StructField,
    DoubleType,
    DecimalType,
    StringType,
    FloatType,
)
import pandas as pd

spark = SparkSession.builder.getOrCreate()
spark.conf.set(
    "spark.sql.shuffle.partitions", "400"
)  # Default is 200, increase if needed


path_n='gs://open-targets-data-releases/25.03/output/'

target = spark.read.parquet(f"{path_n}target/")

diseases = spark.read.parquet(f"{path_n}disease/")

evidences = spark.read.parquet(f"{path_n}evidence")

credible = spark.read.parquet(f"{path_n}credible_set")

new = spark.read.parquet(f"{path_n}colocalisation_coloc") 

index=spark.read.parquet(f"{path_n}study/")

variantIndex = spark.read.parquet(f"{path_n}variant")

biosample = spark.read.parquet(f"{path_n}biosample")

ecaviar=spark.read.parquet(f"{path_n}colocalisation_ecaviar")

all_coloc=ecaviar.unionByName(new, allowMissingColumns=True)

print("loaded files")

newColoc = (
    all_coloc.join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on left side
            "studyLocusId as leftStudyLocusId",
            "StudyId as leftStudyId",
            "variantId as leftVariantId",
            "studyType as credibleLeftStudyType",
        ),
        on="leftStudyLocusId",
        how="left",
    )
    .join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on right side
            "studyLocusId as rightStudyLocusId",
            "studyId as rightStudyId",
            "variantId as rightVariantId",
            "studyType as credibleRightStudyType",
            "pValueExponent as qtlPValueExponent",
            'isTransQtl'
        ),
        on="rightStudyLocusId",
        how="left",
    )
    .join(
        index.selectExpr(  ### bring modulated target on right side (QTL study)
            "studyId as rightStudyId",
            "geneId",
            "projectId",
            "studyType as indexStudyType",
            "condition",
            "biosampleId",
        ),
        on="rightStudyId",
        how="left",
)
    # .persist()
)

print("loaded newColoc")

# remove columns without content (only null values on them)
df = evidences.filter((F.col("datasourceId") == "gwas_credible_sets"))

# Use an aggregation to determine non-null columns
non_null_counts = df.select(
    *[F.sum(F.col(col).isNotNull().cast("int")).alias(col) for col in df.columns]
)

# Collect the counts for each column
non_null_columns = [
    row[0] for row in non_null_counts.collect()[0].asDict().items() if row[1] > 0
]

# Select only the non-null columns
filtered_df = df.select(*non_null_columns)  # .persist()

## bring studyId, variantId, beta from Gwas and pValue
gwasComplete = filtered_df.join(
    credible.selectExpr(
        "studyLocusId", "studyId", "variantId", "beta as betaGwas", "pValueExponent"
    ),
    on="studyLocusId",
    how="left",
)  # .persist()

print("loaded gwasComplete")

resolvedColoc = (
    (
        newColoc.withColumnRenamed("geneId", "targetId")
        .join(
            gwasComplete.withColumnRenamed("studyLocusId", "leftStudyLocusId"),
            on=["leftStudyLocusId", "targetId"],
            how="inner",
        )
        .join(  ### propagated using parent terms
            diseases.selectExpr(
                "id as diseaseId", "name", "parents", "therapeuticAreas"
            ),
            on="diseaseId",
            how="left",
        )
        .withColumn(
            "diseaseId",
            F.explode_outer(F.concat(F.array(F.col("diseaseId")), F.col("parents"))),
        )
        .drop("parents", "oldDiseaseId")
    ).withColumn(
        "colocDoE",
        F.when(
            F.col("rightStudyType").isin(
                ["eqtl", "pqtl", "tuqtl", "sceqtl", "sctuqtl"]
            ),
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_protect"),
            ),
        ).when(
            F.col("rightStudyType").isin(
                ["sqtl", "scsqtl"]
            ),  ### opposite directionality than sqtl
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_protect"),
            ),
        ),
    )
    # .persist()
)
print("loaded resolvedColloc")

datasource_filter = [
#   "ot_genetics_portal",
    "gwas_credible_sets",
    "gene_burden",
    "eva",
    "eva_somatic",
    "gene2phenotype",
    "orphanet",
    "cancer_gene_census",
    "intogen",
    "impc",
    "chembl",
]

assessment, evidences, actionType, oncolabel = temporary_directionOfEffect(
    path_n, datasource_filter
)

print("run temporary direction of effect")


print("built drugApproved dataset")

analysis_chembl_indication = (
    discrepancifier(
        assessment.filter((F.col("datasourceId") == "chembl"))
        .withColumn(
            "maxClinPhase",
            F.max(F.col("clinicalPhase")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase")
        .pivot("homogenized")
        .agg(F.count("targetId"))
    )
    .filter(F.col("coherencyDiagonal") == "coherent")
    .drop(
        "coherencyDiagonal", "coherencyOneCell", "noEvaluable", "GoF_risk", "LoF_risk"
    )
    .withColumnRenamed("GoF_protect", "drugGoF_protect")
    .withColumnRenamed("LoF_protect", "drugLoF_protect")
    # .persist()
)

chemblAssoc = (
    discrepancifier(
        assessment.filter(
            (F.col("datasourceId") == "chembl")
            & (F.col("homogenized") != "noEvaluable")
        )
        .withColumn(
            "maxClinPhase",
            F.max("clinicalPhase").over(Window.partitionBy("targetId", "diseaseId")),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase")
        .pivot("homogenized")
        .count()
    )
    .filter(F.col("coherencyDiagonal") == "coherent")
    .drop(
        "coherencyDiagonal", "coherencyOneCell", "noEvaluable", "GoF_risk", "LoF_risk"
    )
    .withColumnRenamed("GoF_protect", "drugGoF_protect")
    .withColumnRenamed("LoF_protect", "drugLoF_protect")
)

print("built chemblAssoc dataset")

####2 Define agregation function
import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
from scipy.stats.contingency import odds_ratio
from pyspark.sql.types import *


def convertTuple(tup):
    st = ",".join(map(str, tup))
    return st


#####3 run in a function
def aggregations_original(
    df,
    data,
    listado,
    comparisonColumn,
    comparisonType,
    predictionColumn,
    predictionType,
    today_date,
):
    wComparison = Window.partitionBy(comparisonColumn)
    wPrediction = Window.partitionBy(predictionColumn)
    wPredictionComparison = Window.partitionBy(comparisonColumn, predictionColumn)
    results = []
    # uniqIds = df.select("targetId", "diseaseId").distinct().count()
    out = (
        df.withColumn("comparisonType", F.lit(comparisonType))
        .withColumn("dataset", F.lit(data))
        .withColumn("predictionType", F.lit(predictionType))
        # .withColumn("total", F.lit(uniqIds))
        .withColumn("a", F.count("targetId").over(wPredictionComparison))
        .withColumn("comparisonColumn", F.lit(comparisonColumn))
        .withColumn("predictionColumnValue", F.lit(predictionColumn))
        .withColumn(
            "predictionTotal",
            F.count("targetId").over(wPrediction),
        )
        .withColumn(
            "comparisonTotal",
            F.count("targetId").over(wComparison),
        )
        .select(
            F.col(predictionColumn).alias("prediction"),
            F.col(comparisonColumn).alias("comparison"),
            "dataset",
            "comparisonColumn",
            "predictionColumnValue",
            "comparisonType",
            "predictionType",
            "a",
            "predictionTotal",
            "comparisonTotal",
        )
        .filter(F.col("prediction").isNotNull())
        .filter(F.col("comparison").isNotNull())
        .distinct()
    )

    out.write.mode("overwrite").parquet(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + comparisonType
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )

    listado.append(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + comparisonType
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )
    path = "gs://ot-team/jroldan/" + str(
        today_date
        + "_"
        + "analysis/"
        + data
        # + "_propagated"
        + "/"
        + comparisonColumn
        + "_"
        + comparisonType
        + "_"
        + predictionColumn
        + ".parquet"
    )
    print(path)
    
    ### making analysis
    array1 = np.delete(
        out.join(full_data, on=["prediction", "comparison"], how="outer")
        .groupBy("comparison")
        .pivot("prediction")
        .agg(F.first("a"))
        .sort(F.col("comparison").desc())
        .select("comparison", "yes", "no")
        .fillna(0)
        .toPandas()
        .to_numpy(),
        [0],
        1,
    )
    total = np.sum(array1)
    res_npPhaseX = np.array(array1, dtype=int)
    resX = convertTuple(fisher_exact(res_npPhaseX, alternative="two-sided"))
    resx_CI = convertTuple(
        odds_ratio(res_npPhaseX).confidence_interval(confidence_level=0.95)
    )

    result_st.append(resX)
    result_ci.append(resx_CI)
    (rs_result, rs_ci) = relative_success(array1)
    results.extend(
        [
            comparisonType,
            comparisonColumn,
            predictionColumn,
            round(float(resX.split(",")[0]), 2),
            float(resX.split(",")[1]),
            round(float(resx_CI.split(",")[0]), 2),
            round(float(resx_CI.split(",")[1]), 2),
            str(total),
            np.array(res_npPhaseX).tolist(),
            round(float(rs_result), 2),
            round(float(rs_ci[0]), 2),
            round(float(rs_ci[1]), 2),
            # studies,
            # tissues,
            path,
        ]
    )
    return results


#### 3 Loop over different datasets (as they will have different rows and columns)


def comparisons_df_iterative(elements):
    # toAnalysis = [(key, value) for key, value in disdic.items() if value == projectId]
    toAnalysis = [(col, "predictor") for col in elements]
    schema = StructType(
        [
            StructField("comparison", StringType(), True),
            StructField("comparisonType", StringType(), True),
        ]
    )

    comparisons = spark.createDataFrame(toAnalysis, schema=schema)
    ### include all the columns as predictor

    predictions = spark.createDataFrame(
        data=[
            ("Phase>=4", "clinical"),
            #('Phase>=3','clinical'),
            #('Phase>=2','clinical'),
            #('Phase>=1','clinical'),
            #("PhaseT", "clinical"),
        ]
    )
    return comparisons.join(predictions, how="full").collect()


print("load comparisons_df_iterative function")


full_data = spark.createDataFrame(
    data=[
        ("yes", "yes"),
        ("yes", "no"),
        ("no", "yes"),
        ("no", "no"),
    ],
    schema=StructType(
        [
            StructField("prediction", StringType(), True),
            StructField("comparison", StringType(), True),
        ]
    ),
)
print("created full_data and lists")

#rightTissue = spark.read.csv(
#    'gs://ot-team/jroldan/analysis/20250526_rightTissue.csv',
#    header=True,
#).drop("_c0")

print("loaded rightTissue dataset")

negativeTD = (
    evidences.filter(F.col("datasourceId") == "chembl")
    .select("targetId", "diseaseId", "studyStopReason", "studyStopReasonCategories")
    .filter(F.array_contains(F.col("studyStopReasonCategories"), "Negative"))
    .groupBy("targetId", "diseaseId")
    .count()
    .withColumn("stopReason", F.lit("Negative"))
    .drop("count")
)

print("built negativeTD dataset")

print("built bench2 dataset")

###### cut from here
print("looping for variables_study")

#### new part with chatgpt -- TEST

## QUESTIONS TO ANSWER:
# HAVE ECAVIAR >=0.8
# HAVE COLOC 
# HAVE COLOC >= 0.8
# HAVE COLOC + ECAVIAR >= 0.01
# HAVE COLOC >= 0.8 + ECAVIAR >= 0.01
# RIGHT JOING WITH CHEMBL 

resolvedColocFiltered = resolvedColoc.filter((F.col('clpp')>=0.01) | (F.col('h4')>=0.8))
benchmark = (
    (
        resolvedColocFiltered.filter(F.col("betaGwas") < 0).filter(
        F.col("name") != "COVID-19"
    )
        .join(  ### select just GWAS giving protection
            analysis_chembl_indication, on=["targetId", "diseaseId"], how="right"  ### RIGHT SIDE
        )
        .withColumn(
            "AgreeDrug",
            F.when(
                (F.col("drugGoF_protect").isNotNull())
                & (F.col("colocDoE") == "GoF_protect"),
                F.lit("yes"),
            )
            .when(
                (F.col("drugLoF_protect").isNotNull())
                & (F.col("colocDoE") == "LoF_protect"),
                F.lit("yes"),
            )
            .otherwise(F.lit("no")),
        )
    )  #### remove COVID-19 associations
).join(biosample.select("biosampleId", "biosampleName"), on="biosampleId", how="left")


loaded files
loaded newColoc


                                                                                

loaded gwasComplete
loaded resolvedColloc
run temporary direction of effect
built drugApproved dataset


                                                                                

built chemblAssoc dataset
load comparisons_df_iterative function
created full_data and lists
loaded rightTissue dataset
built negativeTD dataset
built bench2 dataset
looping for variables_study


In [10]:
resolvedColocFiltered = resolvedColoc.filter((F.col('clpp')>=0.01) | (F.col('h4').isNotNull()))
benchmark = (
    (
        resolvedColocFiltered.filter(F.col("betaGwas") < 0).filter(
        F.col("name") != "COVID-19"
    )
        .join(  ### select just GWAS giving protection
            analysis_chembl_indication, on=["targetId", "diseaseId"], how="right"  ### RIGHT SIDE
        )
        .withColumn(
            "AgreeDrug",
            F.when(
                (F.col("drugGoF_protect").isNotNull())
                & (F.col("colocDoE") == "GoF_protect"),
                F.lit("yes"),
            )
            .when(
                (F.col("drugLoF_protect").isNotNull())
                & (F.col("colocDoE") == "LoF_protect"),
                F.lit("yes"),
            )
            .otherwise(F.lit("no")),
        )
    )  #### remove COVID-19 associations
).join(biosample.select("biosampleId", "biosampleName"), on="biosampleId", how="left")
#### Coloc no filtered

benchmark.groupBy("targetId", "diseaseId").pivot(
    "colocalisationMethod"
).count().withColumn(
    "hasColoc", F.when(F.col("COLOC").isNotNull(), F.lit("yes")).otherwise(F.lit('no'))
).withColumn(
    "hasEcaviar", F.when(F.col("eCAVIAR").isNotNull(), F.lit("yes")).otherwise(F.lit('no'))
).groupBy('hasColoc','hasEcaviar').count().show()



+--------+----------+-----+
|hasColoc|hasEcaviar|count|
+--------+----------+-----+
|      no|        no|68453|
|      no|       yes|   77|
|     yes|        no|   26|
|     yes|       yes|  146|
+--------+----------+-----+



                                                                                

In [None]:

### Coloc >=0.8
benchmark.groupBy("targetId", "diseaseId").pivot(
    "colocalisationMethod"
).count().withColumn(
    "hasColoc", F.when(F.col("COLOC").isNotNull(), F.lit("yes")).otherwise(F.lit('no'))
).withColumn(
    "hasEcaviar", F.when(F.col("eCAVIAR").isNotNull(), F.lit("yes")).otherwise(F.lit('no'))
).groupBy('hasColoc','hasEcaviar').count().show()



+--------+----------+-----+
|hasColoc|hasEcaviar|count|
+--------+----------+-----+
|      no|        no|68464|
|     yes|       yes|  144|
|      no|       yes|   79|
|     yes|        no|   15|
+--------+----------+-----+



                                                                                