In [6]:
chembl_trials.count()

74187

In [3]:
import time
from array import ArrayType
from functions import (
    relative_success,
    spreadSheetFormatter,
    discrepancifier,
    temporary_directionOfEffect,
)
# from stoppedTrials import terminated_td
from DoEAssessment import directionOfEffect
# from membraneTargets import target_membrane
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
from datetime import datetime
from datetime import date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.types import (
    StructType,
    StructField,
    DoubleType,
    DecimalType,
    StringType,
    FloatType,
)
import pandas as pd
from functools import reduce

spark = SparkSession.builder.getOrCreate()
#spark.conf.set(
#    "spark.sql.shuffle.partitions", "400"
#)  # Default is 200, increase if needed


path_n='gs://open-targets-data-releases/25.06/output/'

target = spark.read.parquet(f"{path_n}target/")

diseases = spark.read.parquet(f"{path_n}disease/")

evidences = spark.read.parquet(f"{path_n}evidence")

credible = spark.read.parquet(f"{path_n}credible_set")

new = spark.read.parquet(f"{path_n}colocalisation_coloc") 

index=spark.read.parquet(f"{path_n}study/")

variantIndex = spark.read.parquet(f"{path_n}variant")

biosample = spark.read.parquet(f"{path_n}biosample")

ecaviar=spark.read.parquet(f"{path_n}colocalisation_ecaviar")

all_coloc=ecaviar.unionByName(new, allowMissingColumns=True)

print("loaded files")

newColoc = (
    all_coloc.join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on left side
            "studyLocusId as leftStudyLocusId",
            "StudyId as leftStudyId",
            "variantId as leftVariantId",
            "studyType as credibleLeftStudyType",
        ),
        on="leftStudyLocusId",
        how="left",
    )
    .join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on right side
            "studyLocusId as rightStudyLocusId",
            "studyId as rightStudyId",
            "variantId as rightVariantId",
            "studyType as credibleRightStudyType",
            "pValueExponent as qtlPValueExponent",
            'isTransQtl'
        ),
        on="rightStudyLocusId",
        how="left",
    )
    .join(
        index.selectExpr(  ### bring modulated target on right side (QTL study)
            "studyId as rightStudyId",
            "geneId",
            "projectId",
            "studyType as indexStudyType",
            "condition",
            "biosampleId",
        ),
        on="rightStudyId",
        how="left",
)
    # .persist()
)

print("loaded newColoc")

# remove columns without content (only null values on them)
df = evidences.filter((F.col("datasourceId") == "gwas_credible_sets"))

# Use an aggregation to determine non-null columns
non_null_counts = df.select(
    *[F.sum(F.col(col).isNotNull().cast("int")).alias(col) for col in df.columns]
)

# Collect the counts for each column
non_null_columns = [
    row[0] for row in non_null_counts.collect()[0].asDict().items() if row[1] > 0
]

# Select only the non-null columns
filtered_df = df.select(*non_null_columns)  # .persist()

## bring studyId, variantId, beta from Gwas and pValue
gwasComplete = filtered_df.join(
    credible.selectExpr(
        "studyLocusId", "studyId", "variantId", "beta as betaGwas", "pValueExponent"
    ),
    on="studyLocusId",
    how="left",
)  # .persist()

print("loaded gwasComplete")

resolvedColoc = (
    (
        newColoc.withColumnRenamed("geneId", "targetId")
        .join(
            gwasComplete.withColumnRenamed("studyLocusId", "leftStudyLocusId"),
            on=["leftStudyLocusId", "targetId"],
            how="inner",
        )
        .join(  ### propagated using parent terms
            diseases.selectExpr(
                "id as diseaseId", "name", "parents", "therapeuticAreas"
            ),
            on="diseaseId",
            how="left",
        )
        .withColumn(
            "diseaseId",
            F.explode_outer(F.concat(F.array(F.col("diseaseId")), F.col("parents"))),
        )
        .drop("parents", "oldDiseaseId")
    ).withColumn(
        "colocDoE",
        F.when(
            F.col("rightStudyType").isin(
                ["eqtl", "pqtl", "tuqtl", "sceqtl", "sctuqtl"]
            ),
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_protect"),
            ),
        ).when(
            F.col("rightStudyType").isin(
                ["sqtl", "scsqtl"]
            ),  ### opposite directionality than sqtl
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_protect"),
            ),
        ),
    )
    # .persist()
)
print("loaded resolvedColloc")

datasource_filter = [
#   "ot_genetics_portal",
    "gwas_credible_sets",
    "gene_burden",
    "eva",
    "eva_somatic",
    "gene2phenotype",
    "orphanet",
    "cancer_gene_census",
    "intogen",
    "impc",
    "chembl",
]

assessment, evidences, actionType, oncolabel = temporary_directionOfEffect(
    path_n, datasource_filter
)

print("run temporary direction of effect")


print("built drugApproved dataset")

analysis_chembl_indication = (
    discrepancifier(
        assessment.filter((F.col("datasourceId") == "chembl"))
        .withColumn(
            "maxClinPhase",
            F.max(F.col("clinicalPhase")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase")
        .pivot("homogenized")
        .agg(F.count("targetId"))
    )
    #.filter(F.col("coherencyDiagonal") == "coherent")
    .drop(
        "coherencyDiagonal", "coherencyOneCell", "noEvaluable", "GoF_risk", "LoF_risk"
    )
    .withColumnRenamed("GoF_protect", "drugGoF_protect")
    .withColumnRenamed("LoF_protect", "drugLoF_protect")
    # .persist()
)

chemblAssoc = (
    discrepancifier(
        assessment.filter(
            (F.col("datasourceId") == "chembl")
            & (F.col("homogenized") != "noEvaluable")
        )
        .withColumn(
            "maxClinPhase",
            F.max("clinicalPhase").over(Window.partitionBy("targetId", "diseaseId")),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase")
        .pivot("homogenized")
        .count()
    )
    #.filter(F.col("coherencyDiagonal") == "coherent")
    .drop(
        "coherencyDiagonal", "coherencyOneCell", "noEvaluable", "GoF_risk", "LoF_risk"
    )
    .withColumnRenamed("GoF_protect", "drugGoF_protect")
    .withColumnRenamed("LoF_protect", "drugLoF_protect")
)

print("built chemblAssoc dataset")

####2 Define agregation function
import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
from scipy.stats.contingency import odds_ratio
from pyspark.sql.types import *


def convertTuple(tup):
    st = ",".join(map(str, tup))
    return st


#####3 run in a function
def aggregations_original(
    df,
    data,
    listado,
    comparisonColumn,
    comparisonType,
    predictionColumn,
    predictionType,
    today_date,
):
    wComparison = Window.partitionBy(comparisonColumn)
    wPrediction = Window.partitionBy(predictionColumn)
    wPredictionComparison = Window.partitionBy(comparisonColumn, predictionColumn)
    results = []
    # uniqIds = df.select("targetId", "diseaseId").distinct().count()
    out = (
        df.withColumn("comparisonType", F.lit(comparisonType))
        .withColumn("dataset", F.lit(data))
        .withColumn("predictionType", F.lit(predictionType))
        # .withColumn("total", F.lit(uniqIds))
        .withColumn("a", F.count("targetId").over(wPredictionComparison))
        .withColumn("comparisonColumn", F.lit(comparisonColumn))
        .withColumn("predictionColumnValue", F.lit(predictionColumn))
        .withColumn(
            "predictionTotal",
            F.count("targetId").over(wPrediction),
        )
        .withColumn(
            "comparisonTotal",
            F.count("targetId").over(wComparison),
        )
        .select(
            F.col(predictionColumn).alias("prediction"),
            F.col(comparisonColumn).alias("comparison"),
            "dataset",
            "comparisonColumn",
            "predictionColumnValue",
            "comparisonType",
            "predictionType",
            "a",
            "predictionTotal",
            "comparisonTotal",
        )
        .filter(F.col("prediction").isNotNull())
        .filter(F.col("comparison").isNotNull())
        .distinct()
    )

    out.write.mode("overwrite").parquet(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + comparisonType
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )

    listado.append(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + comparisonType
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )
    path = "gs://ot-team/jroldan/" + str(
        today_date
        + "_"
        + "analysis/"
        + data
        # + "_propagated"
        + "/"
        + comparisonColumn
        + "_"
        + comparisonType
        + "_"
        + predictionColumn
        + ".parquet"
    )
    print(path)
    
    ### making analysis
    array1 = np.delete(
        out.join(full_data, on=["prediction", "comparison"], how="outer")
        .groupBy("comparison")
        .pivot("prediction")
        .agg(F.first("a"))
        .sort(F.col("comparison").desc())
        .select("comparison", "yes", "no")
        .fillna(0)
        .toPandas()
        .to_numpy(),
        [0],
        1,
    )
    total = np.sum(array1)
    res_npPhaseX = np.array(array1, dtype=int)
    resX = convertTuple(fisher_exact(res_npPhaseX, alternative="two-sided"))
    resx_CI = convertTuple(
        odds_ratio(res_npPhaseX).confidence_interval(confidence_level=0.95)
    )

    result_st.append(resX)
    result_ci.append(resx_CI)
    (rs_result, rs_ci) = relative_success(array1)
    results.extend(
        [
            comparisonType,
            comparisonColumn,
            predictionColumn,
            round(float(resX.split(",")[0]), 2),
            float(resX.split(",")[1]),
            round(float(resx_CI.split(",")[0]), 2),
            round(float(resx_CI.split(",")[1]), 2),
            str(total),
            np.array(res_npPhaseX).tolist(),
            round(float(rs_result), 2),
            round(float(rs_ci[0]), 2),
            round(float(rs_ci[1]), 2),
            # studies,
            # tissues,
            path,
        ]
    )
    return results


#### 3 Loop over different datasets (as they will have different rows and columns)


def comparisons_df_iterative(elements):
    # toAnalysis = [(key, value) for key, value in disdic.items() if value == projectId]
    toAnalysis = [(col, "predictor") for col in elements]
    schema = StructType(
        [
            StructField("comparison", StringType(), True),
            StructField("comparisonType", StringType(), True),
        ]
    )

    comparisons = spark.createDataFrame(toAnalysis, schema=schema)
    ### include all the columns as predictor

    predictions = spark.createDataFrame(
        data=[
            ("Phase>=4", "clinical"),
            #('Phase>=3','clinical'),
            #('Phase>=2','clinical'),
            #('Phase>=1','clinical'),
            #("PhaseT", "clinical"),
        ]
    )
    return comparisons.join(predictions, how="full").collect()


print("load comparisons_df_iterative function")


full_data = spark.createDataFrame(
    data=[
        ("yes", "yes"),
        ("yes", "no"),
        ("no", "yes"),
        ("no", "no"),
    ],
    schema=StructType(
        [
            StructField("prediction", StringType(), True),
            StructField("comparison", StringType(), True),
        ]
    ),
)
print("created full_data and lists")

#rightTissue = spark.read.csv(
#    'gs://ot-team/jroldan/analysis/20250526_rightTissue.csv',
#    header=True,
#).drop("_c0")

print("loaded rightTissue dataset")

negativeTD = (
    evidences.filter(F.col("datasourceId") == "chembl")
    .select("targetId", "diseaseId", "studyStopReason", "studyStopReasonCategories")
    .filter(F.array_contains(F.col("studyStopReasonCategories"), "Negative"))
    .groupBy("targetId", "diseaseId")
    .count()
    .withColumn("stopReason", F.lit("Negative"))
    .drop("count")
)

print("built negativeTD dataset")

print("built bench2 dataset")

###### cut from here
print("looping for variables_study")

#### new part with chatgpt -- TEST

## QUESTIONS TO ANSWER:
# HAVE ECAVIAR >=0.8
# HAVE COLOC 
# HAVE COLOC >= 0.8
# HAVE COLOC + ECAVIAR >= 0.01
# HAVE COLOC >= 0.8 + ECAVIAR >= 0.01
# RIGHT JOING WITH CHEMBL 

resolvedColocFiltered = resolvedColoc.filter((F.col('clpp')>=0.01) | (F.col('h4')>=0.8))
benchmark = (
    (
        resolvedColocFiltered.filter(F.col("betaGwas") < 0).filter(
        F.col("name") != "COVID-19"
    )
        .join(  ### select just GWAS giving protection
            analysis_chembl_indication, on=["targetId", "diseaseId"], how="right"  ### RIGHT SIDE
        )
        .withColumn(
            "AgreeDrug",
            F.when(
                (F.col("drugGoF_protect").isNotNull())
                & (F.col("colocDoE") == "GoF_protect"),
                F.lit("yes"),
            )
            .when(
                (F.col("drugLoF_protect").isNotNull())
                & (F.col("colocDoE") == "LoF_protect"),
                F.lit("yes"),
            )
            .otherwise(F.lit("no")),
        )
    )  #### remove COVID-19 associations
).join(biosample.select("biosampleId", "biosampleName"), on="biosampleId", how="left")


#### benchmark genetics
benchmarkGenetics = (
    (
        resolvedColocFiltered.filter(F.col("betaGwas") < 0).filter(
        F.col("name") != "COVID-19"
    )
        .join(  ### select just GWAS giving protection
            analysis_chembl_indication, on=["targetId", "diseaseId"], how="right"  ### RIGHT SIDE
        )
        .withColumn(
            "hasColoc",
            F.when(F.col("colocDoE").isNotNull(), F.lit('yes')
            ).otherwise(F.lit("no")),
        )
    )  #### remove COVID-19 associations
).join(biosample.select("biosampleId", "biosampleName"), on="biosampleId", how="left")


print("built benchmark dataset")


spark session created at 2025-06-29 20:07:26.933668
Analysis started on 2025-06-29 at  2025-06-29 20:07:26.933668


25/06/29 20:07:32 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


loaded files
loaded newColoc


                                                                                

loaded gwasComplete
loaded resolvedColloc
run temporary direction of effect
built drugApproved dataset


                                                                                

built chemblAssoc dataset
load comparisons_df_iterative function
created full_data and lists
loaded rightTissue dataset
built negativeTD dataset
built bench2 dataset
looping for variables_study
built benchmark dataset


In [None]:
###########################################################################################################################################################
######################################################################## BENCHMARK DOE ####################################################################
###########################################################################################################################################################


## write the benchmark 
#name='benchmark'
#output_partitioned_path = f"gs://ot-team/jroldan/analysis/parquetFiles/{name}"
#benchmark.write.mode("overwrite").parquet(output_partitioned_path)
#print(f'written {name}')
#### Analysis

#### 1 Build a dictionary with the distinct values as key and column names as value
variables_study = ["projectId", "biosampleName", "rightStudyType", "colocDoE","colocalisationMethod"]

# List to hold temporary DataFrames
temp_dfs_for_union = []

# Iterate over the column names to prepare DataFrames for union
for col_name in variables_study:
    # Select the current column, alias it to 'distinct_value' for consistent schema
    # Filter out nulls, then get distinct values
    # Add a literal column with the original 'col_name'
    df_temp = (
        benchmark.select(F.col(col_name).alias("distinct_value"))
        .filter(F.col("distinct_value").isNotNull()) # Exclude None (null) values
        .distinct()
        .withColumn("column_name", F.lit(col_name))
    )
    temp_dfs_for_union.append(df_temp)

disdic = {}

if temp_dfs_for_union:
    # Union all the temporary DataFrames.
    # unionByName is crucial to handle potential schema differences (e.g., if columns have same name but different types)
    # and ensures columns are matched by name.
    combined_distinct_values_df = temp_dfs_for_union[0]
    for i in range(1, len(temp_dfs_for_union)):
        combined_distinct_values_df = combined_distinct_values_df.unionByName(temp_dfs_for_union[i])

    # Now, collect the combined distinct values.
    # This is a single collect operation on the aggregated DataFrame.
    print("Collecting combined distinct values from the cluster...")
    collected_rows = combined_distinct_values_df.collect()

    # Populate the dictionary from the collected rows
    for row in collected_rows:
        disdic[row.distinct_value] = row.column_name
else:
    print("variables_study list is empty, disdic will be empty.")


print("\nFinal disdic:", disdic)

# Assuming 'spark' session, 'benchmark' DataFrame, 'negativeTD' DataFrame, and 'disdic' dictionary are defined

# --- Step 1: Pre-compute 'hasboth' ONCE ---
# This is a shuffle, but only happens once.
print("Pre-computing 'hasboth' column...")
window_target_disease_only = Window.partitionBy('targetId', 'diseaseId')
benchmark_processed = benchmark.withColumn(
    'hasboth',
    F.size(F.collect_set('colocalisationMethod').over(window_target_disease_only))
)

# You might consider caching this intermediate result if 'benchmark' is very large
# and you have enough memory, to avoid re-reading from source if possible.
# benchmark_processed.cache() # or .persist(StorageLevel.MEMORY_AND_DISK)
# benchmark_processed.count() # Force computation if you cache

pivoted_dfs = {}

# --- Step 2: Loop for each variable_study column ---
for col_name in variables_study:
    print(f"Processing pivot for: {col_name}")

    # Define window specs for the current iteration, including 'col_name' in partition
    # (This shuffle is still per iteration, but unavoidable if 'resolvedAgreeDrug' depends on 'col_name' values)
    current_col_window_spec_qtl = Window.partitionBy("targetId", "diseaseId", col_name).orderBy(F.col("qtlPValueExponent").asc())
    current_col_pvalue_order_window = Window.partitionBy("targetId", "diseaseId", col_name).orderBy(F.col('colocalisationMethod').asc(), F.col("qtlPValueExponent").asc())

    # Calculate 'resolvedAgreeDrug' for the current 'col_name'
    # This involves a shuffle per iteration.
    temp_df_with_resolved = benchmark_processed.withColumn('resolvedAgreeDrug',
        F.when(F.col('hasboth') > 1,
            F.first(F.col('AgreeDrug'), ignorenulls=True).over(current_col_pvalue_order_window)
        ).otherwise(F.first(F.col('AgreeDrug'), ignorenulls=True).over(current_col_window_spec_qtl))
    )

    # --- Step 3: Perform the pivot and join ---
    # This is an expensive operation (shuffle, potential wide dataframe)
    pivoted_df = (
        temp_df_with_resolved
        .groupBy(
            "targetId",
            "diseaseId",
            "maxClinPhase",
        )
        .pivot(col_name) # Pivoting on values of the 'col_name' column
        .agg(F.collect_set("resolvedAgreeDrug"))
        .join(negativeTD, on=["targetId", "diseaseId"], how="left") # Ensure negativeTD is broadcast if small
    )

    # --- Step 4: Add derived columns (these are generally cheap) ---
    for phase in [1, 2, 3, 4]:
        pivoted_df = pivoted_df.withColumn(
            f"Phase>={phase}",
            F.when(F.col("maxClinPhase") >= phase, F.lit("yes")).otherwise(F.lit("no")),
        )

    pivoted_df = pivoted_df.withColumn(
        "PhaseT",
        F.when(F.col("stopReason") == "Negative", F.lit("yes")).otherwise(F.lit("no")),
    )

    # Add _only columns dynamically based on disdic values matching current column
    matching_keys = [key for key, val in disdic.items() if val == col_name]

    for key in matching_keys:
        # F.col(key) assumes 'key' refers to a column that exists in pivoted_df after the pivot.
        pivoted_df = pivoted_df.withColumn(
            f"{key}_only",
            F.when(F.array_contains(F.col(key), "yes"), F.lit("yes")).otherwise(F.lit("no")),
        )

### making columns for the 

    # --- Step 5: Store result. Consider writing to GCS to break lineage if memory is an issue ---
    # This is highly recommended if 'variables_study' is very large.
    # Write to Parquet for efficient storage and schema preservation.
    # output_path = f"gs://your-bucket/temp_pivoted_results/{col_name}"
    # print(f"Writing results for {col_name} to {output_path}")
    # pivoted_df.write.mode("overwrite").parquet(output_path)
    # pivoted_dfs[col_name] = spark.read.parquet(output_path) # Read back if needed later
    # output_partitioned_path = f"gs://ot-team/jroldan/analysis/parquetFiles/pivoted_df_{col_name}"
    # pivoted_df.write.mode("overwrite").parquet(output_partitioned_path)
    # print(f"DataFrame successfully written and partitioned to {output_partitioned_path}")
    # If not writing to GCS, just store the DF in memory (be cautious for large number of DFs)

    pivoted_dfs[col_name] = pivoted_df

##### PROJECTID
project_keys=[f"{k}_only" for k,v in disdic.items() if v == 'projectId']
main=['GTEx_only', 'UKB_PPP_EUR_only']
others=[item for item in project_keys if item not in main]

# First condition: any "yes" in list1
condition1 = reduce(lambda acc, col: acc | (F.col(col) == "yes"), others[1:], F.col(others[0]) == "yes")
# Add both columns
pivoted_dfs['projectId'] = pivoted_dfs['projectId'].withColumn("othersProjectId_only", F.when(condition1, "yes").otherwise("no")) 

##### BIOSAMPLE NAME
biosample_keys=[f"{k}_only" for k,v in disdic.items() if v == 'biosampleName']
main=['tibial nerve_only', 'upper lobe of left lung_only','blood plasma_only','lymphoblastoid cell line_only']
others=[item for item in biosample_keys if item not in main]

# First condition: any "yes" in list1
condition1 = reduce(lambda acc, col: acc | (F.col(col) == "yes"), others[1:], F.col(others[0]) == "yes")
# Add both columns
pivoted_dfs['biosampleName'] = pivoted_dfs['biosampleName'].withColumn("othersBiosampleName_only", F.when(condition1, "yes").otherwise("no")) 


##### RIGHTSTUDYTYPE 
rightStudy_keys=[f"{k}_only" for k,v in disdic.items() if v == 'rightStudyType']
main=['eqtl_only', 'pqtl_only']
others=[item for item in rightStudy_keys if item not in main]

# First condition: any "yes" in list1
condition1 = reduce(lambda acc, col: acc | (F.col(col) == "yes"), others[1:], F.col(others[0]) == "yes")
# Add both columns
pivoted_dfs['rightStudyType'] = pivoted_dfs['rightStudyType'].withColumn("otherRightStudyType_only", F.when(condition1, "yes").otherwise("no")) 


###append to dictionary

disdic.update({'othersProjectId': 'projectId', 'othersBiosampleName_only': 'biosampleName', 'otherRightStudyType':'rightStudyType'})


In [6]:
benchmarkGenetics.groupBy('targetId','diseaseId').pivot('hasColoc').count().groupBy('no','yes').count().show()



+----+----+-----+
|  no| yes|count|
+----+----+-----+
|   1|NULL|73881|
|NULL|   9|    4|
|NULL| 406|    1|
|   8|  52|    1|
|NULL|   1|   31|
|NULL|  12|    5|
|NULL|  28|    2|
|NULL|   6|   21|
|NULL|   4|   15|
|  49| 360|    1|
|NULL|  13|    3|
|   9|   9|    2|
|NULL|   8|   11|
|NULL|   2|   31|
|NULL|  15|    3|
|NULL|  10|    8|
|  82| 428|    1|
|NULL|  74|    1|
|  31| 264|    1|
|   2| 101|    3|
+----+----+-----+
only showing top 20 rows



                                                                                

In [8]:
benchmarkGenetics.groupBy('targetId','diseaseId','hasColoc').count().groupBy('hasColoc').count().show()



+--------+-----+
|hasColoc|count|
+--------+-----+
|      no|73964|
|     yes|  301|
+--------+-----+



                                                                                

In [9]:
temp_dfs_for_union[0].show()

25/06/29 15:47:02 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_71_93 !
25/06/29 15:47:02 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_147_609 !
25/06/29 15:47:02 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_147_329 !
25/06/29 15:47:02 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_147_700 !
25/06/29 15:47:02 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_153_726 !
25/06/29 15:47:02 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_71_436 !
25/06/29 15:47:02 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_153_13 !
25/06/29 15:47:02 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_153_683 !
25/06/29 15:47:02 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_71_926 !
25/06/29 15:47:02 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_153_462 !
25/06/29 15:47:02 WARN BlockManagerMasterEndp

+--------------------+-----------+
|      distinct_value|column_name|
+--------------------+-----------+
|           Braineac2|  projectId|
|           BLUEPRINT|  projectId|
|            GEUVADIS|  projectId|
|              OneK1K|  projectId|
|         UKB_PPP_EUR|  projectId|
|      Schmiedel_2018|  projectId|
|          CommonMind|  projectId|
|               CEDAR|  projectId|
|        Nedelec_2016|  projectId|
|                 CAP|  projectId|
|         Alasoo_2018|  projectId|
|       Randolph_2021|  projectId|
|         Nathan_2022|  projectId|
|              HipSci|  projectId|
|          Perez_2022|  projectId|
|Bossini-Castillo_...|  projectId|
|    van_de_Bunt_2015|  projectId|
|            BrainSeq|  projectId|
|                GTEx|  projectId|
|          Quach_2016|  projectId|
+--------------------+-----------+
only showing top 20 rows



                                                                                

In [9]:
########################################################################################################################################################
################################################################### NOW BENCHMARK GENETICS #############################################################
########################################################################################################################################################

#### 1 Build a dictionary with the distinct values as key and column names as value
variables_study = ["projectId", "biosampleName", "rightStudyType", "colocDoE","colocalisationMethod"]

# List to hold temporary DataFrames
temp_dfs_for_union = []

keys_to_remove = {'othersProjectId', 'othersBiosampleName', 'othersRightStudyType'}
# disdic = {k: v for k, v in disdic.items() if k not in keys_to_remove}
# List to hold temporary DataFrames
temp_dfs_for_union = []

# Iterate over the column names to prepare DataFrames for union
for col_name in variables_study:
    # Select the current column, alias it to 'distinct_value' for consistent schema
    # Filter out nulls, then get distinct values
    # Add a literal column with the original 'col_name'
    df_temp = (
        benchmark.select(F.col(col_name).alias("distinct_value"))
        .filter(F.col("distinct_value").isNotNull()) # Exclude None (null) values
        .distinct()
        .withColumn("column_name", F.lit(col_name))
    )
    temp_dfs_for_union.append(df_temp)

disdic = {}
pivoted_dfs={}
# Iterate over the column names to prepare DataFrames for union
for col_name in variables_study:
    # Select the current column, alias it to 'distinct_value' for consistent schema
    # Filter out nulls, then get distinct values
    # Add a literal column with the original 'col_name'
    df_temp = (
        benchmarkGenetics.select(F.col(col_name).alias("distinct_value"))
        .filter(F.col("distinct_value").isNotNull()) # Exclude None (null) values
        .distinct()
        .withColumn("column_name", F.lit(col_name))
    )
    temp_dfs_for_union.append(df_temp)

if temp_dfs_for_union:
    # Union all the temporary DataFrames.
    # unionByName is crucial to handle potential schema differences (e.g., if columns have same name but different types)
    # and ensures columns are matched by name.
    combined_distinct_values_df = temp_dfs_for_union[0]
    for i in range(1, len(temp_dfs_for_union)):
        combined_distinct_values_df = combined_distinct_values_df.unionByName(temp_dfs_for_union[i])

    # Now, collect the combined distinct values.
    # This is a single collect operation on the aggregated DataFrame.
    print("Collecting combined distinct values from the cluster...")
    collected_rows = combined_distinct_values_df.collect()

    # Populate the dictionary from the collected rows
    for row in collected_rows:
        disdic[row.distinct_value] = row.column_name
else:
    print("variables_study list is empty, disdic will be empty.")


print("\nFinal disdic:", disdic)

# Assuming 'spark' session, 'benchmark' DataFrame, 'negativeTD' DataFrame, and 'disdic' dictionary are defined

# --- Step 1: Pre-compute 'hasboth' ONCE ---
# This is a shuffle, but only happens once.
print("Pre-computing 'hasboth' column...")
window_target_disease_only = Window.partitionBy('targetId', 'diseaseId')
benchmark_processed = benchmarkGenetics.withColumn(
    'hasboth',
    F.size(F.collect_set('colocalisationMethod').over(window_target_disease_only))
)

# --- Step 2: Loop for each variable_study column ---
for col_name in variables_study:
    print(f"Processing pivot for: {col_name}")

    # Define window specs for the current iteration, including 'col_name' in partition
    # (This shuffle is still per iteration, but unavoidable if 'resolvedAgreeDrug' depends on 'col_name' values)
    current_col_window_spec_qtl = Window.partitionBy("targetId", "diseaseId", col_name).orderBy(F.col("qtlPValueExponent").asc())
    current_col_pvalue_order_window = Window.partitionBy("targetId", "diseaseId", col_name).orderBy(F.col('colocalisationMethod').asc(), F.col("qtlPValueExponent").asc())

    # Calculate 'resolvedAgreeDrug' for the current 'col_name'
    # This involves a shuffle per iteration.
    temp_df_with_resolved = benchmark_processed.withColumn('resolvedHasColoc',
        F.when(F.col('hasboth') > 1,
            F.first(F.col('hasColoc'), ignorenulls=True).over(current_col_pvalue_order_window)
        ).otherwise(F.first(F.col('hasColoc'), ignorenulls=True).over(current_col_window_spec_qtl))
    )

    # --- Step 3: Perform the pivot and join ---
    # This is an expensive operation (shuffle, potential wide dataframe)
    pivoted_df = (
        temp_df_with_resolved
        .groupBy(
            "targetId",
            "diseaseId",
            "maxClinPhase",
        )
        .pivot(col_name) # Pivoting on values of the 'col_name' column
        .agg(F.collect_set("resolvedHasColoc"))
        .join(negativeTD, on=["targetId", "diseaseId"], how="left") # Ensure negativeTD is broadcast if small
    )

    # --- Step 4: Add derived columns (these are generally cheap) ---
    for phase in [1, 2, 3, 4]:
        pivoted_df = pivoted_df.withColumn(
            f"Phase>={phase}",
            F.when(F.col("maxClinPhase") >= phase, F.lit("yes")).otherwise(F.lit("no")),
        )

    pivoted_df = pivoted_df.withColumn(
        "PhaseT",
        F.when(F.col("stopReason") == "Negative", F.lit("yes")).otherwise(F.lit("no")),
    )

    # Add _only columns dynamically based on disdic values matching current column
    matching_keys = [key for key, val in disdic.items() if val == col_name]

    for key in matching_keys:
        # F.col(key) assumes 'key' refers to a column that exists in pivoted_df after the pivot.
        pivoted_df = pivoted_df.withColumn(
            f"{key}_only",
            F.when(F.array_contains(F.col(key), "yes"), F.lit("yes")).otherwise(F.lit("no")),
        )

    pivoted_dfs[f"{col_name}_gen"] = pivoted_df


##### PROJECTID
project_keys=[f"{k}_only" for k,v in disdic.items() if v == 'projectId']
main=['GTEx_only', 'UKB_PPP_EUR_only']
others=[item for item in project_keys if item not in main]

# First condition: any "yes" in list1
condition1 = reduce(lambda acc, col: acc | (F.col(col) == "yes"), others[1:], F.col(others[0]) == "yes")
# Add both columns
pivoted_dfs['projectId_gen'] = pivoted_dfs['projectId_gen'].withColumn("othersProjectId_only", F.when(condition1, "yes").otherwise("no")) 

##### BIOSAMPLE NAME
biosample_keys=[f"{k}_only" for k,v in disdic.items() if v == 'biosampleName']
main=['tibial nerve_only', 'upper lobe of left lung_only','blood plasma_only','lymphoblastoid cell line_only']
others=[item for item in biosample_keys if item not in main]

# First condition: any "yes" in list1
condition1 = reduce(lambda acc, col: acc | (F.col(col) == "yes"), others[1:], F.col(others[0]) == "yes")
# Add both columns
pivoted_dfs['biosampleName_gen'] = pivoted_dfs['biosampleName_gen'].withColumn("othersBiosampleName_only", F.when(condition1, "yes").otherwise("no")) 


##### RIGHTSTUDYTYPE 
rightStudy_keys=[f"{k}_only" for k,v in disdic.items() if v == 'rightStudyType']
main=['eqtl_only', 'pqtl_only']
others=[item for item in rightStudy_keys if item not in main]

# First condition: any "yes" in list1
condition1 = reduce(lambda acc, col: acc | (F.col(col) == "yes"), others[1:], F.col(others[0]) == "yes")
# Add both columns
pivoted_dfs['rightStudyType_gen'] = pivoted_dfs['rightStudyType_gen'].withColumn("othersRightStudyType_only", F.when(condition1, "yes").otherwise("no")) 


###append to dictionary

disdic.update({'othersProjectId': 'projectId', 'othersBiosampleName': 'biosampleName', 'othersRightStudyType':'rightStudyType'})

Collecting combined distinct values from the cluster...


                                                                                0]


Final disdic: {'Braineac2': 'projectId', 'Kasela_2017': 'projectId', 'BLUEPRINT': 'projectId', 'GEUVADIS': 'projectId', 'OneK1K': 'projectId', 'UKB_PPP_EUR': 'projectId', 'Schmiedel_2018': 'projectId', 'Steinberg_2020': 'projectId', 'CommonMind': 'projectId', 'GENCORD': 'projectId', 'CEDAR': 'projectId', 'Nedelec_2016': 'projectId', 'TwinsUK': 'projectId', 'Lepik_2017': 'projectId', 'Cytoimmgen': 'projectId', 'CAP': 'projectId', 'Alasoo_2018': 'projectId', 'Randolph_2021': 'projectId', 'Jerber_2021': 'projectId', 'ROSMAP': 'projectId', 'Nathan_2022': 'projectId', 'HipSci': 'projectId', 'Kim-Hellmuth_2017': 'projectId', 'Perez_2022': 'projectId', 'PhLiPS': 'projectId', 'Bossini-Castillo_2019': 'projectId', 'Fairfax_2014': 'projectId', 'van_de_Bunt_2015': 'projectId', 'BrainSeq': 'projectId', 'GTEx': 'projectId', 'FUSION': 'projectId', 'Fairfax_2012': 'projectId', 'Quach_2016': 'projectId', 'Schwartzentruber_2018': 'projectId', 'PISA': 'projectId', 'Peng_2018': 'projectId', 'iPSCORE': '

25/06/29 20:31:29 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_71_93 !
25/06/29 20:31:29 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_145_327 !
25/06/29 20:31:29 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_71_413 !
25/06/29 20:31:29 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_145_556 !
25/06/29 20:31:29 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_139_825 !
25/06/29 20:31:29 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_139_965 !
25/06/29 20:31:29 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_71_926 !
25/06/29 20:31:29 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_145_650 !
25/06/29 20:31:29 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_71_896 !
25/06/29 20:31:29 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_71_11 !
25/06/29 20:31:29 WARN BlockManagerMasterEndpoi

Processing pivot for: biosampleName


                                                                                

Processing pivot for: rightStudyType


                                                                                

Processing pivot for: colocDoE


                                                                                

Processing pivot for: colocalisationMethod


                                                                                

In [10]:
pivoted_dfs['biosampleName_gen'].show()

25/06/29 20:50:47 ERROR TransportRequestHandler: Error sending result RpcResponse[requestId=5055745209643946975,body=NioManagedBuffer[buf=java.nio.HeapByteBuffer[pos=0 lim=81 cap=156]]] to /10.132.0.18:35836; closing connection
io.netty.channel.StacklessClosedChannelException: null
	at io.netty.channel.AbstractChannel$AbstractUnsafe.write(Object, ChannelPromise)(Unknown Source) ~[netty-transport-4.1.100.Final.jar:4.1.100.Final]
25/06/29 20:50:47 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_145_830 !
25/06/29 20:50:47 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_139_901 !
25/06/29 20:50:47 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_145_782 !
25/06/29 20:50:47 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_71_413 !
25/06/29 20:50:47 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_145_556 !
25/06/29 20:50:47 WARN BlockManagerMasterEndpoint: No more replicas available for rdd_145

+---------------+-------------+------------+----+------------+------+----------------------------------+--------------------------------+-----------------------------------------------+-----------------------------------------------------+-------------------------------+-----------------------------------------+--------------------------------------+-------------------------------+-------------+------+------------------------+---------------+----------------+---------------+--------------+-------------+--------+-------------------------+----------------------+---------------+---------+-----+------------+----------------+-----------------+----------------+---------------+----------------------------------------------+----------------------------------------------+----------+---------------+----------------+--------------+-------------------+------------------------------+-------------------------+-----------------------------------------------+-------------------------------------------

                                                                                

In [None]:
##### 

In [2]:
disdic.update({'othersProjectId': 'projectId', 'othersBiosampleName_only': 'biosampleName', 'otherRightStudyType':'rightStudyType'})

In [40]:
##### PROJECTID
project_keys=[f"{k}_only" for k,v in disdic.items() if v == 'projectId']
main=['GTEx_only', 'UKB_PPP_EUR_only']
others=[item for item in project_keys if item not in main]

# First condition: any "yes" in list1
condition1 = reduce(lambda acc, col: acc | (F.col(col) == "yes"), others[1:], F.col(others[0]) == "yes")
# Add both columns
pivoted_dfs['projectId'] = pivoted_dfs['projectId'].withColumn("othersProjectId_only", F.when(condition1, "yes").otherwise("no")) 

##### BIOSAMPLE NAME
biosample_keys=[f"{k}_only" for k,v in disdic.items() if v == 'biosampleName']
main=['tibial nerve_only', 'upper lobe of left lung_only','blood plasma_only','lymphoblastoid cell line_only']
others=[item for item in biosample_keys if item not in main]

# First condition: any "yes" in list1
condition1 = reduce(lambda acc, col: acc | (F.col(col) == "yes"), others[1:], F.col(others[0]) == "yes")
# Add both columns
pivoted_dfs['biosampleName'] = pivoted_dfs['biosampleName'].withColumn("othersBiosampleName_only", F.when(condition1, "yes").otherwise("no")) 


##### RIGHTSTUDYTYPE 
rightStudy_keys=[f"{k}_only" for k,v in disdic.items() if v == 'rightStudyType']
main=['eqtl_only', 'pqtl_only']
others=[item for item in rightStudy_keys if item not in main]

# First condition: any "yes" in list1
condition1 = reduce(lambda acc, col: acc | (F.col(col) == "yes"), others[1:], F.col(others[0]) == "yes")
# Add both columns
pivoted_dfs['rightStudyType'] = pivoted_dfs['rightStudyType'].withColumn("otherRightStudyType_only", F.when(condition1, "yes").otherwise("no")) 



In [42]:
unique_values = benchmark.select('projectId').filter(F.col('projectId').isNotNull()).distinct().rdd.flatMap(lambda x: x).collect()

len(pivoted_dfs['projectId'].drop(*unique_values).columns[10:])

                                                                                

45

In [45]:
pivoted_dfs['projectId'].drop(*unique_values).columns[10:]

['HipSci_only',
 'van_de_Bunt_2015_only',
 'GTEx_only',
 'Lepik_2017_only',
 'Bossini-Castillo_2019_only',
 'ROSMAP_only',
 'BLUEPRINT_only',
 'TwinsUK_only',
 'FUSION_only',
 'Cytoimmgen_only',
 'Gilchrist_2021_only',
 'PhLiPS_only',
 'BrainSeq_only',
 'Schmiedel_2018_only',
 'Jerber_2021_only',
 'CEDAR_only',
 'Alasoo_2018_only',
 'Perez_2022_only',
 'CommonMind_only',
 'CAP_only',
 'Nedelec_2016_only',
 'Steinberg_2020_only',
 'GEUVADIS_only',
 'OneK1K_only',
 'Nathan_2022_only',
 'Quach_2016_only',
 'Fairfax_2014_only',
 'Fairfax_2012_only',
 'Peng_2018_only',
 'Aygun_2021_only',
 'GENCORD_only',
 'Walker_2019_only',
 'Kasela_2017_only',
 'iPSCORE_only',
 'Young_2019_only',
 'Schwartzentruber_2018_only',
 'Kim-Hellmuth_2017_only',
 'Naranbhai_2015_only',
 'UKB_PPP_EUR_only',
 'PISA_only',
 'Randolph_2021_only',
 'Sun_2018_only',
 'Braineac2_only',
 'others_only',
 'othersProjectId_only']

In [21]:
resolvedColocFiltered = resolvedColoc.filter((F.col('clpp')>=0.01) | (F.col('h4')>=0.8))

benchmark=(
        resolvedColocFiltered.filter(F.col("betaGwas") < 0).filter(
        F.col("name") != "COVID-19"
    ).withColumn('hasGeneticData', F.lit('yes'))
        .join(  ### select just GWAS giving protection
            analysis_chembl_indication, on=["targetId", "diseaseId"], how="right"  ### RIGHT SIDE
        )
        .withColumn(
            "AgreeDrug",
            F.when(
                (F.col("drugGoF_protect").isNotNull())
                & (F.col("colocDoE") == "GoF_protect"),
                F.lit("yes"),
            )
            .when(
                (F.col("drugLoF_protect").isNotNull())
                & (F.col("colocDoE") == "LoF_protect"),
                F.lit("yes"),
            )
            .otherwise(F.lit("no")),
        ).withColumn('hasGeneticData', F.when(F.col('hasGeneticData')!='yes', F.lit('no')).otherwise(F.col('hasGeneticData')))
    )  #### remove COVID-19 associations
#).join(biosample.select("biosampleId", "biosampleName"), on="biosampleId", how="left")

In [None]:
##### PROJECTID
project_keys=[f"{k}_only" for k,v in disdic.items() if v == 'projectId']
main=['GTEx_only', 'UKB_PPP_EUR_only']
others=[item for item in project_keys if item not in main]

# First condition: any "yes" in list1
condition1 = reduce(lambda acc, col: acc | (F.col(col) == "yes"), others[1:], F.col(others[0]) == "yes")
# Add both columns
pivoted_dfs['projectId'] = pivoted_dfs['projectId'].withColumn("others_only", F.when(condition1, "yes").otherwise("no")) 

##### BIOSAMPLE NAME
biosample_keys=[f"{k}_only" for k,v in disdic.items() if v == 'biosampleName']
main=['tibial nerve_only', 'upper lobe of left lung_only','blood plasma_only','lymphoblastoid cell line_only']
others=[item for item in biosample_keys if item not in main]

# First condition: any "yes" in list1
condition1 = reduce(lambda acc, col: acc | (F.col(col) == "yes"), others[1:], F.col(others[0]) == "yes")
# Add both columns
pivoted_dfs['biosampleName'] = pivoted_dfs['biosampleName'].withColumn("others_only", F.when(condition1, "yes").otherwise("no")) 


##### RIGHTSTUDYTYPE 
rightStudy_keys=[f"{k}_only" for k,v in disdic.items() if v == 'rightStudyType']
main=['eqtl_only', 'pqtl_only']
others=[item for item in rightStudy_keys if item not in main]

# First condition: any "yes" in list1
condition1 = reduce(lambda acc, col: acc | (F.col(col) == "yes"), others[1:], F.col(others[0]) == "yes")
# Add both columns
pivoted_dfs['rightStudyType'] = pivoted_dfs['rightStudyType'].withColumn("others_only", F.when(condition1, "yes").otherwise("no")) 

In [None]:
tibial nerve_only
upper lobe of left lung_only
blood plasma_only
lymphoblastoid cell line_only

In [None]:
project_keys=[f"{k}_only" for k,v in disdic.items() if v == 'projectId']

In [None]:
biosample_keys=[f"{k}_only" for k,v in disdic.items() if v == 'biosampleName']

In [None]:
rightStudyType_keys=[f"{k}_only" for k,v in disdic.items() if v == 'rightStudyType']

In [30]:
project_keys

['Fairfax_2014',
 'van_de_Bunt_2015',
 'HipSci',
 'BrainSeq',
 'OneK1K',
 'GTEx',
 'Schmiedel_2018',
 'Lepik_2017',
 'Peng_2018',
 'Nathan_2022',
 'TwinsUK',
 'Alasoo_2018',
 'Cytoimmgen',
 'Quach_2016',
 'CommonMind',
 'Nedelec_2016',
 'Kim-Hellmuth_2017',
 'BLUEPRINT',
 'UKB_PPP_EUR',
 'FUSION',
 'Naranbhai_2015',
 'Bossini-Castillo_2019',
 'ROSMAP',
 'Randolph_2021',
 'Aygun_2021',
 'PISA',
 'CEDAR',
 'Steinberg_2020',
 'Jerber_2021',
 'Kasela_2017',
 'iPSCORE',
 'GENCORD',
 'GEUVADIS',
 'PhLiPS',
 'CAP',
 'Schwartzentruber_2018',
 'Gilchrist_2021',
 'Perez_2022',
 'Fairfax_2012',
 'Walker_2019',
 'Braineac2',
 'Sun_2018',
 'Young_2019']

In [None]:
from pyspark.sql import functions as F
from functools import reduce

project_keys=[f"{k}_only" for k,v in disdic.items() if v == 'projectId']
main=['GTEx', 'UKB_PPP_EUR']
others=[item for item in project_keys if item not in main]

# First condition: any "yes" in list1
condition1 = reduce(lambda acc, col: acc | (F.col(col) == "yes"), others[1:], F.col(others[0]) == "yes")

# Second condition: any "yes" in list2

# Add both columns
pivoted_dfs['projectId'] = pivoted_dfs['projectId'].withColumn("others_only", F.when(condition1, "yes").otherwise("no")) 


In [34]:
benchmark.groupBy('rightStudyType').count().show()



+--------------+-----+
|rightStudyType|count|
+--------------+-----+
|          NULL|73880|
|          eqtl| 8812|
|         tuqtl| 6341|
|          sqtl| 1805|
|        sceqtl|  279|
|          pqtl|  601|
+--------------+-----+



                                                                                

In [None]:
benchmark.groupBy('targetId','diseaseId').pivot('rightStudyType').

In [37]:
benchmark.groupBy('rightStudyType','AgreeDrug', 'maxClinPhase').count().show()



+--------------+---------+------------+-----+
|rightStudyType|AgreeDrug|maxClinPhase|count|
+--------------+---------+------------+-----+
|          NULL|       no|         3.0|19342|
|          NULL|       no|         1.0|14684|
|          NULL|       no|         2.0|32282|
|          NULL|       no|         4.0| 6155|
|          NULL|       no|         0.5| 1417|
|          eqtl|      yes|         3.0|  618|
|          sqtl|       no|         4.0|  691|
|          eqtl|       no|         3.0|  439|
|          eqtl|      yes|         2.0|  682|
|          eqtl|       no|         2.0| 1343|
|         tuqtl|       no|         2.0|  724|
|          sqtl|       no|         2.0|  433|
|         tuqtl|      yes|         2.0| 1078|
|          eqtl|      yes|         4.0| 2352|
|          eqtl|       no|         4.0| 2065|
|         tuqtl|      yes|         4.0| 2336|
|         tuqtl|       no|         4.0|  615|
|          eqtl|       no|         1.0|  810|
|          pqtl|      yes|        

                                                                                

In [None]:
.agg(F.collect_set("resolvedAgreeDrug"))

In [32]:
pivoted_dfs['projectId'].show()



+---------------+-------------+------------+----+-----------+----------+---------+---------------------+--------+---------+---+-----+----------+----------+------+------------+------------+-------+--------+----+--------------+------+-----------+-----------+-----------------+----------+--------------+-----------+------------+------+----+---------+----------+------+----------+------+-------------+--------------+---------------------+--------------+--------+-------+-----------+-----------+----------+-------+----------------+----------+--------+--------+--------+--------+------+-----------------+---------------------+-----------+-------------+-----------+---------+-------------------+---------------+--------------+----------------+------------+----------------+---------------+---------------+---------------+-----------------+----------------------+--------------+----------------+-----------+-------------------+--------------------------+-----------+------------------+---------------+-------

                                                                                