In [None]:
#### 11.12.2024
#######
#######     ATENTION
#### change code to work with generated dataframe instead of reading the parquet

"""
This scripts run Odds ratio analysis for DoE and 
genetic information on drug clinical success

"""
from functions import discrepancifier
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.types import (
    StructType,
    StructField,
    ArrayType,
    DoubleType,
    DecimalType,
    StringType,
    FloatType,
)
from datetime import datetime


spark = SparkSession.builder.getOrCreate()
c = datetime.now()
today_date = str(date.today())
print("spark session created at", c)

print("Analysis started on " + today_date + " at ", c)
"""
#coloc = spark.read.parquet(
#    "gs://genetics-portal-dev-data/22.09.1/outputs/v2d_coloc"
#).filter(F.col("right_type") != "gwas")
"""

#### make the dataset from stopped clin trials
### read supplementary table 9
""" ### just showing how i did the dataset
st9 = spark.read.csv("/Users/juanr/Downloads/ST9.csv", sep=",", header=True)
st9.filter(
    (F.col("clinicalStatus").isin(["Terminated", "Withdrawn", "Suspended"]))
    & (F.col("prediction") == "Negative")
).groupBy(
    "targetId", "diseaseId", "clinicalStatus", "prediction"
).count().toPandas().to_csv(
    "targetDiseaseStoppedNegative.csv"
"""
### target-diseases terminated&withdrawal in clin trials
terminated = spark.read.csv(
    "gs://ot-team/jroldan/analysis/targetDiseaseStoppedNegative.csv",
    sep=",",
    header=True,
).drop("_c0", "Withdrawn")

path = "gs://open-targets-pre-data-releases/24.12-uo_test-3/output/etl/parquet/"

evidences = (
    spark.read.parquet(f"{path}evidence")
    .filter(
        F.col("datasourceId").isin(
            [
                "ot_genetics_portal",
                "gene_burden",
                "eva",
                "eva_somatic",
                "gene2phenotype",
                "orphanet",
                "cancer_gene_census",
                "intogen",
                "impc",
                "chembl",
            ]
        )
    )
    .persist()
)
ot_genetics = evidences.filter(F.col("datasourceId") == "ot_genetics_portal")

#### Now load sources of data to generate credible_set_OT_genetics evidences and associations.

target = spark.read.parquet(f"{path}targets/")

diseases = spark.read.parquet(f"{path}diseases/")

evidences = spark.read.parquet(f"{path}evidence")

credible = spark.read.parquet(f"{path}credibleSet")

index = spark.read.parquet(f"{path}gwasIndex")

new = spark.read.parquet(f"{path}colocalisation/coloc")

variantIndex = spark.read.parquet(f"{path}variantIndex")

biosample = spark.read.parquet(f"{path}biosample")

newColoc = (
    new.join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on left side
            "studyLocusId as leftStudyLocusId",
            "StudyId as leftStudyId",
            "variantId as leftVariantId",
            "studyType as credibleLeftStudyType",
        ),
        on="leftStudyLocusId",
        how="left",
    )
    .join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on right side
            "studyLocusId as rightStudyLocusId",
            "studyId as rightStudyId",
            "variantId as rightVariantId",
            "studyType as credibleRightStudyType",
        ),
        on="rightStudyLocusId",
        how="left",
    )
    .join(
        index.selectExpr(  ### bring modulated target on right side (QTL study)
            "studyId as rightStudyId",
            "geneId",
            "projectId",
            "studyType as indexStudyType",
            "condition",
            "biosampleId",
        ),
        on="rightStudyId",
        how="left",
    )
    .persist()
)
# remove columns without content (only null values on them)
df = evidences.filter((F.col("datasourceId") == "gwas_credible_sets"))

# Use an aggregation to determine non-null columns
non_null_counts = df.select(
    *[F.sum(F.col(col).isNotNull().cast("int")).alias(col) for col in df.columns]
)

# Collect the counts for each column
non_null_columns = [
    row[0] for row in non_null_counts.collect()[0].asDict().items() if row[1] > 0
]

# Select only the non-null columns
filtered_df = df.select(*non_null_columns).persist()

## bring studyId, variantId, beta from Gwas and pValue
gwasComplete = filtered_df.join(
    credible.selectExpr(
        "studyLocusId", "studyId", "variantId", "beta as betaGwas", "pValueExponent"
    ),
    on="studyLocusId",
    how="left",
)

### bring directionality from QTL

gwasResolvedColoc = (
    (
        newColoc.filter(F.col("rightStudyType") != "gwas")
        .withColumnRenamed("geneId", "targetId")
        .join(
            gwasComplete.withColumnRenamed("studyLocusId", "leftStudyLocusId"),
            on=["leftStudyLocusId", "targetId"],
            how="right",
        )
        .join(  ### propagated using parent terms
            diseases.selectExpr(
                "id as diseaseId", "name", "parents", "therapeuticAreas"
            ),
            on="diseaseId",
            how="left",
        )
        .withColumn(
            "diseaseId",
            F.explode_outer(F.concat(F.array(F.col("diseaseId")), F.col("parents"))),
        )
        .drop("parents", "oldDiseaseId")
    )
    .withColumn(
        "colocDoE",
        F.when(
            F.col("rightStudyType").isin(
                ["eqtl", "pqtl", "tuqtl", "sceqtl", "sctuqtl"]
            ),
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_protect"),
            ),
        ).when(
            F.col("rightStudyType").isin(
                ["sqtl", "scsqtl"]
            ),  ### opposite directionality than sqtl
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_protect"),
            ),
        ),
    )
    .persist()
)

#### take the direction from the lowest p value
window_spec = Window.partitionBy("targetId", "diseaseId").orderBy(
    F.col("pValueExponent").asc()
)
gwasCredibleAssoc = (
    gwasResolvedColoc.withColumn("homogenized", F.first("colocDoE").over(window_spec))
    .select("targetId", "diseaseId", "homogenized")
    .withColumn(
        "homogenized",
        F.when(F.col("homogenized").isNull(), F.lit("noEvaluable")).otherwise(
            F.col("homogenized")
        ),
    )
)

# 1# Make a list of variant of interest (Sequence ontology terms) to subset data of interest.

### Bear in mind that SO works with ontology structure as: SO:XXXXXX, but databases has the SO as: SO_XXXXXX

var_filter_lof = [
    ### High impact variants https://www.ensembl.org/info/genome/variation/prediction/predicted_data.html
    "SO_0001589",  ## frameshit_variant
    "SO_0001587",  ## stop_gained
    "SO_0001574",  ## splice_acceptor_variant
    "SO_0001575",  ## splice_donor_variant
    "SO_0002012",  ## start_lost
    "SO_0001578",  ## stop_lost
    "SO_0001893",  ## transcript_ablation
    # "SO:0001889", ## transcript_amplification ## the Only HIGH impact that increase protein.
]

gof = ["SO_0002053"]
lof = ["SO_0002054"]

print("loading sources")

## Building Sequence Ontology
so_path = "gs://ot-team/jroldan/sequenceOntology_20221118.csv"
so_ontology = spark.read.csv(so_path, header=True)
building = (
    so_ontology.select(F.col("Accession"), F.col("Parents"))
    .withColumn("Parentalind", F.split(F.col("Parents"), ","))
    .withColumn("Parentalind", F.explode_outer("Parentalind"))
    .groupBy("Parentalind")
    .agg(F.collect_list(F.col("Accession")).alias("childrens"))
    .join(so_ontology, F.col("Parentalind") == so_ontology.Accession, "right")
)

## others
target_path = f"{path}targets/"
target = spark.read.parquet(target_path)
disease_path = f"{path}diseases/"
diseases = spark.read.parquet(disease_path)
dis_name = diseases.select("id", "name")
indication_path = f"{path}indication/"
indication = spark.read.parquet(indication_path)
mecact_path = f"{path}mechanismOfAction/"
mecact = spark.read.parquet(mecact_path)

## annotate TSG/oncogene/bivalent using 'hallmarks.attributes'
oncotsg_list = [
    "TSG",
    "oncogene",
    "Oncogene",
    "oncogene",
    "oncogene,TSG",
    "TSG,oncogene",
    "fusion,oncogene",
    "oncogene,fusion",
]

#### rlike('('+Keywords+')(\s|$)'
### on 03.07.2023 we add the categories:
# DISRUPTING AGENT - inhibitor
# STABILISER - activator

### Hacer el join del actionType con el chembl para sacar los mecanismos de accion.
inhibitors = [
    "RNAI INHIBITOR",
    "NEGATIVE MODULATOR",
    "NEGATIVE ALLOSTERIC MODULATOR",
    "ANTAGONIST",
    "ANTISENSE INHIBITOR",
    "BLOCKER",
    "INHIBITOR",
    "DEGRADER",
    "INVERSE AGONIST",
    "ALLOSTERIC ANTAGONIST",
    "DISRUPTING AGENT",  ## added new on 03.07.2023
]

activators = [
    "PARTIAL AGONIST",
    "ACTIVATOR",
    "POSITIVE ALLOSTERIC MODULATOR",
    "POSITIVE MODULATOR",
    "AGONIST",
    "SEQUESTERING AGENT",
    "STABILISER",  ## added new on 03.07.2023
]

columnas = ["activator", "inhibitor"]
both = activators + inhibitors

actionType = (
    mecact.select(
        F.explode_outer("chemblIds").alias("drugId2"),
        "actionType",
        "mechanismOfAction",
        "targets",
    )
    .select(
        F.explode_outer("targets").alias("targetId2"),
        "drugId2",
        "actionType",
        "mechanismOfAction",
    )
    .groupBy("targetId2", "drugId2")
    .agg(
        F.collect_set("actionType").alias("actionType"),
    )
)

oncolabel = (
    target.select(
        "id", "approvedSymbol", F.explode_outer(F.col("hallmarks.attributes"))
    )
    .select("id", "approvedSymbol", "col.description")
    .filter(F.col("description").isin(oncotsg_list))
    .groupBy("id", "approvedSymbol")
    .agg(F.collect_set("description").alias("description"))
    .withColumn("description_splited", F.concat_ws(",", F.col("description")))
    .withColumn(
        "TSorOncogene",
        F.when(
            (
                F.col("description_splited").rlike("ncogene")
                & F.col("description_splited").rlike("TSG")
            ),
            F.lit("bivalent"),
        )
        .when(F.col("description_splited").rlike("ncogene(\s|$)"), F.lit("oncogene"))
        .when(F.col("description_splited").rlike("TSG(\s|$)"), F.lit("TSG"))
        .otherwise(F.lit("noEvaluable")),
    )
    .withColumnRenamed("id", "target_id")
)

# 2# run the transformation of the evidences datasets used.
all = evidences.filter(
    F.col("datasourceId").isin(
        [
            "ot_genetics_portal",
            "gene_burden",
            "eva",
            "eva_somatic",
            "gene2phenotype",
            "orphanet",
            "cancer_gene_census",
            "intogen",
            "impc",
            "chembl",
        ]
    )
)

windowSpec = Window.partitionBy("targetId", "diseaseId")

#### version all gene burden
prueba_assessment = (
    all.withColumn("beta", F.col("beta").cast("double"))  ## ot genetics & gene burden
    .withColumn(
        "OddsRatio", F.col("OddsRatio").cast("double")
    )  ## ot genetics & gene burden
    .withColumn(
        "clinicalSignificances", F.concat_ws(",", F.col("clinicalSignificances"))
    )  ### eva
    .join(oncolabel, oncolabel.target_id == F.col("targetId"), "left")  ###  cgc
    .join(
        actionType,  ## chembl
        (actionType.drugId2 == F.col("drugId"))
        & (actionType.targetId2 == F.col("targetId")),
        "left",
    )
    .withColumn("inhibitors_list", F.array([F.lit(i) for i in inhibitors]))
    .withColumn("activators_list", F.array([F.lit(i) for i in activators]))
    .withColumn(
        "intogen_function",
        F.when(
            F.arrays_overlap(
                F.col("mutatedSamples.functionalConsequenceId"),
                F.array([F.lit(i) for i in (gof)]),
            ),
            F.lit("GoF"),
        ).when(
            F.arrays_overlap(
                F.col("mutatedSamples.functionalConsequenceId"),
                F.array([F.lit(i) for i in (lof)]),
            ),
            F.lit("LoF"),
        ),
        # .otherwise("nodata"),
    )
    .withColumn(
        "intogenAnnot",
        F.size(F.collect_set(F.col("intogen_function")).over(windowSpec)),
    )
    ### variant Effect Column
    .withColumn(
        "variantEffect",
        F.when(
            F.col("datasourceId") == "ot_genetics_portal",
            F.when(
                F.col("variantFunctionalConsequenceId").isNotNull(),
                F.when(
                    F.col("variantFunctionalConsequenceFromQtlId").isNull(),
                    F.when(
                        F.col("variantFunctionalConsequenceId").isin(var_filter_lof),
                        F.lit("LoF"),
                    )
                    .when(
                        F.col("variantFunctionalConsequenceId").isin(gof),
                        F.lit("GoF"),
                    )
                    .otherwise(F.lit("noEvaluable")),
                )
                ### variantFunctionalConsequenceFromQtlId
                .when(
                    F.col("variantFunctionalConsequenceFromQtlId").isNotNull(),
                    F.when(
                        F.col("variantFunctionalConsequenceId").isin(
                            var_filter_lof
                        ),  ## when is a LoF variant
                        F.when(
                            F.col("variantFunctionalConsequenceFromQtlId")
                            == "SO_0002316",
                            F.lit("LoF"),
                        )
                        .when(
                            F.col("variantFunctionalConsequenceFromQtlId")
                            == "SO_0002315",
                            F.lit("conflict/noEvaluable"),
                        )
                        .otherwise(F.lit("LoF")),
                    ).when(
                        F.col("variantFunctionalConsequenceId").isin(var_filter_lof)
                        == False,  ## when is not a LoF, still can be a GoF
                        F.when(
                            F.col("variantFunctionalConsequenceId").isin(gof)
                            == False,  ##if not GoF
                            F.when(
                                F.col("variantFunctionalConsequenceFromQtlId")
                                == "SO_0002316",
                                F.lit("LoF"),
                            )
                            .when(
                                F.col("variantFunctionalConsequenceFromQtlId")
                                == "SO_0002315",
                                F.lit("GoF"),
                            )
                            .otherwise(F.lit("noEvaluable")),
                        ).when(
                            F.col("variantFunctionalConsequenceId").isin(
                                gof
                            ),  ##if is GoF
                            F.when(
                                F.col("variantFunctionalConsequenceFromQtlId")
                                == "SO_0002316",
                                F.lit("conflict/noEvaluable"),
                            ).when(
                                F.col("variantFunctionalConsequenceFromQtlId")
                                == "SO_0002315",
                                F.lit("GoF"),
                            ),
                        ),
                    ),
                ),
            ).when(
                F.col("variantFunctionalConsequenceId").isNull(),
                F.when(
                    F.col("variantFunctionalConsequenceFromQtlId") == "SO_0002316",
                    F.lit("LoF"),
                )
                .when(
                    F.col("variantFunctionalConsequenceFromQtlId") == "SO_0002315",
                    F.lit("GoF"),
                )
                .otherwise(F.lit("noEvaluable")),
            ),
        ).when(
            F.col("datasourceId") == "gene_burden",
            F.when(F.col("targetId").isNotNull(), F.lit("LoF")).otherwise(
                F.lit("noEvaluable")
            ),  ### son tambien no data las que tiene riesgo pero no se ensayan LoF o PT
        )
        #### Eva_germline
        .when(
            F.col("datasourceId") == "eva",
            #### .filter(F.col('variantFunctionalConsequenceId').isin(var_filter_lof))
            F.when(
                F.col("variantFunctionalConsequenceId").isin(var_filter_lof),
                F.lit("LoF"),
            ).otherwise(
                F.lit("noEvaluable")
            ),  ### Son todas aquellas que tenen info pero no son LoF
        )
        #### Eva_somatic
        .when(
            F.col("datasourceId") == "eva_somatic",
            F.when(
                F.col("variantFunctionalConsequenceId").isin(var_filter_lof),
                F.lit("LoF"),
            ).otherwise(
                F.lit("noEvaluable")
            ),  ### Son todas aquellas que tenen info pero no son patogenicas/protective  + LoF
        )
        #### G2P
        .when(
            F.col("datasourceId")
            == "gene2phenotype",  ### 6 types of variants [SO_0002318, SO_0002317, SO_0001622, SO_0002315, SO_0001566, SO_0002220]
            F.when(
                F.col("variantFunctionalConsequenceId") == "SO_0002317",
                F.lit("LoF"),
            )  ### absent gene product
            .when(
                F.col("variantFunctionalConsequenceId") == "SO_0002315",
                F.lit("GoF"),
            )  ### increased gene product level
            .otherwise(F.lit("noEvaluable")),
        )
        #### Orphanet
        .when(
            F.col("datasourceId") == "orphanet",
            F.when(
                F.col("variantFunctionalConsequenceId") == "SO_0002054",
                F.lit("LoF"),
            )  ### Loss of Function Variant
            .when(
                F.col("variantFunctionalConsequenceId") == "SO_0002053",
                F.lit("GoF"),
            )  ### Gain_of_Function Variant
            .otherwise(F.lit("noEvaluable")),
        )
        #### CGC
        .when(
            F.col("datasourceId") == "cancer_gene_census",
            F.when(F.col("TSorOncogene") == "oncogene", F.lit("GoF"))
            .when(F.col("TSorOncogene") == "TSG", F.lit("LoF"))
            .when(F.col("TSorOncogene") == "bivalent", F.lit("bivalent"))
            .otherwise("noEvaluable"),
        )
        #### intogen
        .when(
            F.col("datasourceId") == "intogen",
            F.when(
                F.col("intogenAnnot")
                == 1,  ## oncogene/tummor suppressor for a given trait
                F.when(
                    F.arrays_overlap(
                        F.col("mutatedSamples.functionalConsequenceId"),
                        F.array([F.lit(i) for i in (gof)]),
                    ),
                    F.lit("GoF"),
                ).when(
                    F.arrays_overlap(
                        F.col("mutatedSamples.functionalConsequenceId"),
                        F.array([F.lit(i) for i in (lof)]),
                    ),
                    F.lit("LoF"),
                ),
            )
            .when(
                F.col("intogenAnnot") > 1, F.lit("bivalentIntogen")
            )  ##oncogene & tumor suppressor for a given trait
            .otherwise(F.lit("noEvaluable")),
        )
        #### impc
        .when(
            F.col("datasourceId") == "impc",
            F.when(F.col("diseaseId").isNotNull(), F.lit("LoF")).otherwise(
                F.lit("noEvaluable")
            ),
        )
        ### chembl
        .when(
            F.col("datasourceId") == "chembl",
            F.when(
                F.size(F.array_intersect(F.col("actionType"), F.col("inhibitors_list")))
                >= 1,
                F.lit("LoF"),
            )
            .when(
                F.size(F.array_intersect(F.col("actionType"), F.col("activators_list")))
                >= 1,
                F.lit("GoF"),
            )
            .otherwise(F.lit("noEvaluable")),
        ),
    )
    .withColumn(
        "directionOnTrait",
        ## ot genetics portal
        F.when(
            F.col("datasourceId") == "ot_genetics_portal",  ### the same for gene_burden
            F.when(
                (F.col("beta").isNotNull()) & (F.col("OddsRatio").isNull()),
                F.when(F.col("beta") > 0, F.lit("risk"))
                .when(F.col("beta") < 0, F.lit("protect"))
                .otherwise(F.lit("noEvaluable")),
            )
            .when(
                (F.col("beta").isNull()) & (F.col("OddsRatio").isNotNull()),
                F.when(F.col("OddsRatio") > 1, F.lit("risk"))
                .when(F.col("OddsRatio") < 1, F.lit("protect"))
                .otherwise(F.lit("noEvaluable")),
            )
            .when(
                (F.col("beta").isNull()) & (F.col("OddsRatio").isNull()),
                F.lit("noEvaluable"),
            )
            .when(
                (F.col("beta").isNotNull()) & (F.col("OddsRatio").isNotNull()),
                F.lit("conflict/noEvaluable"),
            ),
        ).when(
            F.col("datasourceId") == "gene_burden",
            F.when(
                (F.col("beta").isNotNull()) & (F.col("OddsRatio").isNull()),
                F.when(F.col("beta") > 0, F.lit("risk"))
                .when(F.col("beta") < 0, F.lit("protect"))
                .otherwise(F.lit("noEvaluable")),
            )
            .when(
                (F.col("oddsRatio").isNotNull()) & (F.col("beta").isNull()),
                F.when(F.col("oddsRatio") > 1, F.lit("risk"))
                .when(F.col("oddsRatio") < 1, F.lit("protect"))
                .otherwise(F.lit("noEvaluable")),
            )
            .when(
                (F.col("beta").isNull()) & (F.col("oddsRatio").isNull()),
                F.lit("noEvaluable"),
            )
            .when(
                (F.col("beta").isNotNull()) & (F.col("oddsRatio").isNotNull()),
                F.lit("conflict"),
            ),
        )
        ## Eva_germline
        .when(
            F.col("datasourceId") == "eva",  ### the same for eva_somatic
            F.when(F.col("clinicalSignificances").rlike("(pathogenic)$"), F.lit("risk"))
            .when(F.col("clinicalSignificances").contains("protect"), F.lit("protect"))
            .otherwise(
                F.lit("noEvaluable")
            ),  ### Son todas aquellas que tenen info pero no son patogenicas/protective  + LoF
        )
        #### Eva_somatic
        .when(
            F.col("datasourceId") == "eva_somatic",
            F.when(F.col("clinicalSignificances").rlike("(pathogenic)$"), F.lit("risk"))
            .when(F.col("clinicalSignificances").contains("protect"), F.lit("protect"))
            .otherwise(
                F.lit("noEvaluable")
            ),  ### Son todas aquellas que tenen info pero no son patogenicas/protective  + LoF
        )
        #### G2P
        .when(
            F.col("datasourceId") == "gene2phenotype",
            F.when(F.col("diseaseId").isNotNull(), F.lit("risk")).otherwise(
                F.lit("noEvaluable")
            ),
        )
        #### Orphanet
        .when(
            F.col("datasourceId") == "orphanet",
            F.when(F.col("diseaseId").isNotNull(), F.lit("risk")).otherwise(
                F.lit("noEvaluable")
            ),
        )
        #### CGC
        .when(
            F.col("datasourceId") == "cancer_gene_census",
            F.when(F.col("diseaseId").isNotNull(), F.lit("risk")).otherwise(
                F.lit("noEvaluable")
            ),
        )
        #### intogen
        .when(
            F.col("datasourceId") == "intogen",
            F.when(F.col("diseaseId").isNotNull(), F.lit("risk")).otherwise(
                F.lit("noEvaluable")
            ),
        )
        #### impc
        .when(
            F.col("datasourceId") == "impc",
            F.when(F.col("diseaseId").isNotNull(), F.lit("risk")).otherwise(
                F.lit("noEvaluable")
            ),
        )
        ### chembl
        .when(
            F.col("datasourceId") == "chembl",
            F.when(F.col("diseaseId").isNotNull(), F.lit("protect")).otherwise(
                F.lit("noEvaluable")
            ),
        ),
    )
    .withColumn(
        "homogenized",
        F.when(
            (F.col("variantEffect") == "LoF") & (F.col("directionOnTrait") == "risk"),
            F.lit("LoF_risk"),
        )
        .when(
            (F.col("variantEffect") == "LoF")
            & (F.col("directionOnTrait") == "protect"),
            F.lit("LoF_protect"),
        )
        .when(
            (F.col("variantEffect") == "GoF") & (F.col("directionOnTrait") == "risk"),
            F.lit("GoF_risk"),
        )
        .when(
            (F.col("variantEffect") == "GoF")
            & (F.col("directionOnTrait") == "protect"),
            F.lit("GoF_protect"),
        )
        .otherwise(F.lit("noEvaluable")),
    )
).persist()

print("Moving to step 2")

columns_chembl = ["LoF_protect", "GoF_protect"]
columns_dataset = ["LoF_protect", "GoF_protect", "LoF_risk", "GoF_risk", "evidenceDif"]
columns = ["GoF_risk", "LoF_protect", "LoF_risk", "GoF_protect"]
terms = ["noEvaluable", "bivalent_risk", "null", "dispar"]

taDf = spark.createDataFrame(
    data=[
        ("MONDO_0045024", "cell proliferation disorder", "Oncology"),
        ("EFO_0005741", "infectious disease", "Other"),
        ("OTAR_0000014", "pregnancy or perinatal disease", "Other"),
        ("EFO_0005932", "animal disease", "Other"),
        ("MONDO_0024458", "disease of visual system", "Other"),
        ("EFO_0000319", "cardiovascular disease", "Other"),
        ("EFO_0009605", "pancreas disease", "Other"),
        ("EFO_0010282", "gastrointestinal disease", "Other"),
        ("OTAR_0000017", "reproductive system or breast disease", "Other"),
        ("EFO_0010285", "integumentary system disease", "Other"),
        ("EFO_0001379", "endocrine system disease", "Other"),
        ("OTAR_0000010", "respiratory or thoracic disease", "Other"),
        ("EFO_0009690", "urinary system disease", "Other"),
        ("OTAR_0000006", "musculoskeletal or connective tissue disease", "Other"),
        ("MONDO_0021205", "disease of ear", "Other"),
        ("EFO_0000540", "immune system disease", "Other"),
        ("EFO_0005803", "hematologic disease", "Other"),
        ("EFO_0000618", "nervous system disease", "Other"),
        ("MONDO_0002025", "psychiatric disorder", "Other"),
        ("MONDO_0024297", "nutritional or metabolic disease", "Other"),
        ("OTAR_0000018", "genetic, familial or congenital disease", "Other"),
        ("OTAR_0000009", "injury, poisoning or other complication", "Other"),
        ("EFO_0000651", "phenotype", "Other"),
        ("EFO_0001444", "measurement", "Other"),
        ("GO_0008150", "biological process", "Other"),
    ],
    schema=StructType(
        [
            StructField("taId", StringType(), True),
            StructField("taLabel", StringType(), True),
            StructField("taLabelSimple", StringType(), True),
        ]
    ),
).withColumn("taRank", F.monotonically_increasing_id())

### give us a classification of Oncology VS non oncology
wByDisease = Window.partitionBy("diseaseId")  #### checked 31.05.2023
diseaseTA = (
    diseases.withColumn("taId", F.explode("therapeuticAreas"))
    .select(F.col("id").alias("diseaseId"), "taId", "parents")
    .join(taDf, on="taId", how="left")
    .withColumn("minRank", F.min("taRank").over(wByDisease))
    .filter(F.col("taRank") == F.col("minRank"))
    .drop("taRank", "minRank")
)

#### give us propagation of diseases and list of therapeutic areas associated
diseases2 = diseases.select("id", "parents").withColumn(
    "diseaseIdPropagated",
    F.explode_outer(F.concat(F.array(F.col("id")), F.col("parents"))),
)

chembl_trials = (
    prueba_assessment.filter((F.col("datasourceId").isin(["chembl"])))
    .groupBy("targetId", "diseaseId")
    .agg(F.max(F.col("clinicalPhase")).alias("maxClinPhase"))
)

terminated_array = (
    terminated.groupBy("targetId", "diseaseId")
    .agg(F.collect_set("clinicalStatus").alias("clinicalStatus"))
    .withColumn("prediction", F.when(F.col("clinicalStatus").isNotNull(), F.lit("yes")))
)

assessment = prueba_assessment.unionByName(
    gwasCredibleAssoc.withColumn("datasourceId", F.lit("gwas_credible_set")),
    allowMissingColumns=True,
)


def analysis_nonPropagated(assessment, analysisDatasources):
    return discrepancifier(
        assessment.filter(F.col("datasourceId").isin(analysisDatasources))
        .withColumn(
            "datasources",
            F.collect_set(F.col("datasourceId")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy(
            "targetId",
            "diseaseId",
        )
        .pivot("homogenized")
        .agg(F.count("targetId"))
        .persist()
    )


def analysis_propagated(assessment, analysisDatasources):
    return discrepancifier(
        assessment.filter(F.col("datasourceId").isin(analysisDatasources))
        .withColumn(
            "datasources",
            F.collect_set(F.col("datasourceId")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .join(
            diseases2.selectExpr("id as diseaseId", "diseaseIdPropagated"),
            on="diseaseId",
            how="left",
        )
        .withColumnRenamed("diseaseId", "oldDiseaseId")
        .withColumnRenamed("diseaseIdPropagated", "diseaseId")
        .groupBy(
            "targetId",
            "diseaseId",
        )
        .pivot("homogenized")
        .agg(F.count("targetId"))
        .persist()
    )


chembl_ds = ["chembl"]


def analysis_drugs(assessment, chembl_ds):
    return discrepancifier(
        assessment.filter((F.col("datasourceId").isin(chembl_ds)))
        .withColumn(
            "maxClinPhase",
            F.max(F.col("clinicalPhase")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase")
        .pivot("homogenized")
        .agg(F.count("targetId"))
        .persist()
    )


analysis_chembl = analysis_drugs(assessment, chembl_ds)

#######
## include here the analysis
#######

analysisDatasources = []


def full_analysis_propagation(
    assessment, analysisDatasources, analysis_chembl, terminated_array, diseaseTA
):
    return (
        analysis_propagated(assessment, analysisDatasources)
        .join(
            analysis_chembl.selectExpr(
                "targetId",
                "diseaseId",
                "maxClinPhase",
                "coherencyDiagonal as coherencyDiagonal_ch",
                "coherencyOneCell as coherencyOneCell_ch",
                "LoF_protect as LoF_protect_ch",
                "GoF_protect as GoF_protect_ch",
            ),
            on=["targetId", "diseaseId"],
            how="right",
        )
        #### Should remove the coherencyDiagonal.isNotNull()
        .withColumn(
            "geneticEvidence",
            F.when(
                F.col("coherencyDiagonal").isNotNull(), F.lit("hasGeneticEvidence")
            ).otherwise(F.lit("noGeneticEvidence")),
        )
        # .filter(F.col("coherencyDiagonal_ch").isNotNull())
        .withColumn(
            "diagonalAgreeWithDrugs",
            F.when(
                (F.col("coherencyDiagonal_ch") == "coherent")
                & (F.col("coherencyDiagonal") == "coherent"),
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        F.col("GoF_risk").isNotNull() | F.col("LoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .when(
                    F.col("GoF_protect_ch").isNotNull()
                    & (
                        F.col("LoF_risk").isNotNull() | F.col("GoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        )
        .withColumn(
            "oneCellAgreeWithDrugs",
            F.when(
                (F.col("coherencyOneCell_ch") == "coherent")
                & (F.col("coherencyOneCell") == "coherent"),
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        (F.col("LoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("GoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .when(
                    (F.col("GoF_protect_ch").isNotNull())
                    & (
                        (F.col("GoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("LoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        )
        .withColumn(
            "Phase4",
            F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=3",
            F.when(F.col("maxClinPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=2",
            F.when(F.col("maxClinPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=1",
            F.when(F.col("maxClinPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase0",
            F.when(F.col("maxClinPhase") == 0, F.lit("yes")).otherwise(F.lit("no")),
        )
        .join(terminated_array, on=["targetId", "diseaseId"], how="left")
        .withColumn(
            "PhaseT",
            F.when(F.col("prediction") == "yes", F.lit("yes")).otherwise(F.lit("no")),
        )
        .join(
            diseaseTA.select("diseaseId", "taLabelSimple"), on="diseaseId", how="left"
        )
        .withColumn(
            "hasGeneticEvidence",
            F.when(
                F.col("geneticEvidence") == "hasGeneticEvidence", F.lit("yes")
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "diagonalYes",
            F.when(
                F.col("hasGeneticEvidence") == "yes",
                F.when(F.col("diagonalAgreeWithDrugs") == "coherent", F.lit("yes"))
                .when(F.col("diagonalAgreeWithDrugs") == "dispar", F.lit("no"))
                .otherwise(F.lit("no")),
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "oneCellYes",
            F.when(
                F.col("hasGeneticEvidence") == "yes",
                F.when(F.col("oneCellAgreeWithDrugs") == "coherent", F.lit("yes"))
                .when(F.col("oneCellAgreeWithDrugs") == "dispar", F.lit("no"))
                .otherwise(F.lit("no")),
            ).otherwise(F.lit("no")),
        )
        .persist()
    )


#####
## no propag
#####
def full_analysis_noPropagation(
    assessment, analysisDatasources, analysis_chembl, terminated_array, diseaseTA
):
    return (
        analysis_nonPropagated(assessment, analysisDatasources)
        .join(
            analysis_chembl.selectExpr(
                "targetId",
                "diseaseId",
                "maxClinPhase",
                "coherencyDiagonal as coherencyDiagonal_ch",
                "coherencyOneCell as coherencyOneCell_ch",
                "LoF_protect as LoF_protect_ch",
                "GoF_protect as GoF_protect_ch",
            ),
            on=["targetId", "diseaseId"],
            how="right",
        )
        .withColumn(
            "geneticEvidence",
            F.when(
                F.col("coherencyDiagonal").isNotNull(), F.lit("hasGeneticEvidence")
            ).otherwise(F.lit("noGeneticEvidence")),
        )
        # .filter(F.col("coherencyDiagonal_ch").isNotNull())
        .withColumn(
            "diagonalAgreeWithDrugs",
            F.when(
                (F.col("coherencyDiagonal_ch") == "coherent")
                & (F.col("coherencyDiagonal") == "coherent"),
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        F.col("GoF_risk").isNotNull() | F.col("LoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .when(
                    F.col("GoF_protect_ch").isNotNull()
                    & (
                        F.col("LoF_risk").isNotNull() | F.col("GoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        )
        .withColumn(
            "oneCellAgreeWithDrugs",
            F.when(
                (F.col("coherencyOneCell_ch") == "coherent")
                & (F.col("coherencyOneCell") == "coherent"),
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        (F.col("LoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("GoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .when(
                    (F.col("GoF_protect_ch").isNotNull())
                    & (
                        (F.col("GoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("LoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        )
        .withColumn(
            "Phase4",
            F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=3",
            F.when(F.col("maxClinPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=2",
            F.when(F.col("maxClinPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=1",
            F.when(F.col("maxClinPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase0",
            F.when(F.col("maxClinPhase") == 0, F.lit("yes")).otherwise(F.lit("no")),
        )
        .join(terminated_array, on=["targetId", "diseaseId"], how="left")
        .withColumn(
            "PhaseT",
            F.when(F.col("prediction") == "yes", F.lit("yes")).otherwise(F.lit("no")),
        )
        .join(
            diseaseTA.select("diseaseId", "taLabelSimple"), on="diseaseId", how="left"
        )
        .withColumn(
            "hasGeneticEvidence",
            F.when(
                F.col("geneticEvidence") == "hasGeneticEvidence", F.lit("yes")
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "diagonalYes",
            F.when(
                F.col("hasGeneticEvidence") == "yes",
                F.when(F.col("diagonalAgreeWithDrugs") == "coherent", F.lit("yes"))
                .when(F.col("diagonalAgreeWithDrugs") == "dispar", F.lit("no"))
                .otherwise(F.lit("no")),
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "oneCellYes",
            F.when(
                F.col("hasGeneticEvidence") == "yes",
                F.when(F.col("oneCellAgreeWithDrugs") == "coherent", F.lit("yes"))
                .when(F.col("oneCellAgreeWithDrugs") == "dispar", F.lit("no"))
                .otherwise(F.lit("no")),
            ).otherwise(F.lit("no")),
        )
        .persist()
    )


print("moving to Step 3")

from functions import relative_success, spreadSheetFormatter, convertTuple
import re
import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
from scipy.stats.contingency import odds_ratio, relative_risk

full_data = spark.createDataFrame(
    data=[
        ("yes", "yes"),
        ("yes", "no"),
        ("no", "yes"),
        ("no", "no"),
    ],
    schema=StructType(
        [
            StructField("prediction", StringType(), True),
            StructField("comparison", StringType(), True),
        ]
    ),
)
c = datetime.now()
print("starting dictionaries at", c)

#### continue here on 10.07.2024

## 1nd dictionary
dfs_dict = {}  ### checked and changed on 01.06.2023
dfs_dict_propag = {}


wocgc_list = [
    "gene_burden",
    "intogen",
    "eva",
    "eva_somatic",
    # "ot_genetics_portal",
    "impc",
    "orphanet",
    "gene2phenotype",
    "gwas_credible_set",
]
datasource_list = [
    "gene_burden",
    "intogen",
    "cancer_gene_census",
    "eva",
    "eva_somatic",
    "ot_genetics_portal",
    "gwas_credible_set",
    "impc",
    "orphanet",
    "gene2phenotype",
    "WOcgc",
    "somatic",
    "germline",
]

germline_list = [
    "gene_burden",
    "eva",
    # "ot_genetics_portal",
    "gwas_credible_set",
    "impc",
    "orphanet",
    "gene2phenotype",
]

somatic_list = ["intogen", "cancer_gene_census", "eva_somatic"]

# assessment = prueba_assessment.filter(F.col("datasourceId").isin(datasources_analysis))


def dataset_builder(assessment, value, analysis_chembl, terminated_array, diseaseTA):
    nonPropagated = full_analysis_noPropagation(
        assessment, value, analysis_chembl, terminated_array, diseaseTA
    )
    propagated = full_analysis_propagation(
        assessment, value, analysis_chembl, terminated_array, diseaseTA
    )
    return (
        # Non propagation
        ## All
        nonPropagated,
        ## Other
        nonPropagated.filter(F.col("taLabelSimple") == "Other"),
        ## Other&Null
        nonPropagated.filter(
            (F.col("taLabelSimple").isNull()) | (F.col("taLabelSimple") == "Other")
        ),
        ## Oncology
        nonPropagated.filter(F.col("taLabelSimple") == "Oncology"),
        # Propagation
        ## All
        propagated,
        ## Other
        propagated.filter(F.col("taLabelSimple") == "Other"),
        ## Other&Null
        propagated.filter(
            (F.col("taLabelSimple").isNull()) | (F.col("taLabelSimple") == "Other")
        ),
        ## Oncology
        propagated.filter(F.col("taLabelSimple") == "Oncology"),
    )


for value in datasource_list:
    print(value)
    if value == "WOcgc":
        (
            dfs_dict[f"df_{value}_All_original"],
            dfs_dict[f"df_{value}_Other_original"],
            dfs_dict[f"df_{value}_OtherNull_original"],
            dfs_dict[f"df_{value}_Oncology_original"],
            dfs_dict_propag[f"df_{value}_All_propag"],
            dfs_dict_propag[f"df_{value}_Other_propag"],
            dfs_dict_propag[f"df_{value}_OtherNull_propag"],
            dfs_dict_propag[f"df_{value}_Oncology_propag"],
        ) = dataset_builder(
            assessment, wocgc_list, analysis_chembl, terminated_array, diseaseTA
        )
    elif value == "germline":
        (
            dfs_dict[f"df_{value}_All_original"],
            dfs_dict[f"df_{value}_Other_original"],
            dfs_dict[f"df_{value}_OtherNull_original"],
            dfs_dict[f"df_{value}_Oncology_original"],
            dfs_dict_propag[f"df_{value}_All_propag"],
            dfs_dict_propag[f"df_{value}_Other_propag"],
            dfs_dict_propag[f"df_{value}_OtherNull_propag"],
            dfs_dict_propag[f"df_{value}_Oncology_propag"],
        ) = dataset_builder(
            assessment,
            germline_list,
            analysis_chembl,
            terminated_array,
            diseaseTA,
        )

    elif value == "somatic":
        (
            dfs_dict[f"df_{value}_All_original"],
            dfs_dict[f"df_{value}_Other_original"],
            dfs_dict[f"df_{value}_OtherNull_original"],
            dfs_dict[f"df_{value}_Oncology_original"],
            dfs_dict_propag[f"df_{value}_All_propag"],
            dfs_dict_propag[f"df_{value}_Other_propag"],
            dfs_dict_propag[f"df_{value}_OtherNull_propag"],
            dfs_dict_propag[f"df_{value}_Oncology_propag"],
        ) = dataset_builder(
            assessment,
            somatic_list,
            analysis_chembl,
            terminated_array,
            diseaseTA,
        )

    else:
        (
            dfs_dict[f"df_{value}_All_original"],
            dfs_dict[f"df_{value}_Other_original"],
            dfs_dict[f"df_{value}_OtherNull_original"],
            dfs_dict[f"df_{value}_Oncology_original"],
            dfs_dict_propag[f"df_{value}_All_propag"],
            dfs_dict_propag[f"df_{value}_Other_propag"],
            dfs_dict_propag[f"df_{value}_OtherNull_propag"],
            dfs_dict_propag[f"df_{value}_Oncology_propag"],
        ) = dataset_builder(
            assessment, value, analysis_chembl, terminated_array, diseaseTA
        )


def comparisons_df() -> list:
    """Return list of all comparisons to be used in the analysis"""
    comparisons = spark.createDataFrame(
        data=[
            ("hasGeneticEvidence", "byDatatype"),
            ("diagonalYes", "byDatatype"),
            ("oneCellYes", "byDatatype"),
        ],
        schema=StructType(
            [
                StructField("comparison", StringType(), True),
                StructField("comparisonType", StringType(), True),
            ]
        ),
    )

    predictions = spark.createDataFrame(
        data=[
            ("Phase4", "clinical"),
            ("Phase>=3", "clinical"),
            ("Phase>=2", "clinical"),
            ("Phase>=1", "clinical"),
            ("PhaseT", "clinical"),
        ]
    )
    return comparisons.join(predictions, how="full").collect()


result = []
result_st = []
result_ci = []
array2 = []
results = []
"""

def aggregations_original(
    df,
    data,
    listado,
    comparisonColumn,
    comparisonType,
    predictionColumn,
    predictionType,
    today_date,
):

    wComparison = Window.partitionBy(comparisonColumn)
    wPrediction = Window.partitionBy(predictionColumn)
    wPredictionComparison = Window.partitionBy(comparisonColumn, predictionColumn)

    uniqIds = df.select("targetId", "diseaseId").distinct().count()

    out = (
        df.withColumn("comparisonType", F.lit(comparisonType))
        .withColumn("predictionType", F.lit(predictionType))
        .withColumn("total", F.lit(uniqIds))
        .withColumn("a", F.count("targetId").over(wPredictionComparison))
        .withColumn(
            "predictionTotal",
            F.count("targetId").over(wPrediction),
        )
        .withColumn(
            "comparisonTotal",
            F.count("targetId").over(wComparison),
        )
        .select(
            F.col(predictionColumn).alias("prediction"),
            F.col(comparisonColumn).alias("comparison"),
            "comparisonType",
            "predictionType",
            "a",
            "predictionTotal",
            "comparisonTotal",
            "total",
        )
        .filter(F.col("prediction").isNotNull())
        .filter(F.col("comparison").isNotNull())
        .distinct()
    )

    out.write.mode("overwrite").parquet(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )

    filePath = "gs://ot-team/jroldan/" + str(
        today_date
        + "_"
        + "analysis/"
        + data
        # + "_propagated"
        + "/"
        + comparisonColumn
        + "_"
        + predictionColumn
        + ".parquet"
    )
    listado.append(
        filePath
    )
    print(
        today_date
        + "_"
        + "analysis/"
        + data
        # + "_propagated"
        + "/"
        + comparisonColumn
        + "_"
        + predictionColumn
        + ".parquet"
    )
    c = datetime.now()
    c.strftime("%H:%M:%S")
    print(c)

    array1 = np.delete(
        out.join(full_data, on=["prediction", "comparison"], how="outer")
        .groupBy("comparison")
        .pivot("prediction")
        .agg(F.first("a"))
        .sort(F.col("comparison").desc())
        .select("comparison", "yes", "no")
        .fillna(0)
        .toPandas()
        .to_numpy(),
        [0],
        1,
    )
    total = np.sum(array1)
    res_npPhaseX = np.array(array1, dtype=int)
    resX = convertTuple(fisher_exact(res_npPhaseX, alternative="two-sided"))
    resx_CI = convertTuple(
        odds_ratio(res_npPhaseX).confidence_interval(confidence_level=0.95)
    )

    result_st.append(resX)
    result_ci.append(resx_CI)
    (rs_result, rs_ci) = relative_success(array1)

    results.append(
        [
            comparisonType,
            comparisonColumn,
            predictionColumn,
            round(float(resX.split(",")[0]), 2),
            float(resX.split(",")[1]),
            round(float(resx_CI.split(",")[0]), 2),
            round(float(resx_CI.split(",")[1]), 2),
            str(total),
            np.array(res_npPhaseX).tolist(),
            round(float(rs_result), 2),
            round(float(rs_ci[0]), 2),
            round(float(rs_ci[1]), 2),
            filePath,
        ]
    )
    return results


c = datetime.now()

print("start doing aggregations and writing")
today_date = str(date.today())
aggSetups_original = comparisons_df()
listado = []

print("starting with non-propagated aggregations at", c)

for key, df in dfs_dict.items():
    df = df.persist()
    for row in aggSetups_original:
        aggregations_original(df, key, listado, *row, today_date)
    df.unpersist()
    print(key + " df unpersisted")

print("non propagated files wroten succesfully at", c)

print("starting with non-propagated aggregations at", c)
for key, df in dfs_dict_propag.items():
    df = df.persist()
    for row in aggSetups_original:
        aggregations_original(df, key, listado, *row, today_date)
    df.unpersist()
    print(key + " df unpersisted")

print("propagated files wroten succesfully at", c)

##### read files and make spreadsheet

print("preparing dataframe")

schema = StructType(
    [
        StructField("group", StringType(), True),
        StructField("comparison", StringType(), True),
        StructField("phase", StringType(), True),
        StructField("oddsRatio", DoubleType(), True),
        StructField("pValue", DoubleType(), True),
        StructField("lowerInterval", DoubleType(), True),
        StructField("upperInterval", DoubleType(), True),
        StructField("total", StringType(), True),
        StructField("values", ArrayType(ArrayType(IntegerType())), True),
        StructField("relSuccess", DoubleType(), True),
        StructField("rsLower", DoubleType(), True),
        StructField("rsUpper", DoubleType(), True),
        StructField("path", StringType(), True),
    ]
)

df = spreadSheetFormatter(spark.createDataFrame(results, schema=schema))

"""

spark session created at 2024-12-16 13:53:05.366726
Analysis started on 2024-12-16 at  2024-12-16 13:53:05.366726


24/12/16 13:53:06 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:53:09 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:54:25 WARN CacheManager: Asked to cache already cached data.        
24/12/16 13:54:25 WARN CacheManager: Asked to cache already cached data.


loading sources


24/12/16 13:54:27 WARN CacheManager: Asked to cache already cached data.


Moving to step 2


24/12/16 13:54:28 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:54:28 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:54:28 WARN CacheManager: Asked to cache already cached data.


moving to Step 3
starting dictionaries at 2024-12-16 13:54:28.587980
gene_burden


24/12/16 13:54:29 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:54:29 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:54:29 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:54:30 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:54:30 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:54:30 WARN CacheManager: Asked to cache already cached data.


intogen


24/12/16 13:54:32 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:54:32 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:54:32 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:54:34 WARN CacheManager: Asked to cache already cached data.        
24/12/16 13:54:34 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:54:34 WARN CacheManager: Asked to cache already cached data.


cancer_gene_census


24/12/16 13:54:35 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:54:35 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:54:35 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:54:37 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:54:37 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:54:37 WARN CacheManager: Asked to cache already cached data.


eva


24/12/16 13:54:38 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:54:38 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:54:38 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:54:40 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:54:40 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:54:40 WARN CacheManager: Asked to cache already cached data.


eva_somatic


24/12/16 13:54:41 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:54:41 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:54:41 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:54:41 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:54:42 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:54:42 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:54:42 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:54:43 WARN CacheManager: Asked to cache already cached data.


ot_genetics_portal


24/12/16 13:54:44 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:54:45 WARN CacheManager: Asked to cache already cached data.


gwas_credible_set


24/12/16 13:54:53 WARN CacheManager: Asked to cache already cached data.        
24/12/16 13:54:59 WARN CacheManager: Asked to cache already cached data.        


impc


24/12/16 13:55:01 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:55:01 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:55:01 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:55:01 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:55:01 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:55:03 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:55:03 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:55:03 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:55:03 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:55:03 WARN CacheManager: Asked to cache already cached data.


orphanet


24/12/16 13:55:04 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:55:04 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:55:04 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:55:06 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:55:06 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:55:06 WARN CacheManager: Asked to cache already cached data.


gene2phenotype


24/12/16 13:55:07 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:55:07 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:55:07 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:55:08 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:55:08 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:55:08 WARN CacheManager: Asked to cache already cached data.


WOcgc


24/12/16 13:55:16 WARN CacheManager: Asked to cache already cached data.        
24/12/16 13:55:23 WARN CacheManager: Asked to cache already cached data.        


somatic


24/12/16 13:55:24 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:55:24 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:55:24 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:55:26 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:55:26 WARN CacheManager: Asked to cache already cached data.
24/12/16 13:55:26 WARN CacheManager: Asked to cache already cached data.


germline


24/12/16 13:55:33 WARN CacheManager: Asked to cache already cached data.        
24/12/16 13:55:40 WARN CacheManager: Asked to cache already cached data.        


'\n\ndef aggregations_original(\n    df,\n    data,\n    listado,\n    comparisonColumn,\n    comparisonType,\n    predictionColumn,\n    predictionType,\n    today_date,\n):\n\n    wComparison = Window.partitionBy(comparisonColumn)\n    wPrediction = Window.partitionBy(predictionColumn)\n    wPredictionComparison = Window.partitionBy(comparisonColumn, predictionColumn)\n\n    uniqIds = df.select("targetId", "diseaseId").distinct().count()\n\n    out = (\n        df.withColumn("comparisonType", F.lit(comparisonType))\n        .withColumn("predictionType", F.lit(predictionType))\n        .withColumn("total", F.lit(uniqIds))\n        .withColumn("a", F.count("targetId").over(wPredictionComparison))\n        .withColumn(\n            "predictionTotal",\n            F.count("targetId").over(wPrediction),\n        )\n        .withColumn(\n            "comparisonTotal",\n            F.count("targetId").over(wComparison),\n        )\n        .select(\n            F.col(predictionColumn).alias

In [12]:
dfs_dict_propag["df_gwas_credible_set_All_propag"].groupBy("oneCellYes").pivot(
    "Phase4"
).count().show()



+----------+-----+-----+
|oneCellYes|   no|  yes|
+----------+-----+-----+
|        no|59560|21813|
|       yes|   26|   36|
+----------+-----+-----+



                                                                                

In [13]:
dfs_dict["df_gwas_credible_set_All_original"].groupBy("oneCellYes").pivot(
    "Phase4"
).count().show()

+----------+-----+-----+
|oneCellYes|   no|  yes|
+----------+-----+-----+
|        no|59575|21820|
|       yes|   11|   29|
+----------+-----+-----+



### extract numbers from coloc to compare with the ones for Gwas 

## potential fail: we are propagating gwasResolvedColoc and in the following steps is propagated again (check that)

In [None]:
gwasResolvedColoc = (
    (
        newColoc.filter(F.col("rightStudyType") != "gwas")
        .withColumnRenamed("geneId", "targetId")
        .join(
            gwasComplete.withColumnRenamed("studyLocusId", "leftStudyLocusId"),
            on=["leftStudyLocusId", "targetId"],
            how="right", ### left becacause is to bring data from 
        )
        .join(  ### propagated using parent terms
            diseases.selectExpr(
                "id as diseaseId", "name", "parents", "therapeuticAreas"
            ),
            on="diseaseId",
            how="left",
        )
        .withColumn(
            "diseaseId",
            F.explode_outer(F.concat(F.array(F.col("diseaseId")), F.col("parents"))),
        )
        .drop("parents", "oldDiseaseId")
    )
    .withColumn(
        "colocDoE",
        F.when(
            F.col("rightStudyType").isin(
                ["eqtl", "pqtl", "tuqtl", "sceqtl", "sctuqtl"]
            ),
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_protect"),
            ),
        ).when(
            F.col("rightStudyType").isin(
                ["sqtl", "scsqtl"]
            ),  ### opposite directionality than sqtl
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_protect"),
            ),
        ),
    )
    .persist()
)

### check numbers of datasets in germline and gwas credible set VS the ones provided by coloc. 

### Run the analysis 

### sanity checks qtls

In [1]:
index.show()

NameError: name 'index' is not defined

In [1]:

#### 11.12.2024
#######
#######     ATENTION
#### change code to work with generated dataframe instead of reading the parquet

"""
This scripts run Odds ratio analysis for DoE and 
genetic information on drug clinical success

"""
from functions import discrepancifier
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.types import (
    StructType,
    StructField,
    ArrayType,
    DoubleType,
    DecimalType,
    StringType,
    FloatType,
)
from datetime import datetime


spark = SparkSession.builder.getOrCreate()

path = "gs://open-targets-pre-data-releases/24.12-uo_test-3/output/etl/parquet/"


evidences = spark.read.parquet(f"{path}evidence")

credible = spark.read.parquet(f"{path}credibleSet")

index = spark.read.parquet(f"{path}gwasIndex")

new = spark.read.parquet(f"{path}colocalisation/coloc")

variantIndex = spark.read.parquet(f"{path}variantIndex")

biosample = spark.read.parquet(f"{path}biosample")

newColoc = (
    new.join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on left side
            "studyLocusId as leftStudyLocusId",
            "StudyId as leftStudyId",
            "variantId as leftVariantId",
            "studyType as credibleLeftStudyType",
        ),
        on="leftStudyLocusId",
        how="left",
    )
    .join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on right side
            "studyLocusId as rightStudyLocusId",
            "studyId as rightStudyId",
            "variantId as rightVariantId",
            "studyType as credibleRightStudyType",
        ),
        on="rightStudyLocusId",
        how="left",
    )
    .join(
        index.selectExpr(  ### bring modulated target on right side (QTL study)
            "studyId as rightStudyId",
            "geneId",
            "projectId",
            "studyType as indexStudyType",
            "condition",
            "biosampleId",
        ),
        on="rightStudyId",
        how="left",
    )
    .persist()
)

24/12/16 15:17:48 WARN YarnScheduler: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient resources
                                                                                

In [3]:
index.groupBy("studyType").count().show()



+---------+------+
|studyType| count|
+---------+------+
|     gwas| 90839|
|     sqtl|184609|
|     pqtl|  3757|
|    tuqtl|301140|
|     eqtl|948158|
|  sctuqtl| 63143|
|   sceqtl|335808|
|   scsqtl| 29350|
+---------+------+



                                                                                

In [8]:
newColoc.show(truncate=False)

+----------------------------------------------------------------------------------+--------------------------------+--------------------------------+----------+--------------+--------------------------+----------------------+----------------------+----------------------+---------------------+--------------------+--------------------+--------------------+-------------------------+-----------------+---------------------+------------------+----------------------+---------------+----------+--------------+-----------+--------------+
|rightStudyId                                                                      |rightStudyLocusId               |leftStudyLocusId                |chromosome|rightStudyType|numberColocalisingVariants|h0                    |h1                    |h2                    |h3                   |h4                  |colocalisationMethod|betaRatioSignAverage|leftStudyId              |leftVariantId    |credibleLeftStudyType|rightVariantId    |credibleRightStudyType|

In [13]:
from pyspark.sql.types import IntegerType, StringType, StructField, StructType
import pandas as pd

pd.DataFrame.iteritems = pd.DataFrame.items

raw_studies_metadata_schema: StructType = StructType(
        [
            StructField("study_id", StringType(), True),
            StructField("dataset_id", StringType(), True),
            StructField("study_label", StringType(), True),
            StructField("sample_group", StringType(), True),
            StructField("tissue_id", StringType(), True),
            StructField("tissue_label", StringType(), True),
            StructField("condition_label", StringType(), True),
            StructField("sample_size", IntegerType(), True),
            StructField("quant_method", StringType(), True),
            StructField("pmid", StringType(), True),
            StructField("study_type", StringType(), True),
        ]
    )
raw_studies_metadata_path = "https://raw.githubusercontent.com/eQTL-Catalogue/eQTL-Catalogue-resources/fe3c4b4ed911b3a184271a6aadcd8c8769a66aba/data_tables/dataset_metadata.tsv"

study_table = spark.createDataFrame(
            pd.read_csv(raw_studies_metadata_path, sep="\t"),
            schema=raw_studies_metadata_schema,
        )

In [18]:
study_table.withColumn("extracted_column", 
    F.concat_ws("_", 
        F.col("study_label"),
        F.col("quant_method"),
        F.col("tissue_label"),
        F.col("condition_label"))
        ).select("extracted_column").show(truncate=False)

+--------------------------------------------------------+
|extracted_column                                        |
+--------------------------------------------------------+
|Alasoo_2018_ge_macrophage_naive                         |
|Alasoo_2018_exon_macrophage_naive                       |
|Alasoo_2018_tx_macrophage_naive                         |
|Alasoo_2018_txrev_macrophage_naive                      |
|Alasoo_2018_leafcutter_macrophage_naive                 |
|Alasoo_2018_ge_macrophage_IFNg_18h                      |
|Alasoo_2018_exon_macrophage_IFNg_18h                    |
|Alasoo_2018_tx_macrophage_IFNg_18h                      |
|Alasoo_2018_txrev_macrophage_IFNg_18h                   |
|Alasoo_2018_leafcutter_macrophage_IFNg_18h              |
|Alasoo_2018_ge_macrophage_Salmonella_5h                 |
|Alasoo_2018_exon_macrophage_Salmonella_5h               |
|Alasoo_2018_tx_macrophage_Salmonella_5h                 |
|Alasoo_2018_txrev_macrophage_Salmonella_5h             

In [23]:
index.select("studyId").show(50,truncate=False)

+------------------------------------------------------------------------------------+
|studyId                                                                             |
+------------------------------------------------------------------------------------+
|Alasoo_2018_exon_macrophage_IFNg+Salmonella_ENSG00000015532.10_17_50354854_50355056 |
|Alasoo_2018_exon_macrophage_IFNg+Salmonella_ENSG00000106771.13_9_109108453_109108488|
|Alasoo_2018_exon_macrophage_IFNg+Salmonella_ENSG00000135218.19_7_80546191_80546267  |
|Alasoo_2018_exon_macrophage_IFNg+Salmonella_ENSG00000135899.19_2_230177366_230177537|
|Alasoo_2018_exon_macrophage_IFNg+Salmonella_ENSG00000138600.10_15_50748113_50748202 |
|Alasoo_2018_exon_macrophage_IFNg+Salmonella_ENSG00000141524.17_17_78119062_78119296 |
|Alasoo_2018_exon_macrophage_IFNg+Salmonella_ENSG00000168385.18_2_241324663_241325244|
|Alasoo_2018_exon_macrophage_IFNg+Salmonella_ENSG00000168994.14_6_3739058_3739161    |
|Alasoo_2018_exon_macrophage_IFNg+Salmonell

In [25]:
index.withColumn("extracted_column", F.split(F.col("studyId"), "_ENS")[0]).select("extracted_column").show(50,truncate=False)



+-------------------------------------------+
|extracted_column                           |
+-------------------------------------------+
|Alasoo_2018_exon_macrophage_IFNg+Salmonella|
|Alasoo_2018_exon_macrophage_IFNg+Salmonella|
|Alasoo_2018_exon_macrophage_IFNg+Salmonella|
|Alasoo_2018_exon_macrophage_IFNg+Salmonella|
|Alasoo_2018_exon_macrophage_IFNg+Salmonella|
|Alasoo_2018_exon_macrophage_IFNg+Salmonella|
|Alasoo_2018_exon_macrophage_IFNg+Salmonella|
|Alasoo_2018_exon_macrophage_IFNg+Salmonella|
|Alasoo_2018_exon_macrophage_IFNg+Salmonella|
|Alasoo_2018_exon_macrophage_IFNg           |
|Alasoo_2018_exon_macrophage_IFNg           |
|Alasoo_2018_exon_macrophage_IFNg           |
|Alasoo_2018_exon_macrophage_IFNg           |
|Alasoo_2018_exon_macrophage_IFNg           |
|Alasoo_2018_exon_macrophage_IFNg           |
|Alasoo_2018_exon_macrophage_IFNg           |
|Alasoo_2018_exon_macrophage_IFNg           |
|Alasoo_2018_exon_macrophage_IFNg           |
|Alasoo_2018_exon_macrophage_IFNg 

                                                                                

In [29]:
study_table.show()

+---------+----------+-----------+--------------------+----------+------------+--------------------+-----------+------------+--------+----------+
| study_id|dataset_id|study_label|        sample_group| tissue_id|tissue_label|     condition_label|sample_size|quant_method|    pmid|study_type|
+---------+----------+-----------+--------------------+----------+------------+--------------------+-----------+------------+--------+----------+
|QTS000001| QTD000001|Alasoo_2018|    macrophage_naive|CL_0000235|  macrophage|               naive|         84|          ge|29379200|      bulk|
|QTS000001| QTD000002|Alasoo_2018|    macrophage_naive|CL_0000235|  macrophage|               naive|         84|        exon|29379200|      bulk|
|QTS000001| QTD000003|Alasoo_2018|    macrophage_naive|CL_0000235|  macrophage|               naive|         84|          tx|29379200|      bulk|
|QTS000001| QTD000004|Alasoo_2018|    macrophage_naive|CL_0000235|  macrophage|               naive|         84|       txrev

In [None]:
study_table.select(
    F.concat_ws(
        "_",
        F.col("study_label"),
        F.col("quant_method"),
        F.col("sample_group"),
    ).alias("extracted_column"),
    "study_type",
).show(200,truncate=False)

+--------------------------------------------------------+----------+
|extracted_column                                        |study_type|
+--------------------------------------------------------+----------+
|Alasoo_2018_ge_macrophage_naive                         |bulk      |
|Alasoo_2018_exon_macrophage_naive                       |bulk      |
|Alasoo_2018_tx_macrophage_naive                         |bulk      |
|Alasoo_2018_txrev_macrophage_naive                      |bulk      |
|Alasoo_2018_leafcutter_macrophage_naive                 |bulk      |
|Alasoo_2018_ge_macrophage_IFNg_18h                      |bulk      |
|Alasoo_2018_exon_macrophage_IFNg_18h                    |bulk      |
|Alasoo_2018_tx_macrophage_IFNg_18h                      |bulk      |
|Alasoo_2018_txrev_macrophage_IFNg_18h                   |bulk      |
|Alasoo_2018_leafcutter_macrophage_IFNg_18h              |bulk      |
|Alasoo_2018_ge_macrophage_Salmonella_5h                 |bulk      |
|Alasoo_2018_exon_ma

In [33]:
saved=study_table.select(
    F.concat_ws(
        "_",
        F.col("study_label"),
        F.col("quant_method"),
        F.col("sample_group"),
    ).alias("extracted_column"),
    "study_type",
).join(
    index.withColumn("extracted_column", F.split(F.col("studyId"), "_ENS")[0]),
    on="extracted_column",
    how="right",
).persist()

In [None]:
saved

In [35]:
saved.filter(F.col("study_type").isNull()).show()

+--------------------+----------+--------------------+---------------+-----------+---------+--------------------+------------------------+---------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+-----------------+------+---------+--------+-------+---------------------+----------------+------------------+---------------+-------------+--------------------+-----------+--------------------+---------------+----------+--------------------+-----------+
|    extracted_column|study_type|             studyId|         geneId|  projectId|studyType|     traitFromSource|traitFromSourceMappedIds|biosampleFromSourceId|pubmedId|publicationTitle|publicationFirstAuthor|publicationDate|publicationJournal|backgroundTraitFromSourceMappedIds|initialSampleSize|nCases|nControls|nSamples|cohorts|ldPopulationStructure|discoverySamples|replicationSamples|qualityControls|analysisFlags|summarystatsLocation|hasSumstats|           con

In [None]:
index.withColumn("extracted_column", 
            F.split(F.col("studyId"), "_ENS")[0]).select("extra")

In [16]:
study_table.withColumn("extracted_column", 
    F.concat_ws("_", 
        F.col("study_label"),
        F.col("quant_method"),
        F.col("tissue_label"),
        F.col("condition_label"))
        ).join(index.withColumn("extracted_column", 
            F.split(F.col("studyId"), "_ENS")[0]
), on="extracted_column", how="right").show()



+--------------------+--------+----------+-----------+------------+---------+------------+---------------+-----------+------------+----+----------+--------------------+---------------+-----------+---------+--------------------+------------------------+---------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+-----------------+------+---------+--------+-------+---------------------+----------------+------------------+---------------+-------------+--------------------+-----------+--------------------+---------------+----------+--------------------+-----------+
|    extracted_column|study_id|dataset_id|study_label|sample_group|tissue_id|tissue_label|condition_label|sample_size|quant_method|pmid|study_type|             studyId|         geneId|  projectId|studyType|     traitFromSource|traitFromSourceMappedIds|biosampleFromSourceId|pubmedId|publicationTitle|publicationFirstAuthor|publicationDate|publicationJo

                                                                                

In [12]:
#### read metadata table

raw_studies_metadata_path = "https://raw.githubusercontent.com/eQTL-Catalogue/eQTL-Catalogue-resources/fe3c4b4ed911b3a184271a6aadcd8c8769a66aba/data_tables/dataset_metadata.tsv"

study_table = spark.createDataFrame(
            pd.read_csv(raw_studies_metadata_path, sep="\t"),
            schema=raw_studies_metadata_schema,
        )

NameError: name 'pd' is not defined

In [11]:
index.withColumn("extracted_column", F.split(F.col("studyId"), "_ENS")[0]
).select("studyId","extracted_column").show(truncate=False)


+------------------------------------------------------------------------------------+-------------------------------------------+
|studyId                                                                             |extracted_column                           |
+------------------------------------------------------------------------------------+-------------------------------------------+
|Alasoo_2018_exon_macrophage_IFNg+Salmonella_ENSG00000015532.10_17_50354854_50355056 |Alasoo_2018_exon_macrophage_IFNg+Salmonella|
|Alasoo_2018_exon_macrophage_IFNg+Salmonella_ENSG00000106771.13_9_109108453_109108488|Alasoo_2018_exon_macrophage_IFNg+Salmonella|
|Alasoo_2018_exon_macrophage_IFNg+Salmonella_ENSG00000135218.19_7_80546191_80546267  |Alasoo_2018_exon_macrophage_IFNg+Salmonella|
|Alasoo_2018_exon_macrophage_IFNg+Salmonella_ENSG00000135899.19_2_230177366_230177537|Alasoo_2018_exon_macrophage_IFNg+Salmonella|
|Alasoo_2018_exon_macrophage_IFNg+Salmonella_ENSG00000138600.10_15_50748113_5074820

In [None]:
study_label + sample_group

In [2]:
index.show()

24/12/16 15:18:07 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

+--------------------+---------------+-----------+---------+--------------------+------------------------+---------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+-----------------+------+---------+--------+-------+---------------------+----------------+------------------+---------------+-------------+--------------------+-----------+--------------------+---------------+----------+--------------------+-----------+
|             studyId|         geneId|  projectId|studyType|     traitFromSource|traitFromSourceMappedIds|biosampleFromSourceId|pubmedId|publicationTitle|publicationFirstAuthor|publicationDate|publicationJournal|backgroundTraitFromSourceMappedIds|initialSampleSize|nCases|nControls|nSamples|cohorts|ldPopulationStructure|discoverySamples|replicationSamples|qualityControls|analysisFlags|summarystatsLocation|hasSumstats|           condition|sumstatQCValues|diseaseIds|backgroundDiseaseIds|biosample

                                                                                

In [None]:
##### make the contradictions and improve them to run the analysis

In [None]:
import time
#from array import ArrayType
from functions import (
    relative_success,
    spreadSheetFormatter,
    discrepancifier,
    temporary_directionOfEffect,
)
# from stoppedTrials import terminated_td
from DoEAssessment import directionOfEffect
# from membraneTargets import target_membrane
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
#from itertools import islice
from datetime import datetime
from datetime import date
from pyspark.sql.types import (
    StructType,
    StructField,
    DoubleType,
    StringType,
    IntegerType,
    ArrayType
)
import pandas as pd


spark = SparkSession.builder.getOrCreate()
spark.conf.set(
    "spark.sql.shuffle.partitions", "400"
)  # Default is 200, increase if needed


path_n='gs://open-targets-data-releases/25.03/output/'

target = spark.read.parquet(f"{path_n}target/")

diseases = spark.read.parquet(f"{path_n}disease/")

evidences = spark.read.parquet(f"{path_n}evidence")

credible = spark.read.parquet(f"{path_n}credible_set")

new = spark.read.parquet(f"{path_n}colocalisation_coloc") 

index=spark.read.parquet(f"{path_n}study/")

variantIndex = spark.read.parquet(f"{path_n}variant")

biosample = spark.read.parquet(f"{path_n}biosample")

print("loaded files")

newColoc = (
    new.join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on left side
            "studyLocusId as leftStudyLocusId",
            "StudyId as leftStudyId",
            "variantId as leftVariantId",
            "studyType as credibleLeftStudyType",
        ),
        on="leftStudyLocusId",
        how="left",
    )
    .join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on right side
            "studyLocusId as rightStudyLocusId",
            "studyId as rightStudyId",
            "variantId as rightVariantId",
            "studyType as credibleRightStudyType",
            'isTransQtl'
        ),
        on="rightStudyLocusId",
        how="left",
    )
    .join(
        index.selectExpr(  ### bring modulated target on right side (QTL study)
            "studyId as rightStudyId",
            "geneId",
            "projectId",
            "studyType as indexStudyType",
            "condition",
            "biosampleId",
        ),
        on="rightStudyId",
        how="left",
)
    # .persist()
)

print("loaded newColoc")

# remove columns without content (only null values on them)
df = evidences.filter((F.col("datasourceId") == "gwas_credible_sets"))

# Use an aggregation to determine non-null columns
non_null_counts = df.select(
    *[F.sum(F.col(col).isNotNull().cast("int")).alias(col) for col in df.columns]
)

# Collect the counts for each column
non_null_columns = [
    row[0] for row in non_null_counts.collect()[0].asDict().items() if row[1] > 0
]

# Select only the non-null columns
filtered_df = df.select(*non_null_columns)  # .persist()

## bring studyId, variantId, beta from Gwas and pValue
gwasComplete = filtered_df.join(
    credible.selectExpr(
        "studyLocusId", "studyId", "variantId", "beta as betaGwas", "pValueExponent"
    ),
    on="studyLocusId",
    how="left",
)  # .persist()

print("loaded gwasComplete")

resolvedColoc = (
    (
        newColoc.withColumnRenamed("geneId", "targetId")
        .join(
            gwasComplete.withColumnRenamed("studyLocusId", "leftStudyLocusId"),
            on=["leftStudyLocusId", "targetId"],
            how="inner",
        )
        .join(  ### propagated using parent terms
            diseases.selectExpr(
                "id as diseaseId", "name", "parents", "therapeuticAreas"
            ),
            on="diseaseId",
            how="left",
        )
        .withColumn(
            "diseaseId",
            F.explode_outer(F.concat(F.array(F.col("diseaseId")), F.col("parents"))),
        )
        .drop("parents", "oldDiseaseId")
    ).withColumn(
        "colocDoE",
        F.when(
            F.col("rightStudyType").isin(
                ["eqtl", "pqtl", "tuqtl", "sceqtl", "sctuqtl"]
            ),
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_protect"),
            ),
        ).when(
            F.col("rightStudyType").isin(
                ["sqtl", "scsqtl"]
            ),  ### opposite directionality than sqtl
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_protect"),
            ),
        ),
    )
    # .persist()
)
print("loaded resolvedColloc")

datasource_filter = [
    "gwas_credible_set",
    "gene_burden",
    "eva",
    "eva_somatic",
    "gene2phenotype",
    "orphanet",
    "cancer_gene_census",
    "intogen",
    "impc",
    "chembl",
]

assessment, evidences, actionType, oncolabel = temporary_directionOfEffect(
    path_n, datasource_filter
)

print("run temporary direction of effect")

window_spec = Window.partitionBy("targetId", "diseaseId",'leftStudyId').orderBy( ### include gwas study
    F.col("pValueExponent").asc()
)
gwasCredibleAssoc = (
    resolvedColoc.withColumn(
        "homogenized", F.first("colocDoE", ignorenulls=True).over(window_spec)
    )  ## added 30.01.2025
    .select("targetId", "diseaseId",'leftStudyId', "homogenized")
    .withColumn(
        "homogenized",
        F.when(F.col("homogenized").isNull(), F.lit("noEvaluable")).otherwise(
            F.col("homogenized")
        ),
    )
)

print("Moving to step 2")

columns_chembl = ["LoF_protect", "GoF_protect"]
columns_dataset = ["LoF_protect", "GoF_protect", "LoF_risk", "GoF_risk", "evidenceDif"]
columns = ["GoF_risk", "LoF_protect", "LoF_risk", "GoF_protect"]
terms = ["noEvaluable", "bivalent_risk", "null", "dispar"]

taDf = spark.createDataFrame(
    data=[
        ("MONDO_0045024", "cell proliferation disorder", "Oncology"),
        ("EFO_0005741", "infectious disease", "Other"),
        ("OTAR_0000014", "pregnancy or perinatal disease", "Other"),
        ("EFO_0005932", "animal disease", "Other"),
        ("MONDO_0024458", "disease of visual system", "Other"),
        ("EFO_0000319", "cardiovascular disease", "Other"),
        ("EFO_0009605", "pancreas disease", "Other"),
        ("EFO_0010282", "gastrointestinal disease", "Other"),
        ("OTAR_0000017", "reproductive system or breast disease", "Other"),
        ("EFO_0010285", "integumentary system disease", "Other"),
        ("EFO_0001379", "endocrine system disease", "Other"),
        ("OTAR_0000010", "respiratory or thoracic disease", "Other"),
        ("EFO_0009690", "urinary system disease", "Other"),
        ("OTAR_0000006", "musculoskeletal or connective tissue disease", "Other"),
        ("MONDO_0021205", "disease of ear", "Other"),
        ("EFO_0000540", "immune system disease", "Other"),
        ("EFO_0005803", "hematologic disease", "Other"),
        ("EFO_0000618", "nervous system disease", "Other"),
        ("MONDO_0002025", "psychiatric disorder", "Other"),
        ("MONDO_0024297", "nutritional or metabolic disease", "Other"),
        ("OTAR_0000018", "genetic, familial or congenital disease", "Other"),
        ("OTAR_0000009", "injury, poisoning or other complication", "Other"),
        ("EFO_0000651", "phenotype", "Other"),
        ("EFO_0001444", "measurement", "Other"),
        ("GO_0008150", "biological process", "Other"),
    ],
    schema=StructType(
        [
            StructField("taId", StringType(), True),
            StructField("taLabel", StringType(), True),
            StructField("taLabelSimple", StringType(), True),
        ]
    ),
).withColumn("taRank", F.monotonically_increasing_id())

### give us a classification of Oncology VS non oncology
wByDisease = Window.partitionBy("diseaseId")  #### checked 31.05.2023
diseaseTA = (
    diseases.withColumn("taId", F.explode("therapeuticAreas"))
    .select(F.col("id").alias("diseaseId"), "taId", "parents")
    .join(taDf, on="taId", how="left")
    .withColumn("minRank", F.min("taRank").over(wByDisease))
    .filter(F.col("taRank") == F.col("minRank"))
    .drop("taRank", "minRank")
)

#### give us propagation of diseases and list of therapeutic areas associated
diseases2 = diseases.select("id", "parents").withColumn(
    "diseaseIdPropagated",
    F.explode_outer(F.concat(F.array(F.col("id")), F.col("parents"))),
)

chembl_trials = (
    assessment.filter((F.col("datasourceId").isin(["chembl"])))
    .groupBy("targetId", "diseaseId")
    .agg(F.max(F.col("clinicalPhase")).alias("maxClinPhase"))
)

negativeTD = (
    evidences.filter(F.col("datasourceId") == "chembl")
    .select("targetId", "diseaseId", "studyStopReason", "studyStopReasonCategories")
    .filter(F.array_contains(F.col("studyStopReasonCategories"), "Negative"))
    .groupBy("targetId", "diseaseId")
    .count()
    .withColumn("stopReason", F.lit("Negative"))
    .drop("count")
)

assessment_all = assessment.unionByName(
    gwasCredibleAssoc.withColumn("datasourceId", F.lit("gwas_credible_set")),
    allowMissingColumns=True,
)

print("defining non propagated,propagated and analysis_drugs functions")

def analysis_nonPropagated(assessment_all, analysisDatasources):
    return discrepancifier(
        assessment_all.filter(F.col("datasourceId").isin(analysisDatasources))
        .withColumn(
            "datasources",
            F.collect_set(F.col("datasourceId")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy(
            "targetId",
            "diseaseId",
        )
        .pivot("homogenized")
        .agg(F.count("targetId"))
        # .persist()
    )


def analysis_propagated(assessment_all, analysisDatasources):
    return discrepancifier(
        assessment_all.filter(F.col("datasourceId").isin(analysisDatasources))
        .withColumn(
            "datasources",
            F.collect_set(F.col("datasourceId")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .join(
            diseases2.selectExpr("id as diseaseId", "diseaseIdPropagated"),
            on="diseaseId",
            how="left",
        )
        .withColumnRenamed("diseaseId", "oldDiseaseId")
        .withColumnRenamed("diseaseIdPropagated", "diseaseId")
        .groupBy(
            "targetId",
            "diseaseId",
        )
        .pivot("homogenized")
        .agg(F.count("targetId"))
        # .persist()
    )

chembl_ds = ["chembl"]

def analysis_drugs(assessment_all, chembl_ds):
    return discrepancifier(
        assessment_all.filter((F.col("datasourceId").isin(chembl_ds))
        )
        .withColumn(
            "maxClinPhase",
            F.max(F.col("clinicalPhase")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase")
        .pivot("homogenized")
        .agg(F.count("targetId"))
        .persist()
    )


analysis_chembl = analysis_drugs(assessment_all, chembl_ds)

#######
## include here the analysis
#######

analysisDatasources = []

print("defining full_analysis_propagation")

doe_columns=["LoF_protect", "GoF_risk", "LoF_risk", "GoF_protect"]
diagonal_lof=['LoF_protect','GoF_risk']
diagonal_gof=['LoF_risk','GoF_protect']

def full_analysis_propagation(
    doe_columns,assessment_all, analysisDatasources, analysis_chembl, negativeTD, diseaseTA,diagonal_lof,diagonal_gof
):
    conditions = [
    F.when(F.col(c) == F.col("maxDoE"), F.lit(c)).otherwise(F.lit(None)) for c in doe_columns
    ]
    
    return (
        analysis_propagated(assessment_all, analysisDatasources)
        .join(
            analysis_chembl.selectExpr(
                "targetId",
                "diseaseId",
                "maxClinPhase",
                "coherencyDiagonal as coherencyDiagonal_ch",
                "coherencyOneCell as coherencyOneCell_ch",
                "LoF_protect as LoF_protect_ch",
                "GoF_protect as GoF_protect_ch",
            ),
            on=["targetId", "diseaseId"],
            how="right",
        )
        #### Should remove the coherencyDiagonal.isNotNull()
        .withColumn(
            "geneticEvidence",
            F.when(
                F.col("coherencyDiagonal").isNotNull(), F.lit("hasGeneticEvidence")
            ).otherwise(F.lit("noGeneticEvidence")),
        )
        # .filter(F.col("coherencyDiagonal_ch").isNotNull())
        .withColumn(
            "diagonalAgreeWithDrugs",
            F.when(
                (F.col("coherencyDiagonal_ch") == "coherent")
                & (F.col("coherencyDiagonal") == "coherent"),
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        F.col("GoF_risk").isNotNull() | F.col("LoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .when(
                    F.col("GoF_protect_ch").isNotNull()
                    & (
                        F.col("LoF_risk").isNotNull() | F.col("GoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        )
        .withColumn(
            "oneCellAgreeWithDrugs",
            F.when(
                (F.col("coherencyOneCell_ch") == "coherent")
                & (F.col("coherencyOneCell") == "coherent"),
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        (F.col("LoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("GoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .when(
                    (F.col("GoF_protect_ch").isNotNull())
                    & (
                        (F.col("GoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("LoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        ).withColumn(
            "arrayN", F.array(*[F.col(c) for c in doe_columns])
        ).withColumn(
            "maxDoE", F.array_max(F.col("arrayN"))
        ).withColumn("maxDoE_names", F.array(*conditions)
        ).withColumn("maxDoE_names", F.expr("filter(maxDoE_names, x -> x is not null)")
        ).withColumn(
            "Phase4",
            F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=3",
            F.when(F.col("maxClinPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=2",
            F.when(F.col("maxClinPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=1",
            F.when(F.col("maxClinPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
        )
        .join(negativeTD, on=["targetId", "diseaseId"], how="left")
        .withColumn(
            "PhaseT",
            F.when(F.col("stopReason") == "Negative", F.lit("yes")).otherwise(F.lit("no")),
        )
        .join(
            diseaseTA.select("diseaseId", "taLabelSimple"), on="diseaseId", how="left"
        )
        .withColumn(
            "hasGeneticEvidence",
            F.when(
                F.col("geneticEvidence") == "hasGeneticEvidence", F.lit("yes")
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "diagonalYes",
            F.when(
                F.col("hasGeneticEvidence") == "yes",
                F.when(F.col("diagonalAgreeWithDrugs") == "coherent", F.lit("yes"))
                .when(F.col("diagonalAgreeWithDrugs") == "dispar", F.lit("no"))
                .otherwise(F.lit("no")),
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "oneCellYes",
            F.when(
                F.col("hasGeneticEvidence") == "yes",
                F.when(F.col("oneCellAgreeWithDrugs") == "coherent", F.lit("yes"))
                .when(F.col("oneCellAgreeWithDrugs") == "dispar", F.lit("no"))
                .otherwise(F.lit("no")),
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "maxDoEArrayN",
            F.expr("aggregate(arrayN, 0, (acc, x) -> acc + IF(x = maxDoE, 1, 0))")
        ).withColumn(
            "NoneCellYes",
            F.when(F.col("LoF_protect_ch").isNotNull() & (F.array_contains(F.col("maxDoE_names"), F.lit("LoF_protect")))==True, F.lit('yes'))
            .when(F.col("GoF_protect_ch").isNotNull() & (F.array_contains(F.col("maxDoE_names"), F.lit("GoF_protect")))==True, F.lit('yes')
                ).otherwise(F.lit('no'))  # If the value is null, return null # Otherwise, check if name is in array
        ).withColumn(
            "NdiagonalYes",
            F.when(F.col("LoF_protect_ch").isNotNull() & 
                (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_lof]))) > 0),
                F.lit("yes")
            ).when(F.col("GoF_protect_ch").isNotNull() & 
                (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_gof]))) > 0),
                F.lit("yes")
            ).otherwise(F.lit('no'))
        )
        # .persist()
    )


#####
## no propag
#####
print("defining full analysis no propagation")


def full_analysis_noPropagation(
    doe_columns,assessment_all, analysisDatasources, analysis_chembl, negativeTD, diseaseTA,diagonal_lof,diagonal_gof
):
    return (
        analysis_nonPropagated(assessment_all, analysisDatasources)
        .join(
            analysis_chembl.selectExpr(
                "targetId",
                "diseaseId",
                "maxClinPhase",
                "coherencyDiagonal as coherencyDiagonal_ch",
                "coherencyOneCell as coherencyOneCell_ch",
                "LoF_protect as LoF_protect_ch",
                "GoF_protect as GoF_protect_ch",
            ),
            on=["targetId", "diseaseId"],
            how="right",
        )
        .withColumn(
            "geneticEvidence",
            F.when(
                F.col("coherencyDiagonal").isNotNull(), F.lit("hasGeneticEvidence")
            ).otherwise(F.lit("noGeneticEvidence")),
        )
        # .filter(F.col("coherencyDiagonal_ch").isNotNull())
        .withColumn(
            "diagonalAgreeWithDrugs",
            F.when(
                (F.col("coherencyDiagonal_ch") == "coherent")
                & (F.col("coherencyDiagonal") == "coherent"),
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        F.col("GoF_risk").isNotNull() | F.col("LoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .when(
                    F.col("GoF_protect_ch").isNotNull()
                    & (
                        F.col("LoF_risk").isNotNull() | F.col("GoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        )
        .withColumn(
            "oneCellAgreeWithDrugs",
            F.when(
                (F.col("coherencyOneCell_ch") == "coherent")
                & (F.col("coherencyOneCell") == "coherent"),
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        (F.col("LoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("GoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .when(
                    (F.col("GoF_protect_ch").isNotNull())
                    & (
                        (F.col("GoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("LoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        )
        .withColumn(
            "Phase4",
            F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=3",
            F.when(F.col("maxClinPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=2",
            F.when(F.col("maxClinPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=1",
            F.when(F.col("maxClinPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
        )
        .join(negativeTD, on=["targetId", "diseaseId"], how="left")
        .withColumn(
            "PhaseT",
            F.when(F.col("stopReason") == "Negative", F.lit("yes")).otherwise(F.lit("no")),
        )
        .join(
            diseaseTA.select("diseaseId", "taLabelSimple"), on="diseaseId", how="left"
        )
        .withColumn(
            "hasGeneticEvidence",
            F.when(
                F.col("geneticEvidence") == "hasGeneticEvidence", F.lit("yes")
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "diagonalYes",
            F.when(
                F.col("hasGeneticEvidence") == "yes",
                F.when(F.col("diagonalAgreeWithDrugs") == "coherent", F.lit("yes"))
                .when(F.col("diagonalAgreeWithDrugs") == "dispar", F.lit("no"))
                .otherwise(F.lit("no")),
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "oneCellYes",
            F.when(
                F.col("hasGeneticEvidence") == "yes",
                F.when(F.col("oneCellAgreeWithDrugs") == "coherent", F.lit("yes"))
                .when(F.col("oneCellAgreeWithDrugs") == "dispar", F.lit("no"))
                .otherwise(F.lit("no")),
            ).otherwise(F.lit("no")),
        ).withColumn(
            "maxDoEArrayN",
            F.expr("aggregate(arrayN, 0, (acc, x) -> acc + IF(x = maxDoE, 1, 0))")
        ).withColumn(
            "NoneCellYes",
            F.when(F.col("LoF_protect_ch").isNotNull() & (F.array_contains(F.col("maxDoE_names"), F.lit("LoF_protect")))==True, F.lit('yes'))
            .when(F.col("GoF_protect_ch").isNotNull() & (F.array_contains(F.col("maxDoE_names"), F.lit("GoF_protect")))==True, F.lit('yes')
                ).otherwise(F.lit('no'))  # If the value is null, return null # Otherwise, check if name is in array
        ).withColumn(
            "NdiagonalYes",
            F.when(F.col("LoF_protect_ch").isNotNull() & 
                (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_lof]))) > 0),
                F.lit("yes")
            ).when(F.col("GoF_protect_ch").isNotNull() & 
                (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_gof]))) > 0),
                F.lit("yes")
            ).otherwise(F.lit('no'))
        )
        # .persist()
    )

print("moving to Step 3")

from functions import relative_success, spreadSheetFormatter, convertTuple
import re
import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
from scipy.stats.contingency import odds_ratio, relative_risk

full_data = spark.createDataFrame(
    data=[
        ("yes", "yes"),
        ("yes", "no"),
        ("no", "yes"),
        ("no", "no"),
    ],
    schema=StructType(
        [
            StructField("prediction", StringType(), True),
            StructField("comparison", StringType(), True),
        ]
    ),
)
c = datetime.now()
print("starting dictionaries at", c)

#### continue here on 10.07.2024

## 1nd dictionary
dfs_dict = {}  ### checked and changed on 01.06.2023
dfs_dict_propag = {}


wocgc_list = [
    "gene_burden",
    "intogen",
    "eva",
    "eva_somatic",
    "impc",
    "orphanet",
    "gene2phenotype",
    "gwas_credible_set",
]
wCgc_list = [
    "gene_burden",
    "intogen",
    "eva",
    "eva_somatic",
    "impc",
    "orphanet",
    "gene2phenotype",
    "gwas_credible_set",
    "cancer_gene_census",
]

datasource_list = [
    "gene_burden",
    "intogen",
    "cancer_gene_census",
    "eva",
    "eva_somatic",
    "gwas_credible_set",
    "impc",
    "orphanet",
    "gene2phenotype",
    "WOcgc",
    "wCgc",
    "somatic",
    "germline",
]

germline_list = [
    "gene_burden",
    "eva",
    "gwas_credible_set",
    "impc",
    "orphanet",
    "gene2phenotype",
]

somatic_list = ["intogen", "cancer_gene_census", "eva_somatic"]


# assessment = prueba_assessment.filter(F.col("datasourceId").isin(datasources_analysis))
def dataset_builder(assessment_all, value, analysis_chembl, negativeTD, diseaseTA):
    nonPropagated = full_analysis_noPropagation(
        doe_columns,assessment_all, value, analysis_chembl, negativeTD, diseaseTA,diagonal_lof,diagonal_gof
    )
    propagated = full_analysis_propagation(
        doe_columns,assessment_all, value, analysis_chembl, negativeTD, diseaseTA,diagonal_lof,diagonal_gof
    )
    return (
        # Non propagation
        ## All
        nonPropagated,
        ## Other
        nonPropagated.filter(F.col("taLabelSimple") == "Other"),
        ## Other&Null
        nonPropagated.filter(
            (F.col("taLabelSimple").isNull()) | (F.col("taLabelSimple") == "Other")
        ),
        ## Oncology
        nonPropagated.filter(F.col("taLabelSimple") == "Oncology"),
        # Propagation
        ## All
        propagated,
        ## Other
        propagated.filter(F.col("taLabelSimple") == "Other"),
        ## Other&Null
        propagated.filter(
            (F.col("taLabelSimple").isNull()) | (F.col("taLabelSimple") == "Other")
        ),
        ## Oncology
        propagated.filter(F.col("taLabelSimple") == "Oncology"),
    )


for value in datasource_list:
    print(value)
    if value == "WOcgc":
        (
            dfs_dict[f"df_{value}_All_original"],
            dfs_dict[f"df_{value}_Other_original"],
            dfs_dict[f"df_{value}_OtherNull_original"],
            dfs_dict[f"df_{value}_Oncology_original"],
            dfs_dict_propag[f"df_{value}_All_propag"],
            dfs_dict_propag[f"df_{value}_Other_propag"],
            dfs_dict_propag[f"df_{value}_OtherNull_propag"],
            dfs_dict_propag[f"df_{value}_Oncology_propag"],
        ) = dataset_builder(
            assessment_all, wocgc_list, analysis_chembl, negativeTD, diseaseTA
        )
    elif value == "wCgc":
        (
            dfs_dict[f"df_{value}_All_original"],
            dfs_dict[f"df_{value}_Other_original"],
            dfs_dict[f"df_{value}_OtherNull_original"],
            dfs_dict[f"df_{value}_Oncology_original"],
            dfs_dict_propag[f"df_{value}_All_propag"],
            dfs_dict_propag[f"df_{value}_Other_propag"],
            dfs_dict_propag[f"df_{value}_OtherNull_propag"],
            dfs_dict_propag[f"df_{value}_Oncology_propag"],
        ) = dataset_builder(
            assessment_all, wCgc_list, analysis_chembl, negativeTD, diseaseTA
        )
    elif value == "germline":
        (
            dfs_dict[f"df_{value}_All_original"],
            dfs_dict[f"df_{value}_Other_original"],
            dfs_dict[f"df_{value}_OtherNull_original"],
            dfs_dict[f"df_{value}_Oncology_original"],
            dfs_dict_propag[f"df_{value}_All_propag"],
            dfs_dict_propag[f"df_{value}_Other_propag"],
            dfs_dict_propag[f"df_{value}_OtherNull_propag"],
            dfs_dict_propag[f"df_{value}_Oncology_propag"],
        ) = dataset_builder(
            assessment_all,
            germline_list,
            analysis_chembl,
            negativeTD,
            diseaseTA,
        )

    elif value == "somatic":
        (
            dfs_dict[f"df_{value}_All_original"],
            dfs_dict[f"df_{value}_Other_original"],
            dfs_dict[f"df_{value}_OtherNull_original"],
            dfs_dict[f"df_{value}_Oncology_original"],
            dfs_dict_propag[f"df_{value}_All_propag"],
            dfs_dict_propag[f"df_{value}_Other_propag"],
            dfs_dict_propag[f"df_{value}_OtherNull_propag"],
            dfs_dict_propag[f"df_{value}_Oncology_propag"],
        ) = dataset_builder(
            assessment_all,
            somatic_list,
            analysis_chembl,
            negativeTD,
            diseaseTA,
        )

    else:
        (
            dfs_dict[f"df_{value}_All_original"],
            dfs_dict[f"df_{value}_Other_original"],
            dfs_dict[f"df_{value}_OtherNull_original"],
            dfs_dict[f"df_{value}_Oncology_original"],
            dfs_dict_propag[f"df_{value}_All_propag"],
            dfs_dict_propag[f"df_{value}_Other_propag"],
            dfs_dict_propag[f"df_{value}_OtherNull_propag"],
            dfs_dict_propag[f"df_{value}_Oncology_propag"]
        ) = dataset_builder(
            assessment_all, value, analysis_chembl, negativeTD, diseaseTA
        )


def comparisons_df() -> list:
    """Return list of all comparisons to be used in the analysis"""
    comparisons = spark.createDataFrame(
        data=[
            ("hasGeneticEvidence", "byDatatype"),
            ("diagonalYes", "byDatatype"),
            ("oneCellYes", "byDatatype"),
            ("NdiagonalYes", "byDatatype"),
            ("NoneCellYes", "byDatatype"),
        ],
        schema=StructType(
            [
                StructField("comparison", StringType(), True),
                StructField("comparisonType", StringType(), True),
            ]
        ),
    )

    predictions = spark.createDataFrame(
        data=[
            ("Phase4", "clinical"),
            ("Phase>=3", "clinical"),
            ("Phase>=2", "clinical"),
            ("Phase>=1", "clinical"),
            ("PhaseT", "clinical"),
        ]
    )
    return comparisons.join(predictions, how="full").collect()


loaded files
loaded newColoc


                                                                                

loaded gwasComplete
loaded resolvedColloc


25/04/24 00:54:45 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:54:46 WARN CacheManager: Asked to cache already cached data.


run temporary direction of effect
Moving to step 2
defining non propagated,propagated and analysis_drugs functions


25/04/24 00:54:47 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:54:47 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:54:47 WARN CacheManager: Asked to cache already cached data.


defining full_analysis_propagation
defining full analysis no propagation
moving to Step 3
starting dictionaries at 2025-04-24 00:54:47.098964
gene_burden


25/04/24 00:54:47 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:54:47 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:54:48 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:54:48 WARN CacheManager: Asked to cache already cached data.


intogen


25/04/24 00:54:49 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:54:49 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:54:50 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:54:50 WARN CacheManager: Asked to cache already cached data.


cancer_gene_census


25/04/24 00:54:51 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:54:51 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:54:52 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:54:52 WARN CacheManager: Asked to cache already cached data.


eva


25/04/24 00:54:53 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:54:53 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:54:54 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:54:54 WARN CacheManager: Asked to cache already cached data.


eva_somatic


25/04/24 00:54:55 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:54:55 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:54:55 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:54:56 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:54:56 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:54:56 WARN CacheManager: Asked to cache already cached data.


gwas_credible_set


25/04/24 00:55:14 WARN CacheManager: Asked to cache already cached data.        ]
25/04/24 00:55:30 WARN CacheManager: Asked to cache already cached data.        ]


impc


25/04/24 00:55:31 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:55:31 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:55:31 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:55:31 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:55:32 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:55:32 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:55:32 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:55:32 WARN CacheManager: Asked to cache already cached data.


orphanet


25/04/24 00:55:33 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:55:33 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:55:34 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:55:34 WARN CacheManager: Asked to cache already cached data.


gene2phenotype


25/04/24 00:55:35 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:55:35 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:55:36 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:55:36 WARN CacheManager: Asked to cache already cached data.


WOcgc


                                                                                ]

wCgc


                                                                                ]

somatic


25/04/24 00:56:43 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:56:43 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:56:44 WARN CacheManager: Asked to cache already cached data.
25/04/24 00:56:44 WARN CacheManager: Asked to cache already cached data.


germline


                                                                                ]

In [142]:
diagonal_lof=['LoF_protect','GoF_risk']
diagonal_gof=['LoF_risk','GoF_protect']

dfs_dict_propag[f"df_gwas_credible_set_All_propag"].filter(F.col('coherencyDiagonal')=='dispar').withColumn(
    "N_oneCellAgreeDrug",
    F.when(F.col("LoF_protect_ch").isNotNull() & (F.array_contains(F.col("maxDoE_names"), F.lit("LoF_protect")))==True, F.lit('yes'))
    .when(F.col("GoF_protect_ch").isNotNull() & (F.array_contains(F.col("maxDoE_names"), F.lit("GoF_protect")))==True, F.lit('yes')
          ).otherwise(F.lit('no'))  # If the value is null, return null # Otherwise, check if name is in array
).withColumn(
    "N_diagonalAgreeDrug",
    F.when(F.col("LoF_protect_ch").isNotNull() & 
        (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_lof]))) > 0),
        F.lit("yes")
    ).when(F.col("GoF_protect_ch").isNotNull() & 
        (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_gof]))) > 0),
        F.lit("yes")
    ).otherwise(F.lit('no'))
).groupBy('diagonalYes','N_oneCellAgreeDrug','N_diagonalAgreeDrug').count().show()

+-----------+------------------+-------------------+-----+
|diagonalYes|N_oneCellAgreeDrug|N_diagonalAgreeDrug|count|
+-----------+------------------+-------------------+-----+
|         no|                no|                 no|   42|
|         no|                no|                yes|   28|
|         no|               yes|                yes|   35|
+-----------+------------------+-------------------+-----+



In [155]:
diagonal_lof=['LoF_protect','GoF_risk']
diagonal_gof=['LoF_risk','GoF_protect']




dfs_dict_propag[f"df_gwas_credible_set_All_propag"].withColumn(
    "maxDoEArrayN",
    F.expr("aggregate(arrayN, 0, (acc, x) -> acc + IF(x = maxDoE, 1, 0))")
).withColumn(
    "N_oneCellAgreeDrug",
    F.when(F.col("LoF_protect_ch").isNotNull() & (F.array_contains(F.col("maxDoE_names"), F.lit("LoF_protect")))==True, F.lit('yes'))
    .when(F.col("GoF_protect_ch").isNotNull() & (F.array_contains(F.col("maxDoE_names"), F.lit("GoF_protect")))==True, F.lit('yes')
          ).otherwise(F.lit('no'))  # If the value is null, return null # Otherwise, check if name is in array
).withColumn(
    "N_diagonalAgreeDrug",
    F.when(F.col("LoF_protect_ch").isNotNull() & 
        (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_lof]))) > 0),
        F.lit("yes")
    ).when(F.col("GoF_protect_ch").isNotNull() & 
        (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_gof]))) > 0),
        F.lit("yes")
    ).otherwise(F.lit('no'))
).groupBy('diagonalYes','N_diagonalAgreeDrug','oneCellYes','N_oneCellAgreeDrug').count().show()

#).filter(F.col('maxDoEArrayN')==2).count()

#.show(200,truncate=False)
#).groupBy('maxDoEArrayN','N_diagonalAgreeDrug').count().show()
#).filter((F.col('diagonalYes')=='no') & (F.col('N_diagonalAgreeDrug')=='yes')).show()

+-----------+-------------------+----------+------------------+-----+
|diagonalYes|N_diagonalAgreeDrug|oneCellYes|N_oneCellAgreeDrug|count|
+-----------+-------------------+----------+------------------+-----+
|        yes|                yes|        no|                no|  111|
|         no|                 no|        no|                no|73934|
|        yes|                yes|        no|               yes|    5|
|         no|                yes|        no|                no|   37|
|        yes|                yes|       yes|               yes|   72|
|         no|                yes|        no|               yes|   38|
+-----------+-------------------+----------+------------------+-----+



In [None]:
#### Solo necesito meter las columnas y correr 

In [None]:
diagonal_lof=['LoF_protect','GoF_risk']
diagonal_gof=['LoF_risk','GoF_protect']
F.size(F.array_intersect(F.col("your_array_column"), F.array([F.lit(x) for x in my_list]))) > 0
)

In [122]:
dfs_dict_propag[f"df_gwas_credible_set_All_propag"].groupBy('diagonalYes_number','diagonalYes').count().show()

+------------------+-----------+-----+
|diagonalYes_number|diagonalYes|count|
+------------------+-----------+-----+
|                no|         no|74009|
|                no|        yes|  188|
+------------------+-----------+-----+



In [None]:
dfs_dict_propag[f"df_gwas_credible_set_All_propag"].

In [5]:
dfs_dict[f"df_gwas_credible_set_All_original"].printSchema()

root
 |-- diseaseId: string (nullable = true)
 |-- targetId: string (nullable = true)
 |-- GoF_protect: long (nullable = true)
 |-- GoF_risk: long (nullable = true)
 |-- LoF_protect: long (nullable = true)
 |-- LoF_risk: long (nullable = true)
 |-- noEvaluable: void (nullable = true)
 |-- coherencyDiagonal: string (nullable = true)
 |-- coherencyOneCell: string (nullable = true)
 |-- maxClinPhase: double (nullable = true)
 |-- coherencyDiagonal_ch: string (nullable = true)
 |-- coherencyOneCell_ch: string (nullable = true)
 |-- LoF_protect_ch: long (nullable = true)
 |-- GoF_protect_ch: long (nullable = true)
 |-- geneticEvidence: string (nullable = false)
 |-- diagonalAgreeWithDrugs: string (nullable = true)
 |-- oneCellAgreeWithDrugs: string (nullable = true)
 |-- Phase4: string (nullable = false)
 |-- Phase>=3: string (nullable = false)
 |-- Phase>=2: string (nullable = false)
 |-- Phase>=1: string (nullable = false)
 |-- stopReason: string (nullable = true)
 |-- PhaseT: string (nul

In [None]:
dfs_dict[f"df_gwas_credible_set_All_original"].filter(
    F.col("coherencyDiagonal").isNotNull()
).groupBy("coherencyDiagonal", "coherencyOneCell", "hasGeneticEvidence").count().show()

+-----------------+----------------+------------------+-----+
|coherencyDiagonal|coherencyOneCell|hasGeneticEvidence|count|
+-----------------+----------------+------------------+-----+
|         coherent|        coherent|               yes|  274|
|         coherent|          dispar|               yes|   15|
|           dispar|          dispar|               yes|   53|
+-----------------+----------------+------------------+-----+



In [12]:
dfs_dict[f"df_gwas_credible_set_All_original"].filter(
    F.col("coherencyDiagonal").isNotNull()).groupBy('maxClinPhase','coherencyDiagonal').count().sort(F.col('maxClinPhase')).show()

+------------+-----------------+-----+
|maxClinPhase|coherencyDiagonal|count|
+------------+-----------------+-----+
|         0.5|         coherent|    1|
|         1.0|           dispar|    1|
|         1.0|         coherent|   25|
|         2.0|           dispar|   21|
|         2.0|         coherent|   96|
|         3.0|         coherent|   75|
|         3.0|           dispar|   11|
|         4.0|           dispar|   20|
|         4.0|         coherent|   92|
+------------+-----------------+-----+



In [14]:
dfs_dict[f"df_gwas_credible_set_All_original"].groupBy('Phase4','coherencyDiagonal').count().sort(F.col('Phase4')).show()

+------+-----------------+-----+
|Phase4|coherencyDiagonal|count|
+------+-----------------+-----+
|    no|             NULL|67713|
|    no|         coherent|  197|
|    no|           dispar|   33|
|   yes|             NULL| 6142|
|   yes|           dispar|   20|
|   yes|         coherent|   92|
+------+-----------------+-----+



In [None]:
dfs_dict[f"df_gwas_credible_set_All_original"].filter(
    F.col("coherencyDiagonal").isNotNull()).groupBy('maxClinPhase','coherencyDiagonal').count().sort(F.col('maxClinPhase')).show()

In [9]:
dfs_dict_propag[f"df_gwas_credible_set_All_propag"].filter(
    F.col("coherencyDiagonal").isNotNull()
).groupBy("coherencyDiagonal", "coherencyOneCell", "hasGeneticEvidence").count().show()

                                                                                

+-----------------+----------------+------------------+-----+
|coherencyDiagonal|coherencyOneCell|hasGeneticEvidence|count|
+-----------------+----------------+------------------+-----+
|         coherent|        coherent|               yes|  303|
|         coherent|          dispar|               yes|   15|
|           dispar|          dispar|               yes|  105|
+-----------------+----------------+------------------+-----+



In [None]:
dfs_dict[f"df_gwas_credible_set_All_original"].withColumn(
        "numbers_coherencyDiagonal",
        F.when(
            (F.col("LoF_risk").isNull())
            & (F.col("LoF_protect").isNull())
            & (F.col("GoF_risk").isNull())
            & (F.col("GoF_protect").isNull())
            & (F.col("noEvaluable").isNull()),
            F.lit("noEvid"),
        )
        .when(
            (F.col("LoF_risk").isNull())
            & (F.col("LoF_protect").isNull())
            & (F.col("GoF_risk").isNull())
            & (F.col("GoF_protect").isNull())
            & (F.col("noEvaluable").isNotNull()),
            F.lit("EvidNotDoE"),
        )
        .when(
            ((F.col("LoF_risk")) | (F.col("GoF_protect").isNotNull())) > 
            (F.col("LoF_protect").isNotNull()) (F.col("GoF_risk").isNotNull())
            ,
            F.when(
                ((F.col("GoF_risk").isNotNull()) & (F.col("LoF_risk").isNotNull())),
                F.lit("dispar"),
            )
            .when(
                ((F.col("LoF_protect").isNotNull()) & (F.col("LoF_risk").isNotNull())),
                F.lit("dispar"),
            )
            .when(
                ((F.col("GoF_protect").isNotNull()) & (F.col("GoF_risk").isNotNull())),
                F.lit("dispar"),
            )
            .when(
                (
                    (F.col("GoF_protect").isNotNull())
                    & (F.col("LoF_protect").isNotNull())
                ),
                F.lit("dispar"),
            )
            .otherwise(F.lit("coherent")),
        ),
    ).withColumn(
        "coherencyOneCell",
        F.when(
            (F.col("LoF_risk").isNull())
            & (F.col("LoF_protect").isNull())
            & (F.col("GoF_risk").isNull())
            & (F.col("GoF_protect").isNull())
            & (F.col("noEvaluable").isNull()),
            F.lit("noEvid"),
        )
        .when(
            (F.col("LoF_risk").isNull())
            & (F.col("LoF_protect").isNull())
            & (F.col("GoF_risk").isNull())
            & (F.col("GoF_protect").isNull())
            & (F.col("noEvaluable").isNotNull()),
            F.lit("EvidNotDoE"),
        )
        .when(
            (F.col("LoF_risk").isNotNull())
            | (F.col("LoF_protect").isNotNull())
            | (F.col("GoF_risk").isNotNull())
            | (F.col("GoF_protect").isNotNull()),
            F.when(
                F.col("LoF_risk").isNotNull()
                & (
                    (F.col("LoF_protect").isNull())
                    & (F.col("GoF_risk").isNull())
                    & (F.col("GoF_protect").isNull())
                ),
                F.lit("coherent"),
            )
            .when(
                F.col("GoF_risk").isNotNull()
                & (
                    (F.col("LoF_protect").isNull())
                    & (F.col("LoF_risk").isNull())
                    & (F.col("GoF_protect").isNull())
                ),
                F.lit("coherent"),
            )
            .when(
                F.col("LoF_protect").isNotNull()
                & (
                    (F.col("LoF_risk").isNull())
                    & (F.col("GoF_risk").isNull())
                    & (F.col("GoF_protect").isNull())
                ),
                F.lit("coherent"),
            )
            .when(
                F.col("GoF_protect").isNotNull()
                & (
                    (F.col("LoF_protect").isNull())
                    & (F.col("GoF_risk").isNull())
                    & (F.col("LoF_risk").isNull())
                ),
                F.lit("coherent"),
            )
            .otherwise(F.lit("dispar")),
        ),
    )

In [15]:
assessment_all.show()

                                                                                ]

+------------------+---------------+-------------+-------------------+--------+----------+----+---------------------------+---------------------------+---------------------------------+--------------------------------+-----------------+-------------+----------+--------------------+--------+-------------+---------------------+--------------+-----------------+--------+----------------+---------------+----------+--------+-------------------+----------------+----------------+--------------------+-------------------+-------------------------+-------------------------------------+-------------------------------------+--------------+------+------------+-----------------+--------------------+----------------------------+-------------------+--------------------+---------+--------------------------------+--------------------------------+--------------+--------------+--------+---------+----------+------------+-----------+--------------+-------------+----+------------------------+-----------------

In [17]:
analysisDatasources

[]

In [19]:
assessment_all.filter(F.col("datasourceId").isin(wCgc_list)).groupBy('homogenized').count().show()



+-----------+-------+
|homogenized|  count|
+-----------+-------+
|noEvaluable|2891236|
|   GoF_risk|3993490|
|LoF_protect|3535651|
|   LoF_risk|5214499|
|GoF_protect|3956367|
+-----------+-------+



                                                                                

In [22]:
w = Window.partitionBy("targetId", "diseaseId","homogenized")

df_with_count = assessment_all.filter(F.col("datasourceId").isin(wCgc_list)).withColumn(
    "homogenized_count", F.count("homogenized").over(w)
)

# Then, compute the max of those counts over the same window
df_with_max = df_with_count.withColumn(
    "maxDoE", F.max("homogenized_count").over(w)
)

In [84]:
discrepancifier(df_with_max.filter((F.col('targetId')=='ENSG00000198911') & (F.col('diseaseId')=='EFO_0006843')).groupBy(
    "targetId", "diseaseId", "homogenized", "maxDoE"
).pivot('homogenized').count()).show()

[Stage 3508:>                                                       (0 + 1) / 1]

+---------------+-----------+-----------+------+-----------+--------+-----------+--------+-----------+-----------------+----------------+
|       targetId|  diseaseId|homogenized|maxDoE|GoF_protect|GoF_risk|LoF_protect|LoF_risk|noEvaluable|coherencyDiagonal|coherencyOneCell|
+---------------+-----------+-----------+------+-----------+--------+-----------+--------+-----------+-----------------+----------------+
|ENSG00000198911|EFO_0006843|GoF_protect|    14|         14|    NULL|       NULL|    NULL|       NULL|         coherent|        coherent|
+---------------+-----------+-----------+------+-----------+--------+-----------+--------+-----------+-----------------+----------------+



                                                                                

In [85]:
test=assessment_all.filter((F.col('targetId')=='ENSG00000198911') & (F.col('diseaseId')=='EFO_0006843')).filter(F.col("datasourceId").isin(wCgc_list)).withColumn(
    "homogenized_count", F.count("homogenized").over(w)
).persist()

In [92]:
w = Window.partitionBy("targetId", "diseaseId","homogenized")
discrepancifier(test.withColumn(
    "homogenized_count", F.count("homogenized").over(w)
).groupBy('targetId','diseaseId','homogenized_count').pivot('homogenized').count()).show()

+---------------+-----------+-----------------+-----------+--------+-----------+--------+-----------+-----------------+----------------+
|       targetId|  diseaseId|homogenized_count|GoF_protect|GoF_risk|LoF_protect|LoF_risk|noEvaluable|coherencyDiagonal|coherencyOneCell|
+---------------+-----------+-----------------+-----------+--------+-----------+--------+-----------+-----------------+----------------+
|ENSG00000198911|EFO_0006843|               14|         14|    NULL|       NULL|    NULL|       NULL|         coherent|        coherent|
+---------------+-----------+-----------------+-----------+--------+-----------+--------+-----------+-----------------+----------------+



In [88]:
discrepancifier(test.groupBy('targetId','diseaseId').pivot('homogenized').count()).show()

+---------------+-----------+-----------+--------+-----------+--------+-----------+-----------------+----------------+
|       targetId|  diseaseId|GoF_protect|GoF_risk|LoF_protect|LoF_risk|noEvaluable|coherencyDiagonal|coherencyOneCell|
+---------------+-----------+-----------+--------+-----------+--------+-----------+-----------------+----------------+
|ENSG00000198911|EFO_0006843|         14|    NULL|       NULL|    NULL|       NULL|         coherent|        coherent|
+---------------+-----------+-----------+--------+-----------+--------+-----------+-----------------+----------------+



In [97]:
test2=assessment_all.filter((F.col('targetId')=='ENSG00000085721') & (F.col('diseaseId')=='EFO_0005110')).filter(F.col("datasourceId").isin(wCgc_list)).withColumn(
    "homogenized_count", F.count("homogenized").over(w)
).persist()

In [106]:
columnas=['GoF_risk','GoF_protect','LoF_protect','LoF_risk']

In [None]:
discrepancifier(
    test2.groupBy("targetId", "diseaseId").pivot("homogenized").count()
).withColumn("arrayN", F.array(*columnas)).withColumn(
    "maxDoE", F.array_max(F.col("arrayN"))
).withColumn(
    "neutralDiagonal",
    F.when(
        (F.col("coherencyDiagonal") == "dispar")
        & ((F.col("LoF_protect") == F.col('maxDoE')) | (F.col("GoF_risk") == F.col('maxDoE')))
        & ((F.col("LoF_risk") == F.col('maxDoE')) | (F.col("GoF_protect") == F.col('maxDoE'))),
        F.lit("noResol"),
    ).otherwise(F.lit('this'))
).show()

25/04/23 23:39:55 WARN CacheManager: Asked to cache already cached data.
25/04/23 23:39:55 WARN CacheManager: Asked to cache already cached data.
25/04/23 23:39:55 WARN CacheManager: Asked to cache already cached data.


+---------------+-----------+-----------+--------+--------+-----------+-----------+-----------------+----------------+-------------------+------+---------------+
|       targetId|  diseaseId|LoF_protect|LoF_risk|GoF_risk|GoF_protect|noEvaluable|coherencyDiagonal|coherencyOneCell|             arrayN|maxDoE|neutralDiagonal|
+---------------+-----------+-----------+--------+--------+-----------+-----------+-----------------+----------------+-------------------+------+---------------+
|ENSG00000085721|EFO_0005110|         12|       6|    NULL|       NULL|       NULL|           dispar|          dispar|[NULL, NULL, 12, 6]|    12|           this|
+---------------+-----------+-----------+--------+--------+-----------+-----------+-----------------+----------------+-------------------+------+---------------+



In [116]:
from pyspark.sql import functions as F

columnas = ["LoF_protect", "GoF_risk", "LoF_risk", "GoF_protect"]

# Add array of values
df = discrepancifier(
    test2.groupBy("targetId", "diseaseId").pivot("homogenized").count()
).withColumn(
    "arrayN", F.array(*[F.col(c) for c in columnas])
).withColumn(
    "maxDoE", F.array_max(F.col("arrayN"))
)

# Create array of column names where value == maxDoE
conditions = [
    F.when(F.col(c) == F.col("maxDoE"), F.lit(c)).otherwise(F.lit(None)) for c in columnas
]

df = df.withColumn("maxDoE_names", F.array(*conditions))
df = df.withColumn("maxDoE_names", F.expr("filter(maxDoE_names, x -> x is not null)"))


25/04/23 23:44:12 WARN CacheManager: Asked to cache already cached data.
25/04/23 23:44:12 WARN CacheManager: Asked to cache already cached data.
25/04/23 23:44:12 WARN CacheManager: Asked to cache already cached data.


In [None]:
df.withColumn('diagonalYes', F.when)

+---------------+-----------+-----------+--------+--------+-----------+-----------+-----------------+----------------+-------------------+------+-------------+
|       targetId|  diseaseId|LoF_protect|LoF_risk|GoF_risk|GoF_protect|noEvaluable|coherencyDiagonal|coherencyOneCell|             arrayN|maxDoE| maxDoE_names|
+---------------+-----------+-----------+--------+--------+-----------+-----------+-----------------+----------------+-------------------+------+-------------+
|ENSG00000085721|EFO_0005110|         12|       6|    NULL|       NULL|       NULL|           dispar|          dispar|[12, NULL, 6, NULL]|    12|[LoF_protect]|
+---------------+-----------+-----------+--------+--------+-----------+-----------+-----------------+----------------+-------------------+------+-------------+



In [95]:
discrepancifier(assessment_all.groupBy('targetId','diseaseId').pivot('homogenized').count()).filter(F.col('coherencyDiagonal')=='dispar').show()



+---------------+-------------+-----------+--------+-----------+--------+-----------+-----------------+----------------+
|       targetId|    diseaseId|GoF_protect|GoF_risk|LoF_protect|LoF_risk|noEvaluable|coherencyDiagonal|coherencyOneCell|
+---------------+-------------+-----------+--------+-----------+--------+-----------+-----------------+----------------+
|ENSG00000085721|  EFO_0005110|       NULL|    NULL|         12|       6|       NULL|           dispar|          dispar|
|ENSG00000135409|  EFO_0004732|       NULL|      12|       NULL|       6|       NULL|           dispar|          dispar|
|ENSG00000186787|  EFO_0004324|       NULL|      57|         57|       6|       NULL|           dispar|          dispar|
|ENSG00000176014|  EFO_1000357|          1|    NULL|         11|    NULL|       NULL|           dispar|          dispar|
|ENSG00000183624|  EFO_0004730|          6|    NULL|          6|    NULL|       NULL|           dispar|          dispar|
|ENSG00000180210|MONDO_0004634| 

                                                                                

In [66]:
discrepancifier(df_with_max2).filter(F.col('coherencyDiagonal')=='dispar').show()



+---------------+-------------+-----------------+------+-----------+--------+-----------+--------+-----------+-----------------+----------------+
|       targetId|    diseaseId|homogenized_count|maxDoE|GoF_protect|GoF_risk|LoF_protect|LoF_risk|noEvaluable|coherencyDiagonal|coherencyOneCell|
+---------------+-------------+-----------------+------+-----------+--------+-----------+--------+-----------+-----------------+----------------+
|ENSG00000110492|  EFO_0008354|                5|     5|       NULL|    NULL|          5|       5|       NULL|           dispar|          dispar|
|ENSG00000133030|  EFO_0004587|               21|    21|         21|    NULL|         21|    NULL|       NULL|           dispar|          dispar|
|ENSG00000133606|  EFO_0004742|                2|     2|       NULL|    NULL|          2|       2|       NULL|           dispar|          dispar|
|ENSG00000178573|  EFO_0000508|                1|     1|       NULL|    NULL|          1|       1|       NULL|           dis

                                                                                

In [30]:
from pyspark.sql import functions as F

# Get the list of homogenized categories (the pivoted column names)
homogenized_cols = [col for col in df_with_max2.columns if col not in {"targetId", "diseaseId", "homogenized_count", "maxDoE"}]

# Create a new column to capture the column name where value == maxDoE
df_result = df_with_max2.withColumn(
    "maxHomogenizedLabel",
    F.expr(
        "CASE " +
        " ".join(
            [f"WHEN `{col}` = maxDoE THEN '{col}'" for col in homogenized_cols]
        ) +
        " ELSE NULL END"
    )
)


In [None]:
### evaluate this only in the ones that are uncoherent, and different from 1 vs 1. 

In [38]:
discrepancifier(df_result).filter(F.col('coherencyDiagonal')=='dispar').show()



+---------------+-------------+-----------------+------+-----------+--------+-----------+--------+-----------+-------------------+-----------------+----------------+
|       targetId|    diseaseId|homogenized_count|maxDoE|GoF_protect|GoF_risk|LoF_protect|LoF_risk|noEvaluable|maxHomogenizedLabel|coherencyDiagonal|coherencyOneCell|
+---------------+-------------+-----------------+------+-----------+--------+-----------+--------+-----------+-------------------+-----------------+----------------+
|ENSG00000037280|  EFO_0000349|                1|     1|       NULL|       1|       NULL|       1|       NULL|           GoF_risk|           dispar|          dispar|
|ENSG00000113083|  EFO_0006841|                2|     2|          2|       2|       NULL|    NULL|       NULL|        GoF_protect|           dispar|          dispar|
|ENSG00000179889|  EFO_0008592|                3|     3|       NULL|       3|       NULL|       3|       NULL|           GoF_risk|           dispar|          dispar|
|ENS

                                                                                

In [42]:
df_with_max.show()



+-----------------+---------------+-------------+-------------------+--------+----------+----+---------------------------+---------------------------+---------------------------------+--------------------------------+-----------------+-------------+----------+--------------------+--------+-------------+---------------------+--------------+-----------------+--------+--------------------+---------------+--------------------+--------+-------------------+-------------------+----------------+--------------------+-------------------+-------------------------+-------------------------------------+-------------------------------------+--------------+------+------------+-----------------+--------------------+----------------------------+-------------------+--------------+---------+--------------------------------+--------------------------------+--------------+--------------+--------+---------+----------+------------+-----------+--------------+-------------+----+------------------------+-------

                                                                                

In [65]:
discrepancifier(df_result).withColumn(
    "neutralDiagonal",
    F.when(
        (F.col("coherencyDiagonal") == "dispar")
        & ((F.col("LoF_protect") == F.col('maxDoE')) | (F.col("GoF_risk") == F.col('maxDoE')))
        & ((F.col("LoF_risk") == F.col('maxDoE')) | (F.col("GoF_protect") == F.col('maxDoE'))),
        F.lit("neutral"),
    ).otherwise(F.col("maxHomogenizedLabel")),
).groupBy('neutralDiagonal','coherencyDiagonal').count().show()



+---------------+-----------------+------+
|neutralDiagonal|coherencyDiagonal| count|
+---------------+-----------------+------+
|    GoF_protect|         coherent| 68353|
|       LoF_risk|         coherent|760316|
|    LoF_protect|         coherent| 68390|
|    noEvaluable|       EvidNotDoE| 84646|
|        neutral|           dispar|  6510|
|       GoF_risk|         coherent| 86618|
+---------------+-----------------+------+



                                                                                

In [64]:
discrepancifier(df_result).withColumn(
    "neutralDiagonal",
    F.when(
        (F.col("coherencyDiagonal") == "dispar")
        & ((F.col("LoF_protect") == F.col('maxDoE')) | (F.col("GoF_risk") == F.col('maxDoE')))
        & ((F.col("LoF_risk") == F.col('maxDoE')) | (F.col("GoF_protect") == F.col('maxDoE'))),
        F.lit("neutral"),
    ).otherwise(F.col("maxHomogenizedLabel")),
).filter(F.col('neutralDiagonal')=='neutral').show()



+---------------+-----------+-----------------+------+-----------+--------+-----------+--------+-----------+-------------------+-----------------+----------------+---------------+
|       targetId|  diseaseId|homogenized_count|maxDoE|GoF_protect|GoF_risk|LoF_protect|LoF_risk|noEvaluable|maxHomogenizedLabel|coherencyDiagonal|coherencyOneCell|neutralDiagonal|
+---------------+-----------+-----------------+------+-----------+--------+-----------+--------+-----------+-------------------+-----------------+----------------+---------------+
|ENSG00000153815|EFO_0004529|              305|   305|       NULL|    NULL|        305|     305|       NULL|        LoF_protect|           dispar|          dispar|        neutral|
|ENSG00000198099|EFO_0004530|               28|    28|       NULL|      28|       NULL|      28|       NULL|           GoF_risk|           dispar|          dispar|        neutral|
|ENSG00000197653|EFO_0803546|               44|    44|         44|      44|       NULL|      44|    

                                                                                

In [None]:
'from pyspark.sql.functions import col, array, aggregate, lit

# Identify the pivot columns only (exclude non-pivot ones)
non_pivot_cols = {"targetId", "diseaseId", "homogenized_count", "maxDoE",'maxHomogenizedLabel','noEvaluable'}
pivot_cols = [c for c in df_result.columns if c not in non_pivot_cols]

# Create array of comparisons (col == maxDoE)
comparisons = array(*[(col(c) == col("maxDoE")).cast("int") for c in pivot_cols])

# Use `aggregate` to count how many columns equal the max
df_with_discrepancy = df_result.withColumn(
    "max_matches",
    aggregate(comparisons, lit(0), lambda acc, x: acc + x)
).withColumn(
    "hasDiscrepancy", F.when(
    F.col("max_matches") >= 2, F.lit('dispar')
).otherwise(F.lit('coherent')))

In [56]:
from pyspark.sql.functions import col, array, aggregate, lit

# List the columns you want to compare
cols_to_check = ["GoF_protect", "GoF_risk", "LoF_protect", "LoF_risk"]

# Create array of Boolean comparisons, cast to int (1 if match, else 0)
comparison_array = array(*[(col(c) == col("maxDoE")).cast("int") for c in cols_to_check])

# Count how many columns matched the maxDoE
df_with_check = df_result.withColumn(
    "num_max_matches",
    aggregate(comparison_array, lit(0), lambda acc, x: acc + x)
).withColumn(
    "has_discrepancy",
    col("num_max_matches") >= 2
)


In [57]:

discrepancifier(df_with_check).filter(
    F.col("coherencyDiagonal") == "dispar"
).groupBy('hasDiscrepancy').count().show()

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `hasDiscrepancy` cannot be resolved. Did you mean one of the following? [`has_discrepancy`, `diseaseId`, `GoF_protect`, `GoF_risk`, `LoF_protect`].;
'Aggregate ['hasDiscrepancy], ['hasDiscrepancy, count(1) AS count#227838L]
+- Filter (coherencyDiagonal#227794 = dispar)
   +- Project [targetId#2033, diseaseId#2115, homogenized_count#176081L, maxDoE#176188L, GoF_protect#190391L, GoF_risk#190392L, LoF_protect#190393L, LoF_risk#190394L, noEvaluable#190395L, maxHomogenizedLabel#190426, num_max_matches#227766, has_discrepancy#227781, coherencyDiagonal#227794, CASE WHEN ((((isnull(LoF_risk#190394L) AND isnull(LoF_protect#190393L)) AND isnull(GoF_risk#190392L)) AND isnull(GoF_protect#190391L)) AND isnull(noEvaluable#190395L)) THEN noEvid WHEN ((((isnull(LoF_risk#190394L) AND isnull(LoF_protect#190393L)) AND isnull(GoF_risk#190392L)) AND isnull(GoF_protect#190391L)) AND isnotnull(noEvaluable#190395L)) THEN EvidNotDoE WHEN (((isnotnull(LoF_risk#190394L) OR isnotnull(LoF_protect#190393L)) OR isnotnull(GoF_risk#190392L)) OR isnotnull(GoF_protect#190391L)) THEN CASE WHEN (isnotnull(LoF_risk#190394L) AND ((isnull(LoF_protect#190393L) AND isnull(GoF_risk#190392L)) AND isnull(GoF_protect#190391L))) THEN coherent WHEN (isnotnull(GoF_risk#190392L) AND ((isnull(LoF_protect#190393L) AND isnull(LoF_risk#190394L)) AND isnull(GoF_protect#190391L))) THEN coherent WHEN (isnotnull(LoF_protect#190393L) AND ((isnull(LoF_risk#190394L) AND isnull(GoF_risk#190392L)) AND isnull(GoF_protect#190391L))) THEN coherent WHEN (isnotnull(GoF_protect#190391L) AND ((isnull(LoF_protect#190393L) AND isnull(GoF_risk#190392L)) AND isnull(LoF_risk#190394L))) THEN coherent ELSE dispar END END AS coherencyOneCell#227808]
      +- Project [targetId#2033, diseaseId#2115, homogenized_count#176081L, maxDoE#176188L, GoF_protect#190391L, GoF_risk#190392L, LoF_protect#190393L, LoF_risk#190394L, noEvaluable#190395L, maxHomogenizedLabel#190426, num_max_matches#227766, has_discrepancy#227781, CASE WHEN ((((isnull(LoF_risk#190394L) AND isnull(LoF_protect#190393L)) AND isnull(GoF_risk#190392L)) AND isnull(GoF_protect#190391L)) AND isnull(noEvaluable#190395L)) THEN noEvid WHEN ((((isnull(LoF_risk#190394L) AND isnull(LoF_protect#190393L)) AND isnull(GoF_risk#190392L)) AND isnull(GoF_protect#190391L)) AND isnotnull(noEvaluable#190395L)) THEN EvidNotDoE WHEN (((isnotnull(LoF_risk#190394L) OR isnotnull(LoF_protect#190393L)) OR isnotnull(GoF_risk#190392L)) OR isnotnull(GoF_protect#190391L)) THEN CASE WHEN (isnotnull(GoF_risk#190392L) AND isnotnull(LoF_risk#190394L)) THEN dispar WHEN (isnotnull(LoF_protect#190393L) AND isnotnull(LoF_risk#190394L)) THEN dispar WHEN (isnotnull(GoF_protect#190391L) AND isnotnull(GoF_risk#190392L)) THEN dispar WHEN (isnotnull(GoF_protect#190391L) AND isnotnull(LoF_protect#190393L)) THEN dispar ELSE coherent END END AS coherencyDiagonal#227794]
         +- Project [targetId#2033, diseaseId#2115, homogenized_count#176081L, maxDoE#176188L, GoF_protect#190391L, GoF_risk#190392L, LoF_protect#190393L, LoF_risk#190394L, noEvaluable#190395L, maxHomogenizedLabel#190426, num_max_matches#227766, (num_max_matches#227766 >= 2) AS has_discrepancy#227781]
            +- Project [targetId#2033, diseaseId#2115, homogenized_count#176081L, maxDoE#176188L, GoF_protect#190391L, GoF_risk#190392L, LoF_protect#190393L, LoF_risk#190394L, noEvaluable#190395L, maxHomogenizedLabel#190426, aggregate(array(cast((GoF_protect#190391L = maxDoE#176188L) as int), cast((GoF_risk#190392L = maxDoE#176188L) as int), cast((LoF_protect#190393L = maxDoE#176188L) as int), cast((LoF_risk#190394L = maxDoE#176188L) as int)), 0, lambdafunction((lambda x_8#227767 + lambda y_9#227768), lambda x_8#227767, lambda y_9#227768, false), lambdafunction(lambda id#227769, lambda id#227769, false)) AS num_max_matches#227766]
               +- Project [targetId#2033, diseaseId#2115, homogenized_count#176081L, maxDoE#176188L, GoF_protect#190391L, GoF_risk#190392L, LoF_protect#190393L, LoF_risk#190394L, noEvaluable#190395L, CASE WHEN (GoF_protect#190391L = maxDoE#176188L) THEN GoF_protect WHEN (GoF_risk#190392L = maxDoE#176188L) THEN GoF_risk WHEN (LoF_protect#190393L = maxDoE#176188L) THEN LoF_protect WHEN (LoF_risk#190394L = maxDoE#176188L) THEN LoF_risk WHEN (noEvaluable#190395L = maxDoE#176188L) THEN noEvaluable ELSE cast(null as string) END AS maxHomogenizedLabel#190426]
                  +- Project [targetId#2033, diseaseId#2115, homogenized_count#176081L, maxDoE#176188L, __pivot_count(1) AS count AS `count(1) AS count`#190390[0] AS GoF_protect#190391L, __pivot_count(1) AS count AS `count(1) AS count`#190390[1] AS GoF_risk#190392L, __pivot_count(1) AS count AS `count(1) AS count`#190390[2] AS LoF_protect#190393L, __pivot_count(1) AS count AS `count(1) AS count`#190390[3] AS LoF_risk#190394L, __pivot_count(1) AS count AS `count(1) AS count`#190390[4] AS noEvaluable#190395L]
                     +- Aggregate [targetId#2033, diseaseId#2115, homogenized_count#176081L, maxDoE#176188L], [targetId#2033, diseaseId#2115, homogenized_count#176081L, maxDoE#176188L, pivotfirst(homogenized#4153, count(1) AS count#190378L, GoF_protect, GoF_risk, LoF_protect, LoF_risk, noEvaluable, 0, 0) AS __pivot_count(1) AS count AS `count(1) AS count`#190390]
                        +- Aggregate [targetId#2033, diseaseId#2115, homogenized_count#176081L, maxDoE#176188L, homogenized#4153], [targetId#2033, diseaseId#2115, homogenized_count#176081L, maxDoE#176188L, homogenized#4153, count(1) AS count(1) AS count#190378L]
                           +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 82 more fields]
                              +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 83 more fields]
                                 +- Window [max(homogenized_count#176081L) windowspecdefinition(targetId#2033, diseaseId#2115, homogenized#4153, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS maxDoE#176188L], [targetId#2033, diseaseId#2115, homogenized#4153]
                                    +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 81 more fields]
                                       +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 81 more fields]
                                          +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 82 more fields]
                                             +- Window [count(homogenized#4153) windowspecdefinition(targetId#2033, diseaseId#2115, homogenized#4153, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS homogenized_count#176081L], [targetId#2033, diseaseId#2115, homogenized#4153]
                                                +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 80 more fields]
                                                   +- Filter datasourceId#2032 IN (gene_burden,intogen,eva,eva_somatic,impc,orphanet,gene2phenotype,gwas_credible_set,cancer_gene_census)
                                                      +- Union false, false
                                                         :- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 80 more fields]
                                                         :  +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 79 more fields]
                                                         :     +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 78 more fields]
                                                         :        +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 78 more fields]
                                                         :           +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 78 more fields]
                                                         :              +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 79 more fields]
                                                         :                 +- Window [collect_set(intogen_function#3736, 0, 0) windowspecdefinition(targetId#2033, diseaseId#2115, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS _we0#3842], [targetId#2033, diseaseId#2115]
                                                         :                    +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 77 more fields]
                                                         :                       +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 77 more fields]
                                                         :                          +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 76 more fields]
                                                         :                             +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 75 more fields]
                                                         :                                +- Join LeftOuter, ((drugId2#2820 = drugId#2066) AND (targetId2#2827 = targetId#2033))
                                                         :                                   :- Join LeftOuter, (target_id#2870 = targetId#2033)
                                                         :                                   :  :- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, concat_ws(,, clinicalSignificances#2049) AS clinicalSignificances#3058, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 66 more fields]
                                                         :                                   :  :  +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#2049, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 66 more fields]
                                                         :                                   :  :     +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, cast(beta#2038 as double) AS beta#2876, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#2049, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 66 more fields]
                                                         :                                   :  :        +- Project [datasourceId#2032, targetId#2033, alleleOrigins#2034, allelicRequirements#2035, ancestry#2036, ancestryId#2037, beta#2038, betaConfidenceIntervalLower#2039, betaConfidenceIntervalUpper#2040, biologicalModelAllelicComposition#2041, biologicalModelGeneticBackground#2042, biologicalModelId#2043, biomarkerName#2044, biomarkers#2045, biosamplesFromSource#2046, cellType#2047, clinicalPhase#2048, clinicalSignificances#2049, clinicalStatus#2050, cohortDescription#2051, cohortId#2052, cohortPhenotypes#2053, cohortShortName#2054, confidence#2055, ... 66 more fields]
                                                         :                                   :  :           +- Filter datasourceId#2032 IN (gwas_credible_set,gene_burden,eva,eva_somatic,gene2phenotype,orphanet,cancer_gene_census,intogen,impc,chembl)
                                                         :                                   :  :              +- Relation [datasourceId#2032,targetId#2033,alleleOrigins#2034,allelicRequirements#2035,ancestry#2036,ancestryId#2037,beta#2038,betaConfidenceIntervalLower#2039,betaConfidenceIntervalUpper#2040,biologicalModelAllelicComposition#2041,biologicalModelGeneticBackground#2042,biologicalModelId#2043,biomarkerName#2044,biomarkers#2045,biosamplesFromSource#2046,cellType#2047,clinicalPhase#2048,clinicalSignificances#2049,clinicalStatus#2050,cohortDescription#2051,cohortId#2052,cohortPhenotypes#2053,cohortShortName#2054,confidence#2055,... 65 more fields] parquet
                                                         :                                   :  +- Project [id#2747 AS target_id#2870, approvedSymbol#2748, description#2855, description_splited#2859, TSorOncogene#2864]
                                                         :                                   :     +- Project [id#2747, approvedSymbol#2748, description#2855, description_splited#2859, CASE WHEN (RLIKE(description_splited#2859, ncogene) AND RLIKE(description_splited#2859, TSG)) THEN bivalent WHEN RLIKE(description_splited#2859, ncogene(\s|$)) THEN oncogene WHEN RLIKE(description_splited#2859, TSG(\s|$)) THEN TSG ELSE noEvaluable END AS TSorOncogene#2864]
                                                         :                                   :        +- Project [id#2747, approvedSymbol#2748, description#2855, concat_ws(,, description#2855) AS description_splited#2859]
                                                         :                                   :           +- Aggregate [id#2747, approvedSymbol#2748], [id#2747, approvedSymbol#2748, collect_set(description#2847, 0, 0) AS description#2855]
                                                         :                                   :              +- Filter description#2847 IN (TSG,oncogene,Oncogene,oncogene,oncogene,TSG,TSG,oncogene,fusion,oncogene,oncogene,fusion)
                                                         :                                   :                 +- Project [id#2747, approvedSymbol#2748, col#2842.description AS description#2847]
                                                         :                                   :                    +- Project [id#2747, approvedSymbol#2748, col#2842]
                                                         :                                   :                       +- Generate explode(hallmarks#2757.attributes), true, [col#2842]
                                                         :                                   :                          +- Relation [id#2747,approvedSymbol#2748,biotype#2749,transcriptIds#2750,canonicalTranscript#2751,canonicalExons#2752,genomicLocation#2753,alternativeGenes#2754,approvedName#2755,go#2756,hallmarks#2757,synonyms#2758,symbolSynonyms#2759,nameSynonyms#2760,functionDescriptions#2761,subcellularLocations#2762,targetClass#2763,obsoleteSymbols#2764,obsoleteNames#2765,constraint#2766,tep#2767,proteinIds#2768,dbXrefs#2769,chemicalProbes#2770,... 5 more fields] parquet
                                                         :                                   +- Aggregate [targetId2#2827, drugId2#2820], [targetId2#2827, drugId2#2820, collect_set(actionType#2805, 0, 0) AS actionType#2837]
                                                         :                                      +- Project [targetId2#2827, drugId2#2820, actionType#2805, mechanismOfAction#2806]
                                                         :                                         +- Generate explode(targets#2810), true, [targetId2#2827]
                                                         :                                            +- Project [drugId2#2820, actionType#2805, mechanismOfAction#2806, targets#2810]
                                                         :                                               +- Generate explode(chemblIds#2807), true, [drugId2#2820]
                                                         :                                                  +- Relation [actionType#2805,mechanismOfAction#2806,chemblIds#2807,targetName#2808,targetType#2809,targets#2810,references#2811] parquet
                                                         +- Project [datasourceId#6381, targetId#1758, null AS alleleOrigins#6387, null AS allelicRequirements#6388, null AS ancestry#6389, null AS ancestryId#6390, null AS beta#6391, null AS betaConfidenceIntervalLower#6392, null AS betaConfidenceIntervalUpper#6393, null AS biologicalModelAllelicComposition#6394, null AS biologicalModelGeneticBackground#6395, null AS biologicalModelId#6396, null AS biomarkerName#6397, null AS biomarkers#6398, null AS biosamplesFromSource#6399, null AS cellType#6400, null AS clinicalPhase#6401, null AS clinicalSignificances#6402, null AS clinicalStatus#6403, null AS cohortDescription#6404, null AS cohortId#6405, null AS cohortPhenotypes#6406, null AS cohortShortName#6407, null AS confidence#6408, ... 80 more fields]
                                                            +- Project [targetId#1758, diseaseId#1911, leftStudyId#1078, homogenized#6188, gwas_credible_set AS datasourceId#6381]
                                                               +- Project [targetId#1758, diseaseId#1911, leftStudyId#1078, CASE WHEN isnull(homogenized#6142) THEN noEvaluable ELSE homogenized#6142 END AS homogenized#6188]
                                                                  +- Project [targetId#1758, diseaseId#1911, leftStudyId#1078, homogenized#6142]
                                                                     +- Project [diseaseId#1911, leftStudyLocusId#949, targetId#1758, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, projectId#975, indexStudyType#1159, condition#998, ... 17 more fields]
                                                                        +- Project [diseaseId#1911, leftStudyLocusId#949, targetId#1758, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, projectId#975, indexStudyType#1159, condition#998, ... 18 more fields]
                                                                           +- Window [first(colocDoE#1991, true) windowspecdefinition(targetId#1758, diseaseId#1911, leftStudyId#1078, pValueExponent#1808 ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS homogenized#6142], [targetId#1758, diseaseId#1911, leftStudyId#1078], [pValueExponent#1808 ASC NULLS FIRST]
                                                                              +- Project [diseaseId#1911, leftStudyLocusId#949, targetId#1758, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, projectId#975, indexStudyType#1159, condition#998, ... 16 more fields]
                                                                                 +- Project [diseaseId#1911, leftStudyLocusId#949, targetId#1758, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, projectId#975, indexStudyType#1159, condition#998, ... 16 more fields]
                                                                                    +- Project [diseaseId#1911, leftStudyLocusId#949, targetId#1758, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, projectId#975, indexStudyType#1159, condition#998, ... 15 more fields]
                                                                                       +- Project [diseaseId#1911, leftStudyLocusId#949, targetId#1758, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, projectId#975, indexStudyType#1159, condition#998, ... 16 more fields]
                                                                                          +- Generate explode(concat(array(diseaseId#800), parents#694)), true, [diseaseId#1911]
                                                                                             +- Project [diseaseId#800, leftStudyLocusId#949, targetId#1758, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, projectId#975, indexStudyType#1159, condition#998, ... 16 more fields]
                                                                                                +- Join LeftOuter, (diseaseId#800 = diseaseId#1864)
                                                                                                   :- Project [leftStudyLocusId#949, targetId#1758, rightStudyId#1103, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, projectId#975, indexStudyType#1159, condition#998, biosampleId#1002, ... 13 more fields]
                                                                                                   :  +- Join Inner, ((leftStudyLocusId#949 = leftStudyLocusId#1783) AND (targetId#1758 = targetId#718))
                                                                                                   :     :- Project [rightStudyId#1103, rightStudyLocusId#950, leftStudyLocusId#949, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, geneId#974 AS targetId#1758, projectId#975, indexStudyType#1159, condition#998, biosampleId#1002]
                                                                                                   :     :  +- Project [rightStudyId#1103, rightStudyLocusId#950, leftStudyLocusId#949, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137, geneId#974, projectId#975, indexStudyType#1159, condition#998, biosampleId#1002]
                                                                                                   :     :     +- Join LeftOuter, (rightStudyId#1103 = rightStudyId#1158)
                                                                                                   :     :        :- Project [rightStudyLocusId#950, leftStudyLocusId#949, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080, rightStudyId#1103, rightVariantId#1104, credibleRightStudyType#1105, isTransQtl#1137]
                                                                                                   :     :        :  +- Join LeftOuter, (rightStudyLocusId#950 = rightStudyLocusId#1102)
                                                                                                   :     :        :     :- Project [leftStudyLocusId#949, rightStudyLocusId#950, chromosome#951, rightStudyType#952, numberColocalisingVariants#953L, h0#954, h1#955, h2#956, h3#957, h4#958, colocalisationMethod#959, betaRatioSignAverage#960, leftStudyId#1078, leftVariantId#1079, credibleLeftStudyType#1080]
                                                                                                   :     :        :     :  +- Join LeftOuter, (leftStudyLocusId#949 = leftStudyLocusId#1077)
                                                                                                   :     :        :     :     :- Relation [leftStudyLocusId#949,rightStudyLocusId#950,chromosome#951,rightStudyType#952,numberColocalisingVariants#953L,h0#954,h1#955,h2#956,h3#957,h4#958,colocalisationMethod#959,betaRatioSignAverage#960] parquet
                                                                                                   :     :        :     :     +- Project [studyLocusId#895 AS leftStudyLocusId#1077, StudyId#896 AS leftStudyId#1078, variantId#897 AS leftVariantId#1079, studyType#920 AS credibleLeftStudyType#1080]
                                                                                                   :     :        :     :        +- Relation [studyLocusId#895,studyId#896,variantId#897,chromosome#898,position#899,region#900,beta#901,zScore#902,pValueMantissa#903,pValueExponent#904,effectAlleleFrequencyFromSource#905,standardError#906,subStudyDescription#907,qualityControls#908,finemappingMethod#909,credibleSetIndex#910,credibleSetlog10BF#911,purityMeanR2#912,purityMinR2#913,locusStart#914,locusEnd#915,sampleSize#916,ldSet#917,locus#918,... 3 more fields] parquet
                                                                                                   :     :        :     +- Project [studyLocusId#1111 AS rightStudyLocusId#1102, studyId#1112 AS rightStudyId#1103, variantId#1113 AS rightVariantId#1104, studyType#1136 AS credibleRightStudyType#1105, isTransQtl#1137]
                                                                                                   :     :        :        +- Relation [studyLocusId#1111,studyId#1112,variantId#1113,chromosome#1114,position#1115,region#1116,beta#1117,zScore#1118,pValueMantissa#1119,pValueExponent#1120,effectAlleleFrequencyFromSource#1121,standardError#1122,subStudyDescription#1123,qualityControls#1124,finemappingMethod#1125,credibleSetIndex#1126,credibleSetlog10BF#1127,purityMeanR2#1128,purityMinR2#1129,locusStart#1130,locusEnd#1131,sampleSize#1132,ldSet#1133,locus#1134,... 3 more fields] parquet
                                                                                                   :     :        +- Project [studyId#973 AS rightStudyId#1158, geneId#974, projectId#975, studyType#976 AS indexStudyType#1159, condition#998, biosampleId#1002]
                                                                                                   :     :           +- Relation [studyId#973,geneId#974,projectId#975,studyType#976,traitFromSource#977,traitFromSourceMappedIds#978,biosampleFromSourceId#979,pubmedId#980,publicationTitle#981,publicationFirstAuthor#982,publicationDate#983,publicationJournal#984,backgroundTraitFromSourceMappedIds#985,initialSampleSize#986,nCases#987,nControls#988,nSamples#989,cohorts#990,ldPopulationStructure#991,discoverySamples#992,replicationSamples#993,qualityControls#994,analysisFlags#995,summarystatsLocation#996,... 6 more fields] parquet
                                                                                                   :     +- Project [studyLocusId#798 AS leftStudyLocusId#1783, datasourceId#717, targetId#718, datatypeId#743, diseaseFromSourceMappedId#747, resourceScore#769, targetFromSourceId#784, diseaseId#800, id#801, score#802, sourceId#805, studyId#1800, variantId#1801, betaGwas#1737, pValueExponent#1808]
                                                                                                   :        +- Project [studyLocusId#798, datasourceId#717, targetId#718, datatypeId#743, diseaseFromSourceMappedId#747, resourceScore#769, targetFromSourceId#784, diseaseId#800, id#801, score#802, sourceId#805, studyId#1800, variantId#1801, betaGwas#1737, pValueExponent#1808]
                                                                                                   :           +- Join LeftOuter, (studyLocusId#798 = studyLocusId#1799)
                                                                                                   :              :- Project [datasourceId#717, targetId#718, datatypeId#743, diseaseFromSourceMappedId#747, resourceScore#769, targetFromSourceId#784, studyLocusId#798, diseaseId#800, id#801, score#802, sourceId#805]
                                                                                                   :              :  +- Filter (datasourceId#717 = gwas_credible_sets)
                                                                                                   :              :     +- Relation [datasourceId#717,targetId#718,alleleOrigins#719,allelicRequirements#720,ancestry#721,ancestryId#722,beta#723,betaConfidenceIntervalLower#724,betaConfidenceIntervalUpper#725,biologicalModelAllelicComposition#726,biologicalModelGeneticBackground#727,biologicalModelId#728,biomarkerName#729,biomarkers#730,biosamplesFromSource#731,cellType#732,clinicalPhase#733,clinicalSignificances#734,clinicalStatus#735,cohortDescription#736,cohortId#737,cohortPhenotypes#738,cohortShortName#739,confidence#740,... 65 more fields] parquet
                                                                                                   :              +- Project [studyLocusId#1799, studyId#1800, variantId#1801, beta#1805 AS betaGwas#1737, pValueExponent#1808]
                                                                                                   :                 +- Relation [studyLocusId#1799,studyId#1800,variantId#1801,chromosome#1802,position#1803,region#1804,beta#1805,zScore#1806,pValueMantissa#1807,pValueExponent#1808,effectAlleleFrequencyFromSource#1809,standardError#1810,subStudyDescription#1811,qualityControls#1812,finemappingMethod#1813,credibleSetIndex#1814,credibleSetlog10BF#1815,purityMeanR2#1816,purityMinR2#1817,locusStart#1818,locusEnd#1819,sampleSize#1820,ldSet#1821,locus#1822,... 3 more fields] parquet
                                                                                                   +- Project [id#689 AS diseaseId#1864, name#691, parents#694, therapeuticAreas#700]
                                                                                                      +- Relation [id#689,code#690,name#691,description#692,dbXRefs#693,parents#694,synonyms#695,obsoleteTerms#696,obsoleteXRefs#697,children#698,ancestors#699,therapeuticAreas#700,descendants#701,ontology#702] parquet


In [50]:
df_result.columns

['targetId',
 'diseaseId',
 'homogenized_count',
 'maxDoE',
 'GoF_protect',
 'GoF_risk',
 'LoF_protect',
 'LoF_risk',
 'noEvaluable',
 'maxHomogenizedLabel']

In [52]:
discrepancifier(df_with_discrepancy).filter(
    F.col("coherencyDiagonal") == "dispar"
).groupBy('hasDiscrepancy').count().show()



+--------------+-----+
|hasDiscrepancy|count|
+--------------+-----+
|      coherent| 6430|
|        dispar|   11|
+--------------+-----+



                                                                                

In [53]:
discrepancifier(df_with_discrepancy).filter(F.col('hasDiscrepancy')=='dispar').show()

                                                                                ]

+---------------+-------------+-----------------+------+-----------+--------+-----------+--------+-----------+-------------------+-----------+--------------+-----------------+----------------+
|       targetId|    diseaseId|homogenized_count|maxDoE|GoF_protect|GoF_risk|LoF_protect|LoF_risk|noEvaluable|maxHomogenizedLabel|max_matches|hasDiscrepancy|coherencyDiagonal|coherencyOneCell|
+---------------+-------------+-----------------+------+-----------+--------+-----------+--------+-----------+-------------------+-----------+--------------+-----------------+----------------+
|ENSG00000158987|  EFO_0004742|                1|     1|          1|       1|          1|       1|       NULL|        GoF_protect|          4|        dispar|           dispar|          dispar|
|ENSG00000135845|  EFO_0010700|               12|    12|         12|      12|         12|      12|       NULL|        GoF_protect|          4|        dispar|           dispar|          dispar|
|ENSG00000109180|  EFO_0004747|    

In [49]:
discrepancifier(df_with_discrepancy).filter(
    F.col("coherencyDiagonal") == "dispar"
).groupBy('max_matches','hasDiscrepancy').count().show()



+-----------+--------------+-----+
|max_matches|hasDiscrepancy|count|
+-----------+--------------+-----+
|       NULL|      coherent| 6531|
+-----------+--------------+-----+



                                                                                

In [18]:
(assessment_all.filter(F.col("datasourceId").isin(wCgc_list))
        .withColumn(
            "datasources",
            F.collect_set(F.col("datasourceId")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy(
            "targetId",
            "diseaseId",
        )
        .pivot("homogenized")
        .agg(F.count("targetId"))).show()



+---------------+--------------+-----------+--------+-----------+--------+-----------+
|       targetId|     diseaseId|GoF_protect|GoF_risk|LoF_protect|LoF_risk|noEvaluable|
+---------------+--------------+-----------+--------+-----------+--------+-----------+
|ENSG00000151617|   EFO_0000612|       NULL|       3|       NULL|    NULL|       NULL|
|ENSG00000100583|   EFO_0004586|       NULL|      30|       NULL|    NULL|       NULL|
|ENSG00000176909|   EFO_0003779|       NULL|      11|       NULL|    NULL|       NULL|
|ENSG00000000938| MONDO_0019156|       NULL|    NULL|       NULL|       1|       NULL|
|ENSG00000060138| MONDO_0033671|       NULL|    NULL|       NULL|       1|       NULL|
|ENSG00000077782| MONDO_0007227|       NULL|    NULL|       NULL|       4|       NULL|
|ENSG00000077935| MONDO_0054728|       NULL|    NULL|       NULL|       3|       NULL|
|ENSG00000148584| MONDO_0007152|       NULL|    NULL|       NULL|       1|       NULL|
|ENSG00000157087| MONDO_0012116|       NULL

                                                                                

In [3]:
import time
#from array import ArrayType
from functions import (
    relative_success,
    spreadSheetFormatter,
    discrepancifier,
    temporary_directionOfEffect,
)
# from stoppedTrials import terminated_td
from DoEAssessment import directionOfEffect
# from membraneTargets import target_membrane
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
#from itertools import islice
from datetime import datetime
from datetime import date
from pyspark.sql.types import (
    StructType,
    StructField,
    DoubleType,
    StringType,
    IntegerType,
    ArrayType
)
import pandas as pd


spark = SparkSession.builder.getOrCreate()
spark.conf.set(
    "spark.sql.shuffle.partitions", "400"
)  # Default is 200, increase if needed


path_n='gs://open-targets-data-releases/25.03/output/'

target = spark.read.parquet(f"{path_n}target/")

diseases = spark.read.parquet(f"{path_n}disease/")

evidences = spark.read.parquet(f"{path_n}evidence")

credible = spark.read.parquet(f"{path_n}credible_set")

new = spark.read.parquet(f"{path_n}colocalisation_coloc") 

index=spark.read.parquet(f"{path_n}study/")

variantIndex = spark.read.parquet(f"{path_n}variant")

biosample = spark.read.parquet(f"{path_n}biosample")

print("loaded files")

newColoc = (
    new.join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on left side
            "studyLocusId as leftStudyLocusId",
            "StudyId as leftStudyId",
            "variantId as leftVariantId",
            "studyType as credibleLeftStudyType",
        ),
        on="leftStudyLocusId",
        how="left",
    )
    .join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on right side
            "studyLocusId as rightStudyLocusId",
            "studyId as rightStudyId",
            "variantId as rightVariantId",
            "studyType as credibleRightStudyType",
            'isTransQtl'
        ),
        on="rightStudyLocusId",
        how="left",
    )
    .join(
        index.selectExpr(  ### bring modulated target on right side (QTL study)
            "studyId as rightStudyId",
            "geneId",
            "projectId",
            "studyType as indexStudyType",
            "condition",
            "biosampleId",
        ),
        on="rightStudyId",
        how="left",
)
    # .persist()
)

print("loaded newColoc")

# remove columns without content (only null values on them)
df = evidences.filter((F.col("datasourceId") == "gwas_credible_sets"))

# Use an aggregation to determine non-null columns
non_null_counts = df.select(
    *[F.sum(F.col(col).isNotNull().cast("int")).alias(col) for col in df.columns]
)

# Collect the counts for each column
non_null_columns = [
    row[0] for row in non_null_counts.collect()[0].asDict().items() if row[1] > 0
]

# Select only the non-null columns
filtered_df = df.select(*non_null_columns)  # .persist()

## bring studyId, variantId, beta from Gwas and pValue
gwasComplete = filtered_df.join(
    credible.selectExpr(
        "studyLocusId", "studyId", "variantId", "beta as betaGwas", "pValueExponent"
    ),
    on="studyLocusId",
    how="left",
)  # .persist()

print("loaded gwasComplete")

resolvedColoc = (
    (
        newColoc.withColumnRenamed("geneId", "targetId")
        .join(
            gwasComplete.withColumnRenamed("studyLocusId", "leftStudyLocusId"),
            on=["leftStudyLocusId", "targetId"],
            how="inner",
        )
        .join(  ### propagated using parent terms
            diseases.selectExpr(
                "id as diseaseId", "name", "parents", "therapeuticAreas"
            ),
            on="diseaseId",
            how="left",
        )
        .withColumn(
            "diseaseId",
            F.explode_outer(F.concat(F.array(F.col("diseaseId")), F.col("parents"))),
        )
        .drop("parents", "oldDiseaseId")
    ).withColumn(
        "colocDoE",
        F.when(
            F.col("rightStudyType").isin(
                ["eqtl", "pqtl", "tuqtl", "sceqtl", "sctuqtl"]
            ),
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_protect"),
            ),
        ).when(
            F.col("rightStudyType").isin(
                ["sqtl", "scsqtl"]
            ),  ### opposite directionality than sqtl
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_protect"),
            ),
        ),
    )
    # .persist()
)
print("loaded resolvedColloc")

datasource_filter = [
    "gwas_credible_set",
    "gene_burden",
    "eva",
    "eva_somatic",
    "gene2phenotype",
    "orphanet",
    "cancer_gene_census",
    "intogen",
    "impc",
    "chembl",
]

assessment, evidences, actionType, oncolabel = temporary_directionOfEffect(
    path_n, datasource_filter
)

print("run temporary direction of effect")

window_spec = Window.partitionBy("targetId", "diseaseId",'leftStudyId').orderBy( ### include gwas study
    F.col("pValueExponent").asc()
)
gwasCredibleAssoc = (
    resolvedColoc.withColumn(
        "homogenized", F.first("colocDoE", ignorenulls=True).over(window_spec)
    )  ## added 30.01.2025
    .select("targetId", "diseaseId",'leftStudyId', "homogenized")
    .withColumn(
        "homogenized",
        F.when(F.col("homogenized").isNull(), F.lit("noEvaluable")).otherwise(
            F.col("homogenized")
        ),
    )
)

print("Moving to step 2")

columns_chembl = ["LoF_protect", "GoF_protect"]
columns_dataset = ["LoF_protect", "GoF_protect", "LoF_risk", "GoF_risk", "evidenceDif"]
columns = ["GoF_risk", "LoF_protect", "LoF_risk", "GoF_protect"]
terms = ["noEvaluable", "bivalent_risk", "null", "dispar"]

taDf = spark.createDataFrame(
    data=[
        ("MONDO_0045024", "cell proliferation disorder", "Oncology"),
        ("EFO_0005741", "infectious disease", "Other"),
        ("OTAR_0000014", "pregnancy or perinatal disease", "Other"),
        ("EFO_0005932", "animal disease", "Other"),
        ("MONDO_0024458", "disease of visual system", "Other"),
        ("EFO_0000319", "cardiovascular disease", "Other"),
        ("EFO_0009605", "pancreas disease", "Other"),
        ("EFO_0010282", "gastrointestinal disease", "Other"),
        ("OTAR_0000017", "reproductive system or breast disease", "Other"),
        ("EFO_0010285", "integumentary system disease", "Other"),
        ("EFO_0001379", "endocrine system disease", "Other"),
        ("OTAR_0000010", "respiratory or thoracic disease", "Other"),
        ("EFO_0009690", "urinary system disease", "Other"),
        ("OTAR_0000006", "musculoskeletal or connective tissue disease", "Other"),
        ("MONDO_0021205", "disease of ear", "Other"),
        ("EFO_0000540", "immune system disease", "Other"),
        ("EFO_0005803", "hematologic disease", "Other"),
        ("EFO_0000618", "nervous system disease", "Other"),
        ("MONDO_0002025", "psychiatric disorder", "Other"),
        ("MONDO_0024297", "nutritional or metabolic disease", "Other"),
        ("OTAR_0000018", "genetic, familial or congenital disease", "Other"),
        ("OTAR_0000009", "injury, poisoning or other complication", "Other"),
        ("EFO_0000651", "phenotype", "Other"),
        ("EFO_0001444", "measurement", "Other"),
        ("GO_0008150", "biological process", "Other"),
    ],
    schema=StructType(
        [
            StructField("taId", StringType(), True),
            StructField("taLabel", StringType(), True),
            StructField("taLabelSimple", StringType(), True),
        ]
    ),
).withColumn("taRank", F.monotonically_increasing_id())

### give us a classification of Oncology VS non oncology
wByDisease = Window.partitionBy("diseaseId")  #### checked 31.05.2023
diseaseTA = (
    diseases.withColumn("taId", F.explode("therapeuticAreas"))
    .select(F.col("id").alias("diseaseId"), "taId", "parents")
    .join(taDf, on="taId", how="left")
    .withColumn("minRank", F.min("taRank").over(wByDisease))
    .filter(F.col("taRank") == F.col("minRank"))
    .drop("taRank", "minRank")
)

#### give us propagation of diseases and list of therapeutic areas associated
diseases2 = diseases.select("id", "parents").withColumn(
    "diseaseIdPropagated",
    F.explode_outer(F.concat(F.array(F.col("id")), F.col("parents"))),
)

chembl_trials = (
    assessment.filter((F.col("datasourceId").isin(["chembl"])))
    .groupBy("targetId", "diseaseId")
    .agg(F.max(F.col("clinicalPhase")).alias("maxClinPhase"))
)

negativeTD = (
    evidences.filter(F.col("datasourceId") == "chembl")
    .select("targetId", "diseaseId", "studyStopReason", "studyStopReasonCategories")
    .filter(F.array_contains(F.col("studyStopReasonCategories"), "Negative"))
    .groupBy("targetId", "diseaseId")
    .count()
    .withColumn("stopReason", F.lit("Negative"))
    .drop("count")
)

assessment_all = assessment.unionByName(
    gwasCredibleAssoc.withColumn("datasourceId", F.lit("gwas_credible_set")),
    allowMissingColumns=True,
)

print("defining non propagated,propagated and analysis_drugs functions")

def analysis_nonPropagated(assessment_all, analysisDatasources):
    return discrepancifier(
        assessment_all.filter(F.col("datasourceId").isin(analysisDatasources))
        .withColumn(
            "datasources",
            F.collect_set(F.col("datasourceId")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy(
            "targetId",
            "diseaseId",
        )
        .pivot("homogenized")
        .agg(F.count("targetId"))
        # .persist()
    )


def analysis_propagated(assessment_all, analysisDatasources):
    return discrepancifier(
        assessment_all.filter(F.col("datasourceId").isin(analysisDatasources))
        .withColumn(
            "datasources",
            F.collect_set(F.col("datasourceId")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .join(
            diseases2.selectExpr("id as diseaseId", "diseaseIdPropagated"),
            on="diseaseId",
            how="left",
        )
        .withColumnRenamed("diseaseId", "oldDiseaseId")
        .withColumnRenamed("diseaseIdPropagated", "diseaseId")
        .groupBy(
            "targetId",
            "diseaseId",
        )
        .pivot("homogenized")
        .agg(F.count("targetId"))
        # .persist()
    )

chembl_ds = ["chembl"]

def analysis_drugs(assessment_all, chembl_ds):
    return discrepancifier(
        assessment_all.filter((F.col("datasourceId").isin(chembl_ds))
        )
        .withColumn(
            "maxClinPhase",
            F.max(F.col("clinicalPhase")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase")
        .pivot("homogenized")
        .agg(F.count("targetId"))
        .persist()
    )


analysis_chembl = analysis_drugs(assessment_all, chembl_ds)

#######
## include here the analysis
#######

analysisDatasources = []

print("defining full_analysis_propagation")

doe_columns=["LoF_protect", "GoF_risk", "LoF_risk", "GoF_protect"]
diagonal_lof=['LoF_protect','GoF_risk']
diagonal_gof=['LoF_risk','GoF_protect']

def full_analysis_propagation(
    doe_columns,assessment_all, analysisDatasources, analysis_chembl, negativeTD, diseaseTA,diagonal_lof,diagonal_gof
):
    conditions = [
    F.when(F.col(c) == F.col("maxDoE"), F.lit(c)).otherwise(F.lit(None)) for c in doe_columns
    ]
    
    return (
        analysis_propagated(assessment_all, analysisDatasources)
        .join(
            analysis_chembl.selectExpr(
                "targetId",
                "diseaseId",
                "maxClinPhase",
                "coherencyDiagonal as coherencyDiagonal_ch",
                "coherencyOneCell as coherencyOneCell_ch",
                "LoF_protect as LoF_protect_ch",
                "GoF_protect as GoF_protect_ch",
            ),
            on=["targetId", "diseaseId"],
            how="right",
        )
        #### Should remove the coherencyDiagonal.isNotNull()
        .withColumn(
            "geneticEvidence",
            F.when(
                F.col("coherencyDiagonal").isNotNull(), F.lit("hasGeneticEvidence")
            ).otherwise(F.lit("noGeneticEvidence")),
        )
        # .filter(F.col("coherencyDiagonal_ch").isNotNull())
        .withColumn(
            "diagonalAgreeWithDrugs",
            F.when(
                (F.col("coherencyDiagonal_ch") == "coherent")
                & (F.col("coherencyDiagonal") == "coherent"),
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        F.col("GoF_risk").isNotNull() | F.col("LoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .when(
                    F.col("GoF_protect_ch").isNotNull()
                    & (
                        F.col("LoF_risk").isNotNull() | F.col("GoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        )
        .withColumn(
            "oneCellAgreeWithDrugs",
            F.when(
                (F.col("coherencyOneCell_ch") == "coherent")
                & (F.col("coherencyOneCell") == "coherent"),
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        (F.col("LoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("GoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .when(
                    (F.col("GoF_protect_ch").isNotNull())
                    & (
                        (F.col("GoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("LoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        ).withColumn(
            "arrayN", F.array(*[F.col(c) for c in doe_columns])
        ).withColumn(
            "maxDoE", F.array_max(F.col("arrayN"))
        ).withColumn("maxDoE_names", F.array(*conditions)
        ).withColumn("maxDoE_names", F.expr("filter(maxDoE_names, x -> x is not null)")
        ).withColumn(
            "Phase4",
            F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=3",
            F.when(F.col("maxClinPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=2",
            F.when(F.col("maxClinPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=1",
            F.when(F.col("maxClinPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
        )
        .join(negativeTD, on=["targetId", "diseaseId"], how="left")
        .withColumn(
            "PhaseT",
            F.when(F.col("stopReason") == "Negative", F.lit("yes")).otherwise(F.lit("no")),
        )
        .join(
            diseaseTA.select("diseaseId", "taLabelSimple"), on="diseaseId", how="left"
        )
        .withColumn(
            "hasGeneticEvidence",
            F.when(
                F.col("geneticEvidence") == "hasGeneticEvidence", F.lit("yes")
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "diagonalYes",
            F.when(
                F.col("hasGeneticEvidence") == "yes",
                F.when(F.col("diagonalAgreeWithDrugs") == "coherent", F.lit("yes"))
                .when(F.col("diagonalAgreeWithDrugs") == "dispar", F.lit("no"))
                .otherwise(F.lit("no")),
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "oneCellYes",
            F.when(
                F.col("hasGeneticEvidence") == "yes",
                F.when(F.col("oneCellAgreeWithDrugs") == "coherent", F.lit("yes"))
                .when(F.col("oneCellAgreeWithDrugs") == "dispar", F.lit("no"))
                .otherwise(F.lit("no")),
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "maxDoEArrayN",
            F.expr("aggregate(arrayN, 0, (acc, x) -> acc + IF(x = maxDoE, 1, 0))")
        ).withColumn(
            "NoneCellYes",
            F.when(F.col("LoF_protect_ch").isNotNull() & (F.array_contains(F.col("maxDoE_names"), F.lit("LoF_protect")))==True, F.lit('yes'))
            .when(F.col("GoF_protect_ch").isNotNull() & (F.array_contains(F.col("maxDoE_names"), F.lit("GoF_protect")))==True, F.lit('yes')
                ).otherwise(F.lit('no'))  # If the value is null, return null # Otherwise, check if name is in array
        ).withColumn(
            "NdiagonalYes",
            F.when(F.col("LoF_protect_ch").isNotNull() & 
                (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_lof]))) > 0),
                F.lit("yes")
            ).when(F.col("GoF_protect_ch").isNotNull() & 
                (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_gof]))) > 0),
                F.lit("yes")
            ).otherwise(F.lit('no'))
        )
        # .persist()
    )


#####
## no propag
#####
print("defining full analysis no propagation")


def full_analysis_noPropagation(
    doe_columns,assessment_all, analysisDatasources, analysis_chembl, negativeTD, diseaseTA,diagonal_lof,diagonal_gof
):
    conditions = [
    F.when(F.col(c) == F.col("maxDoE"), F.lit(c)).otherwise(F.lit(None)) for c in doe_columns
    ]
    return (
        analysis_nonPropagated(assessment_all, analysisDatasources)
        .join(
            analysis_chembl.selectExpr(
                "targetId",
                "diseaseId",
                "maxClinPhase",
                "coherencyDiagonal as coherencyDiagonal_ch",
                "coherencyOneCell as coherencyOneCell_ch",
                "LoF_protect as LoF_protect_ch",
                "GoF_protect as GoF_protect_ch",
            ),
            on=["targetId", "diseaseId"],
            how="right",
        )
        .withColumn(
            "geneticEvidence",
            F.when(
                F.col("coherencyDiagonal").isNotNull(), F.lit("hasGeneticEvidence")
            ).otherwise(F.lit("noGeneticEvidence")),
        )
        # .filter(F.col("coherencyDiagonal_ch").isNotNull())
        .withColumn(
            "diagonalAgreeWithDrugs",
            F.when(
                (F.col("coherencyDiagonal_ch") == "coherent")
                & (F.col("coherencyDiagonal") == "coherent"),
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        F.col("GoF_risk").isNotNull() | F.col("LoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .when(
                    F.col("GoF_protect_ch").isNotNull()
                    & (
                        F.col("LoF_risk").isNotNull() | F.col("GoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        )
        .withColumn(
            "oneCellAgreeWithDrugs",
            F.when(
                (F.col("coherencyOneCell_ch") == "coherent")
                & (F.col("coherencyOneCell") == "coherent"),
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        (F.col("LoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("GoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .when(
                    (F.col("GoF_protect_ch").isNotNull())
                    & (
                        (F.col("GoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("LoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        ).withColumn(
            "arrayN", F.array(*[F.col(c) for c in doe_columns])
        ).withColumn(
            "maxDoE", F.array_max(F.col("arrayN"))
        ).withColumn("maxDoE_names", F.array(*conditions)
        ).withColumn("maxDoE_names", F.expr("filter(maxDoE_names, x -> x is not null)")
        )
        .withColumn(
            "Phase4",
            F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=3",
            F.when(F.col("maxClinPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=2",
            F.when(F.col("maxClinPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=1",
            F.when(F.col("maxClinPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
        )
        .join(negativeTD, on=["targetId", "diseaseId"], how="left")
        .withColumn(
            "PhaseT",
            F.when(F.col("stopReason") == "Negative", F.lit("yes")).otherwise(F.lit("no")),
        )
        .join(
            diseaseTA.select("diseaseId", "taLabelSimple"), on="diseaseId", how="left"
        )
        .withColumn(
            "hasGeneticEvidence",
            F.when(
                F.col("geneticEvidence") == "hasGeneticEvidence", F.lit("yes")
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "diagonalYes",
            F.when(
                F.col("hasGeneticEvidence") == "yes",
                F.when(F.col("diagonalAgreeWithDrugs") == "coherent", F.lit("yes"))
                .when(F.col("diagonalAgreeWithDrugs") == "dispar", F.lit("no"))
                .otherwise(F.lit("no")),
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "oneCellYes",
            F.when(
                F.col("hasGeneticEvidence") == "yes",
                F.when(F.col("oneCellAgreeWithDrugs") == "coherent", F.lit("yes"))
                .when(F.col("oneCellAgreeWithDrugs") == "dispar", F.lit("no"))
                .otherwise(F.lit("no")),
            ).otherwise(F.lit("no")),
        ).withColumn(
            "maxDoEArrayN",
            F.expr("aggregate(arrayN, 0, (acc, x) -> acc + IF(x = maxDoE, 1, 0))")
        ).withColumn(
            "NoneCellYes",
            F.when(F.col("LoF_protect_ch").isNotNull() & (F.array_contains(F.col("maxDoE_names"), F.lit("LoF_protect")))==True, F.lit('yes'))
            .when(F.col("GoF_protect_ch").isNotNull() & (F.array_contains(F.col("maxDoE_names"), F.lit("GoF_protect")))==True, F.lit('yes')
                ).otherwise(F.lit('no'))  # If the value is null, return null # Otherwise, check if name is in array
        ).withColumn(
            "NdiagonalYes",
            F.when(F.col("LoF_protect_ch").isNotNull() & 
                (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_lof]))) > 0),
                F.lit("yes")
            ).when(F.col("GoF_protect_ch").isNotNull() & 
                (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_gof]))) > 0),
                F.lit("yes")
            ).otherwise(F.lit('no'))
        )
        # .persist()
    )

print("moving to Step 3")

from functions import relative_success, spreadSheetFormatter, convertTuple
import re
import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
from scipy.stats.contingency import odds_ratio, relative_risk

full_data = spark.createDataFrame(
    data=[
        ("yes", "yes"),
        ("yes", "no"),
        ("no", "yes"),
        ("no", "no"),
    ],
    schema=StructType(
        [
            StructField("prediction", StringType(), True),
            StructField("comparison", StringType(), True),
        ]
    ),
)
c = datetime.now()
print("starting dictionaries at", c)

#### continue here on 10.07.2024

## 1nd dictionary
dfs_dict = {}  ### checked and changed on 01.06.2023
dfs_dict_propag = {}


wocgc_list = [
    "gene_burden",
    "intogen",
    "eva",
    "eva_somatic",
    "impc",
    "orphanet",
    "gene2phenotype",
    "gwas_credible_set",
]
wCgc_list = [
    "gene_burden",
    "intogen",
    "eva",
    "eva_somatic",
    "impc",
    "orphanet",
    "gene2phenotype",
    "gwas_credible_set",
    "cancer_gene_census",
]

datasource_list = [
    "gene_burden",
    "intogen",
    "cancer_gene_census",
    "eva",
    "eva_somatic",
    "gwas_credible_set",
    "impc",
    "orphanet",
    "gene2phenotype",
    "WOcgc",
    "wCgc",
    "somatic",
    "germline",
]

germline_list = [
    "gene_burden",
    "eva",
    "gwas_credible_set",
    "impc",
    "orphanet",
    "gene2phenotype",
]

somatic_list = ["intogen", "cancer_gene_census", "eva_somatic"]


# assessment = prueba_assessment.filter(F.col("datasourceId").isin(datasources_analysis))
def dataset_builder(assessment_all, value, analysis_chembl, negativeTD, diseaseTA):
    nonPropagated = full_analysis_noPropagation(
        doe_columns,assessment_all, value, analysis_chembl, negativeTD, diseaseTA,diagonal_lof,diagonal_gof
    )
    propagated = full_analysis_propagation(
        doe_columns,assessment_all, value, analysis_chembl, negativeTD, diseaseTA,diagonal_lof,diagonal_gof
    )
    return (
        # Non propagation
        ## All
        nonPropagated,
        ## Other
        nonPropagated.filter(F.col("taLabelSimple") == "Other"),
        ## Other&Null
        nonPropagated.filter(
            (F.col("taLabelSimple").isNull()) | (F.col("taLabelSimple") == "Other")
        ),
        ## Oncology
        nonPropagated.filter(F.col("taLabelSimple") == "Oncology"),
        # Propagation
        ## All
        propagated,
        ## Other
        propagated.filter(F.col("taLabelSimple") == "Other"),
        ## Other&Null
        propagated.filter(
            (F.col("taLabelSimple").isNull()) | (F.col("taLabelSimple") == "Other")
        ),
        ## Oncology
        propagated.filter(F.col("taLabelSimple") == "Oncology"),
    )


for value in datasource_list:
    print(value)
    if value == "WOcgc":
        (
            dfs_dict[f"df_{value}_All_original"],
            dfs_dict[f"df_{value}_Other_original"],
            dfs_dict[f"df_{value}_OtherNull_original"],
            dfs_dict[f"df_{value}_Oncology_original"],
            dfs_dict_propag[f"df_{value}_All_propag"],
            dfs_dict_propag[f"df_{value}_Other_propag"],
            dfs_dict_propag[f"df_{value}_OtherNull_propag"],
            dfs_dict_propag[f"df_{value}_Oncology_propag"],
        ) = dataset_builder(
            assessment_all, wocgc_list, analysis_chembl, negativeTD, diseaseTA
        )
    elif value == "wCgc":
        (
            dfs_dict[f"df_{value}_All_original"],
            dfs_dict[f"df_{value}_Other_original"],
            dfs_dict[f"df_{value}_OtherNull_original"],
            dfs_dict[f"df_{value}_Oncology_original"],
            dfs_dict_propag[f"df_{value}_All_propag"],
            dfs_dict_propag[f"df_{value}_Other_propag"],
            dfs_dict_propag[f"df_{value}_OtherNull_propag"],
            dfs_dict_propag[f"df_{value}_Oncology_propag"],
        ) = dataset_builder(
            assessment_all, wCgc_list, analysis_chembl, negativeTD, diseaseTA
        )
    elif value == "germline":
        (
            dfs_dict[f"df_{value}_All_original"],
            dfs_dict[f"df_{value}_Other_original"],
            dfs_dict[f"df_{value}_OtherNull_original"],
            dfs_dict[f"df_{value}_Oncology_original"],
            dfs_dict_propag[f"df_{value}_All_propag"],
            dfs_dict_propag[f"df_{value}_Other_propag"],
            dfs_dict_propag[f"df_{value}_OtherNull_propag"],
            dfs_dict_propag[f"df_{value}_Oncology_propag"],
        ) = dataset_builder(
            assessment_all,
            germline_list,
            analysis_chembl,
            negativeTD,
            diseaseTA,
        )

    elif value == "somatic":
        (
            dfs_dict[f"df_{value}_All_original"],
            dfs_dict[f"df_{value}_Other_original"],
            dfs_dict[f"df_{value}_OtherNull_original"],
            dfs_dict[f"df_{value}_Oncology_original"],
            dfs_dict_propag[f"df_{value}_All_propag"],
            dfs_dict_propag[f"df_{value}_Other_propag"],
            dfs_dict_propag[f"df_{value}_OtherNull_propag"],
            dfs_dict_propag[f"df_{value}_Oncology_propag"],
        ) = dataset_builder(
            assessment_all,
            somatic_list,
            analysis_chembl,
            negativeTD,
            diseaseTA,
        )

    else:
        (
            dfs_dict[f"df_{value}_All_original"],
            dfs_dict[f"df_{value}_Other_original"],
            dfs_dict[f"df_{value}_OtherNull_original"],
            dfs_dict[f"df_{value}_Oncology_original"],
            dfs_dict_propag[f"df_{value}_All_propag"],
            dfs_dict_propag[f"df_{value}_Other_propag"],
            dfs_dict_propag[f"df_{value}_OtherNull_propag"],
            dfs_dict_propag[f"df_{value}_Oncology_propag"]
        ) = dataset_builder(
            assessment_all, value, analysis_chembl, negativeTD, diseaseTA
        )


def comparisons_df() -> list:
    """Return list of all comparisons to be used in the analysis"""
    comparisons = spark.createDataFrame(
        data=[
            ("hasGeneticEvidence", "byDatatype"),
            ("diagonalYes", "byDatatype"),
            ("oneCellYes", "byDatatype"),
            ("NdiagonalYes", "byDatatype"),
            ("NoneCellYes", "byDatatype"),
        ],
        schema=StructType(
            [
                StructField("comparison", StringType(), True),
                StructField("comparisonType", StringType(), True),
            ]
        ),
    )

    predictions = spark.createDataFrame(
        data=[
            ("Phase4", "clinical"),
            ("Phase>=3", "clinical"),
            ("Phase>=2", "clinical"),
            ("Phase>=1", "clinical"),
            ("PhaseT", "clinical"),
        ]
    )
    return comparisons.join(predictions, how="full").collect()


loaded files
loaded newColoc


                                                                                

loaded gwasComplete
loaded resolvedColloc


25/04/24 01:18:31 WARN CacheManager: Asked to cache already cached data.
25/04/24 01:18:32 WARN CacheManager: Asked to cache already cached data.


run temporary direction of effect
Moving to step 2
defining non propagated,propagated and analysis_drugs functions


25/04/24 01:18:33 WARN CacheManager: Asked to cache already cached data.
25/04/24 01:18:33 WARN CacheManager: Asked to cache already cached data.
25/04/24 01:18:33 WARN CacheManager: Asked to cache already cached data.


defining full_analysis_propagation
defining full analysis no propagation
moving to Step 3
starting dictionaries at 2025-04-24 01:18:33.412624
gene_burden


25/04/24 01:18:34 WARN CacheManager: Asked to cache already cached data.
25/04/24 01:18:34 WARN CacheManager: Asked to cache already cached data.


intogen
cancer_gene_census
eva
eva_somatic
gwas_credible_set


                                                                                

impc
orphanet
gene2phenotype
WOcgc


                                                                                

wCgc


                                                                                

somatic
germline


                                                                                

In [None]:
### see the error: 

In [1]:

from itertools import islice
import time
#from array import ArrayType
from functions import (
    relative_success,
    spreadSheetFormatter,
    discrepancifier,
    temporary_directionOfEffect,
)
# from stoppedTrials import terminated_td
from DoEAssessment import directionOfEffect
# from membraneTargets import target_membrane
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
#from itertools import islice
from datetime import datetime
from datetime import date
from pyspark.sql.types import (
    StructType,
    StructField,
    DoubleType,
    StringType,
    IntegerType,
    ArrayType
)
import pandas as pd


spark = SparkSession.builder.getOrCreate()
spark.conf.set(
    "spark.sql.shuffle.partitions", "400"
)  # Default is 200, increase if needed


path_n='gs://open-targets-data-releases/25.03/output/'

target = spark.read.parquet(f"{path_n}target/")

diseases = spark.read.parquet(f"{path_n}disease/")

evidences = spark.read.parquet(f"{path_n}evidence")

credible = spark.read.parquet(f"{path_n}credible_set")

new = spark.read.parquet(f"{path_n}colocalisation_coloc") 

index=spark.read.parquet(f"{path_n}study/")

variantIndex = spark.read.parquet(f"{path_n}variant")

biosample = spark.read.parquet(f"{path_n}biosample")

print("loaded files")

newColoc = (
    new.join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on left side
            "studyLocusId as leftStudyLocusId",
            "StudyId as leftStudyId",
            "variantId as leftVariantId",
            "studyType as credibleLeftStudyType",
        ),
        on="leftStudyLocusId",
        how="left",
    )
    .join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on right side
            "studyLocusId as rightStudyLocusId",
            "studyId as rightStudyId",
            "variantId as rightVariantId",
            "studyType as credibleRightStudyType",
            'isTransQtl'
        ),
        on="rightStudyLocusId",
        how="left",
    )
    .join(
        index.selectExpr(  ### bring modulated target on right side (QTL study)
            "studyId as rightStudyId",
            "geneId",
            "projectId",
            "studyType as indexStudyType",
            "condition",
            "biosampleId",
        ),
        on="rightStudyId",
        how="left",
)
    # .persist()
)

print("loaded newColoc")

# remove columns without content (only null values on them)
df = evidences.filter((F.col("datasourceId") == "gwas_credible_sets"))

# Use an aggregation to determine non-null columns
non_null_counts = df.select(
    *[F.sum(F.col(col).isNotNull().cast("int")).alias(col) for col in df.columns]
)

# Collect the counts for each column
non_null_columns = [
    row[0] for row in non_null_counts.collect()[0].asDict().items() if row[1] > 0
]

# Select only the non-null columns
filtered_df = df.select(*non_null_columns)  # .persist()

## bring studyId, variantId, beta from Gwas and pValue
gwasComplete = filtered_df.join(
    credible.selectExpr(
        "studyLocusId", "studyId", "variantId", "beta as betaGwas", "pValueExponent"
    ),
    on="studyLocusId",
    how="left",
)  # .persist()

print("loaded gwasComplete")

resolvedColoc = (
    (
        newColoc.withColumnRenamed("geneId", "targetId")
        .join(
            gwasComplete.withColumnRenamed("studyLocusId", "leftStudyLocusId"),
            on=["leftStudyLocusId", "targetId"],
            how="inner",
        )
        .join(  ### propagated using parent terms
            diseases.selectExpr(
                "id as diseaseId", "name", "parents", "therapeuticAreas"
            ),
            on="diseaseId",
            how="left",
        )
        .withColumn(
            "diseaseId",
            F.explode_outer(F.concat(F.array(F.col("diseaseId")), F.col("parents"))),
        )
        .drop("parents", "oldDiseaseId")
    ).withColumn(
        "colocDoE",
        F.when(
            F.col("rightStudyType").isin(
                ["eqtl", "pqtl", "tuqtl", "sceqtl", "sctuqtl"]
            ),
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_protect"),
            ),
        ).when(
            F.col("rightStudyType").isin(
                ["sqtl", "scsqtl"]
            ),  ### opposite directionality than sqtl
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_protect"),
            ),
        ),
    )
    # .persist()
)
print("loaded resolvedColloc")

datasource_filter = [
    "gwas_credible_set",
    "gene_burden",
    "eva",
    "eva_somatic",
    "gene2phenotype",
    "orphanet",
    "cancer_gene_census",
    "intogen",
    "impc",
    "chembl",
]

assessment, evidences, actionType, oncolabel = temporary_directionOfEffect(
    path_n, datasource_filter
)

print("run temporary direction of effect")

window_spec = Window.partitionBy("targetId", "diseaseId",'leftStudyId').orderBy( ### include gwas study
    F.col("pValueExponent").asc()
)
gwasCredibleAssoc = (
    resolvedColoc.withColumn(
        "homogenized", F.first("colocDoE", ignorenulls=True).over(window_spec)
    )  ## added 30.01.2025
    .select("targetId", "diseaseId",'leftStudyId', "homogenized")
    .withColumn(
        "homogenized",
        F.when(F.col("homogenized").isNull(), F.lit("noEvaluable")).otherwise(
            F.col("homogenized")
        ),
    )
)

print("Moving to step 2")

columns_chembl = ["LoF_protect", "GoF_protect"]
columns_dataset = ["LoF_protect", "GoF_protect", "LoF_risk", "GoF_risk", "evidenceDif"]
columns = ["GoF_risk", "LoF_protect", "LoF_risk", "GoF_protect"]
terms = ["noEvaluable", "bivalent_risk", "null", "dispar"]

taDf = spark.createDataFrame(
    data=[
        ("MONDO_0045024", "cell proliferation disorder", "Oncology"),
        ("EFO_0005741", "infectious disease", "Other"),
        ("OTAR_0000014", "pregnancy or perinatal disease", "Other"),
        ("EFO_0005932", "animal disease", "Other"),
        ("MONDO_0024458", "disease of visual system", "Other"),
        ("EFO_0000319", "cardiovascular disease", "Other"),
        ("EFO_0009605", "pancreas disease", "Other"),
        ("EFO_0010282", "gastrointestinal disease", "Other"),
        ("OTAR_0000017", "reproductive system or breast disease", "Other"),
        ("EFO_0010285", "integumentary system disease", "Other"),
        ("EFO_0001379", "endocrine system disease", "Other"),
        ("OTAR_0000010", "respiratory or thoracic disease", "Other"),
        ("EFO_0009690", "urinary system disease", "Other"),
        ("OTAR_0000006", "musculoskeletal or connective tissue disease", "Other"),
        ("MONDO_0021205", "disease of ear", "Other"),
        ("EFO_0000540", "immune system disease", "Other"),
        ("EFO_0005803", "hematologic disease", "Other"),
        ("EFO_0000618", "nervous system disease", "Other"),
        ("MONDO_0002025", "psychiatric disorder", "Other"),
        ("MONDO_0024297", "nutritional or metabolic disease", "Other"),
        ("OTAR_0000018", "genetic, familial or congenital disease", "Other"),
        ("OTAR_0000009", "injury, poisoning or other complication", "Other"),
        ("EFO_0000651", "phenotype", "Other"),
        ("EFO_0001444", "measurement", "Other"),
        ("GO_0008150", "biological process", "Other"),
    ],
    schema=StructType(
        [
            StructField("taId", StringType(), True),
            StructField("taLabel", StringType(), True),
            StructField("taLabelSimple", StringType(), True),
        ]
    ),
).withColumn("taRank", F.monotonically_increasing_id())

### give us a classification of Oncology VS non oncology
wByDisease = Window.partitionBy("diseaseId")  #### checked 31.05.2023
diseaseTA = (
    diseases.withColumn("taId", F.explode("therapeuticAreas"))
    .select(F.col("id").alias("diseaseId"), "taId", "parents")
    .join(taDf, on="taId", how="left")
    .withColumn("minRank", F.min("taRank").over(wByDisease))
    .filter(F.col("taRank") == F.col("minRank"))
    .drop("taRank", "minRank")
)

#### give us propagation of diseases and list of therapeutic areas associated
diseases2 = diseases.select("id", "parents").withColumn(
    "diseaseIdPropagated",
    F.explode_outer(F.concat(F.array(F.col("id")), F.col("parents"))),
)

chembl_trials = (
    assessment.filter((F.col("datasourceId").isin(["chembl"])))
    .groupBy("targetId", "diseaseId")
    .agg(F.max(F.col("clinicalPhase")).alias("maxClinPhase"))
)

negativeTD = (
    evidences.filter(F.col("datasourceId") == "chembl")
    .select("targetId", "diseaseId", "studyStopReason", "studyStopReasonCategories")
    .filter(F.array_contains(F.col("studyStopReasonCategories"), "Negative"))
    .groupBy("targetId", "diseaseId")
    .count()
    .withColumn("stopReason", F.lit("Negative"))
    .drop("count")
)

assessment_all = assessment.unionByName(
    gwasCredibleAssoc.withColumn("datasourceId", F.lit("gwas_credible_set")),
    allowMissingColumns=True,
)

print("defining non propagated,propagated and analysis_drugs functions")

def analysis_nonPropagated(assessment_all, analysisDatasources):
    return discrepancifier(
        assessment_all.filter(F.col("datasourceId").isin(analysisDatasources))
        .withColumn(
            "datasources",
            F.collect_set(F.col("datasourceId")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy(
            "targetId",
            "diseaseId",
        )
        .pivot("homogenized")
        .agg(F.count("targetId"))
        # .persist()
    )


def analysis_propagated(assessment_all, analysisDatasources):
    return discrepancifier(
        assessment_all.filter(F.col("datasourceId").isin(analysisDatasources))
        .withColumn(
            "datasources",
            F.collect_set(F.col("datasourceId")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .join(
            diseases2.selectExpr("id as diseaseId", "diseaseIdPropagated"),
            on="diseaseId",
            how="left",
        )
        .withColumnRenamed("diseaseId", "oldDiseaseId")
        .withColumnRenamed("diseaseIdPropagated", "diseaseId")
        .groupBy(
            "targetId",
            "diseaseId",
        )
        .pivot("homogenized")
        .agg(F.count("targetId"))
        # .persist()
    )

chembl_ds = ["chembl"]

def analysis_drugs(assessment_all, chembl_ds):
    return discrepancifier(
        assessment_all.filter((F.col("datasourceId").isin(chembl_ds))
        )
        .withColumn(
            "maxClinPhase",
            F.max(F.col("clinicalPhase")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase")
        .pivot("homogenized")
        .agg(F.count("targetId"))
        .persist()
    )


analysis_chembl = analysis_drugs(assessment_all, chembl_ds)

#######
## include here the analysis
#######

analysisDatasources = []

print("defining full_analysis_propagation")

doe_columns=["LoF_protect", "GoF_risk", "LoF_risk", "GoF_protect"]
diagonal_lof=['LoF_protect','GoF_risk']
diagonal_gof=['LoF_risk','GoF_protect']

def full_analysis_propagation(
    doe_columns,assessment_all, analysisDatasources, analysis_chembl, negativeTD, diseaseTA,diagonal_lof,diagonal_gof
):
    conditions = [
    F.when(F.col(c) == F.col("maxDoE"), F.lit(c)).otherwise(F.lit(None)) for c in doe_columns
    ]
    
    return (
        analysis_propagated(assessment_all, analysisDatasources)
        .join(
            analysis_chembl.selectExpr(
                "targetId",
                "diseaseId",
                "maxClinPhase",
                "coherencyDiagonal as coherencyDiagonal_ch",
                "coherencyOneCell as coherencyOneCell_ch",
                "LoF_protect as LoF_protect_ch",
                "GoF_protect as GoF_protect_ch",
            ),
            on=["targetId", "diseaseId"],
            how="right",
        )
        #### Should remove the coherencyDiagonal.isNotNull()
        .withColumn(
            "geneticEvidence",
            F.when(
                F.col("coherencyDiagonal").isNotNull(), F.lit("hasGeneticEvidence")
            ).otherwise(F.lit("noGeneticEvidence")),
        )
        # .filter(F.col("coherencyDiagonal_ch").isNotNull())
        .withColumn(
            "diagonalAgreeWithDrugs",
            F.when(
                (F.col("coherencyDiagonal_ch") == "coherent")
                & (F.col("coherencyDiagonal") == "coherent"),
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        F.col("GoF_risk").isNotNull() | F.col("LoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .when(
                    F.col("GoF_protect_ch").isNotNull()
                    & (
                        F.col("LoF_risk").isNotNull() | F.col("GoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        )
        .withColumn(
            "oneCellAgreeWithDrugs",
            F.when(
                (F.col("coherencyOneCell_ch") == "coherent")
                & (F.col("coherencyOneCell") == "coherent"),
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        (F.col("LoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("GoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .when(
                    (F.col("GoF_protect_ch").isNotNull())
                    & (
                        (F.col("GoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("LoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        ).withColumn(
            "arrayN", F.array(*[F.col(c) for c in doe_columns])
        ).withColumn(
            "maxDoE", F.array_max(F.col("arrayN"))
        ).withColumn("maxDoE_names", F.array(*conditions)
        ).withColumn("maxDoE_names", F.expr("filter(maxDoE_names, x -> x is not null)")
        ).withColumn(
            "Phase4",
            F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=3",
            F.when(F.col("maxClinPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=2",
            F.when(F.col("maxClinPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=1",
            F.when(F.col("maxClinPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
        )
        .join(negativeTD, on=["targetId", "diseaseId"], how="left")
        .withColumn(
            "PhaseT",
            F.when(F.col("stopReason") == "Negative", F.lit("yes")).otherwise(F.lit("no")),
        )
        .join(
            diseaseTA.select("diseaseId", "taLabelSimple"), on="diseaseId", how="left"
        )
        .withColumn(
            "hasGeneticEvidence",
            F.when(
                F.col("geneticEvidence") == "hasGeneticEvidence", F.lit("yes")
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "diagonalYes",
            F.when(
                F.col("hasGeneticEvidence") == "yes",
                F.when(F.col("diagonalAgreeWithDrugs") == "coherent", F.lit("yes"))
                .when(F.col("diagonalAgreeWithDrugs") == "dispar", F.lit("no"))
                .otherwise(F.lit("no")),
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "oneCellYes",
            F.when(
                F.col("hasGeneticEvidence") == "yes",
                F.when(F.col("oneCellAgreeWithDrugs") == "coherent", F.lit("yes"))
                .when(F.col("oneCellAgreeWithDrugs") == "dispar", F.lit("no"))
                .otherwise(F.lit("no")),
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "maxDoEArrayN",
            F.expr("aggregate(arrayN, 0, (acc, x) -> acc + IF(x = maxDoE, 1, 0))")
        ).withColumn(
            "NoneCellYes",
            F.when(F.col("LoF_protect_ch").isNotNull() & (F.array_contains(F.col("maxDoE_names"), F.lit("LoF_protect")))==True, F.lit('yes'))
            .when(F.col("GoF_protect_ch").isNotNull() & (F.array_contains(F.col("maxDoE_names"), F.lit("GoF_protect")))==True, F.lit('yes')
                ).otherwise(F.lit('no'))  # If the value is null, return null # Otherwise, check if name is in array
        ).withColumn(
            "NdiagonalYes",
            F.when(F.col("LoF_protect_ch").isNotNull() & 
                (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_lof]))) > 0),
                F.lit("yes")
            ).when(F.col("GoF_protect_ch").isNotNull() & 
                (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_gof]))) > 0),
                F.lit("yes")
            ).otherwise(F.lit('no'))
        )
        # .persist()
    )


#####
## no propag
#####
print("defining full analysis no propagation")


def full_analysis_noPropagation(
    doe_columns,assessment_all, analysisDatasources, analysis_chembl, negativeTD, diseaseTA,diagonal_lof,diagonal_gof
):
    conditions = [
    F.when(F.col(c) == F.col("maxDoE"), F.lit(c)).otherwise(F.lit(None)) for c in doe_columns
    ]
    return (
        analysis_nonPropagated(assessment_all, analysisDatasources)
        .join(
            analysis_chembl.selectExpr(
                "targetId",
                "diseaseId",
                "maxClinPhase",
                "coherencyDiagonal as coherencyDiagonal_ch",
                "coherencyOneCell as coherencyOneCell_ch",
                "LoF_protect as LoF_protect_ch",
                "GoF_protect as GoF_protect_ch",
            ),
            on=["targetId", "diseaseId"],
            how="right",
        )
        .withColumn(
            "geneticEvidence",
            F.when(
                F.col("coherencyDiagonal").isNotNull(), F.lit("hasGeneticEvidence")
            ).otherwise(F.lit("noGeneticEvidence")),
        )
        # .filter(F.col("coherencyDiagonal_ch").isNotNull())
        .withColumn(
            "diagonalAgreeWithDrugs",
            F.when(
                (F.col("coherencyDiagonal_ch") == "coherent")
                & (F.col("coherencyDiagonal") == "coherent"),
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        F.col("GoF_risk").isNotNull() | F.col("LoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .when(
                    F.col("GoF_protect_ch").isNotNull()
                    & (
                        F.col("LoF_risk").isNotNull() | F.col("GoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        )
        .withColumn(
            "oneCellAgreeWithDrugs",
            F.when(
                (F.col("coherencyOneCell_ch") == "coherent")
                & (F.col("coherencyOneCell") == "coherent"),
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        (F.col("LoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("GoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .when(
                    (F.col("GoF_protect_ch").isNotNull())
                    & (
                        (F.col("GoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("LoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        ).withColumn(
            "arrayN", F.array(*[F.col(c) for c in doe_columns])
        ).withColumn(
            "maxDoE", F.array_max(F.col("arrayN"))
        ).withColumn("maxDoE_names", F.array(*conditions)
        ).withColumn("maxDoE_names", F.expr("filter(maxDoE_names, x -> x is not null)")
        )
        .withColumn(
            "Phase4",
            F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=3",
            F.when(F.col("maxClinPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=2",
            F.when(F.col("maxClinPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=1",
            F.when(F.col("maxClinPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
        )
        .join(negativeTD, on=["targetId", "diseaseId"], how="left")
        .withColumn(
            "PhaseT",
            F.when(F.col("stopReason") == "Negative", F.lit("yes")).otherwise(F.lit("no")),
        )
        .join(
            diseaseTA.select("diseaseId", "taLabelSimple"), on="diseaseId", how="left"
        )
        .withColumn(
            "hasGeneticEvidence",
            F.when(
                F.col("geneticEvidence") == "hasGeneticEvidence", F.lit("yes")
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "diagonalYes",
            F.when(
                F.col("hasGeneticEvidence") == "yes",
                F.when(F.col("diagonalAgreeWithDrugs") == "coherent", F.lit("yes"))
                .when(F.col("diagonalAgreeWithDrugs") == "dispar", F.lit("no"))
                .otherwise(F.lit("no")),
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "oneCellYes",
            F.when(
                F.col("hasGeneticEvidence") == "yes",
                F.when(F.col("oneCellAgreeWithDrugs") == "coherent", F.lit("yes"))
                .when(F.col("oneCellAgreeWithDrugs") == "dispar", F.lit("no"))
                .otherwise(F.lit("no")),
            ).otherwise(F.lit("no")),
        ).withColumn(
            "maxDoEArrayN",
            F.expr("aggregate(arrayN, 0, (acc, x) -> acc + IF(x = maxDoE, 1, 0))")
        ).withColumn(
            "NoneCellYes",
            F.when(F.col("LoF_protect_ch").isNotNull() & (F.array_contains(F.col("maxDoE_names"), F.lit("LoF_protect")))==True, F.lit('yes'))
            .when(F.col("GoF_protect_ch").isNotNull() & (F.array_contains(F.col("maxDoE_names"), F.lit("GoF_protect")))==True, F.lit('yes')
                ).otherwise(F.lit('no'))  # If the value is null, return null # Otherwise, check if name is in array
        ).withColumn(
            "NdiagonalYes",
            F.when(F.col("LoF_protect_ch").isNotNull() & 
                (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_lof]))) > 0),
                F.lit("yes")
            ).when(F.col("GoF_protect_ch").isNotNull() & 
                (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_gof]))) > 0),
                F.lit("yes")
            ).otherwise(F.lit('no'))
        )
        # .persist()
    )

print("moving to Step 3")

from functions import relative_success, spreadSheetFormatter, convertTuple
import re
import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
from scipy.stats.contingency import odds_ratio, relative_risk

full_data = spark.createDataFrame(
    data=[
        ("yes", "yes"),
        ("yes", "no"),
        ("no", "yes"),
        ("no", "no"),
    ],
    schema=StructType(
        [
            StructField("prediction", StringType(), True),
            StructField("comparison", StringType(), True),
        ]
    ),
)
c = datetime.now()
print("starting dictionaries at", c)

#### continue here on 10.07.2024

## 1nd dictionary
dfs_dict = {}  ### checked and changed on 01.06.2023
dfs_dict_propag = {}


wocgc_list = [
    "gene_burden",
    "intogen",
    "eva",
    "eva_somatic",
    "impc",
    "orphanet",
    "gene2phenotype",
    "gwas_credible_set",
]
wCgc_list = [
    "gene_burden",
    "intogen",
    "eva",
    "eva_somatic",
    "impc",
    "orphanet",
    "gene2phenotype",
    "gwas_credible_set",
    "cancer_gene_census",
]

datasource_list = [
    "gene_burden",
    "intogen",
    "cancer_gene_census",
    "eva",
    "eva_somatic",
    "gwas_credible_set",
    "impc",
    "orphanet",
    "gene2phenotype",
    "WOcgc",
    "wCgc",
    "somatic",
    "germline",
]

germline_list = [
    "gene_burden",
    "eva",
    "gwas_credible_set",
    "impc",
    "orphanet",
    "gene2phenotype",
]

somatic_list = ["intogen", "cancer_gene_census", "eva_somatic"]


# assessment = prueba_assessment.filter(F.col("datasourceId").isin(datasources_analysis))
def dataset_builder(assessment_all, value, analysis_chembl, negativeTD, diseaseTA):
    nonPropagated = full_analysis_noPropagation(
        doe_columns,assessment_all, value, analysis_chembl, negativeTD, diseaseTA,diagonal_lof,diagonal_gof
    )
    propagated = full_analysis_propagation(
        doe_columns,assessment_all, value, analysis_chembl, negativeTD, diseaseTA,diagonal_lof,diagonal_gof
    )
    return (
        # Non propagation
        ## All
        nonPropagated,
        ## Other
        nonPropagated.filter(F.col("taLabelSimple") == "Other"),
        ## Other&Null
        nonPropagated.filter(
            (F.col("taLabelSimple").isNull()) | (F.col("taLabelSimple") == "Other")
        ),
        ## Oncology
        nonPropagated.filter(F.col("taLabelSimple") == "Oncology"),
        # Propagation
        ## All
        propagated,
        ## Other
        propagated.filter(F.col("taLabelSimple") == "Other"),
        ## Other&Null
        propagated.filter(
            (F.col("taLabelSimple").isNull()) | (F.col("taLabelSimple") == "Other")
        ),
        ## Oncology
        propagated.filter(F.col("taLabelSimple") == "Oncology"),
    )


for value in datasource_list:
    print(value)
    if value == "WOcgc":
        (
            dfs_dict[f"df_{value}_All_original"],
            dfs_dict[f"df_{value}_Other_original"],
            dfs_dict[f"df_{value}_OtherNull_original"],
            dfs_dict[f"df_{value}_Oncology_original"],
            dfs_dict_propag[f"df_{value}_All_propag"],
            dfs_dict_propag[f"df_{value}_Other_propag"],
            dfs_dict_propag[f"df_{value}_OtherNull_propag"],
            dfs_dict_propag[f"df_{value}_Oncology_propag"],
        ) = dataset_builder(
            assessment_all, wocgc_list, analysis_chembl, negativeTD, diseaseTA
        )
    elif value == "wCgc":
        (
            dfs_dict[f"df_{value}_All_original"],
            dfs_dict[f"df_{value}_Other_original"],
            dfs_dict[f"df_{value}_OtherNull_original"],
            dfs_dict[f"df_{value}_Oncology_original"],
            dfs_dict_propag[f"df_{value}_All_propag"],
            dfs_dict_propag[f"df_{value}_Other_propag"],
            dfs_dict_propag[f"df_{value}_OtherNull_propag"],
            dfs_dict_propag[f"df_{value}_Oncology_propag"],
        ) = dataset_builder(
            assessment_all, wCgc_list, analysis_chembl, negativeTD, diseaseTA
        )
    elif value == "germline":
        (
            dfs_dict[f"df_{value}_All_original"],
            dfs_dict[f"df_{value}_Other_original"],
            dfs_dict[f"df_{value}_OtherNull_original"],
            dfs_dict[f"df_{value}_Oncology_original"],
            dfs_dict_propag[f"df_{value}_All_propag"],
            dfs_dict_propag[f"df_{value}_Other_propag"],
            dfs_dict_propag[f"df_{value}_OtherNull_propag"],
            dfs_dict_propag[f"df_{value}_Oncology_propag"],
        ) = dataset_builder(
            assessment_all,
            germline_list,
            analysis_chembl,
            negativeTD,
            diseaseTA,
        )

    elif value == "somatic":
        (
            dfs_dict[f"df_{value}_All_original"],
            dfs_dict[f"df_{value}_Other_original"],
            dfs_dict[f"df_{value}_OtherNull_original"],
            dfs_dict[f"df_{value}_Oncology_original"],
            dfs_dict_propag[f"df_{value}_All_propag"],
            dfs_dict_propag[f"df_{value}_Other_propag"],
            dfs_dict_propag[f"df_{value}_OtherNull_propag"],
            dfs_dict_propag[f"df_{value}_Oncology_propag"],
        ) = dataset_builder(
            assessment_all,
            somatic_list,
            analysis_chembl,
            negativeTD,
            diseaseTA,
        )

    else:
        (
            dfs_dict[f"df_{value}_All_original"],
            dfs_dict[f"df_{value}_Other_original"],
            dfs_dict[f"df_{value}_OtherNull_original"],
            dfs_dict[f"df_{value}_Oncology_original"],
            dfs_dict_propag[f"df_{value}_All_propag"],
            dfs_dict_propag[f"df_{value}_Other_propag"],
            dfs_dict_propag[f"df_{value}_OtherNull_propag"],
            dfs_dict_propag[f"df_{value}_Oncology_propag"]
        ) = dataset_builder(
            assessment_all, value, analysis_chembl, negativeTD, diseaseTA
        )


def comparisons_df() -> list:
    """Return list of all comparisons to be used in the analysis"""
    comparisons = spark.createDataFrame(
        data=[
            ("hasGeneticEvidence", "byDatatype"),
            ("diagonalYes", "byDatatype"),
            ("oneCellYes", "byDatatype"),
            ("NdiagonalYes", "byDatatype"),
            ("NoneCellYes", "byDatatype"),
        ],
        schema=StructType(
            [
                StructField("comparison", StringType(), True),
                StructField("comparisonType", StringType(), True),
            ]
        ),
    )

    predictions = spark.createDataFrame(
        data=[
            ("Phase4", "clinical"),
            ("Phase>=3", "clinical"),
            ("Phase>=2", "clinical"),
            ("Phase>=1", "clinical"),
            ("PhaseT", "clinical"),
        ]
    )
    return comparisons.join(predictions, how="full").collect()



result = []
result_st = []
result_ci = []
array2 = []
results = []


def aggregations_original(
    df,
    data,
    listado,
    comparisonColumn,
    comparisonType,
    predictionColumn,
    predictionType,
    today_date,
):

    wComparison = Window.partitionBy(comparisonColumn)
    wPrediction = Window.partitionBy(predictionColumn)
    wPredictionComparison = Window.partitionBy(comparisonColumn, predictionColumn)

    uniqIds = df.select("targetId", "diseaseId").distinct().count()

    out = (
        df.withColumn("comparisonType", F.lit(comparisonType))
        .withColumn("predictionType", F.lit(predictionType))
        .withColumn("total", F.lit(uniqIds))
        .withColumn("a", F.count("targetId").over(wPredictionComparison))
        .withColumn(
            "predictionTotal",
            F.count("targetId").over(wPrediction),
        )
        .withColumn(
            "comparisonTotal",
            F.count("targetId").over(wComparison),
        )
        .select(
            F.col(predictionColumn).alias("prediction"),
            F.col(comparisonColumn).alias("comparison"),
            "comparisonType",
            "predictionType",
            "a",
            "predictionTotal",
            "comparisonTotal",
            "total",
        )
        .filter(F.col("prediction").isNotNull())
        .filter(F.col("comparison").isNotNull())
        .distinct()
    )

    out.write.mode("overwrite").parquet(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )

    filePath = "gs://ot-team/jroldan/" + str(
        today_date
        + "_"
        + "analysis/"
        + data
        # + "_propagated"
        + "/"
        + comparisonColumn
        + "_"
        + predictionColumn
        + ".parquet"
    )
    listado.append(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )
    print(
        today_date
        + "_"
        + "analysis/"
        + data
        # + "_propagated"
        + "/"
        + comparisonColumn
        + "_"
        + predictionColumn
        + ".parquet"
    )
    c = datetime.now()
    c.strftime("%H:%M:%S")
    print(c)

    array1 = np.delete(
        out.join(full_data, on=["prediction", "comparison"], how="outer")
        .groupBy("comparison")
        .pivot("prediction")
        .agg(F.first("a"))
        .sort(F.col("comparison").desc())
        .select("comparison", "yes", "no")
        .fillna(0)
        .toPandas()
        .to_numpy(),
        [0],
        1,
    )
    total = np.sum(array1)
    res_npPhaseX = np.array(array1, dtype=int)
    resX = convertTuple(fisher_exact(res_npPhaseX, alternative="two-sided"))
    resx_CI = convertTuple(
        odds_ratio(res_npPhaseX).confidence_interval(confidence_level=0.95)
    )

    result_st.append(resX)
    result_ci.append(resx_CI)
    (rs_result, rs_ci) = relative_success(array1)

    results.append(
        [
            data,
            comparisonColumn,
            predictionColumn,
            round(float(resX.split(",")[0]), 2),
            float(resX.split(",")[1]),
            round(float(resx_CI.split(",")[0]), 2),
            round(float(resx_CI.split(",")[1]), 2),
            str(total),
            np.array(res_npPhaseX).tolist(),
            round(float(rs_result), 2),
            round(float(rs_ci[0]), 2),
            round(float(rs_ci[1]), 2),
            filePath,
        ]
    )
    return results




spark session created at 2025-04-24 06:17:17.038629
Analysis started on 2025-04-24 at  2025-04-24 06:17:17.038629


25/04/24 06:17:21 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


loaded files
loaded newColoc


                                                                                

loaded gwasComplete
loaded resolvedColloc
run temporary direction of effect
Moving to step 2
defining non propagated,propagated and analysis_drugs functions


                                                                                

defining full_analysis_propagation
defining full analysis no propagation
moving to Step 3
starting dictionaries at 2025-04-24 06:18:22.479136
gene_burden
intogen
cancer_gene_census
eva
eva_somatic
gwas_credible_set


                                                                                

impc
orphanet
gene2phenotype
WOcgc


                                                                                

wCgc


                                                                                

somatic
germline


                                                                                

In [2]:
c = datetime.now()
print("start doing aggregations and writing")
today_date = str(date.today())
aggSetups_original = comparisons_df()
listado = []
results = []

print("starting with non-propagated aggregations at", c)
for key, df in islice(dfs_dict.items(),2):
    df = df.persist()
    for row in aggSetups_original[:1]:
        aggregations_original(df, key, listado, *row, today_date)
    df.unpersist()
    print(key + " df unpersisted")

print("non propagated files wroten succesfully at", c)


print("starting with propagated aggregations at", c)
for key, df in islice(dfs_dict_propag.items(),2):
    df = df.persist()
    for row in aggSetups_original[:1]:
        aggregations_original(df, key, listado, *row, today_date)
    df.unpersist()
    print(key + " df unpersisted")

print("propagated files wroten succesfully at", c)

print("creating pandas dataframe with resulting rows")
df_results = pd.DataFrame(
    results,
    columns=[
        "group",
        "comparison",
        "phase",
        "OR",
        "pValue",
        "LowCI",
        "HighCI",
        "total",
        "array",
        "rs",
        "lowRs",
        "HighRs",
        "path",
    ],
)
print("created pandas dataframe")
print("converting to spark dataframe")
print("preparing dataframe")

schema = StructType(
    [
        StructField("group", StringType(), True),
        StructField("comparison", StringType(), True),
        StructField("phase", StringType(), True),
        StructField("oddsRatio", DoubleType(), True),
        StructField("pValue", DoubleType(), True),
        StructField("lowerInterval", DoubleType(), True),
        StructField("upperInterval", DoubleType(), True),
        StructField("total", StringType(), True),
        StructField("values", ArrayType(ArrayType(IntegerType())), True),
        StructField("relSuccess", DoubleType(), True),
        StructField("rsLower", DoubleType(), True),
        StructField("rsUpper", DoubleType(), True),
        StructField("path", StringType(), True),
    ]
)

print("read pattern variables")
df = spreadSheetFormatter(spark.createDataFrame(df_results, schema=schema))
print("processed spreadsheet")
print("writting the dataframe")

# Convert list of lists to DataFrame
# Regular expressions
'''
value_pattern = r"df_([^_]+)_"  # Extracts {value}
middle_pattern = r"df_[^_]+_([^_]+)_"  # Extracts middle part (All, Other, etc.)
suffix_pattern = r"(original|propag)$"  # Extracts suffix (original or propag)
'''

df.withColumn(
    "datasource",
    F.regexp_extract(F.col("group"), r"df_(.*?)_(All|Other|OtherNull|Oncology)_(propag|original)", 1)
).withColumn(
    "therArea",
    F.regexp_extract(F.col("group"), r"_(All|Other|OtherNull|Oncology)_", 1)
).withColumn(
    "type",
    F.regexp_extract(F.col("group"), r"_(propag|original)$", 1)
).toPandas().to_csv(
    f"gs://ot-team/jroldan/analysis/{today_date}_genEvidAnalysis_new.csv"
)

print("dataframe written \n Analysis finished")

start doing aggregations and writing


                                                                                

starting with non-propagated aggregations at 2025-04-24 06:21:33.611622


                                                                                

2025-04-24_analysis/df_gene_burden_All_original/hasGeneticEvidence_Phase4.parquet
2025-04-24 06:21:42.256155
df_gene_burden_All_original df unpersisted


                                                                                

2025-04-24_analysis/df_gene_burden_Other_original/hasGeneticEvidence_Phase4.parquet
2025-04-24 06:21:53.047763
df_gene_burden_Other_original df unpersisted
non propagated files wroten succesfully at 2025-04-24 06:21:33.611622
starting with propagated aggregations at 2025-04-24 06:21:33.611622


                                                                                

2025-04-24_analysis/df_gene_burden_All_propag/hasGeneticEvidence_Phase4.parquet
2025-04-24 06:22:04.494690
df_gene_burden_All_propag df unpersisted


                                                                                

2025-04-24_analysis/df_gene_burden_Other_propag/hasGeneticEvidence_Phase4.parquet
2025-04-24 06:22:14.903707
df_gene_burden_Other_propag df unpersisted
propagated files wroten succesfully at 2025-04-24 06:21:33.611622
creating pandas dataframe with resulting rows
created pandas dataframe
converting to spark dataframe
preparing dataframe
read pattern variables
importing functions
imported functions
processed spreadsheet
writting the dataframe
dataframe written 
 Analysis finished


In [None]:

from itertools import islice
import time
#from array import ArrayType
from functions import (
    relative_success,
    spreadSheetFormatter,
    discrepancifier,
    temporary_directionOfEffect,
)
# from stoppedTrials import terminated_td
from DoEAssessment import directionOfEffect
# from membraneTargets import target_membrane
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
#from itertools import islice
from datetime import datetime
from datetime import date
from pyspark.sql.types import (
    StructType,
    StructField,
    DoubleType,
    StringType,
    IntegerType,
    ArrayType
)
import pandas as pd


spark = SparkSession.builder.getOrCreate()
spark.conf.set(
    "spark.sql.shuffle.partitions", "400"
) 

from google.cloud import storage

bucket_name = "ot-team"
prefix = "jroldan/2025-04-24_analysis/"

client = storage.Client()
bucket = client.get_bucket(bucket_name)
blobs = bucket.list_blobs(prefix=prefix)

# List all .parquet files with full GCS paths
parquet_files = [
    f"gs://{bucket_name}/{blob.name}"
    for blob in blobs
    if blob.name.endswith(".parquet")
]

print(f"Found {len(parquet_files)} parquet files")

spark session created at 2025-04-24 09:20:11.034967
Analysis started on 2025-04-24 at  2025-04-24 09:20:11.034967


25/04/24 09:20:16 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [4]:
from google.cloud import storage

bucket_name = "ot-team"
prefix = "jroldan/2025-04-24_analysis/"

client = storage.Client()
bucket = client.get_bucket(bucket_name)
blobs = bucket.list_blobs(prefix=prefix)

# List all .parquet files with full GCS paths
parquet_files = [
    f"gs://{bucket_name}/{blob.name}"
    for blob in blobs
    if blob.name.endswith(".parquet")
]

print(f"Found {len(parquet_files)} parquet files")


Found 1175 parquet files


In [9]:
from itertools import islice
import time
from functions import relative_success, spreadSheetFormatter, convertTuple
#from array import ArrayType
from functions import (
    relative_success,
    spreadSheetFormatter,
    discrepancifier,
    temporary_directionOfEffect,
)
from functions import relative_success, spreadSheetFormatter, convertTuple
import re
import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
from scipy.stats.contingency import odds_ratio, relative_risk
# from stoppedTrials import terminated_td
from DoEAssessment import directionOfEffect
# from membraneTargets import target_membrane
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
#from itertools import islice
from datetime import datetime
from datetime import date
from pyspark.sql.types import (
    StructType,
    StructField,
    DoubleType,
    StringType,
    IntegerType,
    ArrayType
)
import pandas as pd

from pyspark.sql import SparkSession
import numpy as np
spark = SparkSession.builder.getOrCreate()

full_data = spark.createDataFrame(
    data=[
        ("yes", "yes"),
        ("yes", "no"),
        ("no", "yes"),
        ("no", "no"),
    ],
    schema=StructType(
        [
            StructField("prediction", StringType(), True),
            StructField("comparison", StringType(), True),
        ]
    ),
)
results=[]
result = []
result_st = []
result_ci = []
array2 = []
results = []
for path in parquet_files:
    print(f"Reading {path}")
    df = spark.read.parquet(path)
    array1 = np.delete(
        df.join(full_data, on=["prediction", "comparison"], how="outer")
        .groupBy("comparison")
        .pivot("prediction")
        .agg(F.first("a"))
        .sort(F.col("comparison").desc())
        .select("comparison", "yes", "no")
        .fillna(0)
        .toPandas()
        .to_numpy(),
        [0],
        1,
    )
    total = np.sum(array1)
    res_npPhaseX = np.array(array1, dtype=int)
    resX = convertTuple(fisher_exact(res_npPhaseX, alternative="two-sided"))
    resx_CI = convertTuple(
        odds_ratio(res_npPhaseX).confidence_interval(confidence_level=0.95)
    )

    result_st.append(resX)
    result_ci.append(resx_CI)
    (rs_result, rs_ci) = relative_success(array1)
    import re

    # 1. Extract the element between 'analysis/' and the next '/'
    match1 = re.search(r'analysis/([^/]+)/', path)
    part1 = match1.group(1) if match1 else None

    # 2. Extract the element after the second '/' until '_Phase'
    match2 = re.search(r'[^/]+/([^/_]+)_Phase', path)
    part2 = match2.group(1) if match2 else None

    # 3. Extract the element from 'Phase' until '.parquet'
    match3 = re.search(r'(Phase[^.]+)\.parquet', path)
    part3 = match3.group(1) if match3 else None

    print("Part 1:", part1)
    print("Part 2:", part2)
    print("Part 3:", part3)

    results.append(
        [
            part1, #data,
            part2, #comparisonColumn,
            part3, #predictionColumn,
            round(float(resX.split(",")[0]), 2),
            float(resX.split(",")[1]),
            round(float(resx_CI.split(",")[0]), 2),
            round(float(resx_CI.split(",")[1]), 2),
            str(total),
            np.array(res_npPhaseX).tolist(),
            round(float(rs_result), 2),
            round(float(rs_ci[0]), 2),
            round(float(rs_ci[1]), 2),
            path,
        ]
    )
    print('finished')

Reading gs://ot-team/jroldan/2025-04-24_analysis/df_WOcgc_All_original/NdiagonalYes_Phase4.parquet/part-00000-b3d4c9e7-1032-4b19-ab35-d2d5122cbc7f-c000.snappy.parquet
Part 1: df_WOcgc_All_original
Part 2: NdiagonalYes
Part 3: Phase4
finished
Reading gs://ot-team/jroldan/2025-04-24_analysis/df_WOcgc_All_original/NdiagonalYes_Phase>=1.parquet/part-00000-2a4fb3d9-1b5a-4f12-8c1f-5ff8f1c5b0d6-c000.snappy.parquet
Part 1: df_WOcgc_All_original
Part 2: NdiagonalYes
Part 3: Phase>=1
finished
Reading gs://ot-team/jroldan/2025-04-24_analysis/df_WOcgc_All_original/NdiagonalYes_Phase>=2.parquet/part-00000-3072ccae-482c-41f2-830e-5bb3e55c8137-c000.snappy.parquet
Part 1: df_WOcgc_All_original
Part 2: NdiagonalYes
Part 3: Phase>=2
finished
Reading gs://ot-team/jroldan/2025-04-24_analysis/df_WOcgc_All_original/NdiagonalYes_Phase>=3.parquet/part-00000-71d6a4c7-d2e7-4450-99a4-43ec3b22efba-c000.snappy.parquet
Part 1: df_WOcgc_All_original
Part 2: NdiagonalYes
Part 3: Phase>=3
finished
Reading gs://ot-team

In [11]:
today_date = str(date.today())

df_results = pd.DataFrame(
    results,
    columns=[
        "group",
        "comparison",
        "phase",
        "OR",
        "pValue",
        "LowCI",
        "HighCI",
        "total",
        "array",
        "rs",
        "lowRs",
        "HighRs",
        "path",
    ],
)
print("created pandas dataframe")
print("converting to spark dataframe")
print("preparing dataframe")

schema = StructType(
    [
        StructField("group", StringType(), True),
        StructField("comparison", StringType(), True),
        StructField("phase", StringType(), True),
        StructField("oddsRatio", DoubleType(), True),
        StructField("pValue", DoubleType(), True),
        StructField("lowerInterval", DoubleType(), True),
        StructField("upperInterval", DoubleType(), True),
        StructField("total", StringType(), True),
        StructField("values", ArrayType(ArrayType(IntegerType())), True),
        StructField("relSuccess", DoubleType(), True),
        StructField("rsLower", DoubleType(), True),
        StructField("rsUpper", DoubleType(), True),
        StructField("path", StringType(), True),
    ]
)

print("read pattern variables")
df = spreadSheetFormatter(spark.createDataFrame(df_results, schema=schema))
print("processed spreadsheet")
print("writting the dataframe")

# Convert list of lists to DataFrame
# Regular expressions
'''
value_pattern = r"df_([^_]+)_"  # Extracts {value}
middle_pattern = r"df_[^_]+_([^_]+)_"  # Extracts middle part (All, Other, etc.)
suffix_pattern = r"(original|propag)$"  # Extracts suffix (original or propag)
'''

df.withColumn(
    "datasource",
    F.regexp_extract(F.col("group"), r"df_(.*?)_(All|Other|OtherNull|Oncology)_(propag|original)", 1)
).withColumn(
    "therArea",
    F.regexp_extract(F.col("group"), r"_(All|Other|OtherNull|Oncology)_", 1)
).withColumn(
    "type",
    F.regexp_extract(F.col("group"), r"_(propag|original)$", 1)
).toPandas().to_csv(
    f"gs://ot-team/jroldan/analysis/{today_date}_genEvidAnalysis_new_resolvedDiscr.csv"
)

print("dataframe written \n Analysis finished")

created pandas dataframe
converting to spark dataframe
preparing dataframe
read pattern variables
importing functions
imported functions
processed spreadsheet
writting the dataframe


dataframe written 
 Analysis finished


In [32]:
################# INCLUDE DIAGONAL 

import time
from array import ArrayType
from functions import (
    relative_success,
    spreadSheetFormatter,
    discrepancifier,
    temporary_directionOfEffect,
)
# from stoppedTrials import terminated_td
from DoEAssessment import directionOfEffect
# from membraneTargets import target_membrane
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
from datetime import datetime
from datetime import date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.types import (
    StructType,
    StructField,
    DoubleType,
    DecimalType,
    StringType,
    FloatType,
)
import pandas as pd
from functools import reduce

spark = SparkSession.builder.getOrCreate()
spark.conf.set(
    "spark.sql.shuffle.partitions", "400"
)  # Default is 200, increase if needed


path_n='gs://open-targets-data-releases/25.06/output/'

target = spark.read.parquet(f"{path_n}target/")

diseases = spark.read.parquet(f"{path_n}disease/")

evidences = spark.read.parquet(f"{path_n}evidence")

credible = spark.read.parquet(f"{path_n}credible_set")

new = spark.read.parquet(f"{path_n}colocalisation_coloc") 

index=spark.read.parquet(f"{path_n}study/")

variantIndex = spark.read.parquet(f"{path_n}variant")

biosample = spark.read.parquet(f"{path_n}biosample")

ecaviar=spark.read.parquet(f"{path_n}colocalisation_ecaviar")

all_coloc=ecaviar.unionByName(new, allowMissingColumns=True)

print("loaded files")

newColoc = (
    all_coloc.join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on left side
            "studyLocusId as leftStudyLocusId",
            "StudyId as leftStudyId",
            "variantId as leftVariantId",
            "studyType as credibleLeftStudyType",
        ),
        on="leftStudyLocusId",
        how="left",
    )
    .join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on right side
            "studyLocusId as rightStudyLocusId",
            "studyId as rightStudyId",
            "variantId as rightVariantId",
            "studyType as credibleRightStudyType",
            "pValueExponent as qtlPValueExponent",
            'isTransQtl'
        ),
        on="rightStudyLocusId",
        how="left",
    )
    .join(
        index.selectExpr(  ### bring modulated target on right side (QTL study)
            "studyId as rightStudyId",
            "geneId",
            "projectId",
            "studyType as indexStudyType",
            "condition",
            "biosampleId",
        ),
        on="rightStudyId",
        how="left",
)
    # .persist()
)

print("loaded newColoc")

# remove columns without content (only null values on them)
df = evidences.filter((F.col("datasourceId") == "gwas_credible_sets"))

# Use an aggregation to determine non-null columns
non_null_counts = df.select(
    *[F.sum(F.col(col).isNotNull().cast("int")).alias(col) for col in df.columns]
)

# Collect the counts for each column
non_null_columns = [
    row[0] for row in non_null_counts.collect()[0].asDict().items() if row[1] > 0
]

# Select only the non-null columns
filtered_df = df.select(*non_null_columns)  # .persist()

## bring studyId, variantId, beta from Gwas and pValue
gwasComplete = filtered_df.join(
    credible.selectExpr(
        "studyLocusId", "studyId", "variantId", "beta as betaGwas", "pValueExponent"
    ),
    on="studyLocusId",
    how="left",
)  # .persist()

print("loaded gwasComplete")

resolvedColoc = (
    (
        newColoc.withColumnRenamed("geneId", "targetId")
        .join(
            gwasComplete.withColumnRenamed("studyLocusId", "leftStudyLocusId"),
            on=["leftStudyLocusId", "targetId"],
            how="inner",
        )
        .join(  ### propagated using parent terms
            diseases.selectExpr(
                "id as diseaseId", "name", "parents", "therapeuticAreas"
            ),
            on="diseaseId",
            how="left",
        )
        .withColumn(
            "diseaseId",
            F.explode_outer(F.concat(F.array(F.col("diseaseId")), F.col("parents"))),
        )
        .drop("parents", "oldDiseaseId")
    ).withColumn(
        "colocDoE",
        F.when(
            F.col("rightStudyType").isin(
                ["eqtl", "pqtl", "tuqtl", "sceqtl", "sctuqtl"]
            ),
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_protect"),
            ),
        ).when(
            F.col("rightStudyType").isin(
                ["sqtl", "scsqtl"]
            ),  ### opposite directionality than sqtl
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_protect"),
            ),
        ),
    )
    # .persist()
)
print("loaded resolvedColloc")

datasource_filter = [
#   "ot_genetics_portal",
    "gwas_credible_sets",
    "gene_burden",
    "eva",
    "eva_somatic",
    "gene2phenotype",
    "orphanet",
    "cancer_gene_census",
    "intogen",
    "impc",
    "chembl",
]

assessment, evidences, actionType, oncolabel = temporary_directionOfEffect(
    path_n, datasource_filter
)

print("run temporary direction of effect")


print("built drugApproved dataset")

analysis_chembl_indication = (
    discrepancifier(
        assessment.filter((F.col("datasourceId") == "chembl"))
        .withColumn(
            "maxClinPhase",
            F.max(F.col("clinicalPhase")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase")
        .pivot("homogenized")
        .agg(F.count("targetId"))
    )
    .filter(F.col("coherencyDiagonal") == "coherent")
    .drop(
        "coherencyDiagonal", 
        "coherencyOneCell", 
        "noEvaluable", 
        "GoF_risk", 
        "LoF_risk"
    )
    .withColumnRenamed("GoF_protect", "drugGoF_protect")
    .withColumnRenamed("LoF_protect", "drugLoF_protect")
    # .persist()
)

chemblAssoc = (
    discrepancifier(
        assessment.filter(
            (F.col("datasourceId") == "chembl")
            & (F.col("homogenized") != "noEvaluable")
        )
        .withColumn(
            "maxClinPhase",
            F.max("clinicalPhase").over(Window.partitionBy("targetId", "diseaseId")),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase")
        .pivot("homogenized")
        .count()
    )
    #.filter(F.col("coherencyDiagonal") == "coherent")
    .drop(
        "coherencyDiagonal", "coherencyOneCell", "noEvaluable", "GoF_risk", "LoF_risk"
    )
    .withColumnRenamed("GoF_protect", "drugGoF_protect")
    .withColumnRenamed("LoF_protect", "drugLoF_protect")
)

loaded files
loaded newColoc


                                                                                

loaded gwasComplete
loaded resolvedColloc


25/07/10 21:42:12 WARN CacheManager: Asked to cache already cached data.
25/07/10 21:42:13 WARN CacheManager: Asked to cache already cached data.


run temporary direction of effect
built drugApproved dataset


25/07/10 21:42:14 WARN CacheManager: Asked to cache already cached data.
25/07/10 21:42:14 WARN CacheManager: Asked to cache already cached data.
25/07/10 21:42:15 WARN CacheManager: Asked to cache already cached data.
25/07/10 21:42:15 WARN CacheManager: Asked to cache already cached data.
25/07/10 21:42:15 WARN CacheManager: Asked to cache already cached data.


In [33]:

resolvedColocFiltered = resolvedColoc.filter((F.col('clpp')>=0.01) | (F.col('h4')>=0.8))
benchmark = (
    (
        resolvedColocFiltered
        #.filter(F.col("betaGwas") < 0)
        .filter(
        F.col("name") != "COVID-19"
    )
        .join(  ### select just GWAS giving protection
            analysis_chembl_indication, on=["targetId", "diseaseId"], how="right"  ### RIGHT SIDE
        )
        .withColumn(
            "AgreeDrug",
            F.when(
                (F.col("drugGoF_protect").isNotNull())
                & (F.col("colocDoE") == "GoF_protect"),
                F.lit("yes"),
            )
            .when(
                (F.col("drugLoF_protect").isNotNull())
                & (F.col("colocDoE") == "LoF_protect"),
                F.lit("yes"),
            )
            .otherwise(F.lit("no")),
        ).withColumn(
            "AgreeDrugDiagonal",
            F.when(
                (F.col("drugGoF_protect").isNotNull())
                & ((F.col("colocDoE") == "GoF_protect") | (F.col("colocDoE") == "LoF_risk")),
                F.lit("yes"),
            )
            .when(
                (F.col("drugLoF_protect").isNotNull())
                & ((F.col("colocDoE") == "LoF_protect") | (F.col("colocDoE") == "GoF_risk")),
                F.lit("yes"),
            )
            .otherwise(F.lit("no")),
        )
    )  #### remove COVID-19 associations
).join(biosample.select("biosampleId", "biosampleName"), on="biosampleId", how="left").persist()

In [34]:
benchmark.show()



+-----------+---------------+-------------+----------------+------------+-----------------+----------+--------------+--------------------------+----+--------------------+--------------------+----+----+----+----+----+-----------+-------------+---------------------+--------------+----------------------+-----------------+----------+---------+--------------+---------+------------+----------+-------------------------+-------------+------------------+----+-----+--------+-------+---------+--------+--------------+----+----------------+--------+------------+---------------+---------------+---------+-----------------+-------------+
|biosampleId|       targetId|    diseaseId|leftStudyLocusId|rightStudyId|rightStudyLocusId|chromosome|rightStudyType|numberColocalisingVariants|clpp|colocalisationMethod|betaRatioSignAverage|  h0|  h1|  h2|  h3|  h4|leftStudyId|leftVariantId|credibleLeftStudyType|rightVariantId|credibleRightStudyType|qtlPValueExponent|isTransQtl|projectId|indexStudyType|condition|data

                                                                                

In [12]:
benchmark.count()

114325

In [16]:
benchmark.groupBy('targetId','diseaseId','drugGoF_protect','drugLoF_protect').count().count()

                                                                                

74187

In [20]:
discrepancifier(benchmark.groupBy('targetId','diseaseId','drugGoF_protect','drugLoF_protect').pivot('colocDoE').count()).groupBy('coherencyDiagonal','coherencyOneCell').count().show()

25/07/10 21:23:50 WARN CacheManager: Asked to cache already cached data.


+-----------------+----------------+-----+
|coherencyDiagonal|coherencyOneCell|count|
+-----------------+----------------+-----+
|           noEvid|          noEvid|73699|
|         coherent|        coherent|  240|
|           dispar|          dispar|  225|
|         coherent|          dispar|   23|
+-----------------+----------------+-----+



In [25]:
doe_columns=["LoF_protect", "GoF_risk", "LoF_risk", "GoF_protect"]
diagonal_lof=['LoF_protect','GoF_risk']
diagonal_gof=['LoF_risk','GoF_protect']
conditions = [
    F.when(F.col(c) == F.col("maxDoE"), F.lit(c)).otherwise(F.lit(None)) for c in doe_columns
    ]

In [None]:
test=discrepancifier(benchmark.groupBy('targetId','diseaseId','drugGoF_protect','drugLoF_protect').pivot('colocDoE').count()
        ).withColumnRenamed('drugGoF_protect','GoF_protect_ch').withColumnRenamed('drugLoF_protect','LoF_protect_ch').withColumn(
            "arrayN", F.array(*[F.col(c) for c in doe_columns])
        ).withColumn(
            "maxDoE", F.array_max(F.col("arrayN"))
        ).withColumn("maxDoE_names", F.array(*conditions)
        ).withColumn("maxDoE_names", F.expr("filter(maxDoE_names, x -> x is not null)")
        ).withColumn(
            "maxDoEArrayN",
            F.expr("aggregate(arrayN, 0, (acc, x) -> acc + IF(x = maxDoE, 1, 0))")
        ).withColumn(
            "NoneCellYes",
            F.when((F.col("LoF_protect_ch").isNotNull() & (F.col('GoF_protect_ch').isNull())) & (F.array_contains(F.col("maxDoE_names"), F.lit("LoF_protect")))==True, F.lit('yes'))
            .when((F.col("GoF_protect_ch").isNotNull() & (F.col('LoF_protect_ch').isNull())) & (F.array_contains(F.col("maxDoE_names"), F.lit("GoF_protect")))==True, F.lit('yes')
                ).otherwise(F.lit('no'))  # If the value is null, return null # Otherwise, check if name is in array
        ).withColumn(
            "NdiagonalYes",
            F.when((F.col("LoF_protect_ch").isNotNull() & (F.col('GoF_protect_ch').isNull())) & 
                (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_lof]))) > 0),
                F.lit("yes")
            ).when((F.col("GoF_protect_ch").isNotNull() & (F.col('LoF_protect_ch').isNull())) & 
                (F.size(F.array_intersect(F.col("maxDoE_names"), F.array([F.lit(x) for x in diagonal_gof]))) > 0),
                F.lit("yes")
            ).otherwise(F.lit('no'))
        ).withColumn(
            "diagonalAgreeWithDrugs",
            F.when(
                (F.col("coherencyDiagonal_ch") == "coherent")
                & (F.col("coherencyDiagonal") == "coherent"),
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        F.col("GoF_risk").isNotNull() | F.col("LoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .when(
                    F.col("GoF_protect_ch").isNotNull()
                    & (
                        F.col("LoF_risk").isNotNull() | F.col("GoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        )
        .withColumn(
            "oneCellAgreeWithDrugs",
            F.when(
                (F.col("coherencyOneCell_ch") == "coherent")
                & (F.col("coherencyOneCell") == "coherent"),
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        (F.col("LoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("GoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .when(
                    (F.col("GoF_protect_ch").isNotNull())
                    & (
                        (F.col("GoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("LoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        )

25/07/10 21:30:25 WARN CacheManager: Asked to cache already cached data.


In [30]:
test.groupBy('coherencyDiagonal','NdiagonalYes').count().show()



+-----------------+------------+-----+
|coherencyDiagonal|NdiagonalYes|count|
+-----------------+------------+-----+
|           noEvid|          no|73699|
|         coherent|         yes|  146|
|           dispar|         yes|  125|
|           dispar|          no|  100|
|         coherent|          no|  117|
+-----------------+------------+-----+



                                                                                

In [31]:
test.filter((F.col('coherencyDiagonal')=='coherent') & (F.col('NdiagonalYes')=='no')).show()

+---------------+-------------+--------------+--------------+----+-----------+--------+-----------+--------+-----------+-----------------+----------------+--------------------+------+-------------+------------+-----------+------------+
|       targetId|    diseaseId|GoF_protect_ch|LoF_protect_ch|null|GoF_protect|GoF_risk|LoF_protect|LoF_risk|noEvaluable|coherencyDiagonal|coherencyOneCell|              arrayN|maxDoE| maxDoE_names|maxDoEArrayN|NoneCellYes|NdiagonalYes|
+---------------+-------------+--------------+--------------+----+-----------+--------+-----------+--------+-----------+-----------------+----------------+--------------------+------+-------------+------------+-----------+------------+
|ENSG00000169083|  EFO_0001663|             4|            44|NULL|       NULL|       2|       NULL|    NULL|       NULL|         coherent|        coherent|[NULL, 2, NULL, N...|     2|   [GoF_risk]|           1|         no|          no|
|ENSG00000079999|MONDO_0005301|          NULL|          

In [8]:
benchmark.groupBy('AgreeDrugDiagonal','AgreeDrug').count().show()



+-----------------+---------+-----+
|AgreeDrugDiagonal|AgreeDrug|count|
+-----------------+---------+-----+
|               no|       no|96389|
|              yes|      yes| 9563|
|              yes|       no| 8373|
+-----------------+---------+-----+



                                                                                