In [1]:
#####
#######
## ANALYSIS FOR L2G Scores, genetic evidence and Direction of Effect
## Original, propagated and Other Vs Oncology
#######
from functions import discrepancifier
from DoEAssessment import directionOfEffect
from functions import relative_success
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
import matplotlib.pyplot as plt
from datetime import date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from datetime import datetime


spark = SparkSession.builder.getOrCreate()
c = datetime.now()
print("spark session created at", c)


#### make the dataset from stopped clin trials
### read supplementary table 9
""" ### just showing how i did the dataset
st9 = spark.read.csv("/Users/juanr/Downloads/ST9.csv", sep=",", header=True)
st9.filter(
    (F.col("clinicalStatus").isin(["Terminated", "Withdrawn", "Suspended"]))
    & (F.col("prediction") == "Negative")
).groupBy(
    "targetId", "diseaseId", "clinicalStatus", "prediction"
).count().toPandas().to_csv(
    "targetDiseaseStoppedNegative.csv"
)
"""
### target-diseases terminated&withdrawal in clin trials
terminated = spark.read.csv(
    "gs://ot-team/jroldan/analysis/targetDiseaseStoppedNegative.csv",
    sep=",",
    header=True,
).drop("_c0", "Withdrawn")

terminated_array = (
    terminated.groupBy("targetId", "diseaseId")
    .agg(F.collect_set("clinicalStatus").alias("clinicalStatus"))
    .withColumn("prediction", F.when(F.col("clinicalStatus").isNotNull(), F.lit("yes")))
)

### Now , filter by rank, and join with the info from Ot genetics and run the DoE.
ranking = Window.partitionBy("studyId", "variantId")
### union with the other datasources
platform_v = "24.06"

target_path = (
    f"gs://open-targets-data-releases/{platform_v}/output/etl/parquet/targets/"
)
target = spark.read.parquet(target_path)

disease_path = (
    f"gs://open-targets-data-releases/{platform_v}/output/etl/parquet/diseases/"
)
diseases = spark.read.parquet(disease_path)
mecact_path = f"gs://open-targets-data-releases/{platform_v}/output/etl/parquet/mechanismOfAction/"
mecact = spark.read.parquet(mecact_path)
evidences = spark.read.parquet(
    f"gs://open-targets-data-releases/{platform_v}/output/etl/parquet/evidence"
).filter(
    F.col("datasourceId").isin(
        [
            "ot_genetics_portal",
            "gene_burden",
            "eva",
            "eva_somatic",
            "gene2phenotype",
            "orphanet",
            "cancer_gene_census",
            "intogen",
            "impc",
            "chembl",
        ]
    )
)
# 1# Make a list of variant of interest (Sequence ontology terms) to subset data of interest.
### Bear in mind that SO works with ontology structure as: SO:XXXXXX, but databases has the SO as: SO_XXXXXX
var_filter_lof = [
    ### High impact variants https://www.ensembl.org/info/genome/variation/prediction/predicted_data.html
    "SO_0001589",  ## frameshit_variant
    "SO_0001587",  ## stop_gained
    "SO_0001574",  ## splice_acceptor_variant
    "SO_0001575",  ## splice_donor_variant
    "SO_0002012",  ## start_lost
    "SO_0001578",  ## stop_lost
    "SO_0001893",  ## transcript_ablation
    # "SO:0001889", ## transcript_amplification ## the Only HIGH impact that increase protein.
]

gof = ["SO_0002053"]
lof = ["SO_0002054"]

print("loading sources")

## Building Sequence Ontology
so_path = "gs://ot-team/jroldan/sequenceOntology_20221118.csv"
so_ontology = spark.read.csv(so_path, header=True)
building = (
    so_ontology.select(F.col("Accession"), F.col("Parents"))
    .withColumn("Parentalind", F.split(F.col("Parents"), ","))
    .withColumn("Parentalind", F.explode_outer("Parentalind"))
    .groupBy("Parentalind")
    .agg(F.collect_list(F.col("Accession")).alias("childrens"))
    .join(so_ontology, F.col("Parentalind") == so_ontology.Accession, "right")
)
## annotate TSG/oncogene/bivalent using 'hallmarks.attributes'
oncotsg_list = [
    "TSG",
    "oncogene",
    "Oncogene",
    "oncogene",
    "oncogene,TSG",
    "TSG,oncogene",
    "fusion,oncogene",
    "oncogene,fusion",
]

#### rlike('('+Keywords+')(\s|$)'
### on 03.07.2023 we add the categories:
# DISRUPTING AGENT - inhibitor
# STABILISER - activator

### Hacer el join del actionType con el chembl para sacar los mecanismos de accion.
inhibitors = [
    "RNAI INHIBITOR",
    "NEGATIVE MODULATOR",
    "NEGATIVE ALLOSTERIC MODULATOR",
    "ANTAGONIST",
    "ANTISENSE INHIBITOR",
    "BLOCKER",
    "INHIBITOR",
    "DEGRADER",
    "INVERSE AGONIST",
    "ALLOSTERIC ANTAGONIST",
    "DISRUPTING AGENT",  ## added new on 03.07.2023
]

activators = [
    "PARTIAL AGONIST",
    "ACTIVATOR",
    "POSITIVE ALLOSTERIC MODULATOR",
    "POSITIVE MODULATOR",
    "AGONIST",
    "SEQUESTERING AGENT",
    "STABILISER",  ## added new on 03.07.2023
]

columnas = ["activator", "inhibitor"]
both = activators + inhibitors

actionType = (
    mecact.select(
        F.explode_outer("chemblIds").alias("drugId2"),
        "actionType",
        "mechanismOfAction",
        "targets",
    )
    .select(
        F.explode_outer("targets").alias("targetId2"),
        "drugId2",
        "actionType",
        "mechanismOfAction",
    )
    .groupBy("targetId2", "drugId2")
    .agg(
        F.collect_set("actionType").alias("actionType"),
    )
)

oncolabel = (
    target.select(
        "id", "approvedSymbol", F.explode_outer(F.col("hallmarks.attributes"))
    )
    .select("id", "approvedSymbol", "col.description")
    .filter(F.col("description").isin(oncotsg_list))
    .groupBy("id", "approvedSymbol")
    .agg(F.collect_set("description").alias("description"))
    .withColumn("description_splited", F.concat_ws(",", F.col("description")))
    .withColumn(
        "TSorOncogene",
        F.when(
            (
                F.col("description_splited").rlike("ncogene")
                & F.col("description_splited").rlike("TSG")
            ),
            F.lit("bivalent"),
        )
        .when(F.col("description_splited").rlike("ncogene(\s|$)"), F.lit("oncogene"))
        .when(F.col("description_splited").rlike("TSG(\s|$)"), F.lit("TSG"))
        .otherwise(F.lit("noEvaluable")),  ####
    )
    .withColumnRenamed("id", "target_id")
)

# 2# run the transformation of the evidences datasets used.

windowSpec = Window.partitionBy("targetId", "diseaseId")

columns_chembl = ["LoF_protect", "GoF_protect"]
columns_dataset = ["LoF_protect", "GoF_protect", "LoF_risk", "GoF_risk", "evidenceDif"]
columns = ["GoF_risk", "LoF_protect", "LoF_risk", "GoF_protect"]
terms = ["noEvaluable", "bivalent_risk", "null", "dispar"]

sincgc = [
    "gene_burden",
    "intogen",
    "eva",
    "eva_somatic",
    "ot_genetics_portal",
    "impc",
    "orphanet",
    "gene2phenotype",
]

germline = [
    "gene_burden",
    "eva",
    "ot_genetics_portal",
    "impc",
    "orphanet",
    "gene2phenotype",
]

somatic = ["intogen", "cancer_gene_census", "eva_somatic"]

datasource_list = [
    "gene_burden",
    "intogen",
    "cancer_gene_census",
    "eva",
    "eva_somatic",
    "ot_genetics_portal",
    "impc",
    "orphanet",
    "gene2phenotype",
    "chembl",
    "WOcgc",
    "somatic",
    "germline",
]
#### version all gene burden
prueba_assessment = (
    directionOfEffect(evidences, platform_v)
    .withColumn(
        "rank",
        F.when(
            (F.col("datasourceId") == "ot_genetics_portal"),
            F.row_number().over(ranking.orderBy(F.col("resourceScore").desc())),
        ).otherwise(F.lit(None)),
    )
    .withColumn(
        "average",
        F.when(
            (F.col("datasourceId") == "ot_genetics_portal"),
            F.avg("resourceScore").over(ranking.orderBy(F.col("resourceScore").desc())),
        ).otherwise(F.lit(None)),
    )
    .persist()
)

genEvidDataset = (
    prueba_assessment.filter(F.col("datasourceId") != "chembl")  #### checked 31.05.2023
    .groupBy("targetId", "diseaseId")
    .agg(F.count("targetId").alias("Nr_evidences"))
    .select("targetId", "diseaseId", "Nr_evidences")
    .withColumn("geneticEvidence", F.lit("hasGeneticEvidence"))
)

coherency_toAssess_others_datasource = (  #### checked 31.05.2023
    prueba_assessment.filter(
        (F.col("homogenized").isin(columns)) & (F.col("datasourceId") != "chembl")
    )
    .groupBy("targetId", "diseaseId")
    .agg(F.collect_set("datasourceId").alias("datasourceIds"))
)

taDf = spark.createDataFrame(
    data=[
        ("MONDO_0045024", "cell proliferation disorder", "Oncology"),
        ("EFO_0005741", "infectious disease", "Other"),
        ("OTAR_0000014", "pregnancy or perinatal disease", "Other"),
        ("EFO_0005932", "animal disease", "Other"),
        ("MONDO_0024458", "disease of visual system", "Other"),
        ("EFO_0000319", "cardiovascular disease", "Other"),
        ("EFO_0009605", "pancreas disease", "Other"),
        ("EFO_0010282", "gastrointestinal disease", "Other"),
        ("OTAR_0000017", "reproductive system or breast disease", "Other"),
        ("EFO_0010285", "integumentary system disease", "Other"),
        ("EFO_0001379", "endocrine system disease", "Other"),
        ("OTAR_0000010", "respiratory or thoracic disease", "Other"),
        ("EFO_0009690", "urinary system disease", "Other"),
        ("OTAR_0000006", "musculoskeletal or connective tissue disease", "Other"),
        ("MONDO_0021205", "disease of ear", "Other"),
        ("EFO_0000540", "immune system disease", "Other"),
        ("EFO_0005803", "hematologic disease", "Other"),
        ("EFO_0000618", "nervous system disease", "Other"),
        ("MONDO_0002025", "psychiatric disorder", "Other"),
        ("MONDO_0024297", "nutritional or metabolic disease", "Other"),
        ("OTAR_0000018", "genetic, familial or congenital disease", "Other"),
        ("OTAR_0000009", "injury, poisoning or other complication", "Other"),
        ("EFO_0000651", "phenotype", "Other"),
        ("EFO_0001444", "measurement", "Other"),
        ("GO_0008150", "biological process", "Other"),
    ],
    schema=StructType(
        [
            StructField("taId", StringType(), True),
            StructField("taLabel", StringType(), True),
            StructField("taLabelSimple", StringType(), True),
        ]
    ),
).withColumn("taRank", F.monotonically_increasing_id())

### give us a classification of Oncology VS non oncology
wByDisease = Window.partitionBy("diseaseId")  #### checked 31.05.2023
diseaseTA = (
    diseases.withColumn("taId", F.explode("therapeuticAreas"))
    .select(F.col("id").alias("diseaseId"), "taId", "parents")
    .join(taDf, on="taId", how="left")
    .withColumn("minRank", F.min("taRank").over(wByDisease))
    .filter(F.col("taRank") == F.col("minRank"))
    .drop("taRank", "minRank")
)

v2g = spark.read.parquet("gs://genetics-portal-dev-data/22.09.1/outputs/v2g")
varDistToGene = v2g.select(
    F.concat_ws("_", "chr_id", "position", "ref_allele", "alt_allele").alias(
        "variantId"
    ),
    F.col("gene_id").alias("targetId"),
    "source_id",
    "d",
    "distance_score",
).filter(F.col("source_id") == "canonical_tss")

ranking = Window.partitionBy("studyId", "variantId")


#######
# Build Ot genetics dataset as supporting evidence
#######
otGenetics = (
    prueba_assessment.filter(
        F.col("datasourceId").isin(
            [
                "ot_genetics_portal",
            ]
        )
    )
    # .filter((F.col("homogenized") != "noEvaluable"))
    .join(varDistToGene, on=["variantId", "targetId"], how="left")
    .withColumn(
        "datasources",
        F.collect_set("datasourceId").over(Window.partitionBy("targetId", "diseaseId")),
    )
    .withColumn(
        "L2G_ranking",
        F.when(
            (F.col("datasourceId") == "ot_genetics_portal"),
            F.row_number().over(ranking.orderBy(F.col("resourceScore").desc())),
        ).otherwise(F.lit(None)),
    )
    .withColumn(
        "averageL2G",
        F.when(
            (F.col("datasourceId") == "ot_genetics_portal"),
            F.avg("resourceScore").over(ranking.orderBy(F.col("resourceScore").desc())),
        ).otherwise(F.lit(None)),
    )
    .withColumn(
        "averageCanonicalTSSDistance",
        F.when(
            (F.col("datasourceId") == "ot_genetics_portal"),
            F.avg("d").over(ranking.orderBy(F.col("resourceScore").desc())),
        ).otherwise(F.lit(None)),
    )
    .withColumn(
        "datasources",
        F.when(
            F.col("rank").isNull(),
            F.array_remove(F.col("datasources"), "ot_genetics_portal"),
        ).otherwise(F.col("datasources")),
    )
    .withColumn(
        "distance_ranking",
        F.when(
            (F.col("datasourceId") == "ot_genetics_portal"),
            F.row_number().over(ranking.orderBy(F.col("d").asc())),
        ).otherwise(F.lit(None)),
    )
    .withColumn(
        "ChemblL2gRanking",
        F.when(
            (F.array_contains(F.col("datasources"), "chembl"))
            & (F.array_contains(F.col("datasources"), "ot_genetics_portal")),
            F.lit(F.col("L2G_ranking")),
        ).otherwise(F.lit(None)),
    )
    .withColumn(
        "chemblDistanceRanking",
        F.when(
            (F.array_contains(F.col("datasources"), "chembl"))
            & (F.array_contains(F.col("datasources"), "ot_genetics_portal")),
            F.lit(F.col("distance_ranking")),
        ).otherwise(F.lit(None)),
    )
    .withColumn(
        "frontierValue",
        ## ot genetics portal
        F.when(
            F.col("datasourceId") == "ot_genetics_portal",  ### the same for gene_burden
            F.when(
                (F.col("beta").isNotNull()) & (F.col("OddsRatio").isNull()),
                F.when(
                    (F.col("beta") <= 0.1) & (F.col("beta") >= -0.1),
                    F.lit("limitValue"),
                ).otherwise(F.lit("noLimitValue")),
            )
            .when(
                (F.col("beta").isNull()) & (F.col("OddsRatio").isNotNull()),
                F.when(
                    (F.col("OddsRatio") <= 1.1) & (F.col("OddsRatio") >= 0.9),
                    F.lit("limitValue"),
                ).otherwise(F.lit("noLimitValue")),
            )
            .when(
                (F.col("beta").isNull()) & (F.col("OddsRatio").isNull()),
                F.lit("noValue"),
            ),
        ),
    )
).persist()

#####
# function for interpreting DoE and coherencies/discrepancies
#####

diseases2 = diseases.select("id", "parents").withColumn(
    "diseaseIdPropagated",
    F.explode_outer(F.concat(F.array(F.col("id")), F.col("parents"))),
)

analysis_chembl = discrepancifier(
    prueba_assessment.filter((F.col("datasourceId") == "chembl"))
    .withColumn(
        "maxClinPhase",
        F.max(F.col("clinicalPhase")).over(Window.partitionBy("targetId", "diseaseId")),
    )
    .groupBy("targetId", "diseaseId", "maxClinPhase")
    .pivot("homogenized")
    .agg(F.count("targetId"))
    .persist()
)

#### propag OtGenetics:
otGenetics_propag = (
    otGenetics.filter((F.col("datasourceId") == "ot_genetics_portal"))
    .join(
        diseases2.selectExpr("id as diseaseId", "diseaseIdPropagated"),
        on="diseaseId",
        how="left",
    )
    .withColumnRenamed("diseaseId", "oldDiseaseId")
    .withColumnRenamed("diseaseIdPropagated", "diseaseId")
).persist()


#### include dictionary for calling dataframes:
# max_L2GScore
# min_distance_ranking


def benchmarkOT(discrepancifier, otGenetics, metric):
    dict_comb = {}
    dict_comb = {
        "hasDirectionOfEffect": f"{metric}",
        "diagonalYes": f"{metric}",
        "oneCellYes": f"{metric}",
    }
    list_l2g = [
        0.1,
        0.15,
        0.2,
        0.25,
        0.3,
        0.35,
        0.4,
        0.45,
        0.5,
        0.55,
        0.6,
        0.65,
        0.7,
        0.75,
        0.8,
        0.85,
        0.9,
        0.95,
    ]
    list_dist = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    return (
        discrepancifier(
            otGenetics.filter((F.col("datasourceId") == "ot_genetics_portal"))
            .withColumn(
                "min_distance_ranking",
                F.min("distance_ranking").over(
                    Window.partitionBy("targetId", "diseaseId")
                ),
            )
            .withColumn(  ### take maximum L2G score per T-D
                "max_L2GScore",
                F.max("resourceScore").over(
                    Window.partitionBy("targetId", "diseaseId")
                ),
            )
            .groupBy(
                "targetId",
                "diseaseId",
                f"{value}",
            )  ##### modifications here to include the groups of ranking/distances to TSS
            .pivot("homogenized")
            .agg(F.count("targetId"))
        )
        .selectExpr(
            "targetId",
            "diseaseId",
            f"{metric}",
            "coherencyDiagonal as coherencyDiagonal",
            "coherencyOneCell as coherencyOneCell",
            "LoF_protect",
            "GoF_protect",
            "LoF_risk",
            "GoF_risk",
        )
        .join(
            analysis_chembl.selectExpr(
                "targetId",
                "diseaseId",
                "maxClinPhase",
                "coherencyDiagonal as coherencyDiagonal_ch",
                "coherencyOneCell as coherencyOneCell_ch",
                "LoF_protect as LoF_protect_ch",
                "GoF_protect as GoF_protect_ch",
            ),
            on=["targetId", "diseaseId"],
            how="right",
        )
        .withColumn(
            "geneticEvidence",
            F.when(
                F.col(f"{metric}").isNotNull(), F.lit("hasGeneticEvidence")
            ).otherwise(F.lit("noGeneticEvidence")),
        )
        .withColumn(
            "diagonalAgreeWithDrugs",
            F.when(
                (F.col("coherencyDiagonal_ch") == "coherent")
                & (F.col("coherencyDiagonal") == "coherent"),
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        F.col("GoF_risk").isNotNull() | F.col("LoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .when(
                    F.col("GoF_protect_ch").isNotNull()
                    & (
                        F.col("LoF_risk").isNotNull() | F.col("GoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        )
        .withColumn(
            "oneCellAgreeWithDrugs",
            F.when(
                (F.col("coherencyOneCell_ch") == "coherent")
                & (F.col("coherencyOneCell") == "coherent"),
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        (F.col("LoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("GoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .when(
                    (F.col("GoF_protect_ch").isNotNull())
                    & (
                        (F.col("GoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("LoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        )
        .withColumn(
            "Phase4",
            F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=3",
            F.when(F.col("maxClinPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=2",
            F.when(F.col("maxClinPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=1",
            F.when(F.col("maxClinPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase0",
            F.when(F.col("maxClinPhase") == 0, F.lit("yes")).otherwise(F.lit("no")),
        )
        .join(terminated_array, on=["targetId", "diseaseId"], how="left")
        .withColumn(
            "PhaseT",
            F.when(F.col("prediction") == "yes", F.lit("yes")).otherwise(F.lit("no")),
        )
        .join(
            diseaseTA.select("diseaseId", "taLabelSimple"), on="diseaseId", how="left"
        )
        .withColumn(
            "hasGeneticEvidence",
            F.when(
                F.col("geneticEvidence") == "hasGeneticEvidence", F.lit("yes")
            ).otherwise(F.lit("no")),
        )
        .withColumn(  #### new column to modify it
            "hasdirectionOfEffect",
            F.when(F.col("coherencyDiagonal").isNotNull(), F.lit("yes")).otherwise(
                F.lit("no")
            ),
        )
        .withColumn(
            "diagonalYes",
            F.when(
                F.col("diagonalAgreeWithDrugs") == "coherent", F.lit("yes")
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "oneCellYes",
            F.when(
                F.col("oneCellAgreeWithDrugs") == "coherent", F.lit("yes")
            ).otherwise(F.lit("no")),
        )
        .select(
            ["*"]
            + (
                [  ### single columns
                    F.when(F.col(f"{metric}") >= n, F.lit("yes"))
                    .otherwise(F.lit("no"))
                    .alias(f"{metric}>={str(n).replace('.', '_')}")
                    for n in list_l2g
                ]
                if metric == "max_L2GScore"  # Adjust this condition as needed
                else [
                    F.when(F.col(f"{metric}") <= n, F.lit("yes"))
                    .otherwise(F.lit("no"))
                    .alias(f"{metric}<={n}")
                    for n in list_dist
                ]
            )
            + (
                [  ### column combinations for Yes/No colums Plus has DoE (any agreement)
                    F.when((F.col(a) == "yes") & (F.col(x) >= n), F.lit("yes"))
                    .otherwise(F.lit("no"))
                    .alias(f"{x}>={str(n).replace('.', '_')}&{a}_combined")
                    for a, x in dict_comb.items()
                    for n in list_l2g
                ]
                if metric == "max_L2GScore"
                else [
                    F.when((F.col(a) == "yes") & (F.col(x) <= n), F.lit("yes"))
                    .otherwise(F.lit("no"))
                    .alias(f"{x}<={str(n).replace('.', '_')}&{a}_combined")
                    for a, x in dict_comb.items()
                    for n in list_dist
                ]
            )
        )
        .persist()
    )


metric_list = ["max_L2GScore", "min_distance_ranking"]
datasetDict = {}
for value in metric_list:
    if value == "max_L2GScore":
        datasetDict[f"df_l2g_original"] = benchmarkOT(
            discrepancifier, otGenetics, value
        )
        datasetDict[f"df_l2g_propagated"] = benchmarkOT(
            discrepancifier, otGenetics_propag, value
        )
    elif value == "min_distance_ranking":
        datasetDict[f"df_distance_original"] = benchmarkOT(
            discrepancifier, otGenetics, value
        )
        datasetDict[f"df_distance_propagated"] = benchmarkOT(
            discrepancifier, otGenetics_propag, value
        )


def comparisons_df(test_propag) -> list:
    """Return list of all comparisons to be used in the analysis"""
    toAnalysis = test_propag.drop(
        "Phase4",
        "Phase>=3",
        "Phase>=2",
        "Phase>=1",
        "Phase0",
        "clinicalStatus",
        "prediction",
        "count",
        "PhaseT",
        "taLabelSimple",
    ).columns[17:]
    dataType = ["byDatatype"] * len(toAnalysis)
    l_studies = []
    l_studies.extend([list(a) for a in zip(toAnalysis, dataType)])

    schema = StructType(
        [
            StructField("comparison", StringType(), True),
            StructField("comparisonType", StringType(), True),
        ]
    )

    comparisons = spark.createDataFrame(l_studies, schema=schema)
    ### include all the columns as predictor

    predictions = spark.createDataFrame(
        data=[
            ("Phase4", "clinical"),
            ("Phase>=3", "clinical"),
            ("Phase>=2", "clinical"),
            ("Phase>=1", "clinical"),
            ("PhaseT", "clinical"),
        ]
    )
    return comparisons.join(predictions, how="full").collect()


full_data = spark.createDataFrame(
    data=[
        ("yes", "yes"),
        ("yes", "no"),
        ("no", "yes"),
        ("no", "no"),
    ],
    schema=StructType(
        [
            StructField("prediction", StringType(), True),
            StructField("comparison", StringType(), True),
        ]
    ),
)


def aggregations_original(
    df,
    data,
    listado,
    comparisonColumn,
    comparisonType,
    predictionColumn,
    predictionType,
    today_date,
):
    wComparison = Window.partitionBy(F.col(comparisonColumn))
    wPrediction = Window.partitionBy(F.col(predictionColumn))
    wPredictionComparison = Window.partitionBy(
        F.col(comparisonColumn), F.col(predictionColumn)
    )
    """
    uniqIds = df.select("targetId", "diseaseId").distinct().count()
    
    out = (
        df.withColumn("comparisonType", F.lit(comparisonType))
        .withColumn("predictionType", F.lit(predictionType))
        .withColumn("total", F.lit(uniqIds))
        .withColumn("a", F.count("targetId").over(wPredictionComparison))
        .withColumn(
            "predictionTotal",
            F.count("targetId").over(wPrediction),
        )
        .withColumn(
            "comparisonTotal",
            F.count("targetId").over(wComparison),
        )
        .select(
            F.col(predictionColumn).alias("prediction"),
            F.col(comparisonColumn).alias("comparison"),
            "comparisonType",
            "predictionType",
            "a",
            "predictionTotal",
            "comparisonTotal",
            "total",
        )
        .filter(F.col("prediction").isNotNull())
        .filter(F.col("comparison").isNotNull())
        .distinct()
    )
    """
    """
    out.write.mode("overwrite").parquet(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )
    """
    listado.append(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )
    """
    print(
        today_date
        + "_"
        + "analysis/"
        + data
        # + "_propagated"
        + "/"
        + comparisonColumn
        + "_"
        + predictionColumn
        + ".parquet"
    )
    """
    c = datetime.now()
    c.strftime("%H:%M:%S")
    print(c)


c = datetime.now()

print("start doing aggregations and writing")
today_date = str(date.today())
listado = []
#### Run analysis using loop
for key, df in datasetDict.items():
    print(key)
    df = df.persist()
    aggSetups_original = comparisons_df(df)
    for row in aggSetups_original:
        aggregations_original(df, key, listado, *row, today_date)
    df.unpersist()
    print(key + " df unpersisted")

##### read files and make spreadsheet
import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
from scipy.stats.contingency import odds_ratio


def convertTuple(tup):
    st = ",".join(map(str, tup))
    return st


full_data = spark.createDataFrame(
    data=[
        ("yes", "yes"),
        ("yes", "no"),
        ("no", "yes"),
        ("no", "no"),
    ],
    schema=StructType(
        [
            StructField("prediction", StringType(), True),
            StructField("comparison", StringType(), True),
        ]
    ),
)
#### update the dictionary dfs with other columns included in the analysis
key_list = [
    "hasGeneticEvidence",
    "oneCell",
    "diagonal",
    "hasDirectionOfEffect",
    "max_L2GScore",
    "min_distance_ranking",
]
value_list = [
    "geneticEvidence",
    "oneCellDoE",
    "diagonalDoE",
    "hasDirectionOfEffect",
    "L2GScore",
    "TSSDistance",
]

dfs = {}


def create_dict_column(dfs, key_list, value_list):
    if len(key_list) != len(value_list):
        raise ValueError("lists of different length")
    dfs.update(zip(key_list, value_list))
    return dfs


dfs = create_dict_column(dfs, key_list, value_list)

# Define the lists of possible substrings
phase_opt = [
    "Phase4",
    "Phase>=3",
    "Phase>=2",
    "Phase>=1",
    "PhaseT",
]

result = []
result_st = []
result_ci = []
array2 = []

# Initialize an empty list to store the results
results = []

# Iterate over the sample strings and extract the desired substrings
for path in listado:
    array1 = np.delete(
        (
            spark.read.parquet(path)
            .join(full_data, on=["prediction", "comparison"], how="outer")
            .groupBy("comparison")
            .pivot("prediction")
            .agg(F.first("a"))
            .sort(F.col("comparison").desc())
            .select("comparison", "yes", "no")
            .fillna(0)
            .toPandas()
            .to_numpy()
        ),
        [0],
        1,
    )
    total = np.sum(array1)
    res_npPhaseX = np.array(array1, dtype=int)
    resX = convertTuple(fisher_exact(res_npPhaseX, alternative="two-sided"))
    resx_CI = convertTuple(
        odds_ratio(res_npPhaseX).confidence_interval(confidence_level=0.95)
    )

    result_st.append(resX)
    result_ci.append(resx_CI)
    (rs_result, rs_ci) = relative_success(array1)

    # Check for comparison options
    for key, value in dfs.items():
        if key in path and "combined" in path:
            if "oneCell" in path:
                comparison = dfs[key]
                group = "combinedOneCell"
            elif "diagonal" in path:
                comparison = dfs[key]
                group = "combinedDiagonal"
            elif "Direction" in path:
                comparison = dfs[key]
                group = "combinedHasDoE"
        elif key in path:
            comparison = dfs[key]
            group = key

    # Check for phase options
    for substr in phase_opt:
        if substr in path:
            phase = substr

    if "original" in path:
        dimension = "original"
    elif "propag" in path:
        dimension = "propagated"
    if "l2g" in path:
        dataset = "l2gScore"
    elif "distance" in path:
        dataset = "TSSdistance"

    results.append(
        [
            group,
            dataset,
            comparison,
            dimension,
            phase,
            round(float(resX.split(",")[0]), 2),
            float(resX.split(",")[1]),
            round(float(resx_CI.split(",")[0]), 2),
            round(float(resx_CI.split(",")[1]), 2),
            str(total),
            res_npPhaseX,
            round(float(rs_result), 2),
            round(float(rs_ci[0]), 2),
            round(float(rs_ci[1]), 2),
            path,
        ]
    )
    print(path)
# Convert the results to a pandas DataFrame
df = pd.DataFrame(
    results,
    columns=[
        "group",
        "dataset",
        "comparison",
        "dimension",
        "phase",
        "oddsRatio",
        "pValue",
        "lowerInterval",
        "upperInterval",
        "total",
        "values",
        "relSuccess",
        "rsLower",
        "rsUpper",
        "path",
    ],
)
df = pd.DataFrame(
    results,
    columns=[
        "group",
        "dataset",
        "comparison",
        "dimension",
        "phase",
        "oddsRatio",
        "pValue",
        "lowerInterval",
        "upperInterval",
        "total",
        "values",
        "relSuccess",
        "rsLower",
        "rsUpper",
        "path",
    ],
)

spark session created at 2024-08-13 20:47:46.557220
Analysis started on 2024-08-13 at  2024-08-13 20:47:46.557220


24/08/13 20:47:53 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


spark session created at 2024-08-13 20:47:53.623034


                                                                                

loading sources


ERROR:root:KeyboardInterrupt while sending command.                (0 + 8) / 98]
Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/lib/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/opt/conda/miniconda3/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [2]:
#####
#######
## ANALYSIS FOR L2G Scores, genetic evidence and Direction of Effect
## Original, propagated and Other Vs Oncology
#######
from functions import discrepancifier
from DoEAssessment import directionOfEffect
from functions import relative_success
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
import matplotlib.pyplot as plt
from datetime import date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from datetime import datetime


spark = SparkSession.builder.getOrCreate()
c = datetime.now()
print("spark session created at", c)


#### make the dataset from stopped clin trials
### read supplementary table 9
""" ### just showing how i did the dataset
st9 = spark.read.csv("/Users/juanr/Downloads/ST9.csv", sep=",", header=True)
st9.filter(
    (F.col("clinicalStatus").isin(["Terminated", "Withdrawn", "Suspended"]))
    & (F.col("prediction") == "Negative")
).groupBy(
    "targetId", "diseaseId", "clinicalStatus", "prediction"
).count().toPandas().to_csv(
    "targetDiseaseStoppedNegative.csv"
)
"""
### target-diseases terminated&withdrawal in clin trials
terminated = spark.read.csv(
    "gs://ot-team/jroldan/analysis/targetDiseaseStoppedNegative.csv",
    sep=",",
    header=True,
).drop("_c0", "Withdrawn")

terminated_array = (
    terminated.groupBy("targetId", "diseaseId")
    .agg(F.collect_set("clinicalStatus").alias("clinicalStatus"))
    .withColumn("prediction", F.when(F.col("clinicalStatus").isNotNull(), F.lit("yes")))
)

spark session created at 2024-08-13 20:52:22.664204


                                                                                

In [6]:
###### combined list:

list_l2g = [
    0.1,
    0.15,
    0.2,
    0.25,
    0.3,
    0.35,
    0.4,
    0.45,
    0.5,
    0.55,
    0.6,
    0.65,
    0.7,
    0.75,
    0.8,
    0.85,
    0.9,
    0.95,
]
list_dist = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [9]:
for l in list_l2g:
    print(l)

0.1
0.15
0.2
0.25
0.3
0.35
0.4
0.45
0.5
0.55
0.6
0.65
0.7
0.75
0.8
0.85
0.9
0.95


                                                                                

In [23]:
def comparisons_df(test_propag) -> list:
    """Return list of all comparisons to be used in the analysis"""
    toAnalysis = test_propag.drop(
        "Phase4",
        "Phase>=3",
        "Phase>=2",
        "Phase>=1",
        "Phase0",
        "clinicalStatus",
        "prediction",
        "count",
        "PhaseT",
        "taLabelSimple",
    ).columns[17:]
    dataType = ["byDatatype"] * len(toAnalysis)
    l_studies = []
    l_studies.extend([list(a) for a in zip(toAnalysis, dataType)])

    schema = StructType(
        [
            StructField("comparison", StringType(), True),
            StructField("comparisonType", StringType(), True),
        ]
    )

    comparisons = spark.createDataFrame(l_studies, schema=schema)
    ### include all the columns as predictor

    predictions = spark.createDataFrame(
        data=[
            ("Phase4", "clinical"),
            ("Phase>=3", "clinical"),
            ("Phase>=2", "clinical"),
            ("Phase>=1", "clinical"),
            ("PhaseT", "clinical"),
        ]
    )
    return comparisons.join(predictions, how="full").collect()


full_data = spark.createDataFrame(
    data=[
        ("yes", "yes"),
        ("yes", "no"),
        ("no", "yes"),
        ("no", "no"),
    ],
    schema=StructType(
        [
            StructField("prediction", StringType(), True),
            StructField("comparison", StringType(), True),
        ]
    ),
)


def aggregations_original(
    df,
    data,
    listado,
    comparisonColumn,
    comparisonType,
    predictionColumn,
    predictionType,
    today_date,
):
    wComparison = Window.partitionBy(F.col(comparisonColumn))
    wPrediction = Window.partitionBy(F.col(predictionColumn))
    wPredictionComparison = Window.partitionBy(
        F.col(comparisonColumn), F.col(predictionColumn)
    )
    """
    uniqIds = df.select("targetId", "diseaseId").distinct().count()
    
    out = (
        df.withColumn("comparisonType", F.lit(comparisonType))
        .withColumn("predictionType", F.lit(predictionType))
        .withColumn("total", F.lit(uniqIds))
        .withColumn("a", F.count("targetId").over(wPredictionComparison))
        .withColumn(
            "predictionTotal",
            F.count("targetId").over(wPrediction),
        )
        .withColumn(
            "comparisonTotal",
            F.count("targetId").over(wComparison),
        )
        .select(
            F.col(predictionColumn).alias("prediction"),
            F.col(comparisonColumn).alias("comparison"),
            "comparisonType",
            "predictionType",
            "a",
            "predictionTotal",
            "comparisonTotal",
            "total",
        )
        .filter(F.col("prediction").isNotNull())
        .filter(F.col("comparison").isNotNull())
        .distinct()
    )
    """
    """
    out.write.mode("overwrite").parquet(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )
    """
    listado.append(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )
    """
    print(
        today_date
        + "_"
        + "analysis/"
        + data
        # + "_propagated"
        + "/"
        + comparisonColumn
        + "_"
        + predictionColumn
        + ".parquet"
    )
    """
    c = datetime.now()
    c.strftime("%H:%M:%S")
    print(c)


c = datetime.now()

print("start doing aggregations and writing")
today_date = str(date.today())
listado = []
#### Run analysis using loop
for key, df in datasetDict.items():
    print(key)
    df = df.persist()
    aggSetups_original = comparisons_df(df)
    for row in aggSetups_original:
        aggregations_original(df, key, listado, *row, today_date)
    df.unpersist()
    print(key + " df unpersisted")

##### read files and make spreadsheet
import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
from scipy.stats.contingency import odds_ratio


def convertTuple(tup):
    st = ",".join(map(str, tup))
    return st


full_data = spark.createDataFrame(
    data=[
        ("yes", "yes"),
        ("yes", "no"),
        ("no", "yes"),
        ("no", "no"),
    ],
    schema=StructType(
        [
            StructField("prediction", StringType(), True),
            StructField("comparison", StringType(), True),
        ]
    ),
)
#### update the dictionary dfs with other columns included in the analysis
key_list = [
    "hasGeneticEvidence",
    "oneCell",
    "diagonal",
    "hasDirectionOfEffect",
    "max_L2GScore",
    "min_distance_ranking",
]
value_list = [
    "geneticEvidence",
    "oneCellDoE",
    "diagonalDoE",
    "hasDirectionOfEffect",
    "L2GScore",
    "TSSDistance",
]

dfs = {}


def create_dict_column(dfs, key_list, value_list):
    if len(key_list) != len(value_list):
        raise ValueError("lists of different length")
    dfs.update(zip(key_list, value_list))
    return dfs


dfs = create_dict_column(dfs, key_list, value_list)

# Define the lists of possible substrings
phase_opt = [
    "Phase4",
    "Phase>=3",
    "Phase>=2",
    "Phase>=1",
    "PhaseT",
]

result = []
result_st = []
result_ci = []
array2 = []

# Initialize an empty list to store the results
results = []

# Iterate over the sample strings and extract the desired substrings
for path in listado:
    array1 = np.delete(
        (
            spark.read.parquet(path)
            .join(full_data, on=["prediction", "comparison"], how="outer")
            .groupBy("comparison")
            .pivot("prediction")
            .agg(F.first("a"))
            .sort(F.col("comparison").desc())
            .select("comparison", "yes", "no")
            .fillna(0)
            .toPandas()
            .to_numpy()
        ),
        [0],
        1,
    )
    total = np.sum(array1)
    res_npPhaseX = np.array(array1, dtype=int)
    resX = convertTuple(fisher_exact(res_npPhaseX, alternative="two-sided"))
    resx_CI = convertTuple(
        odds_ratio(res_npPhaseX).confidence_interval(confidence_level=0.95)
    )

    result_st.append(resX)
    result_ci.append(resx_CI)
    (rs_result, rs_ci) = relative_success(array1)

    # Check for comparison options
    for key, value in dfs.items():
        if key in path and "combined" in path:
            if "oneCell" in path:
                comparison = dfs[key]
                group = "combinedOneCell"
            elif "diagonal" in path:
                comparison = dfs[key]
                group = "combinedDiagonal"
            elif "Direction" in path:
                comparison = dfs[key]
                group = "combinedHasDoE"
        elif key in path:
            comparison = dfs[key]
            group = key

    # Check for phase options
    for substr in phase_opt:
        if substr in path:
            phase = substr

    if "original" in path:
        dimension = "original"
    elif "propag" in path:
        dimension = "propagated"
    if "l2g" in path:
        dataset = "l2gScore"
    elif "distance" in path:
        dataset = "TSSdistance"

    results.append(
        [
            group,
            dataset,
            comparison,
            dimension,
            phase,
            round(float(resX.split(",")[0]), 2),
            float(resX.split(",")[1]),
            round(float(resx_CI.split(",")[0]), 2),
            round(float(resx_CI.split(",")[1]), 2),
            str(total),
            res_npPhaseX,
            round(float(rs_result), 2),
            round(float(rs_ci[0]), 2),
            round(float(rs_ci[1]), 2),
            path,
        ]
    )
    print(path)
# Convert the results to a pandas DataFrame
df = pd.DataFrame(
    results,
    columns=[
        "group",
        "dataset",
        "comparison",
        "dimension",
        "phase",
        "oddsRatio",
        "pValue",
        "lowerInterval",
        "upperInterval",
        "total",
        "values",
        "relSuccess",
        "rsLower",
        "rsUpper",
        "path",
    ],
)
df = pd.DataFrame(
    results,
    columns=[
        "group",
        "dataset",
        "comparison",
        "dimension",
        "phase",
        "oddsRatio",
        "pValue",
        "lowerInterval",
        "upperInterval",
        "total",
        "values",
        "relSuccess",
        "rsLower",
        "rsUpper",
        "path",
    ],
)

start doing aggregations and writing
df_l2g_original


24/08/09 12:33:24 WARN CacheManager: Asked to cache already cached data.


2024-08-09 12:33:24.590569
2024-08-09 12:33:24.600035
2024-08-09 12:33:24.608987
2024-08-09 12:33:24.618684
2024-08-09 12:33:24.628586
2024-08-09 12:33:24.636323
2024-08-09 12:33:24.644112
2024-08-09 12:33:24.652456
2024-08-09 12:33:24.661620
2024-08-09 12:33:24.669845
2024-08-09 12:33:24.678966
2024-08-09 12:33:24.687293
2024-08-09 12:33:24.694914
2024-08-09 12:33:24.703302
2024-08-09 12:33:24.711203
2024-08-09 12:33:24.718911
2024-08-09 12:33:24.726355
2024-08-09 12:33:24.733893
2024-08-09 12:33:24.742051
2024-08-09 12:33:24.754343
2024-08-09 12:33:24.762562
2024-08-09 12:33:24.770371
2024-08-09 12:33:24.779529
2024-08-09 12:33:24.789073
2024-08-09 12:33:24.798847
2024-08-09 12:33:24.807340
2024-08-09 12:33:24.814584
2024-08-09 12:33:24.821711
2024-08-09 12:33:24.828721
2024-08-09 12:33:24.835681
2024-08-09 12:33:24.842667
2024-08-09 12:33:24.850163
2024-08-09 12:33:24.857628
2024-08-09 12:33:24.865090
2024-08-09 12:33:24.872801
2024-08-09 12:33:24.880072
2024-08-09 12:33:24.888156
2

24/08/09 12:33:27 WARN CacheManager: Asked to cache already cached data.


2024-08-09 12:33:28.273044
2024-08-09 12:33:28.282922
2024-08-09 12:33:28.291608
2024-08-09 12:33:28.299506
2024-08-09 12:33:28.307860
2024-08-09 12:33:28.316238
2024-08-09 12:33:28.324657
2024-08-09 12:33:28.332700
2024-08-09 12:33:28.341796
2024-08-09 12:33:28.349845
2024-08-09 12:33:28.358681
2024-08-09 12:33:28.366449
2024-08-09 12:33:28.374013
2024-08-09 12:33:28.381620
2024-08-09 12:33:28.389001
2024-08-09 12:33:28.395976
2024-08-09 12:33:28.403716
2024-08-09 12:33:28.411079
2024-08-09 12:33:28.417874
2024-08-09 12:33:28.426193
2024-08-09 12:33:28.434824
2024-08-09 12:33:28.444227
2024-08-09 12:33:28.452889
2024-08-09 12:33:28.462659
2024-08-09 12:33:28.473108
2024-08-09 12:33:28.481591
2024-08-09 12:33:28.489437
2024-08-09 12:33:28.497218
2024-08-09 12:33:28.504568
2024-08-09 12:33:28.512139
2024-08-09 12:33:28.520219
2024-08-09 12:33:28.527461
2024-08-09 12:33:28.535625
2024-08-09 12:33:28.542778
2024-08-09 12:33:28.550495
2024-08-09 12:33:28.558552
2024-08-09 12:33:28.567835
2

24/08/09 12:33:31 WARN CacheManager: Asked to cache already cached data.


2024-08-09 12:33:31.825454
2024-08-09 12:33:31.834162
2024-08-09 12:33:31.842037
2024-08-09 12:33:31.849578
2024-08-09 12:33:31.860280
2024-08-09 12:33:31.871016
2024-08-09 12:33:31.880868
2024-08-09 12:33:31.888208
2024-08-09 12:33:31.895682
2024-08-09 12:33:31.904884
2024-08-09 12:33:31.912843
2024-08-09 12:33:31.921091
2024-08-09 12:33:31.930211
2024-08-09 12:33:31.937657
2024-08-09 12:33:31.944840
2024-08-09 12:33:31.952132
2024-08-09 12:33:31.958923
2024-08-09 12:33:31.966736
2024-08-09 12:33:31.974401
2024-08-09 12:33:31.982122
2024-08-09 12:33:31.990485
2024-08-09 12:33:32.000044
2024-08-09 12:33:32.008071
2024-08-09 12:33:32.015681
2024-08-09 12:33:32.023646
2024-08-09 12:33:32.032921
2024-08-09 12:33:32.041494
2024-08-09 12:33:32.050181
2024-08-09 12:33:32.058692
2024-08-09 12:33:32.069904
2024-08-09 12:33:32.078995
2024-08-09 12:33:32.087122
2024-08-09 12:33:32.094096
2024-08-09 12:33:32.103027
2024-08-09 12:33:32.110447
2024-08-09 12:33:32.118839
2024-08-09 12:33:32.127041
2

24/08/09 12:33:33 WARN CacheManager: Asked to cache already cached data.


2024-08-09 12:33:34.045640
2024-08-09 12:33:34.058181
2024-08-09 12:33:34.066922
2024-08-09 12:33:34.077316
2024-08-09 12:33:34.086147
2024-08-09 12:33:34.095247
2024-08-09 12:33:34.103705
2024-08-09 12:33:34.116731
2024-08-09 12:33:34.129532
2024-08-09 12:33:34.141066
2024-08-09 12:33:34.151217
2024-08-09 12:33:34.158535
2024-08-09 12:33:34.165414
2024-08-09 12:33:34.172313
2024-08-09 12:33:34.179431
2024-08-09 12:33:34.186486
2024-08-09 12:33:34.194727
2024-08-09 12:33:34.202559
2024-08-09 12:33:34.210778
2024-08-09 12:33:34.218617
2024-08-09 12:33:34.226690
2024-08-09 12:33:34.236517
2024-08-09 12:33:34.245726
2024-08-09 12:33:34.254151
2024-08-09 12:33:34.261525
2024-08-09 12:33:34.269795
2024-08-09 12:33:34.279101
2024-08-09 12:33:34.286995
2024-08-09 12:33:34.295307
2024-08-09 12:33:34.305636
2024-08-09 12:33:34.315286
2024-08-09 12:33:34.323886
2024-08-09 12:33:34.333009
2024-08-09 12:33:34.341362
2024-08-09 12:33:34.350162
2024-08-09 12:33:34.357920
2024-08-09 12:33:34.366009
2

AnalysisException: Path does not exist: gs://ot-team/jroldan/2024-08-09_analysis/df_l2g_original/hasGeneticEvidence_Phase4.parquet

In [20]:
datasetDict[f"df_l2g_original"].groupBy(
    "max_L2GScore>=0_5",
    "max_L2GScore>=0_5&diagonalAgreeWithDrugs_combined",
    "max_L2GScore>=0_5&diagonalYes_combined",
    "max_L2GScore>=0_5&oneCellAgreeWithDrugs_combined",
    "max_L2GScore>=0_5&oneCellYes_combined",
    "max_L2GScore>=0_5&hasDirectionOfEffect_combined",
).count().show()

+-----------------+-------------------------------------------------+--------------------------------------+------------------------------------------------+-------------------------------------+-----------------------------------------------+-----+
|max_L2GScore>=0_5|max_L2GScore>=0_5&diagonalAgreeWithDrugs_combined|max_L2GScore>=0_5&diagonalYes_combined|max_L2GScore>=0_5&oneCellAgreeWithDrugs_combined|max_L2GScore>=0_5&oneCellYes_combined|max_L2GScore>=0_5&hasDirectionOfEffect_combined|count|
+-----------------+-------------------------------------------------+--------------------------------------+------------------------------------------------+-------------------------------------+-----------------------------------------------+-----+
|               no|                                               no|                                    no|                                              no|                                   no|                                             no|80961|


In [22]:
datasetDict[f"df_l2g_original"].drop(
    "Phase4",
    "Phase>=3",
    "Phase>=2",
    "Phase>=1",
    "Phase0",
    "clinicalStatus",
    "prediction",
    "count",
    "PhaseT",
    "taLabelSimple",
).columns[17:]

['hasGeneticEvidence',
 'hasdirectionOfEffect',
 'diagonalYes',
 'oneCellYes',
 'max_L2GScore>=0_1',
 'max_L2GScore>=0_15',
 'max_L2GScore>=0_2',
 'max_L2GScore>=0_25',
 'max_L2GScore>=0_3',
 'max_L2GScore>=0_35',
 'max_L2GScore>=0_4',
 'max_L2GScore>=0_45',
 'max_L2GScore>=0_5',
 'max_L2GScore>=0_55',
 'max_L2GScore>=0_6',
 'max_L2GScore>=0_65',
 'max_L2GScore>=0_7',
 'max_L2GScore>=0_75',
 'max_L2GScore>=0_8',
 'max_L2GScore>=0_85',
 'max_L2GScore>=0_9',
 'max_L2GScore>=0_95',
 'max_L2GScore>=0_1&hasDirectionOfEffect_combined',
 'max_L2GScore>=0_15&hasDirectionOfEffect_combined',
 'max_L2GScore>=0_2&hasDirectionOfEffect_combined',
 'max_L2GScore>=0_25&hasDirectionOfEffect_combined',
 'max_L2GScore>=0_3&hasDirectionOfEffect_combined',
 'max_L2GScore>=0_35&hasDirectionOfEffect_combined',
 'max_L2GScore>=0_4&hasDirectionOfEffect_combined',
 'max_L2GScore>=0_45&hasDirectionOfEffect_combined',
 'max_L2GScore>=0_5&hasDirectionOfEffect_combined',
 'max_L2GScore>=0_55&hasDirectionOfEffect_co

In [14]:
datasetDict[f"df_l2g_original"].columns

['diseaseId',
 'targetId',
 'max_L2GScore',
 'coherencyDiagonal',
 'coherencyOneCell',
 'LoF_protect',
 'GoF_protect',
 'LoF_risk',
 'GoF_risk',
 'maxClinPhase',
 'coherencyDiagonal_ch',
 'coherencyOneCell_ch',
 'LoF_protect_ch',
 'GoF_protect_ch',
 'geneticEvidence',
 'diagonalAgreeWithDrugs',
 'oneCellAgreeWithDrugs',
 'Phase4',
 'Phase>=3',
 'Phase>=2',
 'Phase>=1',
 'Phase0',
 'clinicalStatus',
 'prediction',
 'PhaseT',
 'taLabelSimple',
 'hasGeneticEvidence',
 'hasdirectionOfEffect',
 'diagonalYes',
 'oneCellYes',
 'max_L2GScore>=0_1',
 'max_L2GScore>=0_15',
 'max_L2GScore>=0_2',
 'max_L2GScore>=0_25',
 'max_L2GScore>=0_3',
 'max_L2GScore>=0_35',
 'max_L2GScore>=0_4',
 'max_L2GScore>=0_45',
 'max_L2GScore>=0_5',
 'max_L2GScore>=0_55',
 'max_L2GScore>=0_6',
 'max_L2GScore>=0_65',
 'max_L2GScore>=0_7',
 'max_L2GScore>=0_75',
 'max_L2GScore>=0_8',
 'max_L2GScore>=0_85',
 'max_L2GScore>=0_9',
 'max_L2GScore>=0_95',
 'max_L2GScore>=0_1&hasDirectionOfEffect_combined',
 'max_L2GScore>=0_

In [6]:
for a, x in dict_comb.items():
    print(a, x)

NameError: name 'dict_comb' is not defined

In [8]:
#### compare whether the column of direction of effect has the right information

datasetDict[f"df_l2g_original"].groupBy(
    "hasGeneticEvidence", "hasdirectionOfEffect", "diagonalYes", "oneCellYes"
).count().show()

+------------------+--------------------+-----------+----------+-----+
|hasGeneticEvidence|hasdirectionOfEffect|diagonalYes|oneCellYes|count|
+------------------+--------------------+-----------+----------+-----+
|                no|                  no|         no|        no|80653|
|               yes|                 yes|         no|        no|  575|
|               yes|                 yes|        yes|        no|   63|
|               yes|                 yes|        yes|       yes|   40|
+------------------+--------------------+-----------+----------+-----+



In [5]:
#####
#######
## ANALYSIS FOR L2G Scores, genetic evidence and Direction of Effect
## Original, propagated and Other Vs Oncology
#######
from functions import discrepancifier
from DoEAssessment import directionOfEffect
from functions import relative_success
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
import matplotlib.pyplot as plt
from datetime import date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from datetime import datetime


spark = SparkSession.builder.getOrCreate()
c = datetime.now()
print("spark session created at", c)


#### make the dataset from stopped clin trials
### read supplementary table 9
""" ### just showing how i did the dataset
st9 = spark.read.csv("/Users/juanr/Downloads/ST9.csv", sep=",", header=True)
st9.filter(
    (F.col("clinicalStatus").isin(["Terminated", "Withdrawn", "Suspended"]))
    & (F.col("prediction") == "Negative")
).groupBy(
    "targetId", "diseaseId", "clinicalStatus", "prediction"
).count().toPandas().to_csv(
    "targetDiseaseStoppedNegative.csv"
)
"""
### target-diseases terminated&withdrawal in clin trials
terminated = spark.read.csv(
    "gs://ot-team/jroldan/analysis/targetDiseaseStoppedNegative.csv",
    sep=",",
    header=True,
).drop("_c0", "Withdrawn")

terminated_array = (
    terminated.groupBy("targetId", "diseaseId")
    .agg(F.collect_set("clinicalStatus").alias("clinicalStatus"))
    .withColumn("prediction", F.when(F.col("clinicalStatus").isNotNull(), F.lit("yes")))
)

### Now , filter by rank, and join with the info from Ot genetics and run the DoE.
ranking = Window.partitionBy("studyId", "variantId")
### union with the other datasources
platform_v = "24.06"

target_path = (
    f"gs://open-targets-data-releases/{platform_v}/output/etl/parquet/targets/"
)
target = spark.read.parquet(target_path)

disease_path = (
    f"gs://open-targets-data-releases/{platform_v}/output/etl/parquet/diseases/"
)
diseases = spark.read.parquet(disease_path)
mecact_path = f"gs://open-targets-data-releases/{platform_v}/output/etl/parquet/mechanismOfAction/"
mecact = spark.read.parquet(mecact_path)
evidences = spark.read.parquet(
    f"gs://open-targets-data-releases/{platform_v}/output/etl/parquet/evidence"
).filter(
    F.col("datasourceId").isin(
        [
            "ot_genetics_portal",
            "gene_burden",
            "eva",
            "eva_somatic",
            "gene2phenotype",
            "orphanet",
            "cancer_gene_census",
            "intogen",
            "impc",
            "chembl",
        ]
    )
)
# 1# Make a list of variant of interest (Sequence ontology terms) to subset data of interest.
### Bear in mind that SO works with ontology structure as: SO:XXXXXX, but databases has the SO as: SO_XXXXXX
var_filter_lof = [
    ### High impact variants https://www.ensembl.org/info/genome/variation/prediction/predicted_data.html
    "SO_0001589",  ## frameshit_variant
    "SO_0001587",  ## stop_gained
    "SO_0001574",  ## splice_acceptor_variant
    "SO_0001575",  ## splice_donor_variant
    "SO_0002012",  ## start_lost
    "SO_0001578",  ## stop_lost
    "SO_0001893",  ## transcript_ablation
    # "SO:0001889", ## transcript_amplification ## the Only HIGH impact that increase protein.
]

gof = ["SO_0002053"]
lof = ["SO_0002054"]

print("loading sources")

## Building Sequence Ontology
so_path = "gs://ot-team/jroldan/sequenceOntology_20221118.csv"
so_ontology = spark.read.csv(so_path, header=True)
building = (
    so_ontology.select(F.col("Accession"), F.col("Parents"))
    .withColumn("Parentalind", F.split(F.col("Parents"), ","))
    .withColumn("Parentalind", F.explode_outer("Parentalind"))
    .groupBy("Parentalind")
    .agg(F.collect_list(F.col("Accession")).alias("childrens"))
    .join(so_ontology, F.col("Parentalind") == so_ontology.Accession, "right")
)
## annotate TSG/oncogene/bivalent using 'hallmarks.attributes'
oncotsg_list = [
    "TSG",
    "oncogene",
    "Oncogene",
    "oncogene",
    "oncogene,TSG",
    "TSG,oncogene",
    "fusion,oncogene",
    "oncogene,fusion",
]

#### rlike('('+Keywords+')(\s|$)'
### on 03.07.2023 we add the categories:
# DISRUPTING AGENT - inhibitor
# STABILISER - activator

### Hacer el join del actionType con el chembl para sacar los mecanismos de accion.
inhibitors = [
    "RNAI INHIBITOR",
    "NEGATIVE MODULATOR",
    "NEGATIVE ALLOSTERIC MODULATOR",
    "ANTAGONIST",
    "ANTISENSE INHIBITOR",
    "BLOCKER",
    "INHIBITOR",
    "DEGRADER",
    "INVERSE AGONIST",
    "ALLOSTERIC ANTAGONIST",
    "DISRUPTING AGENT",  ## added new on 03.07.2023
]

activators = [
    "PARTIAL AGONIST",
    "ACTIVATOR",
    "POSITIVE ALLOSTERIC MODULATOR",
    "POSITIVE MODULATOR",
    "AGONIST",
    "SEQUESTERING AGENT",
    "STABILISER",  ## added new on 03.07.2023
]

columnas = ["activator", "inhibitor"]
both = activators + inhibitors

actionType = (
    mecact.select(
        F.explode_outer("chemblIds").alias("drugId2"),
        "actionType",
        "mechanismOfAction",
        "targets",
    )
    .select(
        F.explode_outer("targets").alias("targetId2"),
        "drugId2",
        "actionType",
        "mechanismOfAction",
    )
    .groupBy("targetId2", "drugId2")
    .agg(
        F.collect_set("actionType").alias("actionType"),
    )
)

oncolabel = (
    target.select(
        "id", "approvedSymbol", F.explode_outer(F.col("hallmarks.attributes"))
    )
    .select("id", "approvedSymbol", "col.description")
    .filter(F.col("description").isin(oncotsg_list))
    .groupBy("id", "approvedSymbol")
    .agg(F.collect_set("description").alias("description"))
    .withColumn("description_splited", F.concat_ws(",", F.col("description")))
    .withColumn(
        "TSorOncogene",
        F.when(
            (
                F.col("description_splited").rlike("ncogene")
                & F.col("description_splited").rlike("TSG")
            ),
            F.lit("bivalent"),
        )
        .when(F.col("description_splited").rlike("ncogene(\s|$)"), F.lit("oncogene"))
        .when(F.col("description_splited").rlike("TSG(\s|$)"), F.lit("TSG"))
        .otherwise(F.lit("noEvaluable")),  ####
    )
    .withColumnRenamed("id", "target_id")
)

# 2# run the transformation of the evidences datasets used.

windowSpec = Window.partitionBy("targetId", "diseaseId")

columns_chembl = ["LoF_protect", "GoF_protect"]
columns_dataset = ["LoF_protect", "GoF_protect", "LoF_risk", "GoF_risk", "evidenceDif"]
columns = ["GoF_risk", "LoF_protect", "LoF_risk", "GoF_protect"]
terms = ["noEvaluable", "bivalent_risk", "null", "dispar"]

sincgc = [
    "gene_burden",
    "intogen",
    "eva",
    "eva_somatic",
    "ot_genetics_portal",
    "impc",
    "orphanet",
    "gene2phenotype",
]

germline = [
    "gene_burden",
    "eva",
    "ot_genetics_portal",
    "impc",
    "orphanet",
    "gene2phenotype",
]

somatic = ["intogen", "cancer_gene_census", "eva_somatic"]

datasource_list = [
    "gene_burden",
    "intogen",
    "cancer_gene_census",
    "eva",
    "eva_somatic",
    "ot_genetics_portal",
    "impc",
    "orphanet",
    "gene2phenotype",
    "chembl",
    "WOcgc",
    "somatic",
    "germline",
]
#### version all gene burden
prueba_assessment = (
    directionOfEffect(evidences, platform_v)
    .withColumn(
        "rank",
        F.when(
            (F.col("datasourceId") == "ot_genetics_portal"),
            F.row_number().over(ranking.orderBy(F.col("resourceScore").desc())),
        ).otherwise(F.lit(None)),
    )
    .withColumn(
        "average",
        F.when(
            (F.col("datasourceId") == "ot_genetics_portal"),
            F.avg("resourceScore").over(ranking.orderBy(F.col("resourceScore").desc())),
        ).otherwise(F.lit(None)),
    )
    .persist()
)

genEvidDataset = (
    prueba_assessment.filter(F.col("datasourceId") != "chembl")  #### checked 31.05.2023
    .groupBy("targetId", "diseaseId")
    .agg(F.count("targetId").alias("Nr_evidences"))
    .select("targetId", "diseaseId", "Nr_evidences")
    .withColumn("geneticEvidence", F.lit("hasGeneticEvidence"))
    .drop("Nr_evidences")
)

coherency_toAssess_others_datasource = (  #### checked 31.05.2023
    prueba_assessment.filter(
        (F.col("homogenized").isin(columns)) & (F.col("datasourceId") != "chembl")
    )
    .groupBy("targetId", "diseaseId")
    .agg(F.collect_set("datasourceId").alias("datasourceIds"))
)

taDf = spark.createDataFrame(
    data=[
        ("MONDO_0045024", "cell proliferation disorder", "Oncology"),
        ("EFO_0005741", "infectious disease", "Other"),
        ("OTAR_0000014", "pregnancy or perinatal disease", "Other"),
        ("EFO_0005932", "animal disease", "Other"),
        ("MONDO_0024458", "disease of visual system", "Other"),
        ("EFO_0000319", "cardiovascular disease", "Other"),
        ("EFO_0009605", "pancreas disease", "Other"),
        ("EFO_0010282", "gastrointestinal disease", "Other"),
        ("OTAR_0000017", "reproductive system or breast disease", "Other"),
        ("EFO_0010285", "integumentary system disease", "Other"),
        ("EFO_0001379", "endocrine system disease", "Other"),
        ("OTAR_0000010", "respiratory or thoracic disease", "Other"),
        ("EFO_0009690", "urinary system disease", "Other"),
        ("OTAR_0000006", "musculoskeletal or connective tissue disease", "Other"),
        ("MONDO_0021205", "disease of ear", "Other"),
        ("EFO_0000540", "immune system disease", "Other"),
        ("EFO_0005803", "hematologic disease", "Other"),
        ("EFO_0000618", "nervous system disease", "Other"),
        ("MONDO_0002025", "psychiatric disorder", "Other"),
        ("MONDO_0024297", "nutritional or metabolic disease", "Other"),
        ("OTAR_0000018", "genetic, familial or congenital disease", "Other"),
        ("OTAR_0000009", "injury, poisoning or other complication", "Other"),
        ("EFO_0000651", "phenotype", "Other"),
        ("EFO_0001444", "measurement", "Other"),
        ("GO_0008150", "biological process", "Other"),
    ],
    schema=StructType(
        [
            StructField("taId", StringType(), True),
            StructField("taLabel", StringType(), True),
            StructField("taLabelSimple", StringType(), True),
        ]
    ),
).withColumn("taRank", F.monotonically_increasing_id())

### give us a classification of Oncology VS non oncology
wByDisease = Window.partitionBy("diseaseId")  #### checked 31.05.2023
diseaseTA = (
    diseases.withColumn("taId", F.explode("therapeuticAreas"))
    .select(F.col("id").alias("diseaseId"), "taId", "parents")
    .join(taDf, on="taId", how="left")
    .withColumn("minRank", F.min("taRank").over(wByDisease))
    .filter(F.col("taRank") == F.col("minRank"))
    .drop("taRank", "minRank")
)

v2g = spark.read.parquet("gs://genetics-portal-dev-data/22.09.1/outputs/v2g")
varDistToGene = v2g.select(
    F.concat_ws("_", "chr_id", "position", "ref_allele", "alt_allele").alias(
        "variantId"
    ),
    F.col("gene_id").alias("targetId"),
    "source_id",
    "d",
    "distance_score",
).filter(F.col("source_id") == "canonical_tss")

ranking = Window.partitionBy("studyId", "variantId")


#######
# Build Ot genetics dataset as supporting evidence
#######
otGenetics = (
    prueba_assessment.filter(
        F.col("datasourceId").isin(
            [
                "ot_genetics_portal",
            ]
        )
    )
    # .filter((F.col("homogenized") != "noEvaluable"))
    .join(varDistToGene, on=["variantId", "targetId"], how="left")
    .join(genEvidDataset, on=["targetId", "diseaseId"], how="left")
    .withColumn(
        "datasources",
        F.collect_set("datasourceId").over(Window.partitionBy("targetId", "diseaseId")),
    )
    .withColumn(
        "L2G_ranking",
        F.when(
            (F.col("datasourceId") == "ot_genetics_portal"),
            F.row_number().over(ranking.orderBy(F.col("resourceScore").desc())),
        ).otherwise(F.lit(None)),
    )
    .withColumn(
        "averageL2G",
        F.when(
            (F.col("datasourceId") == "ot_genetics_portal"),
            F.avg("resourceScore").over(ranking.orderBy(F.col("resourceScore").desc())),
        ).otherwise(F.lit(None)),
    )
    .withColumn(
        "averageCanonicalTSSDistance",
        F.when(
            (F.col("datasourceId") == "ot_genetics_portal"),
            F.avg("d").over(ranking.orderBy(F.col("resourceScore").desc())),
        ).otherwise(F.lit(None)),
    )
    .withColumn(
        "datasources",
        F.when(
            F.col("rank").isNull(),
            F.array_remove(F.col("datasources"), "ot_genetics_portal"),
        ).otherwise(F.col("datasources")),
    )
    .withColumn(
        "distance_ranking",
        F.when(
            (F.col("datasourceId") == "ot_genetics_portal"),
            F.row_number().over(ranking.orderBy(F.col("d").asc())),
        ).otherwise(F.lit(None)),
    )
    .withColumn(
        "ChemblL2gRanking",
        F.when(
            (F.array_contains(F.col("datasources"), "chembl"))
            & (F.array_contains(F.col("datasources"), "ot_genetics_portal")),
            F.lit(F.col("L2G_ranking")),
        ).otherwise(F.lit(None)),
    )
    .withColumn(
        "chemblDistanceRanking",
        F.when(
            (F.array_contains(F.col("datasources"), "chembl"))
            & (F.array_contains(F.col("datasources"), "ot_genetics_portal")),
            F.lit(F.col("distance_ranking")),
        ).otherwise(F.lit(None)),
    )
    .withColumn(
        "frontierValue",
        ## ot genetics portal
        F.when(
            F.col("datasourceId") == "ot_genetics_portal",  ### the same for gene_burden
            F.when(
                (F.col("beta").isNotNull()) & (F.col("OddsRatio").isNull()),
                F.when(
                    (F.col("beta") <= 0.1) & (F.col("beta") >= -0.1),
                    F.lit("limitValue"),
                ).otherwise(F.lit("noLimitValue")),
            )
            .when(
                (F.col("beta").isNull()) & (F.col("OddsRatio").isNotNull()),
                F.when(
                    (F.col("OddsRatio") <= 1.1) & (F.col("OddsRatio") >= 0.9),
                    F.lit("limitValue"),
                ).otherwise(F.lit("noLimitValue")),
            )
            .when(
                (F.col("beta").isNull()) & (F.col("OddsRatio").isNull()),
                F.lit("noValue"),
            ),
        ),
    )
).persist()

#####
# function for interpreting DoE and coherencies/discrepancies
#####

diseases2 = diseases.select("id", "parents").withColumn(
    "diseaseIdPropagated",
    F.explode_outer(F.concat(F.array(F.col("id")), F.col("parents"))),
)

analysis_chembl = discrepancifier(
    prueba_assessment.filter((F.col("datasourceId") == "chembl"))
    .withColumn(
        "maxClinPhase",
        F.max(F.col("clinicalPhase")).over(Window.partitionBy("targetId", "diseaseId")),
    )
    .groupBy("targetId", "diseaseId", "maxClinPhase")
    .pivot("homogenized")
    .agg(F.count("targetId"))
    .persist()
)

#### propag OtGenetics:
otGenetics_propag = (
    otGenetics.filter((F.col("datasourceId") == "ot_genetics_portal"))
    .join(
        diseases2.selectExpr("id as diseaseId", "diseaseIdPropagated"),
        on="diseaseId",
        how="left",
    )
    .withColumnRenamed("diseaseId", "oldDiseaseId")
    .withColumnRenamed("diseaseIdPropagated", "diseaseId")
).persist()


#### include dictionary for calling dataframes:
# max_L2GScore
# min_distance_ranking


def benchmarkOT(discrepancifier, otGenetics, metric):
    dict_comb = {}
    dict_comb = {
        "hasDirectionOfEffect": f"{metric}",
        "diagonalYes": f"{metric}",
        "oneCellYes": f"{metric}",
    }
    list_l2g = [
        0.1,
        0.15,
        0.2,
        0.25,
        0.3,
        0.35,
        0.4,
        0.45,
        0.5,
        0.55,
        0.6,
        0.65,
        0.7,
        0.75,
        0.8,
        0.85,
        0.9,
        0.95,
    ]
    list_dist = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    return (
        discrepancifier(
            otGenetics.filter((F.col("datasourceId") == "ot_genetics_portal"))
            .withColumn(
                "min_distance_ranking",
                F.min("distance_ranking").over(
                    Window.partitionBy("targetId", "diseaseId")
                ),
            )
            .withColumn(  ### take maximum L2G score per T-D
                "max_L2GScore",
                F.max("resourceScore").over(
                    Window.partitionBy("targetId", "diseaseId")
                ),
            )
            .groupBy(
                "targetId",
                "diseaseId",
                "geneticEvidence",
                f"{value}",
            )  ##### modifications here to include the groups of ranking/distances to TSS
            .pivot("homogenized")
            .agg(F.count("targetId"))
        )
        .selectExpr(
            "targetId",
            "diseaseId",
            "geneticEvidence",
            f"{metric}",
            "coherencyDiagonal as coherencyDiagonal",
            "coherencyOneCell as coherencyOneCell",
            "LoF_protect",
            "GoF_protect",
            "LoF_risk",
            "GoF_risk",
        )
        .join(
            analysis_chembl.selectExpr(
                "targetId",
                "diseaseId",
                "maxClinPhase",
                "coherencyDiagonal as coherencyDiagonal_ch",
                "coherencyOneCell as coherencyOneCell_ch",
                "LoF_protect as LoF_protect_ch",
                "GoF_protect as GoF_protect_ch",
            ),
            on=["targetId", "diseaseId"],
            how="right",
        )
        # .withColumn(
        #    "geneticEvidence",
        #    F.when(
        #        F.col(f"{metric}").isNotNull(), F.lit("hasGeneticEvidence")
        #    ).otherwise(F.lit("noGeneticEvidence")),
        # )
        .withColumn(
            "diagonalAgreeWithDrugs",
            F.when(
                (F.col("coherencyDiagonal_ch") == "coherent")
                & (F.col("coherencyDiagonal") == "coherent"),
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        F.col("GoF_risk").isNotNull() | F.col("LoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .when(
                    F.col("GoF_protect_ch").isNotNull()
                    & (
                        F.col("LoF_risk").isNotNull() | F.col("GoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        )
        .withColumn(
            "oneCellAgreeWithDrugs",
            F.when(
                (F.col("coherencyOneCell_ch") == "coherent")
                & (F.col("coherencyOneCell") == "coherent"),
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        (F.col("LoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("GoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .when(
                    (F.col("GoF_protect_ch").isNotNull())
                    & (
                        (F.col("GoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("LoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        )
        .withColumn(
            "Phase4",
            F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=3",
            F.when(F.col("maxClinPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=2",
            F.when(F.col("maxClinPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=1",
            F.when(F.col("maxClinPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase0",
            F.when(F.col("maxClinPhase") == 0, F.lit("yes")).otherwise(F.lit("no")),
        )
        .join(terminated_array, on=["targetId", "diseaseId"], how="left")
        .withColumn(
            "PhaseT",
            F.when(F.col("prediction") == "yes", F.lit("yes")).otherwise(F.lit("no")),
        )
        .join(
            diseaseTA.select("diseaseId", "taLabelSimple"), on="diseaseId", how="left"
        )
        .withColumn(
            "hasGeneticEvidence",
            F.when(
                F.col("geneticEvidence") == "hasGeneticEvidence", F.lit("yes")
            ).otherwise(F.lit("no")),
        )
        .withColumn(  #### new column to modify it
            "hasdirectionOfEffect",
            F.when(F.col("coherencyDiagonal").isNotNull(), F.lit("yes")).otherwise(
                F.lit("no")
            ),
        )
        .withColumn(
            "diagonalYes",
            F.when(
                F.col("diagonalAgreeWithDrugs") == "coherent", F.lit("yes")
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "oneCellYes",
            F.when(
                F.col("oneCellAgreeWithDrugs") == "coherent", F.lit("yes")
            ).otherwise(F.lit("no")),
        )
        .select(
            ["*"]
            + (
                [  ### single columns
                    F.when(F.col(f"{metric}") >= n, F.lit("yes"))
                    .otherwise(F.lit("no"))
                    .alias(f"{metric}>={str(n).replace('.', '_')}")
                    for n in list_l2g
                ]
                if metric == "max_L2GScore"  # Adjust this condition as needed
                else [
                    F.when(F.col(f"{metric}") <= n, F.lit("yes"))
                    .otherwise(F.lit("no"))
                    .alias(f"{metric}<={n}")
                    for n in list_dist
                ]
            )
            + (
                [  ### column combinations for Yes/No colums Plus has DoE (any agreement)
                    F.when((F.col(a) == "yes") & (F.col(x) >= n), F.lit("yes"))
                    .otherwise(F.lit("no"))
                    .alias(f"{x}>={str(n).replace('.', '_')}&{a}_combined")
                    for a, x in dict_comb.items()
                    for n in list_l2g
                ]
                if metric == "max_L2GScore"
                else [
                    F.when((F.col(a) == "yes") & (F.col(x) <= n), F.lit("yes"))
                    .otherwise(F.lit("no"))
                    .alias(f"{x}<={str(n).replace('.', '_')}&{a}_combined")
                    for a, x in dict_comb.items()
                    for n in list_dist
                ]
            )
        )
        .persist()
    )

spark session created at 2024-08-13 21:36:40.200693
loading sources


24/08/13 21:36:44 WARN CacheManager: Asked to cache already cached data.
24/08/13 21:36:44 WARN CacheManager: Asked to cache already cached data.
24/08/13 21:36:45 WARN CacheManager: Asked to cache already cached data.
24/08/13 21:36:46 WARN CacheManager: Asked to cache already cached data.
24/08/13 21:36:46 WARN CacheManager: Asked to cache already cached data.
24/08/13 21:36:46 WARN CacheManager: Asked to cache already cached data.
24/08/13 21:36:46 WARN CacheManager: Asked to cache already cached data.


In [6]:
metric_list = ["max_L2GScore", "min_distance_ranking"]

datasetDict = {}
for value in metric_list:
    if value == "max_L2GScore":
        datasetDict[f"df_l2g_original"] = benchmarkOT(
            discrepancifier, otGenetics, value
        )
        datasetDict[f"df_l2g_propagated"] = benchmarkOT(
            discrepancifier, otGenetics_propag, value
        )
    elif value == "min_distance_ranking":
        datasetDict[f"df_distance_original"] = benchmarkOT(
            discrepancifier, otGenetics, value
        )
        datasetDict[f"df_distance_propagated"] = benchmarkOT(
            discrepancifier, otGenetics_propag, value
        )

                                                                                

In [23]:
#########
#####
#######
## ANALYSIS FOR L2G Scores, genetic evidence and Direction of Effect
## Original, propagated and Other Vs Oncology
#######
from functions import discrepancifier
from DoEAssessment import directionOfEffect
from functions import relative_success
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
import matplotlib.pyplot as plt
from datetime import date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from datetime import datetime


spark = SparkSession.builder.getOrCreate()
c = datetime.now()
print("spark session created at", c)


#### make the dataset from stopped clin trials
### read supplementary table 9
""" ### just showing how i did the dataset
st9 = spark.read.csv("/Users/juanr/Downloads/ST9.csv", sep=",", header=True)
st9.filter(
    (F.col("clinicalStatus").isin(["Terminated", "Withdrawn", "Suspended"]))
    & (F.col("prediction") == "Negative")
).groupBy(
    "targetId", "diseaseId", "clinicalStatus", "prediction"
).count().toPandas().to_csv(
    "targetDiseaseStoppedNegative.csv"
)
"""
### target-diseases terminated&withdrawal in clin trials
terminated = spark.read.csv(
    "gs://ot-team/jroldan/analysis/targetDiseaseStoppedNegative.csv",
    sep=",",
    header=True,
).drop("_c0", "Withdrawn")

terminated_array = (
    terminated.groupBy("targetId", "diseaseId")
    .agg(F.collect_set("clinicalStatus").alias("clinicalStatus"))
    .withColumn("prediction", F.when(F.col("clinicalStatus").isNotNull(), F.lit("yes")))
)

### Now , filter by rank, and join with the info from Ot genetics and run the DoE.
ranking = Window.partitionBy("studyId", "variantId")
### union with the other datasources
platform_v = "24.06"

target_path = (
    f"gs://open-targets-data-releases/{platform_v}/output/etl/parquet/targets/"
)
target = spark.read.parquet(target_path)

disease_path = (
    f"gs://open-targets-data-releases/{platform_v}/output/etl/parquet/diseases/"
)
diseases = spark.read.parquet(disease_path)
mecact_path = f"gs://open-targets-data-releases/{platform_v}/output/etl/parquet/mechanismOfAction/"
mecact = spark.read.parquet(mecact_path)
evidences = spark.read.parquet(
    f"gs://open-targets-data-releases/{platform_v}/output/etl/parquet/evidence"
).filter(
    F.col("datasourceId").isin(
        [
            "ot_genetics_portal",
            "gene_burden",
            "eva",
            "eva_somatic",
            "gene2phenotype",
            "orphanet",
            "cancer_gene_census",
            "intogen",
            "impc",
            "chembl",
        ]
    )
)
# 1# Make a list of variant of interest (Sequence ontology terms) to subset data of interest.
### Bear in mind that SO works with ontology structure as: SO:XXXXXX, but databases has the SO as: SO_XXXXXX
var_filter_lof = [
    ### High impact variants https://www.ensembl.org/info/genome/variation/prediction/predicted_data.html
    "SO_0001589",  ## frameshit_variant
    "SO_0001587",  ## stop_gained
    "SO_0001574",  ## splice_acceptor_variant
    "SO_0001575",  ## splice_donor_variant
    "SO_0002012",  ## start_lost
    "SO_0001578",  ## stop_lost
    "SO_0001893",  ## transcript_ablation
    # "SO:0001889", ## transcript_amplification ## the Only HIGH impact that increase protein.
]

gof = ["SO_0002053"]
lof = ["SO_0002054"]

print("loading sources")

## Building Sequence Ontology
so_path = "gs://ot-team/jroldan/sequenceOntology_20221118.csv"
so_ontology = spark.read.csv(so_path, header=True)
building = (
    so_ontology.select(F.col("Accession"), F.col("Parents"))
    .withColumn("Parentalind", F.split(F.col("Parents"), ","))
    .withColumn("Parentalind", F.explode_outer("Parentalind"))
    .groupBy("Parentalind")
    .agg(F.collect_list(F.col("Accession")).alias("childrens"))
    .join(so_ontology, F.col("Parentalind") == so_ontology.Accession, "right")
)
## annotate TSG/oncogene/bivalent using 'hallmarks.attributes'
oncotsg_list = [
    "TSG",
    "oncogene",
    "Oncogene",
    "oncogene",
    "oncogene,TSG",
    "TSG,oncogene",
    "fusion,oncogene",
    "oncogene,fusion",
]

#### rlike('('+Keywords+')(\s|$)'
### on 03.07.2023 we add the categories:
# DISRUPTING AGENT - inhibitor
# STABILISER - activator

### Hacer el join del actionType con el chembl para sacar los mecanismos de accion.
inhibitors = [
    "RNAI INHIBITOR",
    "NEGATIVE MODULATOR",
    "NEGATIVE ALLOSTERIC MODULATOR",
    "ANTAGONIST",
    "ANTISENSE INHIBITOR",
    "BLOCKER",
    "INHIBITOR",
    "DEGRADER",
    "INVERSE AGONIST",
    "ALLOSTERIC ANTAGONIST",
    "DISRUPTING AGENT",  ## added new on 03.07.2023
]

activators = [
    "PARTIAL AGONIST",
    "ACTIVATOR",
    "POSITIVE ALLOSTERIC MODULATOR",
    "POSITIVE MODULATOR",
    "AGONIST",
    "SEQUESTERING AGENT",
    "STABILISER",  ## added new on 03.07.2023
]

columnas = ["activator", "inhibitor"]
both = activators + inhibitors

actionType = (
    mecact.select(
        F.explode_outer("chemblIds").alias("drugId2"),
        "actionType",
        "mechanismOfAction",
        "targets",
    )
    .select(
        F.explode_outer("targets").alias("targetId2"),
        "drugId2",
        "actionType",
        "mechanismOfAction",
    )
    .groupBy("targetId2", "drugId2")
    .agg(
        F.collect_set("actionType").alias("actionType"),
    )
)

oncolabel = (
    target.select(
        "id", "approvedSymbol", F.explode_outer(F.col("hallmarks.attributes"))
    )
    .select("id", "approvedSymbol", "col.description")
    .filter(F.col("description").isin(oncotsg_list))
    .groupBy("id", "approvedSymbol")
    .agg(F.collect_set("description").alias("description"))
    .withColumn("description_splited", F.concat_ws(",", F.col("description")))
    .withColumn(
        "TSorOncogene",
        F.when(
            (
                F.col("description_splited").rlike("ncogene")
                & F.col("description_splited").rlike("TSG")
            ),
            F.lit("bivalent"),
        )
        .when(F.col("description_splited").rlike("ncogene(\s|$)"), F.lit("oncogene"))
        .when(F.col("description_splited").rlike("TSG(\s|$)"), F.lit("TSG"))
        .otherwise(F.lit("noEvaluable")),  ####
    )
    .withColumnRenamed("id", "target_id")
)

# 2# run the transformation of the evidences datasets used.

windowSpec = Window.partitionBy("targetId", "diseaseId")

columns_chembl = ["LoF_protect", "GoF_protect"]
columns_dataset = ["LoF_protect", "GoF_protect", "LoF_risk", "GoF_risk", "evidenceDif"]
columns = ["GoF_risk", "LoF_protect", "LoF_risk", "GoF_protect"]
terms = ["noEvaluable", "bivalent_risk", "null", "dispar"]

sincgc = [
    "gene_burden",
    "intogen",
    "eva",
    "eva_somatic",
    "ot_genetics_portal",
    "impc",
    "orphanet",
    "gene2phenotype",
]

germline = [
    "gene_burden",
    "eva",
    "ot_genetics_portal",
    "impc",
    "orphanet",
    "gene2phenotype",
]

somatic = ["intogen", "cancer_gene_census", "eva_somatic"]

datasource_list = [
    "gene_burden",
    "intogen",
    "cancer_gene_census",
    "eva",
    "eva_somatic",
    "ot_genetics_portal",
    "impc",
    "orphanet",
    "gene2phenotype",
    "chembl",
    "WOcgc",
    "somatic",
    "germline",
]
#### version all gene burden
prueba_assessment = (
    directionOfEffect(evidences, platform_v)
    .withColumn(
        "rank",
        F.when(
            (F.col("datasourceId") == "ot_genetics_portal"),
            F.row_number().over(ranking.orderBy(F.col("resourceScore").desc())),
        ).otherwise(F.lit(None)),
    )
    .withColumn(
        "average",
        F.when(
            (F.col("datasourceId") == "ot_genetics_portal"),
            F.avg("resourceScore").over(ranking.orderBy(F.col("resourceScore").desc())),
        ).otherwise(F.lit(None)),
    )
    .persist()
)

genEvidDataset = (
    prueba_assessment.filter(F.col("datasourceId") != "chembl")  #### checked 31.05.2023
    .groupBy("targetId", "diseaseId")
    .agg(F.count("targetId").alias("Nr_evidences"))
    .select("targetId", "diseaseId", "Nr_evidences")
    .withColumn("geneticEvidence", F.lit("hasGeneticEvidence"))
    .drop("Nr_evidences")
)

coherency_toAssess_others_datasource = (  #### checked 31.05.2023
    prueba_assessment.filter(
        (F.col("homogenized").isin(columns)) & (F.col("datasourceId") != "chembl")
    )
    .groupBy("targetId", "diseaseId")
    .agg(F.collect_set("datasourceId").alias("datasourceIds"))
)

taDf = spark.createDataFrame(
    data=[
        ("MONDO_0045024", "cell proliferation disorder", "Oncology"),
        ("EFO_0005741", "infectious disease", "Other"),
        ("OTAR_0000014", "pregnancy or perinatal disease", "Other"),
        ("EFO_0005932", "animal disease", "Other"),
        ("MONDO_0024458", "disease of visual system", "Other"),
        ("EFO_0000319", "cardiovascular disease", "Other"),
        ("EFO_0009605", "pancreas disease", "Other"),
        ("EFO_0010282", "gastrointestinal disease", "Other"),
        ("OTAR_0000017", "reproductive system or breast disease", "Other"),
        ("EFO_0010285", "integumentary system disease", "Other"),
        ("EFO_0001379", "endocrine system disease", "Other"),
        ("OTAR_0000010", "respiratory or thoracic disease", "Other"),
        ("EFO_0009690", "urinary system disease", "Other"),
        ("OTAR_0000006", "musculoskeletal or connective tissue disease", "Other"),
        ("MONDO_0021205", "disease of ear", "Other"),
        ("EFO_0000540", "immune system disease", "Other"),
        ("EFO_0005803", "hematologic disease", "Other"),
        ("EFO_0000618", "nervous system disease", "Other"),
        ("MONDO_0002025", "psychiatric disorder", "Other"),
        ("MONDO_0024297", "nutritional or metabolic disease", "Other"),
        ("OTAR_0000018", "genetic, familial or congenital disease", "Other"),
        ("OTAR_0000009", "injury, poisoning or other complication", "Other"),
        ("EFO_0000651", "phenotype", "Other"),
        ("EFO_0001444", "measurement", "Other"),
        ("GO_0008150", "biological process", "Other"),
    ],
    schema=StructType(
        [
            StructField("taId", StringType(), True),
            StructField("taLabel", StringType(), True),
            StructField("taLabelSimple", StringType(), True),
        ]
    ),
).withColumn("taRank", F.monotonically_increasing_id())

### give us a classification of Oncology VS non oncology
wByDisease = Window.partitionBy("diseaseId")  #### checked 31.05.2023
diseaseTA = (
    diseases.withColumn("taId", F.explode("therapeuticAreas"))
    .select(F.col("id").alias("diseaseId"), "taId", "parents")
    .join(taDf, on="taId", how="left")
    .withColumn("minRank", F.min("taRank").over(wByDisease))
    .filter(F.col("taRank") == F.col("minRank"))
    .drop("taRank", "minRank")
)

v2g = spark.read.parquet("gs://genetics-portal-dev-data/22.09.1/outputs/v2g")
varDistToGene = v2g.select(
    F.concat_ws("_", "chr_id", "position", "ref_allele", "alt_allele").alias(
        "variantId"
    ),
    F.col("gene_id").alias("targetId"),
    "source_id",
    "d",
    "distance_score",
).filter(F.col("source_id") == "canonical_tss")

ranking = Window.partitionBy("studyId", "variantId")


#######
# Build Ot genetics dataset as supporting evidence
#######
otGenetics = (
    prueba_assessment.filter(
        F.col("datasourceId").isin(
            [
                "ot_genetics_portal",
            ]
        )
    )
    # .filter((F.col("homogenized") != "noEvaluable"))
    .join(varDistToGene, on=["variantId", "targetId"], how="left")
    .join(genEvidDataset, on=["targetId", "diseaseId"], how="left")
    .withColumn(
        "datasources",
        F.collect_set("datasourceId").over(Window.partitionBy("targetId", "diseaseId")),
    )
    .withColumn(
        "L2G_ranking",
        F.when(
            (F.col("datasourceId") == "ot_genetics_portal"),
            F.row_number().over(ranking.orderBy(F.col("resourceScore").desc())),
        ).otherwise(F.lit(None)),
    )
    .withColumn(
        "averageL2G",
        F.when(
            (F.col("datasourceId") == "ot_genetics_portal"),
            F.avg("resourceScore").over(ranking.orderBy(F.col("resourceScore").desc())),
        ).otherwise(F.lit(None)),
    )
    .withColumn(
        "averageCanonicalTSSDistance",
        F.when(
            (F.col("datasourceId") == "ot_genetics_portal"),
            F.avg("d").over(ranking.orderBy(F.col("resourceScore").desc())),
        ).otherwise(F.lit(None)),
    )
    .withColumn(
        "datasources",
        F.when(
            F.col("rank").isNull(),
            F.array_remove(F.col("datasources"), "ot_genetics_portal"),
        ).otherwise(F.col("datasources")),
    )
    .withColumn(
        "distance_ranking",
        F.when(
            (F.col("datasourceId") == "ot_genetics_portal"),
            F.row_number().over(ranking.orderBy(F.col("d").asc())),
        ).otherwise(F.lit(None)),
    )
    .withColumn(
        "ChemblL2gRanking",
        F.when(
            (F.array_contains(F.col("datasources"), "chembl"))
            & (F.array_contains(F.col("datasources"), "ot_genetics_portal")),
            F.lit(F.col("L2G_ranking")),
        ).otherwise(F.lit(None)),
    )
    .withColumn(
        "chemblDistanceRanking",
        F.when(
            (F.array_contains(F.col("datasources"), "chembl"))
            & (F.array_contains(F.col("datasources"), "ot_genetics_portal")),
            F.lit(F.col("distance_ranking")),
        ).otherwise(F.lit(None)),
    )
    .withColumn(
        "frontierValue",
        ## ot genetics portal
        F.when(
            F.col("datasourceId") == "ot_genetics_portal",  ### the same for gene_burden
            F.when(
                (F.col("beta").isNotNull()) & (F.col("OddsRatio").isNull()),
                F.when(
                    (F.col("beta") <= 0.1) & (F.col("beta") >= -0.1),
                    F.lit("limitValue"),
                ).otherwise(F.lit("noLimitValue")),
            )
            .when(
                (F.col("beta").isNull()) & (F.col("OddsRatio").isNotNull()),
                F.when(
                    (F.col("OddsRatio") <= 1.1) & (F.col("OddsRatio") >= 0.9),
                    F.lit("limitValue"),
                ).otherwise(F.lit("noLimitValue")),
            )
            .when(
                (F.col("beta").isNull()) & (F.col("OddsRatio").isNull()),
                F.lit("noValue"),
            ),
        ),
    )
).persist()

#####
# function for interpreting DoE and coherencies/discrepancies
#####

diseases2 = diseases.select("id", "parents").withColumn(
    "diseaseIdPropagated",
    F.explode_outer(F.concat(F.array(F.col("id")), F.col("parents"))),
)

analysis_chembl = discrepancifier(
    prueba_assessment.filter((F.col("datasourceId") == "chembl"))
    .withColumn(
        "maxClinPhase",
        F.max(F.col("clinicalPhase")).over(Window.partitionBy("targetId", "diseaseId")),
    )
    .groupBy("targetId", "diseaseId", "maxClinPhase")
    .pivot("homogenized")
    .agg(F.count("targetId"))
    .persist()
)

#### propag OtGenetics:
otGenetics_propag = (
    otGenetics.filter((F.col("datasourceId") == "ot_genetics_portal"))
    .join(
        diseases2.selectExpr("id as diseaseId", "diseaseIdPropagated"),
        on="diseaseId",
        how="left",
    )
    .withColumnRenamed("diseaseId", "oldDiseaseId")
    .withColumnRenamed("diseaseIdPropagated", "diseaseId")
).persist()


#### include dictionary for calling dataframes:
# max_L2GScore
# min_distance_ranking


def benchmarkOT(discrepancifier, otGenetics, metric):
    dict_comb = {}
    dict_comb = {
        "hasDirectionOfEffect": f"{metric}",
        "diagonalYes": f"{metric}",
        "oneCellYes": f"{metric}",
        "L2GbutNotDoE": f"{metric}",
    }
    list_l2g = [
        0.1,
        0.15,
        0.2,
        0.25,
        0.3,
        0.35,
        0.4,
        0.45,
        0.5,
        0.55,
        0.6,
        0.65,
        0.7,
        0.75,
        0.8,
        0.85,
        0.9,
        0.95,
    ]
    list_dist = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    return (
        discrepancifier(
            otGenetics.filter((F.col("datasourceId") == "ot_genetics_portal"))
            .withColumn(
                "min_distance_ranking",
                F.min("distance_ranking").over(
                    Window.partitionBy("targetId", "diseaseId")
                ),
            )
            .withColumn(  ### take maximum L2G score per T-D
                "max_L2GScore",
                F.max("resourceScore").over(
                    Window.partitionBy("targetId", "diseaseId")
                ),
            )
            .groupBy(
                "targetId",
                "diseaseId",
                "geneticEvidence",
                f"{value}",
            )  ##### modifications here to include the groups of ranking/distances to TSS
            .pivot("homogenized")
            .agg(F.count("targetId"))
        )
        .selectExpr(
            "targetId",
            "diseaseId",
            "geneticEvidence",
            f"{metric}",
            "coherencyDiagonal as coherencyDiagonal",
            "coherencyOneCell as coherencyOneCell",
            "LoF_protect",
            "GoF_protect",
            "LoF_risk",
            "GoF_risk",
        )
        .join(
            analysis_chembl.selectExpr(
                "targetId",
                "diseaseId",
                "maxClinPhase",
                "coherencyDiagonal as coherencyDiagonal_ch",
                "coherencyOneCell as coherencyOneCell_ch",
                "LoF_protect as LoF_protect_ch",
                "GoF_protect as GoF_protect_ch",
            ),
            on=["targetId", "diseaseId"],
            how="right",
        )
        # .withColumn(
        #    "geneticEvidence",
        #    F.when(
        #        F.col(f"{metric}").isNotNull(), F.lit("hasGeneticEvidence")
        #    ).otherwise(F.lit("noGeneticEvidence")),
        # )
        .withColumn(
            "diagonalAgreeWithDrugs",
            F.when(
                (F.col("coherencyDiagonal_ch") == "coherent")
                & (F.col("coherencyDiagonal") == "coherent"),
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        F.col("GoF_risk").isNotNull() | F.col("LoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .when(
                    F.col("GoF_protect_ch").isNotNull()
                    & (
                        F.col("LoF_risk").isNotNull() | F.col("GoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        )
        .withColumn(
            "oneCellAgreeWithDrugs",
            F.when(
                (F.col("coherencyOneCell_ch") == "coherent")
                & (F.col("coherencyOneCell") == "coherent"),
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        (F.col("LoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("GoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .when(
                    (F.col("GoF_protect_ch").isNotNull())
                    & (
                        (F.col("GoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("LoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        )
        .withColumn(
            "Phase4",
            F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=3",
            F.when(F.col("maxClinPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=2",
            F.when(F.col("maxClinPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=1",
            F.when(F.col("maxClinPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase0",
            F.when(F.col("maxClinPhase") == 0, F.lit("yes")).otherwise(F.lit("no")),
        )
        .join(terminated_array, on=["targetId", "diseaseId"], how="left")
        .withColumn(
            "PhaseT",
            F.when(F.col("prediction") == "yes", F.lit("yes")).otherwise(F.lit("no")),
        )
        .join(
            diseaseTA.select("diseaseId", "taLabelSimple"), on="diseaseId", how="left"
        )
        .withColumn(
            "hasGeneticEvidence",
            F.when(
                F.col("geneticEvidence") == "hasGeneticEvidence", F.lit("yes")
            ).otherwise(F.lit("no")),
        )
        .withColumn(  #### new column to modify it
            "hasdirectionOfEffect",
            F.when(F.col("coherencyDiagonal").isNotNull(), F.lit("yes")).otherwise(
                F.lit("no")
            ),
        )
        .withColumn(
            "diagonalYes",
            F.when(
                F.col("diagonalAgreeWithDrugs") == "coherent", F.lit("yes")
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "oneCellYes",
            F.when(
                F.col("oneCellAgreeWithDrugs") == "coherent", F.lit("yes")
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "L2GAndColoc",
            F.when(
                (F.col("geneticEvidence") == "hasGeneticEvidence")
                & (F.col("coherencyDiagonal").isin(["coherent", "dispar"])),
                F.lit("yes"),
            ).otherwise(F.lit("no")),
        )
        .select(
            ["*"]
            + (
                [  ### single columns
                    F.when(F.col(f"{metric}") >= n, F.lit("yes"))
                    .otherwise(F.lit("no"))
                    .alias(f"{metric}>={str(n).replace('.', '_')}")
                    for n in list_l2g
                ]
                if metric == "max_L2GScore"  # Adjust this condition as needed
                else [
                    F.when(F.col(f"{metric}") <= n, F.lit("yes"))
                    .otherwise(F.lit("no"))
                    .alias(f"{metric}<={n}")
                    for n in list_dist
                ]
            )
            + (
                [  ### column combinations for Yes/No colums Plus has DoE (any agreement)
                    F.when((F.col(a) == "yes") & (F.col(x) >= n), F.lit("yes"))
                    .otherwise(F.lit("no"))
                    .alias(f"{x}>={str(n).replace('.', '_')}&{a}_combined")
                    for a, x in dict_comb.items()
                    for n in list_l2g
                ]
                if metric == "max_L2GScore"
                else [
                    F.when((F.col(a) == "yes") & (F.col(x) <= n), F.lit("yes"))
                    .otherwise(F.lit("no"))
                    .alias(f"{x}<={str(n).replace('.', '_')}&{a}_combined")
                    for a, x in dict_comb.items()
                    for n in list_dist
                ]
            )
        )
        .persist()
    )


metric_list = ["max_L2GScore", "min_distance_ranking"]
datasetDict = {}
for value in metric_list:
    if value == "max_L2GScore":
        datasetDict[f"df_l2g_original"] = benchmarkOT(
            discrepancifier, otGenetics, value
        )
        datasetDict[f"df_l2g_propagated"] = benchmarkOT(
            discrepancifier, otGenetics_propag, value
        )
    elif value == "min_distance_ranking":
        datasetDict[f"df_distance_original"] = benchmarkOT(
            discrepancifier, otGenetics, value
        )
        datasetDict[f"df_distance_propagated"] = benchmarkOT(
            discrepancifier, otGenetics_propag, value
        )

spark session created at 2024-08-14 11:36:51.363460
loading sources


24/08/14 11:36:55 WARN CacheManager: Asked to cache already cached data.
24/08/14 11:36:55 WARN CacheManager: Asked to cache already cached data.
24/08/14 11:36:56 WARN CacheManager: Asked to cache already cached data.
24/08/14 11:36:57 WARN CacheManager: Asked to cache already cached data.
24/08/14 11:36:57 WARN CacheManager: Asked to cache already cached data.
24/08/14 11:36:57 WARN CacheManager: Asked to cache already cached data.
24/08/14 11:36:57 WARN CacheManager: Asked to cache already cached data.


In [26]:
datasetDict[f"df_distance_propagated"].drop(
    "Phase4",
    "Phase>=3",
    "Phase>=2",
    "Phase>=1",
    "Phase0",
    "clinicalStatus",
    "prediction",
    "count",
    "PhaseT",
    "taLabelSimple",
).columns[17:]

['hasGeneticEvidence',
 'hasdirectionOfEffect',
 'diagonalYes',
 'oneCellYes',
 'L2GbutNotDoE',
 'min_distance_ranking<=1',
 'min_distance_ranking<=2',
 'min_distance_ranking<=3',
 'min_distance_ranking<=4',
 'min_distance_ranking<=5',
 'min_distance_ranking<=6',
 'min_distance_ranking<=7',
 'min_distance_ranking<=8',
 'min_distance_ranking<=9',
 'min_distance_ranking<=10',
 'min_distance_ranking<=1&hasDirectionOfEffect_combined',
 'min_distance_ranking<=2&hasDirectionOfEffect_combined',
 'min_distance_ranking<=3&hasDirectionOfEffect_combined',
 'min_distance_ranking<=4&hasDirectionOfEffect_combined',
 'min_distance_ranking<=5&hasDirectionOfEffect_combined',
 'min_distance_ranking<=6&hasDirectionOfEffect_combined',
 'min_distance_ranking<=7&hasDirectionOfEffect_combined',
 'min_distance_ranking<=8&hasDirectionOfEffect_combined',
 'min_distance_ranking<=9&hasDirectionOfEffect_combined',
 'min_distance_ranking<=10&hasDirectionOfEffect_combined',
 'min_distance_ranking<=1&diagonalYes_comb

In [4]:
datasetDict[f"df_distance_propagated"].withColumn(
    "L2GData",
    F.when(F.col("min_distance_ranking").isNotNull(), F.lit("hasL2G")).otherwise(
        F.lit("noL2G")
    ),
).groupBy("L2GData", "coherencyDiagonal").count().show()



+-------+-----------------+-----+
|L2GData|coherencyDiagonal|count|
+-------+-----------------+-----+
|  noL2G|             null|80304|
| hasL2G|       EvidNotDoE|  766|
| hasL2G|         coherent|  248|
| hasL2G|           dispar|   13|
+-------+-----------------+-----+



                                                                                

In [20]:
for a, x in dict_comb.items():
    print(a)

NameError: name 'dict_comb' is not defined

In [17]:
datasetDict[f"df_distance_propagated"].withColumn(
    "L2GData",
    F.when(F.col("min_distance_ranking").isNotNull(), F.lit("hasL2G")).otherwise(
        F.lit("noL2G")
    ),
).withColumn(
    "L2GbutNotDoE",
    F.when(
        (F.col("geneticEvidence") == "hasGeneticEvidence")
        & (F.col("coherencyDiagonal").isin(["coherent", "dispar"])),
        F.lit("yes"),
    ).otherwise(F.lit("no")),
).groupBy(
    "L2GData",
    "geneticEvidence",
    "L2GbutNotDoE",
    "coherencyDiagonal",
    "min_distance_ranking<=6&oneCellYes_combined",
).count().show()

+-------+------------------+------------+-----------------+-------------------------------------------+-----+
|L2GData|   geneticEvidence|L2GbutNotDoE|coherencyDiagonal|min_distance_ranking<=6&oneCellYes_combined|count|
+-------+------------------+------------+-----------------+-------------------------------------------+-----+
| hasL2G|hasGeneticEvidence|          no|       EvidNotDoE|                                         no|  766|
|  noL2G|              null|          no|             null|                                         no|80304|
| hasL2G|hasGeneticEvidence|         yes|         coherent|                                         no|  193|
| hasL2G|hasGeneticEvidence|         yes|         coherent|                                        yes|   55|
| hasL2G|hasGeneticEvidence|         yes|           dispar|                                         no|   13|
+-------+------------------+------------+-----------------+-------------------------------------------+-----+



In [13]:
evidences.filter(F.col("datasourceId") == "ot_genetics_portal").withColumn(
    "hasScore", F.when(F.col("score").isNotNull(), F.lit("yes")).otherwise(F.lit("now"))
).groupBy("hasScore").count().show()

+--------+------+
|hasScore| count|
+--------+------+
|     yes|781213|
+--------+------+



In [7]:
datasetDict[f"df_distance_propagated"].columns

['diseaseId',
 'targetId',
 'geneticEvidence',
 'min_distance_ranking',
 'coherencyDiagonal',
 'coherencyOneCell',
 'LoF_protect',
 'GoF_protect',
 'LoF_risk',
 'GoF_risk',
 'maxClinPhase',
 'coherencyDiagonal_ch',
 'coherencyOneCell_ch',
 'LoF_protect_ch',
 'GoF_protect_ch',
 'diagonalAgreeWithDrugs',
 'oneCellAgreeWithDrugs',
 'Phase4',
 'Phase>=3',
 'Phase>=2',
 'Phase>=1',
 'Phase0',
 'clinicalStatus',
 'prediction',
 'PhaseT',
 'taLabelSimple',
 'hasGeneticEvidence',
 'hasdirectionOfEffect',
 'diagonalYes',
 'oneCellYes',
 'min_distance_ranking<=1',
 'min_distance_ranking<=2',
 'min_distance_ranking<=3',
 'min_distance_ranking<=4',
 'min_distance_ranking<=5',
 'min_distance_ranking<=6',
 'min_distance_ranking<=7',
 'min_distance_ranking<=8',
 'min_distance_ranking<=9',
 'min_distance_ranking<=10',
 'min_distance_ranking<=1&hasDirectionOfEffect_combined',
 'min_distance_ranking<=2&hasDirectionOfEffect_combined',
 'min_distance_ranking<=3&hasDirectionOfEffect_combined',
 'min_distan

In [None]:
### How many T-D have L2G score?
### How many T-D overlap with L2G and Coloc/DoE?

#### modify the spreadsheet of coloc from 02.08.2024

In [1]:
#### now with propagation
""" this scripts run the analysis for comparing QTL studies, tissues together with therapy areas matched"""
from functions import relative_success
from stoppedTrials import terminated_td
from DoEAssessment import directionOfEffect
from membraneTargets import target_membrane
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from datetime import date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.types import (
    StructType,
    StructField,
    DoubleType,
    DecimalType,
    StringType,
    FloatType,
)

spark = SparkSession.builder.getOrCreate()

spark session created at 2024-08-15 09:00:58.613710
Analysis started on 2024-08-15 at  2024-08-15 09:00:58.613710


24/08/15 09:01:06 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [17]:
#### now with propagation
""" this scripts run the analysis for comparing QTL studies, tissues together with therapy areas matched"""
from functions import relative_success
from stoppedTrials import terminated_td
from DoEAssessment import directionOfEffect
from membraneTargets import target_membrane
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from datetime import date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.types import (
    StructType,
    StructField,
    DoubleType,
    DecimalType,
    StringType,
    FloatType,
)

spark = SparkSession.builder.getOrCreate()

platform_v = "24.06"

target_path = (
    f"gs://open-targets-data-releases/{platform_v}/output/etl/parquet/targets/"
)
target = spark.read.parquet(target_path)

disease_path = (
    f"gs://open-targets-data-releases/{platform_v}/output/etl/parquet/diseases/"
)
diseases = spark.read.parquet(disease_path)

evidences = spark.read.parquet(
    f"gs://open-targets-data-releases/{platform_v}/output/etl/parquet/evidence"
)

coloc = spark.read.parquet(
    "gs://genetics-portal-dev-data/22.09.1/outputs/v2d_coloc"
).filter(F.col("right_type") != "gwas")


### laod sample sizes - look at the document
sampleSize = spark.read.csv("gs://ot-team/jroldan/colocSampleSize.csv", header=True)

terminated = terminated_td(
    spark, "gs://ot-team/jroldan/analysis/targetDiseaseStoppedNegative.csv"
)
### load QTL tissues mapped to therapy areas
onto_samples = spark.read.csv(
    "gs://ot-team/jroldan/20240112_mappedSQLtissuesGTP3_5.csv", header=True
)

#### GSEA annotation for hallmark inflamation targets
immflam_annot = (
    spark.read.json(
        "gs://ot-team/jroldan/analysis/HALLMARK_INFLAMMATORY_RESPONSE.v2023.2.Hs_edited.json"
    )
    .select(F.explode_outer("geneSymbols").alias("approvedSymbol"))
    .withColumn("isInflam", F.lit("yes"))
)

#### Build the uniqueBetas_analyse
targetType = (
    target.select("id", "approvedSymbol", F.explode_outer("targetClass"))
    .select("id", "approvedSymbol", "col.label")
    .groupBy("id", "approvedSymbol")
    .agg(F.collect_set("label").alias("label"))
    .filter(F.col("label").isNotNull())
    .selectExpr(
        "id as targetIdtargetType",
        "approvedSymbol as approvedSymbol",
        "label as targetType",
    )
    .join(immflam_annot, on="approvedSymbol", how="left")
)

### take ontology of samples
samplesOnto = (
    onto_samples.withColumn(
        "right_bio_feature", F.split(F.col("original"), " - ").getItem(0)
    )
    .withColumn(
        "therapyArea", F.split(F.col("20231207_curated_simplified"), " - ").getItem(2)
    )
    .withColumn("EFO", F.split(F.col("20231207_curated_simplified"), " - ").getItem(1))
    .withColumn(
        "right_bio_feature2",
        F.split(F.col("20231207_curated_simplified"), " - ").getItem(0),
    )
    .drop(
        "curated_simplified",
        "20231207_curated_simplified",
        "original",
        "curated",
        "_c3",
        "_c4",
    )
)

coloc2 = (
    coloc.select(
        F.concat_ws("_", "left_chrom", "left_pos", "left_ref", "left_alt").alias(
            "left_locus_id"
        ),
        F.concat_ws("_", "right_chrom", "right_pos", "right_ref", "right_alt").alias(
            "right_locus_id"
        ),
        F.col("left_study").alias("left_study_id"),
        F.col("right_study").alias("right_study_id"),
        "right_gene_id",
        "coloc_h4",
        "left_var_right_study_beta",
        "right_phenotype",
        F.col("left_type"),
        F.col("right_type"),
        F.col("right_bio_feature"),
        F.col("is_flipped"),
        "left_var_right_study_pval",
    )
    .withColumn(
        "beta_assessed",  ### diferent from sQTL and oQTL
        F.when(
            (F.col("left_var_right_study_beta") > 0)
            & (F.col("right_study_id") != "GTEx-sQTL"),
            F.lit("gof"),
        ).when(
            (F.col("left_var_right_study_beta") < 0)
            & (F.col("right_study_id") != "GTEx-sQTL"),
            F.lit("lof"),
        )
        #### for sQTL is the opposite
        .when(
            (F.col("left_var_right_study_beta") > 0)
            & (F.col("right_study_id") == "GTEx-sQTL"),
            F.lit("lof"),
        )
        .when(
            (F.col("left_var_right_study_beta") < 0)
            & (F.col("right_study_id") == "GTEx-sQTL"),
            F.lit("gof"),
        )
        .otherwise(F.lit("neutral")),
    )
    .join(samplesOnto, on=["right_bio_feature"], how="left")
    .drop("right_bio_feature")
)
### check for disparities using count of different assessment for beta for target

disparities = coloc2.groupBy("left_locus_id", "left_study_id", "right_gene_id").agg(
    F.size(F.collect_set("beta_assessed")).alias("count"),
)
### add the label of which left_locus_id,left_study_id and right_gene_id are having contradictions

coloc3 = (
    coloc2.withColumnRenamed("right_bio_feature2", "right_bio_feature")
    .join(
        disparities, on=["left_locus_id", "left_study_id", "right_gene_id"], how="left"
    )
    .persist()
)
#### Run directionOfEffect
prueba_assessment = directionOfEffect(evidences, platform_v)

## add therapeuticArea name to  diseases
taDf = spark.createDataFrame(
    data=[
        ("MONDO_0045024", "cell proliferation disorder", "Oncology"),
        ("EFO_0005741", "infectious disease", "Other"),
        ("OTAR_0000014", "pregnancy or perinatal disease", "Other"),
        ("EFO_0005932", "animal disease", "Other"),
        ("MONDO_0024458", "disease of visual system", "Other"),
        ("EFO_0000319", "cardiovascular disease", "Other"),
        ("EFO_0009605", "pancreas disease", "Other"),
        ("EFO_0010282", "gastrointestinal disease", "Other"),
        ("OTAR_0000017", "reproductive system or breast disease", "Other"),
        ("EFO_0010285", "integumentary system disease", "Other"),
        ("EFO_0001379", "endocrine system disease", "Other"),
        ("OTAR_0000010", "respiratory or thoracic disease", "Other"),
        ("EFO_0009690", "urinary system disease", "Other"),
        ("OTAR_0000006", "musculoskeletal or connective tissue disease", "Other"),
        ("MONDO_0021205", "disease of ear", "Other"),
        ("EFO_0000540", "immune system disease", "Other"),
        ("EFO_0005803", "hematologic disease", "Other"),
        ("EFO_0000618", "nervous system disease", "Other"),
        ("MONDO_0002025", "psychiatric disorder", "Other"),
        ("OTAR_0000020", "nutritional or metabolic disease", "Other"),
        ("OTAR_0000018", "genetic, familial or congenital disease", "Other"),
        ("OTAR_0000009", "injury, poisoning or other complication", "Other"),
        ("EFO_0000651", "phenotype", "Other"),
        ("EFO_0001444", "measurement", "Other"),
        ("GO_0008150", "biological process", "Other"),
    ],
    schema=StructType(
        [
            StructField("taId", StringType(), True),
            StructField("taLabel", StringType(), True),
            StructField("taLabelSimple", StringType(), True),
        ]
    ),
)
#### prepare for disease propagation, but explode at the end
diseases2 = (
    diseases.withColumn("taId", F.explode_outer("therapeuticAreas"))
    .join(taDf.drop("taLabelSimple"), on="taId", how="left")
    .groupBy("id", "parents", "name")
    .agg(
        F.collect_set("taId").alias("taId"),
        F.collect_set("taLabel").alias("diseaseTherapyAreas"),
    )
)
#### load data from ot genetics with the assessments
delimiter = ","
evidence_ot_genetics = (
    (
        prueba_assessment.filter((F.col("datasourceId").isin(["ot_genetics_portal"])))
        .groupBy("targetId", "diseaseId", "variantId", "studyId")
        .pivot("directionOnTrait")
        .count()
        .drop("noEvaluable", "conflict/noEvaluable")
        .persist()
    )
    .join(diseases2, F.col("diseaseId") == diseases2.id, "left")
    .withColumnRenamed("diseaseId", "oldDiseaseId")
    .withColumn(
        "diseaseId",
        F.explode_outer(F.concat(F.array(F.col("id")), F.col("parents"))),
    )
    .drop("id")
)

coloc_otgene = (
    coloc3.withColumnRenamed("left_study_id", "studyId")
    .withColumnRenamed("left_locus_id", "locusId")
    .withColumnRenamed("right_gene_id", "targetId")
    .join(
        evidence_ot_genetics.withColumnRenamed("variantId", "locusId"),
        on=["studyId", "locusId", "targetId"],
        how="left",
    )
    .withColumnRenamed("GoF", "GoF_OT")  # remove
    .withColumnRenamed("LoF", "LoF_OT")  # remove
).persist()

chembl_trials = (
    prueba_assessment.filter((F.col("datasourceId").isin(["chembl"])))
    .groupBy("targetId", "diseaseId")
    .agg(F.max(F.col("clinicalPhase")).alias("maxClinPhase"))
)

chembl = (
    (
        prueba_assessment.filter(
            (F.col("datasourceId").isin(["chembl"]))
            & (F.col("homogenized").isin(["noEvaluable", "dispar"]) == False)
        )
    )
    .groupBy("targetId", "diseaseId")
    .pivot("variantEffect")
    .count()
    .withColumnRenamed("LoF", "LoF_Ch")
    .withColumnRenamed("GoF", "GoF_Ch")
    .join(chembl_trials, on=["targetId", "diseaseId"], how="left")
    .persist()
)

coloc_bnch = coloc_otgene.join(chembl, on=["targetId", "diseaseId"], how="inner")
coloc_bnch2 = coloc_bnch.join(
    sampleSize.select("right_study_id", "sampleSize"), on="right_study_id", how="left"
).persist()

withEvidence = (
    coloc_bnch2.withColumn(
        "ChEMBL",
        F.when(
            (F.col("GoF_Ch").isNotNull()) & (F.col("LoF_Ch").isNotNull()),
            F.lit("gof&lof"),
        )
        .when(
            (F.col("LoF_Ch").isNotNull()) & (F.col("GoF_Ch").isNull()),
            F.lit(F.lit("lof")),
        )
        .when(
            (F.col("GoF_Ch").isNotNull()) & (F.col("LoF_Ch").isNull()),
            F.lit(F.lit("gof")),
        ),
    )
    .withColumn(
        "Coherency_chembl",
        F.when(  ### there are cases of drug with gof&lof
            (F.col("protect").isNotNull()),
            F.when(
                (F.col("beta_assessed") == "gof"),
                F.when(
                    (F.col("GoF_Ch").isNotNull()) & (F.col("LoF_Ch").isNull()),
                    F.lit("coherent"),
                ).when(
                    (F.col("LoF_Ch").isNotNull()) & (F.col("GoF_Ch").isNull()),
                    F.lit("dispar"),
                ),
            ).when(
                (F.col("beta_assessed") == "lof"),
                F.when(
                    (F.col("GoF_Ch").isNotNull()) & (F.col("LoF_Ch").isNull()),
                    F.lit("dispar"),
                ).when(
                    (F.col("LoF_Ch").isNotNull()) & (F.col("GoF_Ch").isNull()),
                    F.lit("coherent"),
                ),
            ),
        ),
    )
    .join(target.select("id", "approvedSymbol"), target.id == F.col("targetId"), "left")
    .drop("id")
    .persist()
)

uniqueBetas = withEvidence.persist()

custom_schema = StructType(
    [
        StructField("_c0", StringType(), True),
        StructField("_c1", StringType(), True),
        StructField("_c2", StringType(), True),
        StructField("_c3", DecimalType(38, 37), True),
        # Add more fields as needed
    ]
)

### windows function for adjusting pvalue using benjamini-hochberg correction
window = Window.partitionBy("studyId").orderBy("pVal")
window2 = Window.partitionBy("studyId")

### read the tissue enrichent's file
tissueEnrichment = (
    spark.read.csv(
        "gs://ot-team/jroldan/Tissue_enrichment_results/",
        sep="\t",
        schema=custom_schema,
    )
    .selectExpr(
        "_c0 as studyId",
        "_c1 as tissueEnriched",
        "_c3 as pVal",
        #### instead of removing, keeping the ones with pvalues=0 to not perturb the rank
    )
    .withColumn(
        "pVal",
        F.when(F.col("pVal") == 0e-37, F.lit(1e-37)).otherwise(F.col("pVal")),
        ### ranking column
    )
    .withColumn(
        "index",
        F.row_number().over(window),
        ### total number of rows per study column
    )
    .withColumn(
        "length",
        F.last("index").over(window2),
        ### BH adjusted p values calculation
    )
    .withColumn("adjPVal", (F.col("pVal") * F.col("length")) / F.col("index"))
)

### read the file with matching tissue enriched with therapy areas
tissueEnrichTherAreas = (
    spark.read.csv(
        "gs://ot-team/jroldan/analysis/20231207_gwasTissueEnrrichedToTherapyAreas.csv/",
        sep=",",
        header=True,
    )
    .withColumnRenamed("tissue", "tissueEnriched")
    .withColumn(
        "tissueEnrichedTherapyAreas",
        F.array(F.col("TherapyArea"), F.col("Alternative")),
    )
    .persist()
)

### prepare the file of tissue enriched with their therapy areas
studyEnrichTherArea = (
    tissueEnrichment.join(
        tissueEnrichTherAreas.select("tissueEnriched", "tissueEnrichedTherapyAreas"),
        on="tissueEnriched",
        how="left",
        ## filter by p value <0.05
    )
    ### filter by adjusted p values below 0.05
    .filter(F.col("adjPVal") < 0.05)
    .groupBy("studyId")
    .agg(
        F.array_except(
            F.flatten(F.collect_set("tissueEnrichedTherapyAreas")), F.array(F.lit(None))
        ).alias("studyTherapyArea"),
        F.collect_set("tissueEnriched").alias("tissuesEnriched"),
    )
)

#### mapping right_bio_feature - disease
schema_rbf = StructType(
    [
        StructField("remove", StringType(), True),
        StructField("right_bio_feature", StringType(), True),
        StructField("name", StringType(), True),
        StructField("matchRBFToDisease", StringType(), True),
        # Add more fields as needed
    ]
)

### read the tissue enrichent's file
tissueToDisease = (
    spark.read.csv(
        "gs://ot-team/jroldan/analysis/20240801_newcurationRBFToDisease.csv/",
        header=True,
    )
    .select("name", "right_bio_feature", "matchRBFToDisease")
    .distinct()
)


uniqueBetas_location = (
    (
        target_membrane(spark, target, uniqueBetas).drop(
            "mb",
            "counted",
            "HPA_membrane",
            "HPA_secreted",
            "uniprot_membrane",
            "uniprot_secreted",
            "loc",
            "location_id",
            # "result",
            "loc_id",
        )
        # .join(targetType, on=["targetId"], how="left")
    )
    .join(tissueToDisease, on=["name", "right_bio_feature"], how="left")
    .withColumn("qtlTherapyArea", F.split(F.col("therapyArea"), ","))
    .join(studyEnrichTherArea, on="studyId", how="left")
    .withColumn(
        "taIntersectQtlDisease",
        F.array_intersect(F.col("qtlTherapyArea"), F.col("diseaseTherapyAreas")),
    )
    .withColumn(
        "taIntersectQtlTissueEnriched",
        F.array_intersect(F.col("qtlTherapyArea"), F.col("studyTherapyArea")),
    )
    .withColumn(
        "taNIntersectQtlDisease",
        F.when(
            F.size(
                F.array_intersect(F.col("qtlTherapyArea"), F.col("diseaseTherapyAreas"))
            )
            >= 1,
            F.lit("yes"),
        ).otherwise(F.lit("no")),
    )
    .withColumn(
        "taNIntersectQtlTissueEnriched",
        F.when(
            F.size(
                F.array_intersect(F.col("qtlTherapyArea"), F.col("studyTherapyArea"))
            )
            >= 1,
            F.lit("yes"),
        ).otherwise(F.lit("no")),
    )
    .join(targetType, on="approvedSymbol", how="left")
)


"""single column comparisons"""
dfs = {}
claves = []
columns = []
list_columns = [
    "right_bio_feature",
    "right_type",
    "beta_assessed",
    "right_study_id",
    "therapyArea",
]

for x in list_columns:
    a = [data[0] for data in uniqueBetas_location.select(x).distinct().collect()]
    for y in a:
        claves.append(y)
        columns.append(x)
#### remove the ".0" that is problematic
claves2 = [str(s).replace(".0", "") for s in claves]

for key, value in zip(claves2, columns):
    dfs[key] = value

"""combined column comparisons"""
list1 = []
list2 = []
cols = uniqueBetas_location.drop("Coherency_geneBurden").columns

### make list per column to be combined
study_column = [
    str(row.right_study_id)
    for row in uniqueBetas_location.filter(F.col("right_study_id").isNotNull())
    .select("right_study_id")
    .distinct()
    .collect()
]

qtl_type = [
    str(row.right_type)
    for row in uniqueBetas_location.filter(F.col("right_type").isNotNull())
    .select("right_type")
    .distinct()
    .collect()  ## there are "Nulls" in this column
]

biofeature_column = [
    str(row.right_bio_feature)
    for row in uniqueBetas_location.filter(F.col("right_bio_feature").isNotNull())
    .select("right_bio_feature")
    .distinct()
    .collect()
]

### make a dictionary with column combinations study & qtl
for x in study_column:
    for y in qtl_type:
        list1.append(x)
        list2.append(y)

### make a dictionary with column combinations biofeature & qtl
for x in biofeature_column:
    for y in qtl_type:
        list1.append(x)
        list2.append(y)

combined_lists = [list(pair) for pair in zip(list1, list2)]


## collect all columns in an array
### drop Coherency_geneBurden because it contains "coherent" and "dispar" words
def dict_comb_comp(combined_lists, study_column, qtl_type):
    new2 = {}
    for x, n in combined_lists:
        if x in study_column:
            if n in qtl_type:
                new2.update(
                    {f"{x}&{n}": {"right_type": {f"{n}": {"right_study_id": f"{x}"}}}}
                )
    return new2


new2 = dict_comb_comp(combined_lists, study_column, qtl_type)

df_string = (
    uniqueBetas_location.drop("Coherency_geneBurden")
    .select([F.col(col_name).cast("string").alias(col_name) for col_name in cols])
    .withColumn("array", F.array(cols))
)

"""make the dataframe with column combinations"""
df_string2 = df_string.select(
    ["*"]
    + [
        F.when(
            (F.col(i) == b)
            & (F.col(c) == t)
            & (F.col("Coherency_chembl") == "coherent"),
            F.lit("yes"),
        )
        .otherwise(F.lit("no"))
        .alias(a)
        for a, x in new2.items()
        for i, z in x.items()
        for b, y in z.items()
        for c, t in y.items()
        # print(a, i, b, c, t)
    ]
).persist()

delimiter = ","  ### to convert string of propagated traits to array

uniqueBetas_analyse = (
    df_string2.select(
        ["*"]
        + [  #### only doe
            F.when(
                ((f"{x}") == F.col(c)) & (F.col("Coherency_chembl") == "coherent"),
                F.lit("yes"),
            )
            .otherwise(F.lit("no"))
            .alias(f"{x}_{c}")
            for x, c in dfs.items()
        ]
    )
    .join(terminated, on=["targetId", "diseaseId"], how="left")
    .withColumn(
        "Phase4",
        F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
    )
    .withColumn(
        "Phase>=3",
        F.when(F.col("maxClinPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
    )
    .withColumn(
        "Phase>=2",
        F.when(F.col("maxClinPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
    )
    .withColumn(
        "Phase>=1",
        F.when(F.col("maxClinPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
    )
    .withColumn(
        "Phase0",
        F.when(F.col("maxClinPhase") >= 0, F.lit("yes")).otherwise(F.lit("no")),
    )
    .withColumn(
        "allPhases",
        F.when(F.col("maxClinPhase").isNotNull(), F.lit("yes")).otherwise(F.lit("no")),
    )
    .withColumn(
        "PhaseT",
        F.when(F.col("prediction").isNotNull(), F.lit("yes")).otherwise(F.lit("no")),
    )
    .withColumn(
        "coloc>_60",
        F.when(
            (F.col("coloc_h4") >= 0.60) & (F.col("Coherency_chembl") == "coherent"),
            F.lit("yes"),
        )
        .when(
            (F.col("coloc_h4") >= 0.60) & (F.col("Coherency_chembl") == "dispar"),
            F.lit("no"),
        )
        .when(
            (F.col("coloc_h4") < 0.60) & (F.col("Coherency_chembl") == "dispar"),
            F.lit("no"),
        )
        .when(
            (F.col("coloc_h4").isNull()) | (F.col("Coherency_chembl").isNull()),
            F.lit("no"),
        )
        .otherwise(F.lit("no")),
    )
    .withColumn(
        "coloc>_80",
        F.when(
            (F.col("coloc_h4") >= 0.80) & (F.col("Coherency_chembl") == "coherent"),
            F.lit("yes"),
        )
        .when(
            (F.col("coloc_h4") >= 0.80) & (F.col("Coherency_chembl") == "dispar"),
            F.lit("no"),
        )
        .when(
            (F.col("coloc_h4") < 0.80) & (F.col("Coherency_chembl") == "dispar"),
            F.lit("no"),
        )
        .when(
            (F.col("coloc_h4").isNull()) | (F.col("Coherency_chembl").isNull()),
            F.lit("no"),
        )
        .otherwise(F.lit("no")),
    )
    .withColumn(
        "coloc>_85",
        F.when(
            (F.col("coloc_h4") >= 0.85) & (F.col("Coherency_chembl") == "coherent"),
            F.lit("yes"),
        )
        .when(
            (F.col("coloc_h4") >= 0.85) & (F.col("Coherency_chembl") == "dispar"),
            F.lit("no"),
        )
        .when(
            (F.col("coloc_h4") < 0.85) & (F.col("Coherency_chembl") == "dispar"),
            F.lit("no"),
        )
        .when(
            (F.col("coloc_h4").isNull()) | (F.col("Coherency_chembl").isNull()),
            F.lit("no"),
        )
        .otherwise(F.lit("no")),
    )
    .withColumn(
        "coloc>_90",
        F.when(
            (F.col("coloc_h4") >= 0.90) & (F.col("Coherency_chembl") == "coherent"),
            F.lit("yes"),
        )
        .when(
            (F.col("coloc_h4") >= 0.90) & (F.col("Coherency_chembl") == "dispar"),
            F.lit("no"),
        )
        .when(
            (F.col("coloc_h4") < 0.90) & (F.col("Coherency_chembl") == "dispar"),
            F.lit("no"),
        )
        .when(
            (F.col("coloc_h4").isNull()) | (F.col("Coherency_chembl").isNull()),
            F.lit("no"),
        )
        .otherwise(F.lit("no")),
    )
    .withColumn(
        "coloc>_95",
        F.when(
            (F.col("coloc_h4") >= 0.95) & (F.col("Coherency_chembl") == "coherent"),
            F.lit("yes"),
        )
        .when(
            (F.col("coloc_h4") >= 0.95) & (F.col("Coherency_chembl") == "dispar"),
            F.lit("no"),
        )
        .when(
            (F.col("coloc_h4") < 0.95) & (F.col("Coherency_chembl") == "dispar"),
            F.lit("no"),
        )
        .when(
            (F.col("coloc_h4").isNull()) | (F.col("Coherency_chembl").isNull()),
            F.lit("no"),
        )
        .otherwise(F.lit("no")),
    )
    .withColumn(
        "secreted",
        F.when(F.col("Nr_secreted") == 1, F.lit("yes"))
        .when(F.col("Nr_secreted") == 0, F.lit("no"))
        .otherwise(None),
    )
    .withColumn(
        "matchQtlTeTherArea",
        F.when(
            (F.col("taNIntersectQtlTissueEnriched") == "yes"),
            F.lit("yes"),
        ).otherwise(F.lit("no")),
    )
    .withColumn(
        "matchQtlDiseaseTherArea",
        F.when(
            (F.col("taNIntersectQtlDisease") == "yes"),
            F.lit("yes"),
        ).otherwise(F.lit("no")),
    )
    .withColumn(
        "matchQtlRBFrelevantToDisease",
        F.when(
            (F.col("matchRBFToDisease") == "yes"),
            F.lit("yes"),
        ).otherwise(F.lit("no")),
    )
    .withColumn(
        "doeCoherent",
        F.when(F.col("Coherency_chembl") == "coherent", F.lit("yes")).otherwise(
            F.lit("no")
        ),
    )
    .withColumn(
        "doeNotCoherent",
        F.when(F.col("Coherency_chembl") == "coherent", F.lit("yes")).otherwise(
            F.lit("no")
        ),
    )
    .withColumn(
        "doe&matchQtlTeTherArea",
        F.when(
            (F.col("Coherency_chembl") == "coherent")
            & (F.col("taNIntersectQtlTissueEnriched") == "yes"),
            F.lit("yes"),
        ).otherwise(F.lit("no")),
    )
    .withColumn(
        "doe&matchQtlDiseaseTherArea",
        F.when(
            (F.col("Coherency_chembl") == "coherent")
            & (F.col("taNIntersectQtlDisease") == "yes"),
            F.lit("yes"),
        ).otherwise(F.lit("no")),
    )
    .withColumn(
        "doe&matchQtlRBFrelevantToDisease",
        F.when(
            (F.col("Coherency_chembl") == "coherent")
            & (F.col("matchRBFToDisease") == "yes"),
            F.lit("yes"),
        ).otherwise(F.lit("no")),
    )
    .withColumn(
        "doeInflam",
        F.when(
            (F.col("coherency_chembl") == "coherent") & (F.col("isInflam") == "yes"),
            F.lit("yes"),
        ).otherwise(F.lit("no")),
    )
    .withColumn(
        "isInflam",
        F.when(F.col("isInflam") == "yes", F.lit("yes")).otherwise(F.lit("no")),
    )
    ### columns combinations for tissue enrichment and therapy areas
    .withColumn(
        "doeInflamMatchQtlTeTherArea",
        F.when(
            (F.col("Coherency_chembl") == "coherent")
            & (F.col("taNIntersectQtlTissueEnriched") == "yes")
            & (F.col("isInflam") == "yes"),
            F.lit("yes"),
        ).otherwise(F.lit("no")),
    )
    .withColumn(
        "doeInflamMatchQtlDiseaseTherArea",
        F.when(
            (F.col("Coherency_chembl") == "coherent")
            & (F.col("taNIntersectQtlDisease") == "yes")
            & (F.col("isInflam") == "yes"),
            F.lit("yes"),
        ).otherwise(F.lit("no")),
    )
    .withColumn(
        "doeInflamMatchQtlRBFrelevantToDisease",
        F.when(
            (F.col("Coherency_chembl") == "coherent")
            & (F.col("matchRBFToDisease") == "yes")
            & (F.col("isInflam") == "yes"),
            F.lit("yes"),
        ).otherwise(F.lit("no")),
    )
    #### columns considering only inflam and tissue/therapy areas
    .withColumn(
        "inflamMatchQtlTeTherArea",
        F.when(
            (F.col("taNIntersectQtlTissueEnriched") == "yes")
            & (F.col("isInflam") == "yes"),
            F.lit("yes"),
        ).otherwise(F.lit("no")),
    )
    .withColumn(
        "inflamMatchQtlDiseaseTherArea",
        F.when(
            (F.col("taNIntersectQtlDisease") == "yes") & (F.col("isInflam") == "yes"),
            F.lit("yes"),
        ).otherwise(F.lit("no")),
    )
    .withColumn(
        "inflamMatchQtlRBFrelevantToDisease",
        F.when(
            (F.col("matchRBFToDisease") == "yes") & (F.col("isInflam") == "yes"),
            F.lit("yes"),
        ).otherwise(F.lit("no")),
    )
    .drop("targetType")  ### remove nonsense columns for this
    .filter(
        (F.col("coherency_chembl").isNotNull())
        & (F.col("name") != "COVID-19")
        # | (F.col("name").isNull())
    )
    .drop(
        "array",
        "remove",
        "Phase0",
        "allPhases",
        "Terminated",
        "propagatedTraits",
        "count",
        "clinicalStatus",
        "prediction",
        "allPhases",
        "propagatedTraits",
        "targetType",
    )
    .repartition(100)
    .persist()
)


def comparisons_df(toAnalysis_studies) -> list:
    """Return list of all comparisons to be used in the analysis"""

    dataType = ["byDatatype"] * len(toAnalysis_studies)
    l_studies = []
    l_studies.extend([list(a) for a in zip(toAnalysis_studies, dataType)])

    schema = StructType(
        [
            StructField("comparison", StringType(), True),
            StructField("comparisonType", StringType(), True),
        ]
    )

    comparisons = spark.createDataFrame(l_studies, schema=schema)
    ### include all the columns as predictor

    predictions = spark.createDataFrame(
        data=[
            ("Phase4", "clinical"),
            ("Phase>=3", "clinical"),
            ("Phase>=2", "clinical"),
            ("Phase>=1", "clinical"),
            ("PhaseT", "clinical"),
        ]
    )
    return comparisons.join(predictions, how="full").collect()


full_data = spark.createDataFrame(
    data=[
        ("yes", "yes"),
        ("yes", "no"),
        ("no", "yes"),
        ("no", "no"),
    ],
    schema=StructType(
        [
            StructField("prediction", StringType(), True),
            StructField("comparison", StringType(), True),
        ]
    ),
)
print("created full_data and lists")

toAnalysis_studies = uniqueBetas_analyse.drop(
    "Phase4",
    "Phase>=3",
    "Phase>=2",
    "Phase>=1",
    "PhaseT",
).columns[
    157:
]  ### removed column combinations

24/08/15 09:07:28 WARN CacheManager: Asked to cache already cached data.        
                                                                                

created full_data and lists


In [2]:
#### now with propagation
""" this scripts run the analysis for comparing QTL studies, tissues together with therapy areas matched"""
from functions import relative_success
from stoppedTrials import terminated_td
from DoEAssessment import directionOfEffect
from membraneTargets import target_membrane
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from datetime import date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.types import (
    StructType,
    StructField,
    DoubleType,
    DecimalType,
    StringType,
    FloatType,
)

spark = SparkSession.builder.getOrCreate()

spark session created at 2024-08-19 08:22:07.236163
Analysis started on 2024-08-19 at  2024-08-19 08:22:07.236163


24/08/19 08:22:12 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [4]:
from pyspark.sql.functions import format_number, udf

test = (
    spark.read.option("header", "true")
    .option("quote", '"')
    .option("escape", '"')
    .option("multiLine", "true")
    .csv("gs://ot-team/jroldan/analysis/2024-08-02_colocDoEanalysis.csv", header=True)
)

In [5]:
def format_pvalue(val):
    if val is not None:
        if val < 0.005:
            return "{:.2e}".format(val)  # Scientific notation
        else:
            return "{:.4f}".format(val)  # Fixed decimal notation
    else:
        return None


# Register the UDF
format_pvalue_udf = udf(format_pvalue, StringType())

regex = r"\[\[\s*(\d+)\s+(\d+)\]\s*\[\s*(\d+)\s+(\d+)\]\]"

test2 = (
    test.withColumn(
        "significant",
        F.when(F.col("pValue") < 0.0001, F.lit("****"))
        .when((F.col("pValue") >= 0.0001) & (F.col("pValue") < 0.001), F.lit("***"))
        .when((F.col("pValue") >= 0.001) & (F.col("pValue") < 0.01), F.lit("**"))
        .when((F.col("pValue") >= 0.01) & (F.col("pValue") < 0.05), F.lit("*"))
        .when(F.col("pValue") >= 0.05, F.lit("ns")),
    )
    .withColumn(
        "writeFigure",
        F.concat(
            F.round(F.col("oddsRatio"), 2),
            F.lit(" "),
            F.lit("("),
            F.round(F.col("lowerInterval"), 2),
            F.lit("-"),
            F.round(F.col("upperInterval"), 2),
            F.lit(")"),
        ),
    )
    .withColumn("val1", F.regexp_extract("values", regex, 1))
    .withColumn("val2", F.regexp_extract("values", regex, 2))
    .withColumn("val3", F.regexp_extract("values", regex, 3))
    .withColumn("val4", F.regexp_extract("values", regex, 4))
    .withColumn("numerator", (F.col("val1") + F.col("val2")).cast("int"))
    .withColumn("denominator", (F.col("val3") + F.col("val4")).cast("int"))
    .withColumn("pValue", F.col("pValue").cast(DoubleType()))
    .withColumn(
        "pValue_formatted",
        F.when(
            F.col("pValue") < 0.0001, F.col("pValue").cast("string")
        )  # Check if value matches scientific notation range
        # .when(F.col("pValue") < 0.05, F.format_number(F.col("pValue"), 2))
        .otherwise(F.format_number(F.col("pValue"), 4)),
    )
).withColumn("numDen", F.concat_ws("/", F.col("numerator"), F.col("denominator")))

In [6]:
list_mappings = {
    "biofeature": [
        "Stomach",
        "BRAIN_PUTAMEN",
        "Brain_Hypothalamus",
        "Adipose_Subcutaneous",
        "BLOOD",
        "IPSC",
        "Artery_Aorta",
        "NEUTROPHIL",
        "MONOCYTE_IAV",
        "Vagina",
        "BRAIN_NAIVE",
        "Muscle_Skeletal",
        "Lung",
        "DLPFC_NAIVE",
        "Testis",
        "CD8_T-CELL_NAIVE",
        "CD4_T-CELL_NAIVE",
        "TREG_MEMORY",
        "B-CELL_NAIVE",
        "NK-CELL_NAIVE",
        "BRAIN_SPINAL_CORD",
        "BRAIN_FRONTAL_CORTEX",
        "LCL_STATIN",
        "MUSCLE",
        "PANCREATIC_ISLET",
        "Cells_EBV-transformed_lymphocytes",
        "CD8_T-CELL_ANTI-CD3-CD28",
        "ADIPOSE_NAIVE",
        "MONOCYTE_NAIVE",
        "Adrenal_Gland",
        "FAT",
        "Minor_Salivary_Gland",
        "Pituitary",
        "MONOCYTE_CD14",
        "PLACENTA_NAIVE",
        "ILEUM",
        "Colon_Transverse",
        "Pancreas",
        "Esophagus_Muscularis",
        "SMALL_INTESTINE",
        "MONOCYTE_LPS2",
        "Nerve_Tibial",
        "Artery_Tibial",
        "MACROPHAGE_IFNG",
        "BLOOD PLASMA",
        "Brain_Nucleus_accumbens_basal_ganglia",
        "Brain_Substantia_nigra",
        "MACROPHAGE_IFNG+SALMONELLA",
        "LCL",
        "Brain_Spinal_cord_cervical_c-1",
        "Cells_Cultured_fibroblasts",
        "Prostate",
        "FIBROBLAST",
        "SKIN_SUN_EXPOSED",
        "MONOCYTE_LPS",
        "Brain_Cerebellum",
        "Brain_Cortex",
        "Brain_Cerebellar_Hemisphere",
        "ADIPOSE_VISCERAL",
        "Thyroid",
        "Skin_Not_Sun_Exposed_Suprapubic",
        "Brain_Amygdala",
        "Esophagus_Gastroesophageal_Junction",
        "Heart_Left_Ventricle",
        "SKIN_NOT_SUN_EXPOSED",
        "MONOCYTE_IFN24",
        "Spleen",
        "Whole_Blood",
        "TH2_MEMORY",
        "Skin_Sun_Exposed_Lower_leg",
        "T-CELL_CD4",
        "MONOCYTE_LPS24",
        "T-CELL",
        "B-CELL_CD19",
        "HLC",
        "Breast_Mammary_Tissue",
        "Adipose_Visceral_Omentum",
        "TH17_MEMORY",
        "SKIN",
        "MONOCYTE",
        "PLATELET",
        "Small_Intestine_Terminal_Ileum",
        "Kidney_Cortex",
        "Brain_Anterior_cingulate_cortex_BA24",
        "MONOCYTE_CD16_NAIVE",
        "MACROPHAGE_NAIVE",
        "NEUTROPHIL_CD16",
        "BREAST",
        "Liver",
        "MACROPHAGE_SALMONELLA",
        "MONOCYTE_R848",
        "Colon_Sigmoid",
        "Brain_Hippocampus",
        "Brain_Caudate_basal_ganglia",
        "Heart_Atrial_Appendage",
        "Artery_Coronary",
        "TH1_MEMORY",
        "Esophagus_Mucosa",
        "BRAIN_NUCLEUS_ACCUMBENS",
        "MACROPHAGE_LISTERIA",
        "MUSCLE_NAIVE",
        "BRAIN",
        "Ovary",
        "MONOCYTE_PAM3CSK4",
        "BRAIN_CAUDATE",
        "SUBSTANTIA_NIGRA",
        "Uterus",
        "LOW_GRADE_CARTILAGE_NAIVE",
        "T-CELL_CD8",
        "TFH_MEMORY",
        "TREG_NAIVE",
        "CD4_T-CELL_ANTI-CD3-CD28",
        "Brain_Frontal_Cortex_BA9",
        "Brain_Putamen_basal_ganglia",
        "RECTUM",
        "NEUTROPHIL_CD15",
        "TH1-17_MEMORY",
        "HIGH_GRADE_CARTILAGE_NAIVE",
        "MICROGLIA_NAIVE",
        "LCL_NAIVE",
    ],
    "qtl_type": ["sqtl", "eqtl", "pqtl"],
    "right_study_id": [
        "Fairfax_2014",
        "CEDAR",
        "ROSMAP",
        "GTEx-eQTL",
        "BLUEPRINT",
        "TwinsUK",
        "Schmiedel_2018",
        "Lepik_2017",
        "GTEx-sQTL",
        "Quach_2016",
        "eQTLGen",
        "Nedelec_2016",
        "FUSION",
        "Steinberg_2020",
        "PhLiPS",
        "GENCORD",
        "CommonMind",
        "OLLI_2016",
        "FOLKERSEN_2020",
        "GEUVADIS",
        "CAP",
        "SUHRE_2017",
        "PIETZNER_2020",
        "HipSci",
        "Alasoo_2018",
        "Fairfax_2012",
        "Kasela_2017",
        "Peng_2018",
        "SUN2018",
        "Braineac2",
        "HILLARY_2019",
        "BrainSeq",
        "van_de_Bunt_2015",
        "FOLKERSEN_2017",
        "Young_2019",
        "iPSCORE",
        "Naranbhai_2015",
    ],
}

In [8]:
from pyspark.sql.functions import coalesce

conditions = []
conditions2 = []

for list_name, substrings in list_mappings.items():
    for substr in substrings:
        # Create a condition column for each substring
        condition_column = F.when(F.col("path").contains(substr), F.lit(f"{substr}"))
        condition_column2 = F.when(
            F.col("path").contains(substr), F.lit(f"{list_name}")
        )
        conditions.append(condition_column)
        conditions2.append(condition_column2)

# Combine all conditions using coalesce to get the first non-null result
final_condition_column = coalesce(*conditions)
final_condition_column2 = coalesce(*conditions2)

df = test2.withColumn("name", final_condition_column).withColumn(
    "data_type", final_condition_column2
)

In [9]:
### extract the right name from the right_bio_feature one

# Extract the part of the string between '/propagated/' and '_right_bio_feature'
df = df.withColumn(
    "name_rbf",
    F.regexp_extract(F.col("path"), r"/propagated/(.+)_right_bio_feature", 1),
)

# Show result
df.select("name_rbf").show(truncate=False)

+---------------------------------+
|name_rbf                         |
+---------------------------------+
|                                 |
|                                 |
|                                 |
|                                 |
|                                 |
|Stomach                          |
|Stomach                          |
|Stomach                          |
|Stomach                          |
|Stomach                          |
|Cells_EBV-transformed_lymphocytes|
|Cells_EBV-transformed_lymphocytes|
|Cells_EBV-transformed_lymphocytes|
|Cells_EBV-transformed_lymphocytes|
|Cells_EBV-transformed_lymphocytes|
|Nerve_Tibial                     |
|Nerve_Tibial                     |
|Nerve_Tibial                     |
|Nerve_Tibial                     |
|Nerve_Tibial                     |
+---------------------------------+
only showing top 20 rows



                                                                                

In [9]:
df.toPandas().to_csv(
    "gs://ot-team/jroldan/analysis/2024-08-02_colocDoEanalysis_processed_rbfCorrected.csv"
)

                                                                                

In [None]:
### dataframe of L2G coherency

In [3]:
l2g = (
    spark.read.option("header", "true")
    .option("quote", '"')
    .option("escape", '"')
    .option("multiLine", "true")
    .csv(
        "gs://ot-team/jroldan/analysis/2024-08-14_analysis_coherencyL2G.csv",
        header=True,
    )
)

In [5]:
from pyspark.sql.functions import format_number, udf


def format_pvalue(val):
    if val is not None:
        if val < 0.005:
            return "{:.2e}".format(val)  # Scientific notation
        else:
            return "{:.4f}".format(val)  # Fixed decimal notation
    else:
        return None


# Register the UDF
format_pvalue_udf = udf(format_pvalue, StringType())

regex = r"\[\[\s*(\d+)\s+(\d+)\]\s*\[\s*(\d+)\s+(\d+)\]\]"

l2g2 = (
    l2g.withColumn(
        "significant",
        F.when(F.col("pValue") < 0.0001, F.lit("****"))
        .when((F.col("pValue") >= 0.0001) & (F.col("pValue") < 0.001), F.lit("***"))
        .when((F.col("pValue") >= 0.001) & (F.col("pValue") < 0.01), F.lit("**"))
        .when((F.col("pValue") >= 0.01) & (F.col("pValue") < 0.05), F.lit("*"))
        .when(F.col("pValue") >= 0.05, F.lit("ns")),
    )
    .withColumn(
        "writeFigure",
        F.concat(
            F.round(F.col("oddsRatio"), 2),
            F.lit(" "),
            F.lit("("),
            F.round(F.col("lowerInterval"), 2),
            F.lit("-"),
            F.round(F.col("upperInterval"), 2),
            F.lit(")"),
        ),
    )
    .withColumn("val1", F.regexp_extract("values", regex, 1))
    .withColumn("val2", F.regexp_extract("values", regex, 2))
    .withColumn("val3", F.regexp_extract("values", regex, 3))
    .withColumn("val4", F.regexp_extract("values", regex, 4))
    .withColumn("numerator", (F.col("val1") + F.col("val2")).cast("int"))
    .withColumn("denominator", (F.col("val3") + F.col("val4")).cast("int"))
    .withColumn("pValue", F.col("pValue").cast(DoubleType()))
    .withColumn(
        "pValue_formatted",
        F.when(
            F.col("pValue") < 0.0001, F.col("pValue").cast("string")
        )  # Check if value matches scientific notation range
        # .when(F.col("pValue") < 0.05, F.format_number(F.col("pValue"), 2))
        .otherwise(F.format_number(F.col("pValue"), 4)),
    )
).withColumn("numDen", F.concat_ws("/", F.col("numerator"), F.col("denominator")))

In [6]:
from pyspark.sql.functions import regexp_extract

In [7]:
mapl2g = {
    "l2gScore": [
        "0_1",
        "0_15",
        "0_2",
        "0_25",
        "0_3",
        "0_35",
        "0_4",
        "0_45",
        "0_5",
        "0_55",
        "0_6",
        "0_65",
        "0_7",
        "0_75",
        "0_8",
        "0_85",
        "0_9",
        "0_95",
    ],
    "distanceRank": [
        "<=1",
        "<=2",
        "<=3",
        "<=4",
        "<=5",
        "<=6",
        "<=7",
        "<=8",
        "<=9",
        "<=10",
    ],
}
from pyspark.sql.functions import coalesce

conditions = []
conditions2 = []

for list_name, substrings in mapl2g.items():
    for substr in substrings:
        # Create a condition column for each substring
        condition_column = F.when(F.col("path").contains(substr), F.lit(f"{substr}"))
        condition_column2 = F.when(
            F.col("path").contains(substr), F.lit(f"{list_name}")
        )
        conditions.append(condition_column)
        conditions2.append(condition_column2)

# Combine all conditions using coalesce to get the first non-null result
final_condition_column = coalesce(*conditions)
final_condition_column2 = coalesce(*conditions2)

l2g3 = (
    l2g2.withColumn("name2", final_condition_column).withColumn(
        "data_type", final_condition_column2
    )
    #    .filter(F.col("rank") == 0.1)
    # .show(truncate=False)
)

In [8]:
### modify the 0_1 numbers.
# Start with a default value for the match column
match_column = F.lit(None)

# Iterate over all substrings in the dictionary
for key, substrings in mapl2g.items():
    for substring in substrings:
        pattern = f"{substring}[_&]"
        match_column = F.when(F.col("path").rlike(pattern), substring).otherwise(
            match_column
        )

# Add the new column to the DataFrame
l2g4 = l2g3.withColumn("rankTrue", F.regexp_replace(match_column, "_", "."))

# df_with_match.withColumn(
#    "matched_substring2", F.regexp_replace(F.col("matched_substring"), "_", ".")
# ).show(1000)

In [9]:
# Start with a default value for the match column
match_column = F.lit(None)

# Iterate over all substrings in the dictionary
for key, substrings in mapl2g.items():
    for substring in substrings:
        pattern = f"{substring}[_&]"
        # Build the match column with the correct substring if found
        match_column = F.when(F.col("path").rlike(pattern), substring).otherwise(
            match_column
        )

# Extract the number from patterns like "<=n"
number_column = F.regexp_extract(match_column, r"<=([\d]+)", 2)

# Replace underscores with periods for patterns like "0_1"
processed_match_column = F.when(
    F.col("path").rlike(r"[_&]"), F.regexp_replace(match_column, "_", ".")
).otherwise(number_column)

# Add the new column to the DataFrame
l2g4 = l2g3.withColumn("rankTrue", processed_match_column).withColumn(
    "fixed",  ### process the strings <=n, to take only n in the distance rank cases
    F.when(
        F.col("rankTrue").contains("<="),
        F.expr("substring(rankTrue, 3, length(rankTrue) - 2)"),
    ).otherwise(F.col("rankTrue")),
)

In [44]:
l2g4.toPandas().to_csv(
    "gs://ot-team/jroldan/analysis/2024-08-14_processed_analysis_coherencyL2G.csv"
)

#### fix the group for the L2G score and Coloc

In [11]:
# List of substrings of interest
group_mapping = ["hasGeneticEvidence", "L2GAndColoc", "diagonalYes", "oneCellYes"]

# Initialize a default column with None
match_column = F.lit(None)

# Iterate over the substrings and update the match_column when a substring is found
for substring in group_mapping:
    match_column = F.when(F.col("path").rlike(substring), substring).otherwise(
        match_column
    )

# Add the new column with the matched substring
l2g5 = l2g4.withColumn("group_fixed", match_column)

In [12]:
l2g5.groupBy("group_fixed").count().show()

[Stage 2:>                                                          (0 + 1) / 1]

+------------------+-----+
|       group_fixed|count|
+------------------+-----+
|              null|  280|
|       diagonalYes|  300|
|       L2GAndColoc|  300|
|hasGeneticEvidence|  300|
|        oneCellYes|  300|
+------------------+-----+



                                                                                

In [13]:
l2g5.filter(F.col("group_fixed").isNull()).select("path").show(truncate=False)

+--------------------------------------------------------------------------------------------+
|path                                                                                        |
+--------------------------------------------------------------------------------------------+
|gs://ot-team/jroldan/2024-08-14_analysis/df_l2g_original/max_L2GScore>=0_1_Phase4.parquet   |
|gs://ot-team/jroldan/2024-08-14_analysis/df_l2g_original/max_L2GScore>=0_1_Phase>=3.parquet |
|gs://ot-team/jroldan/2024-08-14_analysis/df_l2g_original/max_L2GScore>=0_1_Phase>=2.parquet |
|gs://ot-team/jroldan/2024-08-14_analysis/df_l2g_original/max_L2GScore>=0_1_Phase>=1.parquet |
|gs://ot-team/jroldan/2024-08-14_analysis/df_l2g_original/max_L2GScore>=0_1_PhaseT.parquet   |
|gs://ot-team/jroldan/2024-08-14_analysis/df_l2g_original/max_L2GScore>=0_15_Phase4.parquet  |
|gs://ot-team/jroldan/2024-08-14_analysis/df_l2g_original/max_L2GScore>=0_15_Phase>=3.parquet|
|gs://ot-team/jroldan/2024-08-14_analysis/df_l2g_o

In [14]:
l2g5.toPandas().to_csv(
    "gs://ot-team/jroldan/analysis/2024-08-14_processedLatest_analysis_coherencyL2G_v19Aug.csv"
)

#### Trying to see whether the analysis can be done as the code it is in whe notebook

In [None]:
#########
#####
#######
## ANALYSIS FOR L2G Scores, genetic evidence and Direction of Effect
## Original, propagated and Other Vs Oncology
#######
from functions import discrepancifier
from DoEAssessment import directionOfEffect
from functions import relative_success
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
import matplotlib.pyplot as plt
from datetime import date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from datetime import datetime


spark = SparkSession.builder.getOrCreate()
c = datetime.now()
print("spark session created at", c)


#### make the dataset from stopped clin trials
### read supplementary table 9
""" ### just showing how i did the dataset
st9 = spark.read.csv("/Users/juanr/Downloads/ST9.csv", sep=",", header=True)
st9.filter(
    (F.col("clinicalStatus").isin(["Terminated", "Withdrawn", "Suspended"]))
    & (F.col("prediction") == "Negative")
).groupBy(
    "targetId", "diseaseId", "clinicalStatus", "prediction"
).count().toPandas().to_csv(
    "targetDiseaseStoppedNegative.csv"
)
"""
### target-diseases terminated&withdrawal in clin trials
terminated = spark.read.csv(
    "gs://ot-team/jroldan/analysis/targetDiseaseStoppedNegative.csv",
    sep=",",
    header=True,
).drop("_c0", "Withdrawn")

terminated_array = (
    terminated.groupBy("targetId", "diseaseId")
    .agg(F.collect_set("clinicalStatus").alias("clinicalStatus"))
    .withColumn("prediction", F.when(F.col("clinicalStatus").isNotNull(), F.lit("yes")))
)

### New coloc evidences == leftVariantId
ranking = Window.partitionBy("studyId", "leftVariantId")

### union with the other datasources
platform_v = "24.09"

target_path = (
    f"gs://open-targets-data-releases/{platform_v}/output/etl/parquet/targets/"
)
target = spark.read.parquet(target_path)

disease_path = (
    f"gs://open-targets-data-releases/{platform_v}/output/etl/parquet/diseases/"
)
diseases = spark.read.parquet(disease_path)
mecact_path = f"gs://open-targets-data-releases/{platform_v}/output/etl/parquet/mechanismOfAction/"
mecact = spark.read.parquet(mecact_path)
evidences = spark.read.parquet(
    f"gs://open-targets-data-releases/{platform_v}/output/etl/parquet/evidence"
).filter(
    F.col("datasourceId").isin(
        [
            "ot_genetics_portal",
            "gene_burden",
            "eva",
            "eva_somatic",
            "gene2phenotype",
            "orphanet",
            "cancer_gene_census",
            "intogen",
            "impc",
            "chembl",
        ]
    )
)

# 2# run the transformation of the evidences datasets used.

windowSpec = Window.partitionBy("targetId", "diseaseId")

columns_chembl = ["LoF_protect", "GoF_protect"]
columns_dataset = ["LoF_protect", "GoF_protect", "LoF_risk", "GoF_risk", "evidenceDif"]
columns = ["GoF_risk", "LoF_protect", "LoF_risk", "GoF_protect"]
terms = ["noEvaluable", "bivalent_risk", "null", "dispar"]

sincgc = [
    "gene_burden",
    "intogen",
    "eva",
    "eva_somatic",
    "ot_genetics_portal",
    "impc",
    "orphanet",
    "gene2phenotype",
]

germline = [
    "gene_burden",
    "eva",
    "ot_genetics_portal",
    "impc",
    "orphanet",
    "gene2phenotype",
]

somatic = ["intogen", "cancer_gene_census", "eva_somatic"]

datasource_list = [
    "gene_burden",
    "intogen",
    "cancer_gene_census",
    "eva",
    "eva_somatic",
    "ot_genetics_portal",
    "impc",
    "orphanet",
    "gene2phenotype",
    "chembl",
    "WOcgc",
    "somatic",
    "germline",
]
#### Build the evidences from new coloc to be able to do the ranking.


#### version all gene burden
prueba_assessment = (
    directionOfEffect(evidences, platform_v)
    .withColumn(
        "rank",
        F.when(
            (F.col("datasourceId") == "ot_genetics_portal"),
            F.row_number().over(ranking.orderBy(F.col("resourceScore").desc())),
        ).otherwise(F.lit(None)),
    )
    .withColumn(
        "average",
        F.when(
            (F.col("datasourceId") == "ot_genetics_portal"),
            F.avg("resourceScore").over(ranking.orderBy(F.col("resourceScore").desc())),
        ).otherwise(F.lit(None)),
    )
    .persist()
)

spark session created at 2025-01-23 12:10:51.668473
Analysis started on 2025-01-23 at  2025-01-23 12:10:51.668473


25/01/23 12:10:58 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


spark session created at 2025-01-23 12:10:58.122712


AnalysisException: Column 'leftVariantId' does not exist. Did you mean one of the following? [variantId, targetId, variantRsId, reactionId, targetId2, cohortId, contrast, datatypeId, literature, target_id, variantHgvsId, ancestryId, beta, datasourceId, description, diseaseId, drugId, projectId, releaseDate, sourceId, studyId, OddsRatio, actionType, alleleOrigins, ancestry, biomarkers, clinicalStatus, drugId2, homogenized, id, intogenAnnot, pathways, reactionName, releaseVersion, studyCases, variantEffect, biomarkerName, cellType, clinicalPhase, confidence, drugResponse, pValueMantissa, pmcIds, publicationYear, resourceScore, score, sex, statisticalMethod, studyStartDate, targetInModel, urls, TSorOncogene, activators_list, biologicalModelId, geneticBackground, mutatedSamples, pValueExponent, studyOverview, approvedSymbol, cohortDescription, cohortPhenotypes, cohortShortName, description_splited, directionOnTrait, drugFromSource, inhibitors_list, intogen_function, studySampleSize, studyStopReason, targetModulation, textMiningSentences, allelicRequirements, diseaseCellLines, diseaseFromSource, diseaseFromSourceId, statisticalTestTail, targetFromSource, targetFromSourceId, targetInModelMgiId, clinicalSignificances, log2FoldChangeValue, crisprScreenLibrary, biosamplesFromSource, publicationFirstAuthor, significantDriverMethods, targetInModelEnsemblId, diseaseFromSourceMappedId, statisticalMethodOverview, studyStopReasonCategories, betaConfidenceIntervalLower, betaConfidenceIntervalUpper, log2FoldChangePercentileRank, studyCasesWithQualifyingVariants, variantAminoacidDescriptions, variantFunctionalConsequenceId, biologicalModelGeneticBackground, oddsRatioConfidenceIntervalLower, oddsRatioConfidenceIntervalUpper, biologicalModelAllelicComposition, variantFunctionalConsequenceFromQtlId, diseaseModelAssociatedHumanPhenotypes, diseaseModelAssociatedModelPhenotypes];
'Project [datasourceId#781, targetId#782, alleleOrigins#783, allelicRequirements#784, ancestry#785, ancestryId#786, beta#1087, betaConfidenceIntervalLower#788, betaConfidenceIntervalUpper#789, biologicalModelAllelicComposition#790, biologicalModelGeneticBackground#791, biologicalModelId#792, biomarkerName#793, biomarkers#794, biosamplesFromSource#795, cellType#796, clinicalPhase#797, clinicalSignificances#1267, clinicalStatus#799, cohortDescription#800, cohortId#801, cohortPhenotypes#802, cohortShortName#803, confidence#804, ... 79 more fields]
+- Project [datasourceId#781, targetId#782, alleleOrigins#783, allelicRequirements#784, ancestry#785, ancestryId#786, beta#1087, betaConfidenceIntervalLower#788, betaConfidenceIntervalUpper#789, biologicalModelAllelicComposition#790, biologicalModelGeneticBackground#791, biologicalModelId#792, biomarkerName#793, biomarkers#794, biosamplesFromSource#795, cellType#796, clinicalPhase#797, clinicalSignificances#1267, clinicalStatus#799, cohortDescription#800, cohortId#801, cohortPhenotypes#802, cohortShortName#803, confidence#804, ... 78 more fields]
   +- Project [datasourceId#781, targetId#782, alleleOrigins#783, allelicRequirements#784, ancestry#785, ancestryId#786, beta#1087, betaConfidenceIntervalLower#788, betaConfidenceIntervalUpper#789, biologicalModelAllelicComposition#790, biologicalModelGeneticBackground#791, biologicalModelId#792, biomarkerName#793, biomarkers#794, biosamplesFromSource#795, cellType#796, clinicalPhase#797, clinicalSignificances#1267, clinicalStatus#799, cohortDescription#800, cohortId#801, cohortPhenotypes#802, cohortShortName#803, confidence#804, ... 77 more fields]
      +- Project [datasourceId#781, targetId#782, alleleOrigins#783, allelicRequirements#784, ancestry#785, ancestryId#786, beta#1087, betaConfidenceIntervalLower#788, betaConfidenceIntervalUpper#789, biologicalModelAllelicComposition#790, biologicalModelGeneticBackground#791, biologicalModelId#792, biomarkerName#793, biomarkers#794, biosamplesFromSource#795, cellType#796, clinicalPhase#797, clinicalSignificances#1267, clinicalStatus#799, cohortDescription#800, cohortId#801, cohortPhenotypes#802, cohortShortName#803, confidence#804, ... 77 more fields]
         +- Project [datasourceId#781, targetId#782, alleleOrigins#783, allelicRequirements#784, ancestry#785, ancestryId#786, beta#1087, betaConfidenceIntervalLower#788, betaConfidenceIntervalUpper#789, biologicalModelAllelicComposition#790, biologicalModelGeneticBackground#791, biologicalModelId#792, biomarkerName#793, biomarkers#794, biosamplesFromSource#795, cellType#796, clinicalPhase#797, clinicalSignificances#1267, clinicalStatus#799, cohortDescription#800, cohortId#801, cohortPhenotypes#802, cohortShortName#803, confidence#804, ... 77 more fields]
            +- Project [datasourceId#781, targetId#782, alleleOrigins#783, allelicRequirements#784, ancestry#785, ancestryId#786, beta#1087, betaConfidenceIntervalLower#788, betaConfidenceIntervalUpper#789, biologicalModelAllelicComposition#790, biologicalModelGeneticBackground#791, biologicalModelId#792, biomarkerName#793, biomarkers#794, biosamplesFromSource#795, cellType#796, clinicalPhase#797, clinicalSignificances#1267, clinicalStatus#799, cohortDescription#800, cohortId#801, cohortPhenotypes#802, cohortShortName#803, confidence#804, ... 78 more fields]
               +- Window [collect_set(intogen_function#1938, 0, 0) windowspecdefinition(targetId#782, diseaseId#864, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS _we0#2043], [targetId#782, diseaseId#864]
                  +- Project [datasourceId#781, targetId#782, alleleOrigins#783, allelicRequirements#784, ancestry#785, ancestryId#786, beta#1087, betaConfidenceIntervalLower#788, betaConfidenceIntervalUpper#789, biologicalModelAllelicComposition#790, biologicalModelGeneticBackground#791, biologicalModelId#792, biomarkerName#793, biomarkers#794, biosamplesFromSource#795, cellType#796, clinicalPhase#797, clinicalSignificances#1267, clinicalStatus#799, cohortDescription#800, cohortId#801, cohortPhenotypes#802, cohortShortName#803, confidence#804, ... 76 more fields]
                     +- Project [datasourceId#781, targetId#782, alleleOrigins#783, allelicRequirements#784, ancestry#785, ancestryId#786, beta#1087, betaConfidenceIntervalLower#788, betaConfidenceIntervalUpper#789, biologicalModelAllelicComposition#790, biologicalModelGeneticBackground#791, biologicalModelId#792, biomarkerName#793, biomarkers#794, biosamplesFromSource#795, cellType#796, clinicalPhase#797, clinicalSignificances#1267, clinicalStatus#799, cohortDescription#800, cohortId#801, cohortPhenotypes#802, cohortShortName#803, confidence#804, ... 76 more fields]
                        +- Project [datasourceId#781, targetId#782, alleleOrigins#783, allelicRequirements#784, ancestry#785, ancestryId#786, beta#1087, betaConfidenceIntervalLower#788, betaConfidenceIntervalUpper#789, biologicalModelAllelicComposition#790, biologicalModelGeneticBackground#791, biologicalModelId#792, biomarkerName#793, biomarkers#794, biosamplesFromSource#795, cellType#796, clinicalPhase#797, clinicalSignificances#1267, clinicalStatus#799, cohortDescription#800, cohortId#801, cohortPhenotypes#802, cohortShortName#803, confidence#804, ... 75 more fields]
                           +- Project [datasourceId#781, targetId#782, alleleOrigins#783, allelicRequirements#784, ancestry#785, ancestryId#786, beta#1087, betaConfidenceIntervalLower#788, betaConfidenceIntervalUpper#789, biologicalModelAllelicComposition#790, biologicalModelGeneticBackground#791, biologicalModelId#792, biomarkerName#793, biomarkers#794, biosamplesFromSource#795, cellType#796, clinicalPhase#797, clinicalSignificances#1267, clinicalStatus#799, cohortDescription#800, cohortId#801, cohortPhenotypes#802, cohortShortName#803, confidence#804, ... 74 more fields]
                              +- Join LeftOuter, ((drugId2#1031 = drugId#815) AND (targetId2#1038 = targetId#782))
                                 :- Join LeftOuter, (target_id#1081 = targetId#782)
                                 :  :- Project [datasourceId#781, targetId#782, alleleOrigins#783, allelicRequirements#784, ancestry#785, ancestryId#786, beta#1087, betaConfidenceIntervalLower#788, betaConfidenceIntervalUpper#789, biologicalModelAllelicComposition#790, biologicalModelGeneticBackground#791, biologicalModelId#792, biomarkerName#793, biomarkers#794, biosamplesFromSource#795, cellType#796, clinicalPhase#797, concat_ws(,, clinicalSignificances#798) AS clinicalSignificances#1267, clinicalStatus#799, cohortDescription#800, cohortId#801, cohortPhenotypes#802, cohortShortName#803, confidence#804, ... 65 more fields]
                                 :  :  +- Project [datasourceId#781, targetId#782, alleleOrigins#783, allelicRequirements#784, ancestry#785, ancestryId#786, beta#1087, betaConfidenceIntervalLower#788, betaConfidenceIntervalUpper#789, biologicalModelAllelicComposition#790, biologicalModelGeneticBackground#791, biologicalModelId#792, biomarkerName#793, biomarkers#794, biosamplesFromSource#795, cellType#796, clinicalPhase#797, clinicalSignificances#798, clinicalStatus#799, cohortDescription#800, cohortId#801, cohortPhenotypes#802, cohortShortName#803, confidence#804, ... 65 more fields]
                                 :  :     +- Project [datasourceId#781, targetId#782, alleleOrigins#783, allelicRequirements#784, ancestry#785, ancestryId#786, cast(beta#787 as double) AS beta#1087, betaConfidenceIntervalLower#788, betaConfidenceIntervalUpper#789, biologicalModelAllelicComposition#790, biologicalModelGeneticBackground#791, biologicalModelId#792, biomarkerName#793, biomarkers#794, biosamplesFromSource#795, cellType#796, clinicalPhase#797, clinicalSignificances#798, clinicalStatus#799, cohortDescription#800, cohortId#801, cohortPhenotypes#802, cohortShortName#803, confidence#804, ... 65 more fields]
                                 :  :        +- Filter datasourceId#781 IN (ot_genetics_portal,gene_burden,eva,eva_somatic,gene2phenotype,orphanet,cancer_gene_census,intogen,impc,chembl)
                                 :  :           +- Filter datasourceId#781 IN (ot_genetics_portal,gene_burden,eva,eva_somatic,gene2phenotype,orphanet,cancer_gene_census,intogen,impc,chembl)
                                 :  :              +- Relation [datasourceId#781,targetId#782,alleleOrigins#783,allelicRequirements#784,ancestry#785,ancestryId#786,beta#787,betaConfidenceIntervalLower#788,betaConfidenceIntervalUpper#789,biologicalModelAllelicComposition#790,biologicalModelGeneticBackground#791,biologicalModelId#792,biomarkerName#793,biomarkers#794,biosamplesFromSource#795,cellType#796,clinicalPhase#797,clinicalSignificances#798,clinicalStatus#799,cohortDescription#800,cohortId#801,cohortPhenotypes#802,cohortShortName#803,confidence#804,... 65 more fields] parquet
                                 :  +- Project [id#960 AS target_id#1081, approvedSymbol#961, description#1066, description_splited#1070, TSorOncogene#1075]
                                 :     +- Project [id#960, approvedSymbol#961, description#1066, description_splited#1070, CASE WHEN (RLIKE(description_splited#1070, ncogene) AND RLIKE(description_splited#1070, TSG)) THEN bivalent WHEN RLIKE(description_splited#1070, ncogene(\s|$)) THEN oncogene WHEN RLIKE(description_splited#1070, TSG(\s|$)) THEN TSG ELSE noEvaluable END AS TSorOncogene#1075]
                                 :        +- Project [id#960, approvedSymbol#961, description#1066, concat_ws(,, description#1066) AS description_splited#1070]
                                 :           +- Aggregate [id#960, approvedSymbol#961], [id#960, approvedSymbol#961, collect_set(description#1058, 0, 0) AS description#1066]
                                 :              +- Filter description#1058 IN (TSG,oncogene,Oncogene,oncogene,oncogene,TSG,TSG,oncogene,fusion,oncogene,oncogene,fusion)
                                 :                 +- Project [id#960, approvedSymbol#961, col#1053.description AS description#1058]
                                 :                    +- Project [id#960, approvedSymbol#961, col#1053]
                                 :                       +- Generate explode(hallmarks#970.attributes), true, [col#1053]
                                 :                          +- Relation [id#960,approvedSymbol#961,biotype#962,transcriptIds#963,canonicalTranscript#964,canonicalExons#965,genomicLocation#966,alternativeGenes#967,approvedName#968,go#969,hallmarks#970,synonyms#971,symbolSynonyms#972,nameSynonyms#973,functionDescriptions#974,subcellularLocations#975,targetClass#976,obsoleteSymbols#977,obsoleteNames#978,constraint#979,tep#980,proteinIds#981,dbXrefs#982,chemicalProbes#983,... 4 more fields] parquet
                                 +- Aggregate [targetId2#1038, drugId2#1031], [targetId2#1038, drugId2#1031, collect_set(actionType#1016, 0, 0) AS actionType#1048]
                                    +- Project [targetId2#1038, drugId2#1031, actionType#1016, mechanismOfAction#1017]
                                       +- Generate explode(targets#1021), true, [targetId2#1038]
                                          +- Project [drugId2#1031, actionType#1016, mechanismOfAction#1017, targets#1021]
                                             +- Generate explode(chemblIds#1018), true, [drugId2#1031]
                                                +- Relation [actionType#1016,mechanismOfAction#1017,chemblIds#1018,targetName#1019,targetType#1020,targets#1021,references#1022] parquet


In [3]:
#### BUILDING THE NEW GWAS GENETIC EVIDENCE FROM COLOC
from functions import discrepancifier
from DoEAssessment import directionOfEffect
from functions import relative_success
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
import matplotlib.pyplot as plt
from datetime import date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from datetime import datetime

path = "gs://open-targets-pre-data-releases/24.12-uo_test-3/output/etl/parquet/"


#### Now load sources of data to generate credible_set_OT_genetics evidences and associations.

target = spark.read.parquet(f"{path}targets/")

diseases = spark.read.parquet(f"{path}diseases/")

evidences = spark.read.parquet(f"{path}evidence").filter(
    F.col("datasourceId").isin(
        [
            "ot_genetics_portal",
            "gene_burden",
            "eva",
            "eva_somatic",
            "gene2phenotype",
            "orphanet",
            "cancer_gene_census",
            "intogen",
            "impc",
            "chembl",
        ]
    )
)
ot_genetics = evidences.filter(F.col("datasourceId") == "ot_genetics_portal")

credibleEvidence = spark.read.parquet(f"{path}evidence").filter(
    F.col("datasourceId").isin(["gwas_credible_sets"])
)
credible = spark.read.parquet(f"{path}credibleSet")

index = spark.read.parquet(f"{path}gwasIndex")

new = spark.read.parquet(f"{path}colocalisation/coloc")

variantIndex = spark.read.parquet(f"{path}variantIndex")

biosample = spark.read.parquet(f"{path}biosample")

newColoc = (
    new.join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on left side
            "studyLocusId as leftStudyLocusId",
            "StudyId as leftStudyId",
            "variantId as leftVariantId",
            "studyType as credibleLeftStudyType",
        ),
        on="leftStudyLocusId",
        how="left",
    )
    .join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on right side
            "studyLocusId as rightStudyLocusId",
            "studyId as rightStudyId",
            "variantId as rightVariantId",
            "studyType as credibleRightStudyType",
        ),
        on="rightStudyLocusId",
        how="left",
    )
    .join(
        index.selectExpr(  ### bring modulated target on right side (QTL study)
            "studyId as rightStudyId",
            "geneId",
            "projectId",
            "studyType as indexStudyType",
            "condition",
            "biosampleId",
        ),
        on="rightStudyId",
        how="left",
    )
    .persist()
)
# remove columns without content (only null values on them)
df = credibleEvidence.filter((F.col("datasourceId") == "gwas_credible_sets"))

# Use an aggregation to determine non-null columns
non_null_counts = df.select(
    *[F.sum(F.col(col).isNotNull().cast("int")).alias(col) for col in df.columns]
)

# Collect the counts for each column
non_null_columns = [
    row[0] for row in non_null_counts.collect()[0].asDict().items() if row[1] > 0
]

# Select only the non-null columns
filtered_df = df.select(*non_null_columns).persist()

## bring studyId, variantId, beta from Gwas and pValue
gwasComplete = filtered_df.join(
    credible.selectExpr(
        "studyLocusId", "studyId", "variantId", "beta as betaGwas", "pValueExponent"
    ),
    on="studyLocusId",
    how="left",
)

### bring directionality from QTL

gwasResolvedColoc = (
    (
        newColoc.filter(F.col("rightStudyType") != "gwas")
        .withColumnRenamed("geneId", "targetId")
        .join(
            gwasComplete.withColumnRenamed("studyLocusId", "leftStudyLocusId"),
            on=["leftStudyLocusId", "targetId"],
            how="right",
        )
        .join(  ### propagated using parent terms
            diseases.selectExpr(
                "id as diseaseId", "name", "parents", "therapeuticAreas"
            ),
            on="diseaseId",
            how="left",
        )
        .withColumn(
            "diseaseId",
            F.explode_outer(F.concat(F.array(F.col("diseaseId")), F.col("parents"))),
        )
        .drop("parents", "oldDiseaseId")
    )
    .withColumn(
        "colocDoE",
        F.when(
            F.col("rightStudyType").isin(
                ["eqtl", "pqtl", "tuqtl", "sceqtl", "sctuqtl"]
            ),
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_protect"),
            ),
        ).when(
            F.col("rightStudyType").isin(
                ["sqtl", "scsqtl"]
            ),  ### opposite directionality than sqtl
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_protect"),
            ),
        ),
    )
    .persist()
)

#### take the direction from the lowest p value
window_spec = Window.partitionBy("targetId", "diseaseId").orderBy(
    F.col("pValueExponent").asc()
)
### modify to include more information
gwasCredibleAssoc = (
    gwasResolvedColoc.withColumn(
        "homogenized", F.first("colocDoE", ignorenulls=True).over(window_spec)
    )
    .select(
        "targetId",
        "diseaseId",
        "homogenized",
        "leftStudyLocusId",
        "h4",
        "datasourceId",
        "resourceScore",
        "leftVariantId",
        "credibleLeftStudyType",
    )
    .withColumn(
        "homogenized",
        F.when(F.col("homogenized").isNull(), F.lit("noEvaluable")).otherwise(
            F.col("homogenized")
        ),
    )
)  ### there will be duplicates TargetId-DiseaseId because we are taking the most significant DoE

### Now I have the DoE

spark session created at 2025-01-24 11:28:26.526966
Analysis started on 2025-01-24 at  2025-01-24 11:28:26.526966


25/01/24 11:28:28 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [4]:
#### LOAD STUDYLOCUSID AND VARIANT DISTANCES
l2gPred = spark.read.parquet(
    "gs://open-targets-pre-data-releases/24.12-uo_test-3/output/etl/parquet/locusToGenePredictions"
)
l2gTable = (
    l2gPred.select("studyLocusId", "geneId", F.explode_outer("locusToGeneFeatures"))
    .filter(F.col("key").isin(["distanceFootprintMean", "distanceTssMean"]))
    .groupBy("studyLocusId", "geneId")
    .pivot("key")
    .agg(F.first("value"))
)

gwasCredibleAssocDistances = gwasCredibleAssoc.join(
    l2gTable.withColumnRenamed("studyLocusId", "leftStudyLocusId").withColumnRenamed(
        "geneId", "targetId"
    ),
    on=["leftStudyLocusId", "targetId"],
    how="left",
)

                                                                                

In [5]:
gwasCredibleAssocDistances.show()

[Stage 49:>                                                         (0 + 1) / 1]

+--------------------+---------------+-----------+-----------+------------------+------------------+-------------------+---------------+---------------------+---------------------+---------------+
|    leftStudyLocusId|       targetId|  diseaseId|homogenized|                h4|      datasourceId|      resourceScore|  leftVariantId|credibleLeftStudyType|distanceFootprintMean|distanceTssMean|
+--------------------+---------------+-----------+-----------+------------------+------------------+-------------------+---------------+---------------------+---------------------+---------------+
|9b68d48251bfb71e9...|ENSG00000000003|EFO_0004529|noEvaluable|              null|gwas_credible_sets| 0.9525927165569602|           null|                 null|           0.99999464|     0.99975055|
|cb3db2374fb1fbeda...|ENSG00000000003|EFO_0004529|noEvaluable|              null|gwas_credible_sets| 0.9525927165569602|           null|                 null|            0.9999886|      0.9997445|
|b9e47164bf645b

                                                                                

In [101]:
gwasCredibleAssocDistances.columns

['leftStudyLocusId',
 'targetId',
 'diseaseId',
 'homogenized',
 'h4',
 'datasourceId',
 'resourceScore',
 'leftVariantId',
 'credibleLeftStudyType',
 'geneId',
 'distanceFootprintMean',
 'distanceTssMean']

In [None]:
### For a given T-D there could be multiples studylocusId
gwasCredibleAssocDistances.groupBy("targetId", "diseaseId").agg(
    F.size(F.collect_set("leftStudyLocusId")).alias("counts")
).sort(F.col("counts").desc()).show()

[Stage 195:>                                                      (0 + 16) / 17]

+---------------+-----------+------+
|       targetId|  diseaseId|counts|
+---------------+-----------+------+
|ENSG00000166035|EFO_0004529|  2634|
|ENSG00000175445|EFO_0004529|  2074|
|ENSG00000134824|EFO_0004529|  1914|
|ENSG00000149485|EFO_0004529|  1891|
|ENSG00000087237|EFO_0004529|  1699|
|ENSG00000084674|EFO_0004529|  1641|
|ENSG00000218819|EFO_0004529|  1598|
|ENSG00000166035|EFO_0004732|  1577|
|ENSG00000038427|EFO_0004346|  1562|
|ENSG00000038427|EFO_0001444|  1555|
|ENSG00000198670|EFO_0004529|  1463|
|ENSG00000109917|EFO_0004529|  1441|
|ENSG00000122194|EFO_0004529|  1437|
|ENSG00000134825|EFO_0004529|  1418|
|ENSG00000130164|EFO_0004529|  1403|
|ENSG00000000971|EFO_0004747|  1382|
|ENSG00000110243|EFO_0004529|  1362|
|ENSG00000116785|EFO_0004747|  1295|
|ENSG00000130203|EFO_0004529|  1278|
|ENSG00000069399|EFO_0004529|  1260|
+---------------+-----------+------+
only showing top 20 rows



                                                                                

In [8]:
gwasCredibleAssocDistances.filter(
    (F.col("targetId") == "ENSG00000001617") & (F.col("diseaseId") == "EFO_0005856")
).show(truncate=False)

                                                                                

+--------------------------------+---------------+-----------+-----------+--------------------+------------------+-------------------+--------------+---------------------+---------------------+---------------+
|leftStudyLocusId                |targetId       |diseaseId  |homogenized|h4                  |datasourceId      |resourceScore      |leftVariantId |credibleLeftStudyType|distanceFootprintMean|distanceTssMean|
+--------------------------------+---------------+-----------+-----------+--------------------+------------------+-------------------+--------------+---------------------+---------------------+---------------+
|74da780a06c89e32672ccfe39eafdb41|ENSG00000001617|EFO_0005856|GoF_protect|0.8813741513582225  |gwas_credible_sets|0.09022955545837465|3_50132282_C_T|gwas                 |0.9783658            |0.9783658      |
|74da780a06c89e32672ccfe39eafdb41|ENSG00000001617|EFO_0005856|GoF_protect|0.027484611508886925|gwas_credible_sets|0.09022955545837465|3_50132282_C_T|gwas       

In [13]:
analysis_chembl = (
    discrepancifier(
        directionOfEffect(
            evidences.filter((F.col("datasourceId") == "chembl")), "24.09"
        )
        .withColumn(
            "maxClinPhase",
            F.max(F.col("clinicalPhase")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase")
        .pivot("homogenized")
        .agg(F.count("targetId"))
        .persist()
    )
    .filter(  ### ensure drug has annotated MoA and is coherent per Target-Disease
        ((F.col("GoF_protect").isNotNull()) | F.col("LoF_protect").isNotNull())
        & (F.col("coherencyDiagonal") == "coherent")
    )
    .selectExpr(
        "targetId",
        "diseaseId",
        "maxClinPhase",
        "coherencyDiagonal as coherencyDiagonal_ch",
        "coherencyOneCell as coherencyOneCell_ch",
        "LoF_protect as LoF_protect_ch",
        "GoF_protect as GoF_protect_ch",
    )
)

                                                                                

In [40]:
### pivot colocdoE grouping by T-D-studyLocusId-distances

values = ["max_L2GScore", "min_footPrintDistance_rank", "min_tssDistance_rank"]
value = "max_L2GScore"
dict_comb = {
    "hasGeneticEvidence": f"{value}",
    "diagonalYes": f"{value}",
    "oneCellYes": f"{value}",
    "L2GAndColoc": f"{value}",
}
list_l2g = [
    0.10,
    0.15,
    0.20,
    0.25,
    0.30,
    0.35,
    0.40,
    0.45,
    0.50,
    0.55,
    0.60,
    0.65,
    0.70,
    0.75,
    0.80,
    0.85,
    0.90,
    0.95,
]

dataset = (
    discrepancifier(
        gwasCredibleAssocDistances
        # .filter(F.col("h4").isNotNull()) #### not filter by this because we want to include the L2G AND Coloc question
        .withColumn(  ### take maximum L2G score per T-D
            "max_L2GScore",
            F.max("resourceScore").over(Window.partitionBy("targetId", "diseaseId")),
        )
        .withColumn(
            "min_footPrintDistance_rank",
            F.min("distanceFootprintMean").over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .withColumn(
            "min_tssDistance_rank",
            F.min("distanceTssMean").over(Window.partitionBy("targetId", "diseaseId")),
        )
        .groupBy(
            "targetId",
            "diseaseId",
            f"{value}",
            # "leftStudyLocusId",
        )
        .pivot("homogenized")
        .count()
    )
    .join(analysis_chembl, on=["targetId", "diseaseId"], how="right")
    .withColumn(
        "diagonalAgreeWithDrugs",
        F.when(
            (
                (F.col("coherencyDiagonal_ch") == "coherent")
                & (F.col("coherencyDiagonal").isNotNull())
            )
            # & (F.col("coherencyDiagonal") == "coherent")
            ,
            F.when(
                (F.col("LoF_protect_ch").isNotNull())
                & (F.col("GoF_risk").isNotNull() | F.col("LoF_protect").isNotNull()),
                F.lit("coherent"),
            )
            .when(
                F.col("GoF_protect_ch").isNotNull()
                & (F.col("LoF_risk").isNotNull() | F.col("GoF_protect").isNotNull()),
                F.lit("coherent"),
            )
            .otherwise(F.lit("dispar")),
        ),
    )
    .withColumn(
        "oneCellAgreeWithDrugs",
        F.when(
            (
                (F.col("coherencyOneCell_ch") == "coherent")
                & (F.col("coherencyDiagonal").isNotNull())
            )
            # & (F.col("coherencyOneCell") == "coherent")
            ,
            F.when(
                (F.col("LoF_protect_ch").isNotNull())
                & (
                    (F.col("LoF_protect").isNotNull())
                    & (F.col("LoF_risk").isNull())
                    & (F.col("GoF_protect").isNull())
                    & (F.col("GoF_risk").isNull())
                ),
                F.lit("coherent"),
            )
            .when(
                (F.col("GoF_protect_ch").isNotNull())
                & (
                    (F.col("GoF_protect").isNotNull())
                    & (F.col("LoF_risk").isNull())
                    & (F.col("LoF_protect").isNull())
                    & (F.col("GoF_risk").isNull())
                ),
                F.lit("coherent"),
            )
            .otherwise(F.lit("dispar")),
        ),
        # ).filter(
        #    F.col("diagonalAgreeWithDrugs").isNotNull()
    )
    .withColumn(
        "Phase4",
        F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
    )
    .withColumn(
        "Phase>=3",
        F.when(F.col("maxClinPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
    )
    .withColumn(
        "Phase>=2",
        F.when(F.col("maxClinPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
    )
    .withColumn(
        "Phase>=1",
        F.when(F.col("maxClinPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
    )
    .withColumn(
        "Phase0",
        F.when(F.col("maxClinPhase") == 0, F.lit("yes")).otherwise(F.lit("no")),
    )
    .withColumn(
        "diagonalYes",
        F.when(F.col("diagonalAgreeWithDrugs") == "coherent", F.lit("yes")).otherwise(
            F.lit("no")
        ),
    )
    .withColumn(
        "oneCellYes",
        F.when(F.col("oneCellAgreeWithDrugs") == "coherent", F.lit("yes")).otherwise(
            F.lit("no")
        ),
    )
    .withColumn(
        "hasGeneticEvidence",
        F.when(F.col("max_L2GScore").isNotNull(), F.lit("yes")).otherwise(F.lit("no")),
    )
    .withColumn(
        "L2GAndColoc",
        F.when(
            (F.col(f"{value}").isNotNull())
            & (F.col("coherencyDiagonal").isin(["coherent", "dispar"])),
            F.lit("yes"),
        ).otherwise(F.lit("no")),
    )
    .select(
        ["*"]
        + (
            [  ### single columns
                F.when(F.col(f"{value}") >= n, F.lit("yes"))
                .otherwise(F.lit("no"))
                .alias(f"{value}>={str(n).replace('.', '_')}")
                for n in list_l2g
            ]
        )
        + (
            [  ### column combinations for Yes/No colums Plus has DoE (any agreement)
                F.when((F.col(a) == "yes") & (F.col(x) >= n), F.lit("yes"))
                .otherwise(F.lit("no"))
                .alias(f"{x}>={str(n).replace('.', '_')}&{a}_combined")
                for a, x in dict_comb.items()
                for n in list_l2g
            ]
        )
    )
)

### HERE

                                                                                

In [None]:
#### Make the loop and see if that make sense

In [None]:
### Make all the datasets
values = ["max_L2GScore", "min_footPrintDistance_rank", "min_tssDistance_rank"]

datasetDict = {}
for value in values:
    if value == "max_L2GScore":
        datasetDict[f"df_l2g_original"] = benchmarkOT(
            discrepancifier, otGenetics, value
        )
        datasetDict[f"df_l2g_propagated"] = benchmarkOT(
            discrepancifier, otGenetics_propag, value
        )
    elif value == "min_distance_ranking":
        datasetDict[f"df_distance_original"] = benchmarkOT(
            discrepancifier, otGenetics, value
        )
        datasetDict[f"df_distance_propagated"] = benchmarkOT(
            discrepancifier, otGenetics_propag, value
        )

In [59]:
def comparisons_df(test_propag) -> list:
    """Return list of all comparisons to be used in the analysis"""
    toAnalysis = dataset.columns[22:]
    dataType = ["byDatatype"] * len(toAnalysis)
    l_studies = []
    l_studies.extend([list(a) for a in zip(toAnalysis, dataType)])

    schema = StructType(
        [
            StructField("comparison", StringType(), True),
            StructField("comparisonType", StringType(), True),
        ]
    )

    comparisons = spark.createDataFrame(l_studies, schema=schema)
    ### include all the columns as predictor

    predictions = spark.createDataFrame(
        data=[
            ("Phase4", "clinical"),
            # ("Phase>=3", "clinical"),
            # ("Phase>=2", "clinical"),
            # ("Phase>=1", "clinical"),
            # ("PhaseT", "clinical"),
        ]
    )
    return comparisons.join(predictions, how="full").collect()


full_data = spark.createDataFrame(
    data=[
        ("yes", "yes"),
        ("yes", "no"),
        ("no", "yes"),
        ("no", "no"),
    ],
    schema=StructType(
        [
            StructField("prediction", StringType(), True),
            StructField("comparison", StringType(), True),
        ]
    ),
)

In [60]:
def aggregations_original(
    df,
    data,
    listado,
    comparisonColumn,
    comparisonType,
    predictionColumn,
    predictionType,
    today_date,
):

    wComparison = Window.partitionBy(F.col(comparisonColumn))
    wPrediction = Window.partitionBy(F.col(predictionColumn))
    wPredictionComparison = Window.partitionBy(
        F.col(comparisonColumn), F.col(predictionColumn)
    )

    uniqIds = df.select("targetId", "diseaseId").distinct().count()
    out = (
        df.withColumn("comparisonType", F.lit(comparisonType))
        .withColumn("predictionType", F.lit(predictionType))
        .withColumn("total", F.lit(uniqIds))
        .withColumn("a", F.count("targetId").over(wPredictionComparison))
        .withColumn(
            "predictionTotal",
            F.count("targetId").over(wPrediction),
        )
        .withColumn(
            "comparisonTotal",
            F.count("targetId").over(wComparison),
        )
        .select(
            F.col(predictionColumn).alias("prediction"),
            F.col(comparisonColumn).alias("comparison"),
            "comparisonType",
            "predictionType",
            "a",
            "predictionTotal",
            "comparisonTotal",
            "total",
        )
        .filter(F.col("prediction").isNotNull())
        .filter(F.col("comparison").isNotNull())
        .distinct()
    )
    """
    out.write.mode("overwrite").parquet(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )
    """
    filePath = "gs://ot-team/jroldan/" + str(
        today_date
        + "_"
        + "analysis/"
        + data
        # + "_propagated"
        + "/"
        + comparisonColumn
        + "_"
        + predictionColumn
        + ".parquet"
    )

    listado.append(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )

    print(
        today_date
        + "_"
        + "analysis/"
        + data
        # + "_propagated"
        + "/"
        + comparisonColumn
        + "_"
        + predictionColumn
        + ".parquet"
    )

    c = datetime.now()
    c.strftime("%H:%M:%S")
    print(c)
    array1 = np.delete(
        out.join(full_data, on=["prediction", "comparison"], how="outer")
        .groupBy("comparison")
        .pivot("prediction")
        .agg(F.first("a"))
        .sort(F.col("comparison").desc())
        .select("comparison", "yes", "no")
        .fillna(0)
        .toPandas()
        .to_numpy(),
        [0],
        1,
    )
    total = np.sum(array1)
    res_npPhaseX = np.array(array1, dtype=int)
    resX = convertTuple(fisher_exact(res_npPhaseX, alternative="two-sided"))
    resx_CI = convertTuple(
        odds_ratio(res_npPhaseX).confidence_interval(confidence_level=0.95)
    )

    result_st.append(resX)
    result_ci.append(resx_CI)
    (rs_result, rs_ci) = relative_success(array1)

    results.append(
        [
            comparisonType,
            comparisonColumn,
            predictionColumn,
            round(float(resX.split(",")[0]), 2),
            float(resX.split(",")[1]),
            round(float(resx_CI.split(",")[0]), 2),
            round(float(resx_CI.split(",")[1]), 2),
            str(total),
            np.array(res_npPhaseX).tolist(),
            round(float(rs_result), 2),
            round(float(rs_ci[0]), 2),
            round(float(rs_ci[1]), 2),
            filePath,
        ]
    )
    return results

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
from scipy.stats.contingency import odds_ratio

# Initialize an empty list to store the results
result_st = []
result_ci = []
results = []


def convertTuple(tup):
    st = ",".join(map(str, tup))
    return st


aggSetups_original = comparisons_df(df)
listado = []
today_date = str(date.today())

for row in aggSetups_original:
    aggregations_original(dataset, value, listado, *row, today_date)

2025-01-24_analysis/max_L2GScore/diagonalYes_Phase4.parquet
2025-01-24 13:48:02.567385


Exception in thread "serve-DataFrame" java.net.SocketTimeoutException: Accept timed out
	at java.base/java.net.PlainSocketImpl.socketAccept(Native Method)
	at java.base/java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:474)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:565)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:533)
	at org.apache.spark.security.SocketAuthServer$$anon$1.run(SocketAuthServer.scala:64)
                                                                                

2025-01-24_analysis/max_L2GScore/oneCellYes_Phase4.parquet
2025-01-24 13:48:29.285365


                                                                                

2025-01-24_analysis/max_L2GScore/hasGeneticEvidence_Phase4.parquet
2025-01-24 13:48:55.759779


                                                                                

2025-01-24_analysis/max_L2GScore/L2GAndColoc_Phase4.parquet
2025-01-24 13:49:19.984382


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_1_Phase4.parquet
2025-01-24 13:49:46.456749


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_15_Phase4.parquet
2025-01-24 13:50:13.758986


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_2_Phase4.parquet
2025-01-24 13:50:38.469266


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_25_Phase4.parquet
2025-01-24 13:51:02.176871


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_3_Phase4.parquet
2025-01-24 13:51:25.474986


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_35_Phase4.parquet
2025-01-24 13:51:50.037965


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_4_Phase4.parquet
2025-01-24 13:52:14.255274


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_45_Phase4.parquet
2025-01-24 13:52:37.702930


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_5_Phase4.parquet
2025-01-24 13:53:02.433922


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_55_Phase4.parquet
2025-01-24 13:53:25.855291


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_6_Phase4.parquet
2025-01-24 13:53:49.078757


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_65_Phase4.parquet
2025-01-24 13:54:13.704120


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_7_Phase4.parquet
2025-01-24 13:54:36.906718


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_75_Phase4.parquet
2025-01-24 13:54:59.822479


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_8_Phase4.parquet
2025-01-24 13:55:26.063421


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_85_Phase4.parquet
2025-01-24 13:55:49.409704


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_9_Phase4.parquet
2025-01-24 13:56:13.114374


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_95_Phase4.parquet
2025-01-24 13:56:38.148650


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_1&hasGeneticEvidence_combined_Phase4.parquet
2025-01-24 13:57:03.434358


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_15&hasGeneticEvidence_combined_Phase4.parquet
2025-01-24 13:57:26.207100


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_2&hasGeneticEvidence_combined_Phase4.parquet
2025-01-24 13:57:50.347361


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_25&hasGeneticEvidence_combined_Phase4.parquet
2025-01-24 13:58:14.525416


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_3&hasGeneticEvidence_combined_Phase4.parquet
2025-01-24 13:58:37.009774


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_35&hasGeneticEvidence_combined_Phase4.parquet
2025-01-24 13:59:00.995739


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_4&hasGeneticEvidence_combined_Phase4.parquet
2025-01-24 13:59:24.655329


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_45&hasGeneticEvidence_combined_Phase4.parquet
2025-01-24 13:59:48.331202


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_5&hasGeneticEvidence_combined_Phase4.parquet
2025-01-24 14:00:13.933699


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_55&hasGeneticEvidence_combined_Phase4.parquet
2025-01-24 14:00:38.613016


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_6&hasGeneticEvidence_combined_Phase4.parquet
2025-01-24 14:01:01.386085


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_65&hasGeneticEvidence_combined_Phase4.parquet
2025-01-24 14:01:24.964907


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_7&hasGeneticEvidence_combined_Phase4.parquet
2025-01-24 14:01:53.121509


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_75&hasGeneticEvidence_combined_Phase4.parquet
2025-01-24 14:02:16.785295


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_8&hasGeneticEvidence_combined_Phase4.parquet
2025-01-24 14:02:39.551917


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_85&hasGeneticEvidence_combined_Phase4.parquet
2025-01-24 14:03:03.986713


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_9&hasGeneticEvidence_combined_Phase4.parquet
2025-01-24 14:03:26.651291


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_95&hasGeneticEvidence_combined_Phase4.parquet
2025-01-24 14:03:50.173742


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_1&diagonalYes_combined_Phase4.parquet
2025-01-24 14:04:14.760426


                                                                                ]

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_15&diagonalYes_combined_Phase4.parquet
2025-01-24 14:04:41.902378


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_2&diagonalYes_combined_Phase4.parquet
2025-01-24 14:05:06.772834


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_25&diagonalYes_combined_Phase4.parquet
2025-01-24 14:05:32.394684


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_3&diagonalYes_combined_Phase4.parquet
2025-01-24 14:05:58.777523


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_35&diagonalYes_combined_Phase4.parquet
2025-01-24 14:06:24.689999


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_4&diagonalYes_combined_Phase4.parquet
2025-01-24 14:06:49.924568


                                                                                ]

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_45&diagonalYes_combined_Phase4.parquet
2025-01-24 14:07:15.333694


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_5&diagonalYes_combined_Phase4.parquet
2025-01-24 14:07:41.036188


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_55&diagonalYes_combined_Phase4.parquet
2025-01-24 14:08:07.142632


                                                                                ]

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_6&diagonalYes_combined_Phase4.parquet
2025-01-24 14:08:32.785918


                                                                                ]]

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_65&diagonalYes_combined_Phase4.parquet
2025-01-24 14:08:58.365627


                                                                                ]

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_7&diagonalYes_combined_Phase4.parquet
2025-01-24 14:09:25.263246


                                                                                ]

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_75&diagonalYes_combined_Phase4.parquet
2025-01-24 14:09:51.059274


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_8&diagonalYes_combined_Phase4.parquet
2025-01-24 14:10:18.460608


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_85&diagonalYes_combined_Phase4.parquet
2025-01-24 14:10:44.923492


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_9&diagonalYes_combined_Phase4.parquet
2025-01-24 14:11:10.748078


                                                                                ]

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_95&diagonalYes_combined_Phase4.parquet
2025-01-24 14:11:36.316350


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_1&oneCellYes_combined_Phase4.parquet
2025-01-24 14:12:01.436673


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_15&oneCellYes_combined_Phase4.parquet
2025-01-24 14:12:28.952148


                                                                                ]

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_2&oneCellYes_combined_Phase4.parquet
2025-01-24 14:12:55.746970


                                                                                ]

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_25&oneCellYes_combined_Phase4.parquet
2025-01-24 14:13:20.667611


                                                                                ]

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_3&oneCellYes_combined_Phase4.parquet
2025-01-24 14:13:45.942858


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_35&oneCellYes_combined_Phase4.parquet
2025-01-24 14:14:11.690364


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_4&oneCellYes_combined_Phase4.parquet
2025-01-24 14:14:38.478899


                                                                                ]

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_45&oneCellYes_combined_Phase4.parquet
2025-01-24 14:15:05.488663


                                                                                ]

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_5&oneCellYes_combined_Phase4.parquet
2025-01-24 14:15:30.864056


                                                                                6]

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_55&oneCellYes_combined_Phase4.parquet
2025-01-24 14:15:56.497687


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_6&oneCellYes_combined_Phase4.parquet
2025-01-24 14:16:21.694651


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_65&oneCellYes_combined_Phase4.parquet
2025-01-24 14:16:48.401367


                                                                                ]

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_7&oneCellYes_combined_Phase4.parquet
2025-01-24 14:17:14.524949


                                                                                6]

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_75&oneCellYes_combined_Phase4.parquet
2025-01-24 14:17:39.106764


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_8&oneCellYes_combined_Phase4.parquet
2025-01-24 14:18:04.167550


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_85&oneCellYes_combined_Phase4.parquet
2025-01-24 14:18:29.769816


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_9&oneCellYes_combined_Phase4.parquet
2025-01-24 14:18:55.840624


                                                                                ]

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_95&oneCellYes_combined_Phase4.parquet
2025-01-24 14:19:20.762628


                                                                                ]

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_1&L2GAndColoc_combined_Phase4.parquet
2025-01-24 14:19:46.075609


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_15&L2GAndColoc_combined_Phase4.parquet
2025-01-24 14:20:11.583557


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_2&L2GAndColoc_combined_Phase4.parquet
2025-01-24 14:20:37.562144


                                                                                ]

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_25&L2GAndColoc_combined_Phase4.parquet
2025-01-24 14:21:03.195342


                                                                                ]]

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_3&L2GAndColoc_combined_Phase4.parquet
2025-01-24 14:21:27.993809


                                                                                ]

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_35&L2GAndColoc_combined_Phase4.parquet
2025-01-24 14:21:53.874462


                                                                                ]

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_4&L2GAndColoc_combined_Phase4.parquet
2025-01-24 14:22:20.350230


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_45&L2GAndColoc_combined_Phase4.parquet
2025-01-24 14:22:47.253558


                                                                                ]

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_5&L2GAndColoc_combined_Phase4.parquet
2025-01-24 14:23:12.846449


                                                                                ]6]

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_55&L2GAndColoc_combined_Phase4.parquet
2025-01-24 14:23:38.345145


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_6&L2GAndColoc_combined_Phase4.parquet
2025-01-24 14:24:04.600772


                                                                                ]

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_65&L2GAndColoc_combined_Phase4.parquet
2025-01-24 14:24:31.039574


                                                                                ]

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_7&L2GAndColoc_combined_Phase4.parquet
2025-01-24 14:24:57.199812


                                                                                6]

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_75&L2GAndColoc_combined_Phase4.parquet
2025-01-24 14:25:21.516819


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_8&L2GAndColoc_combined_Phase4.parquet
2025-01-24 14:25:47.292305


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_85&L2GAndColoc_combined_Phase4.parquet
2025-01-24 14:26:13.126445


                                                                                

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_9&L2GAndColoc_combined_Phase4.parquet
2025-01-24 14:26:39.312637


                                                                                ]

2025-01-24_analysis/max_L2GScore/max_L2GScore>=0_95&L2GAndColoc_combined_Phase4.parquet
2025-01-24 14:27:04.090736


                                                                                ]

In [66]:
df = pd.DataFrame(
    results,
    columns=[
        "type",
        "criteria",
        "phase",
        "OR",
        "pValue",
        "LowCI",
        "HighCI",
        "total",
        "array",
        "rs",
        "lowRs",
        "HighRs",
        "path",
    ],
)

In [67]:
df

Unnamed: 0,type,criteria,phase,OR,pValue,LowCI,HighCI,total,array,rs,lowRs,HighRs,path
0,byDatatype,diagonalYes,Phase4,2.90,2.282597e-13,2.17,3.88,75278,"[[100, 98], [19534, 55546]]",1.94,1.69,2.23,gs://ot-team/jroldan/2025-01-24_analysis/max_L...
1,byDatatype,oneCellYes,Phase4,3.96,1.302329e-10,2.56,6.18,75278,"[[53, 38], [19581, 55606]]",2.24,1.88,2.66,gs://ot-team/jroldan/2025-01-24_analysis/max_L...
2,byDatatype,hasGeneticEvidence,Phase4,1.98,5.966624e-27,1.75,2.23,75278,"[[453, 657], [19181, 54987]]",1.58,1.47,1.70,gs://ot-team/jroldan/2025-01-24_analysis/max_L...
3,byDatatype,L2GAndColoc,Phase4,2.60,6.797493e-17,2.07,3.25,75278,"[[155, 170], [19479, 55474]]",1.84,1.64,2.06,gs://ot-team/jroldan/2025-01-24_analysis/max_L...
4,byDatatype,max_L2GScore>=0_1,Phase4,2.01,4.654861e-24,1.76,2.29,75278,"[[385, 549], [19249, 55095]]",1.59,1.47,1.72,gs://ot-team/jroldan/2025-01-24_analysis/max_L...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,byDatatype,max_L2GScore>=0_75&L2GAndColoc_combined,Phase4,2.78,2.032015e-11,2.05,3.77,75278,"[[89, 91], [19545, 55553]]",1.90,1.64,2.20,gs://ot-team/jroldan/2025-01-24_analysis/max_L...
90,byDatatype,max_L2GScore>=0_8&L2GAndColoc_combined,Phase4,2.84,3.538646e-10,2.04,3.96,75278,"[[76, 76], [19558, 55568]]",1.92,1.64,2.25,gs://ot-team/jroldan/2025-01-24_analysis/max_L...
91,byDatatype,max_L2GScore>=0_85&L2GAndColoc_combined,Phase4,3.24,2.767114e-10,2.23,4.71,75278,"[[65, 57], [19569, 55587]]",2.05,1.73,2.42,gs://ot-team/jroldan/2025-01-24_analysis/max_L...
92,byDatatype,max_L2GScore>=0_9&L2GAndColoc_combined,Phase4,3.85,6.480108e-08,2.30,6.52,75278,"[[38, 28], [19596, 55616]]",2.21,1.80,2.72,gs://ot-team/jroldan/2025-01-24_analysis/max_L...


In [62]:
results

[['byDatatype',
  'diagonalYes',
  'Phase4',
  2.9,
  2.2825971081982413e-13,
  2.17,
  3.88,
  '75278',
  [[100, 98], [19534, 55546]],
  1.94,
  1.69,
  2.23,
  'gs://ot-team/jroldan/2025-01-24_analysis/max_L2GScore/diagonalYes_Phase4.parquet'],
 ['byDatatype',
  'oneCellYes',
  'Phase4',
  3.96,
  1.3023288254998088e-10,
  2.56,
  6.18,
  '75278',
  [[53, 38], [19581, 55606]],
  2.24,
  1.88,
  2.66,
  'gs://ot-team/jroldan/2025-01-24_analysis/max_L2GScore/oneCellYes_Phase4.parquet'],
 ['byDatatype',
  'hasGeneticEvidence',
  'Phase4',
  1.98,
  5.966623858885305e-27,
  1.75,
  2.23,
  '75278',
  [[453, 657], [19181, 54987]],
  1.58,
  1.47,
  1.7,
  'gs://ot-team/jroldan/2025-01-24_analysis/max_L2GScore/hasGeneticEvidence_Phase4.parquet'],
 ['byDatatype',
  'L2GAndColoc',
  'Phase4',
  2.6,
  6.797492523446592e-17,
  2.07,
  3.25,
  '75278',
  [[155, 170], [19479, 55474]],
  1.84,
  1.64,
  2.06,
  'gs://ot-team/jroldan/2025-01-24_analysis/max_L2GScore/L2GAndColoc_Phase4.parquet'],


In [44]:
dataset.columns[22:]

['diagonalYes',
 'oneCellYes',
 'hasGeneticEvidence',
 'L2GAndColoc',
 'max_L2GScore>=0_1',
 'max_L2GScore>=0_15',
 'max_L2GScore>=0_2',
 'max_L2GScore>=0_25',
 'max_L2GScore>=0_3',
 'max_L2GScore>=0_35',
 'max_L2GScore>=0_4',
 'max_L2GScore>=0_45',
 'max_L2GScore>=0_5',
 'max_L2GScore>=0_55',
 'max_L2GScore>=0_6',
 'max_L2GScore>=0_65',
 'max_L2GScore>=0_7',
 'max_L2GScore>=0_75',
 'max_L2GScore>=0_8',
 'max_L2GScore>=0_85',
 'max_L2GScore>=0_9',
 'max_L2GScore>=0_95',
 'max_L2GScore>=0_1&hasGeneticEvidence_combined',
 'max_L2GScore>=0_15&hasGeneticEvidence_combined',
 'max_L2GScore>=0_2&hasGeneticEvidence_combined',
 'max_L2GScore>=0_25&hasGeneticEvidence_combined',
 'max_L2GScore>=0_3&hasGeneticEvidence_combined',
 'max_L2GScore>=0_35&hasGeneticEvidence_combined',
 'max_L2GScore>=0_4&hasGeneticEvidence_combined',
 'max_L2GScore>=0_45&hasGeneticEvidence_combined',
 'max_L2GScore>=0_5&hasGeneticEvidence_combined',
 'max_L2GScore>=0_55&hasGeneticEvidence_combined',
 'max_L2GScore>=0_6&

In [110]:
discrepancifier(
    gwasCredibleAssocDistances.groupBy(
        "targetId",
        "diseaseId",
        # "leftStudyLocusId",
        "distanceFootprintMean",
        "distanceTssMean",
    )
    .pivot("homogenized")
    .count()
).groupBy("coherencyDiagonal").count().show()



+-----------------+-------+
|coherencyDiagonal|  count|
+-----------------+-------+
|       EvidNotDoE|1359051|
|         coherent|1036658|
+-----------------+-------+



                                                                                

In [69]:
from DoEAssessment import directionOfEffect

In [None]:
gwasCredibleAssocDistances.groupBy()

                                                                                

+---------------+-------------+----------------+-----------+----+------------+-------------+-------------+---------------------+------+---------------------+---------------+------------+--------------------+-------------------+--------------+--------------+
|       targetId|    diseaseId|leftStudyLocusId|homogenized|  h4|datasourceId|resourceScore|leftVariantId|credibleLeftStudyType|geneId|distanceFootprintMean|distanceTssMean|maxClinPhase|coherencyDiagonal_ch|coherencyOneCell_ch|LoF_protect_ch|GoF_protect_ch|
+---------------+-------------+----------------+-----------+----+------------+-------------+-------------+---------------------+------+---------------------+---------------+------------+--------------------+-------------------+--------------+--------------+
|ENSG00000007314|  EFO_0000555|            null|       null|null|        null|         null|         null|                 null|  null|                 null|           null|         2.0|            coherent|           coherent

                                                                                

In [95]:
analysis_chembl = (
    discrepancifier(
        directionOfEffect(
            evidences.filter((F.col("datasourceId") == "chembl")), "24.09"
        )
        .withColumn(
            "maxClinPhase",
            F.max(F.col("clinicalPhase")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase")
        .pivot("homogenized")
        .agg(F.count("targetId"))
        .persist()
    )
    .filter(  ### ensure drug has annotated MoA and is coherent per Target-Disease
        ((F.col("GoF_protect").isNotNull()) | F.col("LoF_protect").isNotNull())
        & (F.col("coherencyDiagonal") == "coherent")
    )
    .selectExpr(
        "targetId",
        "diseaseId",
        "maxClinPhase",
        "coherencyDiagonal as coherencyDiagonal_ch",
        "coherencyOneCell as coherencyOneCell_ch",
        "LoF_protect as LoF_protect_ch",
        "GoF_protect as GoF_protect_ch",
    )
)
drugsCredibleDistance = (
    gwasCredibleAssocDistances.join(
        analysis_chembl, on=["targetId", "diseaseId"], how="right"
    )
    .withColumn(
        "geneticEvidence", F.when(F.col("resourceScore").isNotNull(), F.lit("yes"))
    )
    .withColumn(
        "diagonalAgreeWithDrugs",
        F.when(
            (F.col("coherencyDiagonal_ch") == "coherent")
            # & (F.col("coherencyDiagonal") == "coherent")
            ,
            F.when(
                (F.col("LoF_protect_ch").isNotNull())
                & (F.col("GoF_risk").isNotNull() | F.col("LoF_protect").isNotNull()),
                F.lit("coherent"),
            )
            .when(
                F.col("GoF_protect_ch").isNotNull()
                & (F.col("LoF_risk").isNotNull() | F.col("GoF_protect").isNotNull()),
                F.lit("coherent"),
            )
            .otherwise(F.lit("dispar")),
        ),
    )
    .withColumn(
        "oneCellAgreeWithDrugs",
        F.when(
            (F.col("coherencyOneCell_ch") == "coherent")
            # & (F.col("coherencyOneCell") == "coherent")
            ,
            F.when(
                (F.col("LoF_protect_ch").isNotNull())
                & (
                    (F.col("LoF_protect").isNotNull())
                    & (F.col("LoF_risk").isNull())
                    & (F.col("GoF_protect").isNull())
                    & (F.col("GoF_risk").isNull())
                ),
                F.lit("coherent"),
            )
            .when(
                (F.col("GoF_protect_ch").isNotNull())
                & (
                    (F.col("GoF_protect").isNotNull())
                    & (F.col("LoF_risk").isNull())
                    & (F.col("LoF_protect").isNull())
                    & (F.col("GoF_risk").isNull())
                ),
                F.lit("coherent"),
            )
            .otherwise(F.lit("dispar")),
        ),
    )
    .withColumn(
        "Phase4",
        F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
    )
    .withColumn(
        "Phase>=3",
        F.when(F.col("maxClinPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
    )
    .withColumn(
        "Phase>=2",
        F.when(F.col("maxClinPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
    )
    .withColumn(
        "Phase>=1",
        F.when(F.col("maxClinPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
    )
)

25/01/23 17:24:59 WARN CacheManager: Asked to cache already cached data.
25/01/23 17:25:00 WARN CacheManager: Asked to cache already cached data.
25/01/23 17:25:00 WARN CacheManager: Asked to cache already cached data.
25/01/23 17:25:00 WARN CacheManager: Asked to cache already cached data.


AnalysisException: Column 'GoF_risk' does not exist. Did you mean one of the following? [geneId, h4, targetId, GoF_protect_ch, diseaseId, homogenized, LoF_protect_ch, maxClinPhase, datasourceId, leftVariantId, resourceScore, distanceTssMean, geneticEvidence, leftStudyLocusId, coherencyDiagonal_ch, coherencyOneCell_ch, distanceFootprintMean, credibleLeftStudyType];
'Project [targetId#47413, diseaseId#47513, leftStudyLocusId#48830, homogenized#49124, h4#47963, datasourceId#47627, resourceScore#47697, leftVariantId#48024, credibleLeftStudyType#48025, geneId#59270, distanceFootprintMean#59300, distanceTssMean#59301, maxClinPhase#113689, coherencyDiagonal_ch#117011, coherencyOneCell_ch#117012, LoF_protect_ch#117013L, GoF_protect_ch#117014L, geneticEvidence#117039, CASE WHEN (coherencyDiagonal_ch#117011 = coherent) THEN CASE WHEN (isnotnull(LoF_protect_ch#117013L) AND (isnotnull('GoF_risk) OR isnotnull('LoF_protect))) THEN coherent WHEN (isnotnull(GoF_protect_ch#117014L) AND (isnotnull('LoF_risk) OR isnotnull('GoF_protect))) THEN coherent ELSE dispar END END AS diagonalAgreeWithDrugs#117058]
+- Project [targetId#47413, diseaseId#47513, leftStudyLocusId#48830, homogenized#49124, h4#47963, datasourceId#47627, resourceScore#47697, leftVariantId#48024, credibleLeftStudyType#48025, geneId#59270, distanceFootprintMean#59300, distanceTssMean#59301, maxClinPhase#113689, coherencyDiagonal_ch#117011, coherencyOneCell_ch#117012, LoF_protect_ch#117013L, GoF_protect_ch#117014L, CASE WHEN isnotnull(resourceScore#47697) THEN yes END AS geneticEvidence#117039]
   +- Project [targetId#47413, diseaseId#47513, leftStudyLocusId#48830, homogenized#49124, h4#47963, datasourceId#47627, resourceScore#47697, leftVariantId#48024, credibleLeftStudyType#48025, geneId#59270, distanceFootprintMean#59300, distanceTssMean#59301, maxClinPhase#113689, coherencyDiagonal_ch#117011, coherencyOneCell_ch#117012, LoF_protect_ch#117013L, GoF_protect_ch#117014L]
      +- Join RightOuter, ((targetId#47628 = targetId#47413) AND (diseaseId#48955 = diseaseId#47513))
         :- Project [leftStudyLocusId#48830, targetId#47628, diseaseId#48955, homogenized#49124, h4#47963, datasourceId#47627, resourceScore#47697, leftVariantId#48024, credibleLeftStudyType#48025, geneId#59270, distanceFootprintMean#59300, distanceTssMean#59301]
         :  +- Join LeftOuter, (leftStudyLocusId#48830 = leftStudyLocusId#59310)
         :     :- Project [targetId#47628, diseaseId#48955, CASE WHEN isnull(homogenized#49074) THEN noEvaluable ELSE homogenized#49074 END AS homogenized#49124, leftStudyLocusId#48830, h4#47963, datasourceId#47627, resourceScore#47697, leftVariantId#48024, credibleLeftStudyType#48025]
         :     :  +- Project [targetId#47628, diseaseId#48955, homogenized#49074, leftStudyLocusId#48830, h4#47963, datasourceId#47627, resourceScore#47697, leftVariantId#48024, credibleLeftStudyType#48025]
         :     :     +- Project [diseaseId#48955, leftStudyLocusId#48830, targetId#47628, rightStudyId#48048, rightStudyLocusId#47955, chromosome#47956, rightStudyType#47957, numberColocalisingVariants#47958L, h0#47959, h1#47960, h2#47961, h3#47962, h4#47963, colocalisationMethod#47964, betaRatioSignAverage#47965, leftStudyId#48023, leftVariantId#48024, credibleLeftStudyType#48025, rightVariantId#48049, credibleRightStudyType#48050, projectId#47896, indexStudyType#48101, condition#47919, biosampleId#47923, ... 16 more fields]
         :     :        +- Project [diseaseId#48955, leftStudyLocusId#48830, targetId#47628, rightStudyId#48048, rightStudyLocusId#47955, chromosome#47956, rightStudyType#47957, numberColocalisingVariants#47958L, h0#47959, h1#47960, h2#47961, h3#47962, h4#47963, colocalisationMethod#47964, betaRatioSignAverage#47965, leftStudyId#48023, leftVariantId#48024, credibleLeftStudyType#48025, rightVariantId#48049, credibleRightStudyType#48050, projectId#47896, indexStudyType#48101, condition#47919, biosampleId#47923, ... 17 more fields]
         :     :           +- Window [first(colocDoE#49033, true) windowspecdefinition(targetId#47628, diseaseId#48955, pValueExponent#48855 ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS homogenized#49074], [targetId#47628, diseaseId#48955], [pValueExponent#48855 ASC NULLS FIRST]
         :     :              +- Project [diseaseId#48955, leftStudyLocusId#48830, targetId#47628, rightStudyId#48048, rightStudyLocusId#47955, chromosome#47956, rightStudyType#47957, numberColocalisingVariants#47958L, h0#47959, h1#47960, h2#47961, h3#47962, h4#47963, colocalisationMethod#47964, betaRatioSignAverage#47965, leftStudyId#48023, leftVariantId#48024, credibleLeftStudyType#48025, rightVariantId#48049, credibleRightStudyType#48050, projectId#47896, indexStudyType#48101, condition#47919, biosampleId#47923, ... 15 more fields]
         :     :                 +- Project [diseaseId#48955, leftStudyLocusId#48830, targetId#47628, rightStudyId#48048, rightStudyLocusId#47955, chromosome#47956, rightStudyType#47957, numberColocalisingVariants#47958L, h0#47959, h1#47960, h2#47961, h3#47962, h4#47963, colocalisationMethod#47964, betaRatioSignAverage#47965, leftStudyId#48023, leftVariantId#48024, credibleLeftStudyType#48025, rightVariantId#48049, credibleRightStudyType#48050, projectId#47896, indexStudyType#48101, condition#47919, biosampleId#47923, ... 15 more fields]
         :     :                    +- Project [diseaseId#48955, leftStudyLocusId#48830, targetId#47628, rightStudyId#48048, rightStudyLocusId#47955, chromosome#47956, rightStudyType#47957, numberColocalisingVariants#47958L, h0#47959, h1#47960, h2#47961, h3#47962, h4#47963, colocalisationMethod#47964, betaRatioSignAverage#47965, leftStudyId#48023, leftVariantId#48024, credibleLeftStudyType#48025, rightVariantId#48049, credibleRightStudyType#48050, projectId#47896, indexStudyType#48101, condition#47919, biosampleId#47923, ... 14 more fields]
         :     :                       +- Project [diseaseId#48955, leftStudyLocusId#48830, targetId#47628, rightStudyId#48048, rightStudyLocusId#47955, chromosome#47956, rightStudyType#47957, numberColocalisingVariants#47958L, h0#47959, h1#47960, h2#47961, h3#47962, h4#47963, colocalisationMethod#47964, betaRatioSignAverage#47965, leftStudyId#48023, leftVariantId#48024, credibleLeftStudyType#48025, rightVariantId#48049, credibleRightStudyType#48050, projectId#47896, indexStudyType#48101, condition#47919, biosampleId#47923, ... 15 more fields]
         :     :                          +- Generate explode(concat(array(diseaseId#47728), parents#47389)), true, [diseaseId#48955]
         :     :                             +- Project [diseaseId#47728, leftStudyLocusId#48830, targetId#47628, rightStudyId#48048, rightStudyLocusId#47955, chromosome#47956, rightStudyType#47957, numberColocalisingVariants#47958L, h0#47959, h1#47960, h2#47961, h3#47962, h4#47963, colocalisationMethod#47964, betaRatioSignAverage#47965, leftStudyId#48023, leftVariantId#48024, credibleLeftStudyType#48025, rightVariantId#48049, credibleRightStudyType#48050, projectId#47896, indexStudyType#48101, condition#47919, biosampleId#47923, ... 15 more fields]
         :     :                                +- Join LeftOuter, (diseaseId#47728 = diseaseId#48909)
         :     :                                   :- Project [leftStudyLocusId#48830, targetId#47628, rightStudyId#48048, rightStudyLocusId#47955, chromosome#47956, rightStudyType#47957, numberColocalisingVariants#47958L, h0#47959, h1#47960, h2#47961, h3#47962, h4#47963, colocalisationMethod#47964, betaRatioSignAverage#47965, leftStudyId#48023, leftVariantId#48024, credibleLeftStudyType#48025, rightVariantId#48049, credibleRightStudyType#48050, projectId#47896, indexStudyType#48101, condition#47919, biosampleId#47923, datasourceId#47627, ... 12 more fields]
         :     :                                   :  +- Join RightOuter, ((leftStudyLocusId#47954 = leftStudyLocusId#48830) AND (targetId#48806 = targetId#47628))
         :     :                                   :     :- Project [rightStudyId#48048, rightStudyLocusId#47955, leftStudyLocusId#47954, chromosome#47956, rightStudyType#47957, numberColocalisingVariants#47958L, h0#47959, h1#47960, h2#47961, h3#47962, h4#47963, colocalisationMethod#47964, betaRatioSignAverage#47965, leftStudyId#48023, leftVariantId#48024, credibleLeftStudyType#48025, rightVariantId#48049, credibleRightStudyType#48050, geneId#47895 AS targetId#48806, projectId#47896, indexStudyType#48101, condition#47919, biosampleId#47923]
         :     :                                   :     :  +- Filter NOT (rightStudyType#47957 = gwas)
         :     :                                   :     :     +- Project [rightStudyId#48048, rightStudyLocusId#47955, leftStudyLocusId#47954, chromosome#47956, rightStudyType#47957, numberColocalisingVariants#47958L, h0#47959, h1#47960, h2#47961, h3#47962, h4#47963, colocalisationMethod#47964, betaRatioSignAverage#47965, leftStudyId#48023, leftVariantId#48024, credibleLeftStudyType#48025, rightVariantId#48049, credibleRightStudyType#48050, geneId#47895, projectId#47896, indexStudyType#48101, condition#47919, biosampleId#47923]
         :     :                                   :     :        +- Join LeftOuter, (rightStudyId#48048 = rightStudyId#48100)
         :     :                                   :     :           :- Project [rightStudyLocusId#47955, leftStudyLocusId#47954, chromosome#47956, rightStudyType#47957, numberColocalisingVariants#47958L, h0#47959, h1#47960, h2#47961, h3#47962, h4#47963, colocalisationMethod#47964, betaRatioSignAverage#47965, leftStudyId#48023, leftVariantId#48024, credibleLeftStudyType#48025, rightStudyId#48048, rightVariantId#48049, credibleRightStudyType#48050]
         :     :                                   :     :           :  +- Join LeftOuter, (rightStudyLocusId#47955 = rightStudyLocusId#48047)
         :     :                                   :     :           :     :- Project [leftStudyLocusId#47954, rightStudyLocusId#47955, chromosome#47956, rightStudyType#47957, numberColocalisingVariants#47958L, h0#47959, h1#47960, h2#47961, h3#47962, h4#47963, colocalisationMethod#47964, betaRatioSignAverage#47965, leftStudyId#48023, leftVariantId#48024, credibleLeftStudyType#48025]
         :     :                                   :     :           :     :  +- Join LeftOuter, (leftStudyLocusId#47954 = leftStudyLocusId#48022)
         :     :                                   :     :           :     :     :- Relation [leftStudyLocusId#47954,rightStudyLocusId#47955,chromosome#47956,rightStudyType#47957,numberColocalisingVariants#47958L,h0#47959,h1#47960,h2#47961,h3#47962,h4#47963,colocalisationMethod#47964,betaRatioSignAverage#47965] parquet
         :     :                                   :     :           :     :     +- Project [studyLocusId#47842 AS leftStudyLocusId#48022, StudyId#47843 AS leftStudyId#48023, variantId#47844 AS leftVariantId#48024, studyType#47867 AS credibleLeftStudyType#48025]
         :     :                                   :     :           :     :        +- Relation [studyLocusId#47842,studyId#47843,variantId#47844,chromosome#47845,position#47846,region#47847,beta#47848,zScore#47849,pValueMantissa#47850,pValueExponent#47851,effectAlleleFrequencyFromSource#47852,standardError#47853,subStudyDescription#47854,qualityControls#47855,finemappingMethod#47856,credibleSetIndex#47857,credibleSetlog10BF#47858,purityMeanR2#47859,purityMinR2#47860,locusStart#47861,locusEnd#47862,sampleSize#47863,ldSet#47864,locus#47865,... 2 more fields] parquet
         :     :                                   :     :           :     +- Project [studyLocusId#48055 AS rightStudyLocusId#48047, studyId#48056 AS rightStudyId#48048, variantId#48057 AS rightVariantId#48049, studyType#48080 AS credibleRightStudyType#48050]
         :     :                                   :     :           :        +- Relation [studyLocusId#48055,studyId#48056,variantId#48057,chromosome#48058,position#48059,region#48060,beta#48061,zScore#48062,pValueMantissa#48063,pValueExponent#48064,effectAlleleFrequencyFromSource#48065,standardError#48066,subStudyDescription#48067,qualityControls#48068,finemappingMethod#48069,credibleSetIndex#48070,credibleSetlog10BF#48071,purityMeanR2#48072,purityMinR2#48073,locusStart#48074,locusEnd#48075,sampleSize#48076,ldSet#48077,locus#48078,... 2 more fields] parquet
         :     :                                   :     :           +- Project [studyId#47894 AS rightStudyId#48100, geneId#47895, projectId#47896, studyType#47897 AS indexStudyType#48101, condition#47919, biosampleId#47923]
         :     :                                   :     :              +- Relation [studyId#47894,geneId#47895,projectId#47896,studyType#47897,traitFromSource#47898,traitFromSourceMappedIds#47899,biosampleFromSourceId#47900,pubmedId#47901,publicationTitle#47902,publicationFirstAuthor#47903,publicationDate#47904,publicationJournal#47905,backgroundTraitFromSourceMappedIds#47906,initialSampleSize#47907,nCases#47908,nControls#47909,nSamples#47910,cohorts#47911,ldPopulationStructure#47912,discoverySamples#47913,replicationSamples#47914,qualityControls#47915,analysisFlags#47916,summarystatsLocation#47917,... 6 more fields] parquet
         :     :                                   :     +- Project [studyLocusId#47706 AS leftStudyLocusId#48830, datasourceId#47627, targetId#47628, datatypeId#47657, diseaseFromSourceMappedId#47661, resourceScore#47697, targetFromSourceId#47713, diseaseId#47728, id#47729, score#47730, sourceId#47733, studyId#48847, variantId#48848, betaGwas#48785, pValueExponent#48855]
         :     :                                   :        +- Project [studyLocusId#47706, datasourceId#47627, targetId#47628, datatypeId#47657, diseaseFromSourceMappedId#47661, resourceScore#47697, targetFromSourceId#47713, diseaseId#47728, id#47729, score#47730, sourceId#47733, studyId#48847, variantId#48848, betaGwas#48785, pValueExponent#48855]
         :     :                                   :           +- Join LeftOuter, (studyLocusId#47706 = studyLocusId#48846)
         :     :                                   :              :- Project [datasourceId#47627, targetId#47628, datatypeId#47657, diseaseFromSourceMappedId#47661, resourceScore#47697, studyLocusId#47706, targetFromSourceId#47713, diseaseId#47728, id#47729, score#47730, sourceId#47733]
         :     :                                   :              :  +- Filter (datasourceId#47627 = gwas_credible_sets)
         :     :                                   :              :     +- Filter datasourceId#47627 IN (gwas_credible_sets)
         :     :                                   :              :        +- Relation [datasourceId#47627,targetId#47628,alleleOrigins#47629,allelicRequirements#47630,ancestry#47631,ancestryId#47632,assays#47633,assessments#47634,beta#47635,betaConfidenceIntervalLower#47636,betaConfidenceIntervalUpper#47637,biologicalModelAllelicComposition#47638,biologicalModelGeneticBackground#47639,biologicalModelId#47640,biomarkerList#47641,biomarkerName#47642,biomarkers#47643,biosamplesFromSource#47644,cellLineBackground#47645,cellType#47646,clinicalPhase#47647,clinicalSignificances#47648,clinicalStatus#47649,cohortDescription#47650,... 83 more fields] parquet
         :     :                                   :              +- Project [studyLocusId#48846, studyId#48847, variantId#48848, beta#48852 AS betaGwas#48785, pValueExponent#48855]
         :     :                                   :                 +- Relation [studyLocusId#48846,studyId#48847,variantId#48848,chromosome#48849,position#48850,region#48851,beta#48852,zScore#48853,pValueMantissa#48854,pValueExponent#48855,effectAlleleFrequencyFromSource#48856,standardError#48857,subStudyDescription#48858,qualityControls#48859,finemappingMethod#48860,credibleSetIndex#48861,credibleSetlog10BF#48862,purityMeanR2#48863,purityMinR2#48864,locusStart#48865,locusEnd#48866,sampleSize#48867,ldSet#48868,locus#48869,... 2 more fields] parquet
         :     :                                   +- Project [id#47382 AS diseaseId#48909, name#47386, parents#47389, therapeuticAreas#47394]
         :     :                                      +- Relation [id#47382,code#47383,dbXRefs#47384,description#47385,name#47386,directLocationIds#47387,obsoleteTerms#47388,parents#47389,synonyms#47390,ancestors#47391,descendants#47392,children#47393,therapeuticAreas#47394,indirectLocationIds#47395,ontology#47396] parquet
         :     +- Project [studyLocusId#59269 AS leftStudyLocusId#59310, geneId#59270, distanceFootprintMean#59300, distanceTssMean#59301]
         :        +- Project [studyLocusId#59269, geneId#59270, __pivot_first(value) AS `first(value)`#59299[0] AS distanceFootprintMean#59300, __pivot_first(value) AS `first(value)`#59299[1] AS distanceTssMean#59301]
         :           +- Aggregate [studyLocusId#59269, geneId#59270], [studyLocusId#59269, geneId#59270, pivotfirst(key#59277, first(value)#59293, distanceFootprintMean, distanceTssMean, 0, 0) AS __pivot_first(value) AS `first(value)`#59299]
         :              +- Aggregate [studyLocusId#59269, geneId#59270, key#59277], [studyLocusId#59269, geneId#59270, key#59277, first(value#59278, false) AS first(value)#59293]
         :                 +- Filter key#59277 IN (distanceFootprintMean,distanceTssMean)
         :                    +- Project [studyLocusId#59269, geneId#59270, key#59277, value#59278]
         :                       +- Generate explode(locusToGeneFeatures#59272), true, [key#59277, value#59278]
         :                          +- Relation [studyLocusId#59269,geneId#59270,score#59271,locusToGeneFeatures#59272] parquet
         +- Project [targetId#47413, diseaseId#47513, maxClinPhase#113689, coherencyDiagonal#116990 AS coherencyDiagonal_ch#117011, coherencyOneCell#117000 AS coherencyOneCell_ch#117012, LoF_protect#116947L AS LoF_protect_ch#117013L, GoF_protect#116946L AS GoF_protect_ch#117014L]
            +- Filter ((isnotnull(GoF_protect#116946L) OR isnotnull(LoF_protect#116947L)) AND (coherencyDiagonal#116990 = coherent))
               +- Project [targetId#47413, diseaseId#47513, maxClinPhase#113689, GoF_protect#116946L, LoF_protect#116947L, noEvaluable#116948L, GoF_risk#116973, LoF_risk#116981, coherencyDiagonal#116990, CASE WHEN ((((isnull(LoF_risk#116981) AND isnull(LoF_protect#116947L)) AND isnull(GoF_risk#116973)) AND isnull(GoF_protect#116946L)) AND isnull(noEvaluable#116948L)) THEN noEvid WHEN ((((isnull(LoF_risk#116981) AND isnull(LoF_protect#116947L)) AND isnull(GoF_risk#116973)) AND isnull(GoF_protect#116946L)) AND isnotnull(noEvaluable#116948L)) THEN EvidNotDoE WHEN (((isnotnull(LoF_risk#116981) OR isnotnull(LoF_protect#116947L)) OR isnotnull(GoF_risk#116973)) OR isnotnull(GoF_protect#116946L)) THEN CASE WHEN (isnotnull(LoF_risk#116981) AND ((isnull(LoF_protect#116947L) AND isnull(GoF_risk#116973)) AND isnull(GoF_protect#116946L))) THEN coherent WHEN (isnotnull(GoF_risk#116973) AND ((isnull(LoF_protect#116947L) AND isnull(LoF_risk#116981)) AND isnull(GoF_protect#116946L))) THEN coherent WHEN (isnotnull(LoF_protect#116947L) AND ((isnull(LoF_risk#116981) AND isnull(GoF_risk#116973)) AND isnull(GoF_protect#116946L))) THEN coherent WHEN (isnotnull(GoF_protect#116946L) AND ((isnull(LoF_protect#116947L) AND isnull(GoF_risk#116973)) AND isnull(LoF_risk#116981))) THEN coherent ELSE dispar END END AS coherencyOneCell#117000]
                  +- Project [targetId#47413, diseaseId#47513, maxClinPhase#113689, GoF_protect#116946L, LoF_protect#116947L, noEvaluable#116948L, GoF_risk#116973, LoF_risk#116981, CASE WHEN ((((isnull(LoF_risk#116981) AND isnull(LoF_protect#116947L)) AND isnull(GoF_risk#116973)) AND isnull(GoF_protect#116946L)) AND isnull(noEvaluable#116948L)) THEN noEvid WHEN ((((isnull(LoF_risk#116981) AND isnull(LoF_protect#116947L)) AND isnull(GoF_risk#116973)) AND isnull(GoF_protect#116946L)) AND isnotnull(noEvaluable#116948L)) THEN EvidNotDoE WHEN (((isnotnull(LoF_risk#116981) OR isnotnull(LoF_protect#116947L)) OR isnotnull(GoF_risk#116973)) OR isnotnull(GoF_protect#116946L)) THEN CASE WHEN (isnotnull(GoF_risk#116973) AND isnotnull(LoF_risk#116981)) THEN dispar WHEN (isnotnull(LoF_protect#116947L) AND isnotnull(LoF_risk#116981)) THEN dispar WHEN (isnotnull(GoF_protect#116946L) AND isnotnull(GoF_risk#116973)) THEN dispar WHEN (isnotnull(GoF_protect#116946L) AND isnotnull(LoF_protect#116947L)) THEN dispar ELSE coherent END END AS coherencyDiagonal#116990]
                     +- Project [targetId#47413, diseaseId#47513, maxClinPhase#113689, GoF_protect#116946L, LoF_protect#116947L, noEvaluable#116948L, GoF_risk#116973, null AS LoF_risk#116981]
                        +- Project [targetId#47413, diseaseId#47513, maxClinPhase#113689, GoF_protect#116946L, LoF_protect#116947L, noEvaluable#116948L, null AS GoF_risk#116973]
                           +- Project [targetId#47413, diseaseId#47513, maxClinPhase#113689, __pivot_count(targetId) AS `count(targetId)`#116945[0] AS GoF_protect#116946L, __pivot_count(targetId) AS `count(targetId)`#116945[1] AS LoF_protect#116947L, __pivot_count(targetId) AS `count(targetId)`#116945[2] AS noEvaluable#116948L]
                              +- Aggregate [targetId#47413, diseaseId#47513, maxClinPhase#113689], [targetId#47413, diseaseId#47513, maxClinPhase#113689, pivotfirst(homogenized#113567, count(targetId)#116937L, GoF_protect, LoF_protect, noEvaluable, 0, 0) AS __pivot_count(targetId) AS `count(targetId)`#116945]
                                 +- Aggregate [targetId#47413, diseaseId#47513, maxClinPhase#113689, homogenized#113567], [targetId#47413, diseaseId#47513, maxClinPhase#113689, homogenized#113567, count(targetId#47413) AS count(targetId)#116937L]
                                    +- Project [datasourceId#47412, targetId#47413, alleleOrigins#47414, allelicRequirements#47415, ancestry#47416, ancestryId#47417, assays#47418, assessments#47419, beta#112069, betaConfidenceIntervalLower#47421, betaConfidenceIntervalUpper#47422, biologicalModelAllelicComposition#47423, biologicalModelGeneticBackground#47424, biologicalModelId#47425, biomarkerList#47426, biomarkerName#47427, biomarkers#47428, biosamplesFromSource#47429, cellLineBackground#47430, cellType#47431, clinicalPhase#47432, clinicalSignificances#112285, clinicalStatus#47434, cohortDescription#47435, ... 97 more fields]
                                       +- Project [datasourceId#47412, targetId#47413, alleleOrigins#47414, allelicRequirements#47415, ancestry#47416, ancestryId#47417, assays#47418, assessments#47419, beta#112069, betaConfidenceIntervalLower#47421, betaConfidenceIntervalUpper#47422, biologicalModelAllelicComposition#47423, biologicalModelGeneticBackground#47424, biologicalModelId#47425, biomarkerList#47426, biomarkerName#47427, biomarkers#47428, biosamplesFromSource#47429, cellLineBackground#47430, cellType#47431, clinicalPhase#47432, clinicalSignificances#112285, clinicalStatus#47434, cohortDescription#47435, ... 98 more fields]
                                          +- Window [max(clinicalPhase#47432) windowspecdefinition(targetId#47413, diseaseId#47513, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS maxClinPhase#113689], [targetId#47413, diseaseId#47513]
                                             +- Project [datasourceId#47412, targetId#47413, alleleOrigins#47414, allelicRequirements#47415, ancestry#47416, ancestryId#47417, assays#47418, assessments#47419, beta#112069, betaConfidenceIntervalLower#47421, betaConfidenceIntervalUpper#47422, biologicalModelAllelicComposition#47423, biologicalModelGeneticBackground#47424, biologicalModelId#47425, biomarkerList#47426, biomarkerName#47427, biomarkers#47428, biosamplesFromSource#47429, cellLineBackground#47430, cellType#47431, clinicalPhase#47432, clinicalSignificances#112285, clinicalStatus#47434, cohortDescription#47435, ... 96 more fields]
                                                +- Project [datasourceId#47412, targetId#47413, alleleOrigins#47414, allelicRequirements#47415, ancestry#47416, ancestryId#47417, assays#47418, assessments#47419, beta#112069, betaConfidenceIntervalLower#47421, betaConfidenceIntervalUpper#47422, biologicalModelAllelicComposition#47423, biologicalModelGeneticBackground#47424, biologicalModelId#47425, biomarkerList#47426, biomarkerName#47427, biomarkers#47428, biosamplesFromSource#47429, cellLineBackground#47430, cellType#47431, clinicalPhase#47432, clinicalSignificances#112285, clinicalStatus#47434, cohortDescription#47435, ... 96 more fields]
                                                   +- Project [datasourceId#47412, targetId#47413, alleleOrigins#47414, allelicRequirements#47415, ancestry#47416, ancestryId#47417, assays#47418, assessments#47419, beta#112069, betaConfidenceIntervalLower#47421, betaConfidenceIntervalUpper#47422, biologicalModelAllelicComposition#47423, biologicalModelGeneticBackground#47424, biologicalModelId#47425, biomarkerList#47426, biomarkerName#47427, biomarkers#47428, biosamplesFromSource#47429, cellLineBackground#47430, cellType#47431, clinicalPhase#47432, clinicalSignificances#112285, clinicalStatus#47434, cohortDescription#47435, ... 95 more fields]
                                                      +- Project [datasourceId#47412, targetId#47413, alleleOrigins#47414, allelicRequirements#47415, ancestry#47416, ancestryId#47417, assays#47418, assessments#47419, beta#112069, betaConfidenceIntervalLower#47421, betaConfidenceIntervalUpper#47422, biologicalModelAllelicComposition#47423, biologicalModelGeneticBackground#47424, biologicalModelId#47425, biomarkerList#47426, biomarkerName#47427, biomarkers#47428, biosamplesFromSource#47429, cellLineBackground#47430, cellType#47431, clinicalPhase#47432, clinicalSignificances#112285, clinicalStatus#47434, cohortDescription#47435, ... 95 more fields]
                                                         +- Project [datasourceId#47412, targetId#47413, alleleOrigins#47414, allelicRequirements#47415, ancestry#47416, ancestryId#47417, assays#47418, assessments#47419, beta#112069, betaConfidenceIntervalLower#47421, betaConfidenceIntervalUpper#47422, biologicalModelAllelicComposition#47423, biologicalModelGeneticBackground#47424, biologicalModelId#47425, biomarkerList#47426, biomarkerName#47427, biomarkers#47428, biosamplesFromSource#47429, cellLineBackground#47430, cellType#47431, clinicalPhase#47432, clinicalSignificances#112285, clinicalStatus#47434, cohortDescription#47435, ... 95 more fields]
                                                            +- Project [datasourceId#47412, targetId#47413, alleleOrigins#47414, allelicRequirements#47415, ancestry#47416, ancestryId#47417, assays#47418, assessments#47419, beta#112069, betaConfidenceIntervalLower#47421, betaConfidenceIntervalUpper#47422, biologicalModelAllelicComposition#47423, biologicalModelGeneticBackground#47424, biologicalModelId#47425, biomarkerList#47426, biomarkerName#47427, biomarkers#47428, biosamplesFromSource#47429, cellLineBackground#47430, cellType#47431, clinicalPhase#47432, clinicalSignificances#112285, clinicalStatus#47434, cohortDescription#47435, ... 96 more fields]
                                                               +- Window [collect_set(intogen_function#113082, 0, 0) windowspecdefinition(targetId#47413, diseaseId#47513, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS _we0#113205], [targetId#47413, diseaseId#47513]
                                                                  +- Project [datasourceId#47412, targetId#47413, alleleOrigins#47414, allelicRequirements#47415, ancestry#47416, ancestryId#47417, assays#47418, assessments#47419, beta#112069, betaConfidenceIntervalLower#47421, betaConfidenceIntervalUpper#47422, biologicalModelAllelicComposition#47423, biologicalModelGeneticBackground#47424, biologicalModelId#47425, biomarkerList#47426, biomarkerName#47427, biomarkers#47428, biosamplesFromSource#47429, cellLineBackground#47430, cellType#47431, clinicalPhase#47432, clinicalSignificances#112285, clinicalStatus#47434, cohortDescription#47435, ... 94 more fields]
                                                                     +- Project [datasourceId#47412, targetId#47413, alleleOrigins#47414, allelicRequirements#47415, ancestry#47416, ancestryId#47417, assays#47418, assessments#47419, beta#112069, betaConfidenceIntervalLower#47421, betaConfidenceIntervalUpper#47422, biologicalModelAllelicComposition#47423, biologicalModelGeneticBackground#47424, biologicalModelId#47425, biomarkerList#47426, biomarkerName#47427, biomarkers#47428, biosamplesFromSource#47429, cellLineBackground#47430, cellType#47431, clinicalPhase#47432, clinicalSignificances#112285, clinicalStatus#47434, cohortDescription#47435, ... 94 more fields]
                                                                        +- Project [datasourceId#47412, targetId#47413, alleleOrigins#47414, allelicRequirements#47415, ancestry#47416, ancestryId#47417, assays#47418, assessments#47419, beta#112069, betaConfidenceIntervalLower#47421, betaConfidenceIntervalUpper#47422, biologicalModelAllelicComposition#47423, biologicalModelGeneticBackground#47424, biologicalModelId#47425, biomarkerList#47426, biomarkerName#47427, biomarkers#47428, biosamplesFromSource#47429, cellLineBackground#47430, cellType#47431, clinicalPhase#47432, clinicalSignificances#112285, clinicalStatus#47434, cohortDescription#47435, ... 93 more fields]
                                                                           +- Project [datasourceId#47412, targetId#47413, alleleOrigins#47414, allelicRequirements#47415, ancestry#47416, ancestryId#47417, assays#47418, assessments#47419, beta#112069, betaConfidenceIntervalLower#47421, betaConfidenceIntervalUpper#47422, biologicalModelAllelicComposition#47423, biologicalModelGeneticBackground#47424, biologicalModelId#47425, biomarkerList#47426, biomarkerName#47427, biomarkers#47428, biosamplesFromSource#47429, cellLineBackground#47430, cellType#47431, clinicalPhase#47432, clinicalSignificances#112285, clinicalStatus#47434, cohortDescription#47435, ... 92 more fields]
                                                                              +- Join LeftOuter, ((drugId2#112013 = drugId#47450) AND (targetId2#112020 = targetId#47413))
                                                                                 :- Join LeftOuter, (target_id#112063 = targetId#47413)
                                                                                 :  :- Project [datasourceId#47412, targetId#47413, alleleOrigins#47414, allelicRequirements#47415, ancestry#47416, ancestryId#47417, assays#47418, assessments#47419, beta#112069, betaConfidenceIntervalLower#47421, betaConfidenceIntervalUpper#47422, biologicalModelAllelicComposition#47423, biologicalModelGeneticBackground#47424, biologicalModelId#47425, biomarkerList#47426, biomarkerName#47427, biomarkers#47428, biosamplesFromSource#47429, cellLineBackground#47430, cellType#47431, clinicalPhase#47432, concat_ws(,, clinicalSignificances#47433) AS clinicalSignificances#112285, clinicalStatus#47434, cohortDescription#47435, ... 83 more fields]
                                                                                 :  :  +- Project [datasourceId#47412, targetId#47413, alleleOrigins#47414, allelicRequirements#47415, ancestry#47416, ancestryId#47417, assays#47418, assessments#47419, beta#112069, betaConfidenceIntervalLower#47421, betaConfidenceIntervalUpper#47422, biologicalModelAllelicComposition#47423, biologicalModelGeneticBackground#47424, biologicalModelId#47425, biomarkerList#47426, biomarkerName#47427, biomarkers#47428, biosamplesFromSource#47429, cellLineBackground#47430, cellType#47431, clinicalPhase#47432, clinicalSignificances#47433, clinicalStatus#47434, cohortDescription#47435, ... 83 more fields]
                                                                                 :  :     +- Project [datasourceId#47412, targetId#47413, alleleOrigins#47414, allelicRequirements#47415, ancestry#47416, ancestryId#47417, assays#47418, assessments#47419, cast(beta#47420 as double) AS beta#112069, betaConfidenceIntervalLower#47421, betaConfidenceIntervalUpper#47422, biologicalModelAllelicComposition#47423, biologicalModelGeneticBackground#47424, biologicalModelId#47425, biomarkerList#47426, biomarkerName#47427, biomarkers#47428, biosamplesFromSource#47429, cellLineBackground#47430, cellType#47431, clinicalPhase#47432, clinicalSignificances#47433, clinicalStatus#47434, cohortDescription#47435, ... 83 more fields]
                                                                                 :  :        +- Filter datasourceId#47412 IN (ot_genetics_portal,gene_burden,eva,eva_somatic,gene2phenotype,orphanet,cancer_gene_census,intogen,impc,chembl)
                                                                                 :  :           +- Filter (datasourceId#47412 = chembl)
                                                                                 :  :              +- Filter datasourceId#47412 IN (ot_genetics_portal,gene_burden,eva,eva_somatic,gene2phenotype,orphanet,cancer_gene_census,intogen,impc,chembl)
                                                                                 :  :                 +- Relation [datasourceId#47412,targetId#47413,alleleOrigins#47414,allelicRequirements#47415,ancestry#47416,ancestryId#47417,assays#47418,assessments#47419,beta#47420,betaConfidenceIntervalLower#47421,betaConfidenceIntervalUpper#47422,biologicalModelAllelicComposition#47423,biologicalModelGeneticBackground#47424,biologicalModelId#47425,biomarkerList#47426,biomarkerName#47427,biomarkers#47428,biosamplesFromSource#47429,cellLineBackground#47430,cellType#47431,clinicalPhase#47432,clinicalSignificances#47433,clinicalStatus#47434,cohortDescription#47435,... 83 more fields] parquet
                                                                                 :  +- Project [id#111942 AS target_id#112063, approvedSymbol#111943, description#112048, description_splited#112052, TSorOncogene#112057]
                                                                                 :     +- Project [id#111942, approvedSymbol#111943, description#112048, description_splited#112052, CASE WHEN (RLIKE(description_splited#112052, ncogene) AND RLIKE(description_splited#112052, TSG)) THEN bivalent WHEN RLIKE(description_splited#112052, ncogene(\s|$)) THEN oncogene WHEN RLIKE(description_splited#112052, TSG(\s|$)) THEN TSG ELSE noEvaluable END AS TSorOncogene#112057]
                                                                                 :        +- Project [id#111942, approvedSymbol#111943, description#112048, concat_ws(,, description#112048) AS description_splited#112052]
                                                                                 :           +- Aggregate [id#111942, approvedSymbol#111943], [id#111942, approvedSymbol#111943, collect_set(description#112040, 0, 0) AS description#112048]
                                                                                 :              +- Filter description#112040 IN (TSG,oncogene,Oncogene,oncogene,oncogene,TSG,TSG,oncogene,fusion,oncogene,oncogene,fusion)
                                                                                 :                 +- Project [id#111942, approvedSymbol#111943, col#112035.description AS description#112040]
                                                                                 :                    +- Project [id#111942, approvedSymbol#111943, col#112035]
                                                                                 :                       +- Generate explode(hallmarks#111952.attributes), true, [col#112035]
                                                                                 :                          +- Relation [id#111942,approvedSymbol#111943,biotype#111944,transcriptIds#111945,canonicalTranscript#111946,canonicalExons#111947,genomicLocation#111948,alternativeGenes#111949,approvedName#111950,go#111951,hallmarks#111952,synonyms#111953,symbolSynonyms#111954,nameSynonyms#111955,functionDescriptions#111956,subcellularLocations#111957,targetClass#111958,obsoleteSymbols#111959,obsoleteNames#111960,constraint#111961,tep#111962,proteinIds#111963,dbXrefs#111964,chemicalProbes#111965,... 4 more fields] parquet
                                                                                 +- Aggregate [targetId2#112020, drugId2#112013], [targetId2#112020, drugId2#112013, collect_set(actionType#111998, 0, 0) AS actionType#112030]
                                                                                    +- Project [targetId2#112020, drugId2#112013, actionType#111998, mechanismOfAction#111999]
                                                                                       +- Generate explode(targets#112003), true, [targetId2#112020]
                                                                                          +- Project [drugId2#112013, actionType#111998, mechanismOfAction#111999, targets#112003]
                                                                                             +- Generate explode(chemblIds#112000), true, [drugId2#112013]
                                                                                                +- Relation [actionType#111998,mechanismOfAction#111999,chemblIds#112000,targetName#112001,targetType#112002,targets#112003,references#112004] parquet


In [None]:
#### Join the gwasCredibleAssocDistances with the drugs

In [78]:
drugsCredibleDistance.show()

                                                                                

+---------------+-------------+----------------+-----------+----+------------+-------------+-------------+---------------------+------+---------------------+---------------+------------+-----------+-----------+-----------+--------+--------+-----------------+----------------+
|       targetId|    diseaseId|leftStudyLocusId|homogenized|  h4|datasourceId|resourceScore|leftVariantId|credibleLeftStudyType|geneId|distanceFootprintMean|distanceTssMean|maxClinPhase|GoF_protect|LoF_protect|noEvaluable|GoF_risk|LoF_risk|coherencyDiagonal|coherencyOneCell|
+---------------+-------------+----------------+-----------+----+------------+-------------+-------------+---------------------+------+---------------------+---------------+------------+-----------+-----------+-----------+--------+--------+-----------------+----------------+
|ENSG00000007314|  EFO_0000555|            null|       null|null|        null|         null|         null|                 null|  null|                 null|           null

                                                                                

In [84]:
drugsCredibleDistance.withColumn(
    "min_tssDistance_ranking",
    F.min(
        "distanceTssMean",
    ).over(Window.partitionBy("targetId", "diseaseId")),
).withColumn(
    "min_footprintDistance_ranking",
    F.min(
        "distanceFootprintMean",
    ).over(Window.partitionBy("targetId", "diseaseId")),
).withColumn(  ### take maximum L2G score per T-D
    "max_L2GScore",
    F.max(
        "resourceScore",
    ).over(Window.partitionBy("targetId", "diseaseId")),
).filter(
    F.col("homogenized").isNotNull()
).show()

                                                                                

+---------------+-----------+--------------------+-----------+------------------+------------------+-------------------+--------------+---------------------+---------------+---------------------+---------------+------------+-----------+-----------+-----------+--------+--------+-----------------+----------------+-----------------------+-----------------------------+------------------+
|       targetId|  diseaseId|    leftStudyLocusId|homogenized|                h4|      datasourceId|      resourceScore| leftVariantId|credibleLeftStudyType|         geneId|distanceFootprintMean|distanceTssMean|maxClinPhase|GoF_protect|LoF_protect|noEvaluable|GoF_risk|LoF_risk|coherencyDiagonal|coherencyOneCell|min_tssDistance_ranking|min_footprintDistance_ranking|      max_L2GScore|
+---------------+-----------+--------------------+-----------+------------------+------------------+-------------------+--------------+---------------------+---------------+---------------------+---------------+------------+--

                                                                                

In [None]:
analysis_chembl.filter(
    (F.col("GoF_protect").isNotNull()) | F.col("LoF_protect").isNotNull()
).filter(F.col())

In [None]:
analysis_chembl.filter(
    (F.col("GoF_protect").isNotNull()) | F.col("LoF_protect").isNotNull()
).groupBy("coherencyDiagonal").count().grupByshow()

In [None]:
analysis_chembl.filter(
    (F.col("GoF_protect").isNotNull()) | F.col("LoF_protect").isNotNull()
).show()

+---------------+-------------+------------+-----------+-----------+-----------+--------+--------+-----------------+----------------+
|       targetId|    diseaseId|maxClinPhase|GoF_protect|LoF_protect|noEvaluable|GoF_risk|LoF_risk|coherencyDiagonal|coherencyOneCell|
+---------------+-------------+------------+-----------+-----------+-----------+--------+--------+-----------------+----------------+
|ENSG00000007314|  EFO_0000555|         2.0|       null|          3|       null|    null|    null|         coherent|        coherent|
|ENSG00000007314|  EFO_0003102|         0.5|       null|          1|       null|    null|    null|         coherent|        coherent|
|ENSG00000007314|  EFO_0003894|         4.0|       null|          1|       null|    null|    null|         coherent|        coherent|
|ENSG00000007314|  EFO_0004699|         3.0|       null|          3|       null|    null|    null|         coherent|        coherent|
|ENSG00000007314|  EFO_0801084|         2.0|       null|      

In [67]:
gwasCredibleAssocDistances.show()

[Stage 1026:>                                                       (0 + 1) / 1]

+--------------------+---------------+-----------+-----------+------------------+------------------+-------------------+---------------+---------------------+---------------+---------------------+---------------+
|    leftStudyLocusId|       targetId|  diseaseId|homogenized|                h4|      datasourceId|      resourceScore|  leftVariantId|credibleLeftStudyType|         geneId|distanceFootprintMean|distanceTssMean|
+--------------------+---------------+-----------+-----------+------------------+------------------+-------------------+---------------+---------------------+---------------+---------------------+---------------+
|9b68d48251bfb71e9...|ENSG00000000003|EFO_0004529|noEvaluable|              null|gwas_credible_sets| 0.9525927165569602|           null|                 null|ENSG00000000003|           0.99999464|     0.99975055|
|cb3db2374fb1fbeda...|ENSG00000000003|EFO_0004529|noEvaluable|              null|gwas_credible_sets| 0.9525927165569602|           null|            

                                                                                

In [65]:
gwasCredibleAssoc.join(
    l2gTable.withColumnRenamed("studyLocusId", "leftStudyLocusId"),
    on="leftStudyLocusId",
    how="left",
).show()

[Stage 994:>                                                        (0 + 1) / 1]

+--------------------+---------------+-----------+-----------+------------------+------------------+-------------------+---------------+---------------------+---------------+---------------------+---------------+
|    leftStudyLocusId|       targetId|  diseaseId|homogenized|                h4|      datasourceId|      resourceScore|  leftVariantId|credibleLeftStudyType|         geneId|distanceFootprintMean|distanceTssMean|
+--------------------+---------------+-----------+-----------+------------------+------------------+-------------------+---------------+---------------------+---------------+---------------------+---------------+
|9b68d48251bfb71e9...|ENSG00000000003|EFO_0004529|noEvaluable|              null|gwas_credible_sets| 0.9525927165569602|           null|                 null|ENSG00000000003|           0.99999464|     0.99975055|
|cb3db2374fb1fbeda...|ENSG00000000003|EFO_0004529|noEvaluable|              null|gwas_credible_sets| 0.9525927165569602|           null|            

                                                                                

In [56]:
gwasResolvedColoc.count()

18333585

In [55]:
gwasCredibleAssoc.distinct().count()

                                                                                

17244441

In [57]:
gwasCredibleAssoc.filter(F.col("colocDoE").isNotNull()).count()

16378337

In [60]:
gwasCredibleAssoc.show()

[Stage 881:>                                                        (0 + 1) / 1]

+---------------+-----------+-----------+--------------------+------------------+------------------+-------------------+---------------+---------------------+
|       targetId|  diseaseId|homogenized|    leftStudyLocusId|                h4|      datasourceId|      resourceScore|  leftVariantId|credibleLeftStudyType|
+---------------+-----------+-----------+--------------------+------------------+------------------+-------------------+---------------+---------------------+
|ENSG00000000003|EFO_0004529|noEvaluable|9b68d48251bfb71e9...|              null|gwas_credible_sets| 0.9525927165569602|           null|                 null|
|ENSG00000000003|EFO_0004529|noEvaluable|cb3db2374fb1fbeda...|              null|gwas_credible_sets| 0.9525927165569602|           null|                 null|
|ENSG00000000003|EFO_0004529|noEvaluable|b9e47164bf645be64...|              null|gwas_credible_sets| 0.9525927165569602|           null|                 null|
|ENSG00000000003|EFO_0004611|noEvaluable|9b68d

                                                                                

In [63]:
gwasCredibleAssoc.filter(F.col("homogenized") != "noEvaluable").groupBy(
    "targetId", "diseaseId", "homogenized"
).count().count()

                                                                                

205167

In [None]:
gwasCredibleAssoc.filter(F.col("homogenized").isNotNull()).groupBy(
    "targetId", "diseaseId",""homogenized""
).count().count()

                                                                                

716633

In [54]:
gwasCredibleAssoc.count()

18333585

In [23]:
gwasResolvedColoc.show()

+-----------+--------------------+---------------+--------------------+--------------------+----------+--------------+--------------------------+--------------------+--------------------+--------------------+------------------+-------------------+--------------------+--------------------+------------+---------------+---------------------+-------------------+----------------------+---------+--------------+---------+--------------+------------------+-------------------+-------------------------+-------------------+------------------+--------------------+-------------------+------------------+------------+---------------+--------------------+--------------+--------------------+----------------+-----------+
|  diseaseId|    leftStudyLocusId|       targetId|        rightStudyId|   rightStudyLocusId|chromosome|rightStudyType|numberColocalisingVariants|                  h0|                  h1|                  h2|                h3|                 h4|colocalisationMethod|betaRatioSignAverag

In [35]:
gwasResolvedColoc.filter(
    (F.col("targetId") == "ENSG00000000971") & (F.col("diseaseId") == "EFO_0001365")
).sort(F.col("pValueExponent").desc()).show()

                                                                                

+-----------+--------------------+---------------+--------------------+--------------------+----------+--------------+--------------------------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------------+--------------------+-------------------+---------------+---------------------+----------------+----------------------+-----------+--------------+---------+--------------+------------------+-------------------+-------------------------+-------------------+------------------+--------------------+-------------------+------------------+-------------------+---------------+------------------+--------------+--------------------+--------------------+--------+
|  diseaseId|    leftStudyLocusId|       targetId|        rightStudyId|   rightStudyLocusId|chromosome|rightStudyType|numberColocalisingVariants|                  h0|                  h1|                  h2|                  h3|                 h4|colocalisationMethod|bet

In [36]:
gwasResolvedColoc.withColumn(
    "homogenizedRaw",
    F.when(
        F.col("colocDoE").isNotNull(), F.first("colocDoE").over(window_spec)
    ).otherwise(F.lit(None)),
).groupBy("targetId", "diseaseId", "colocDoE", "homogenizedRaw").count().show()

[Stage 414:>                                                        (0 + 1) / 1]

+---------------+-----------+-----------+--------------+-----+
|       targetId|  diseaseId|   colocDoE|homogenizedRaw|count|
+---------------+-----------+-----------+--------------+-----+
|ENSG00000000003|EFO_0004529|       null|          null|    3|
|ENSG00000000003|EFO_0004611|       null|          null|    2|
|ENSG00000000003|EFO_0004732|       null|          null|    3|
|ENSG00000000419|EFO_0000313|       null|          null|    1|
|ENSG00000000419|EFO_0004339|       null|          null|    1|
|ENSG00000000419|EFO_0004503|       null|          null|    1|
|ENSG00000000457|EFO_0003872|GoF_protect|   GoF_protect|    1|
|ENSG00000000457|EFO_0004286|       null|          null|    1|
|ENSG00000000460|EFO_0003911|       null|          null|    1|
|ENSG00000000460|EFO_0004269|       null|          null|    1|
|ENSG00000000460|EFO_0004339|       null|          null|    1|
|ENSG00000000938|EFO_0004742|       null|          null|    1|
|ENSG00000000971|EFO_0001365|       null|          null

                                                                                

In [43]:
gwasResolvedColoc.filter(F.col("colocDoE").isNotNull()).withColumn(
    "homogenizedRaw",
    F.when(
        F.col("colocDoE").isNotNull(), F.first("colocDoE").over(window_spec)
    ).otherwise(F.lit(None)),
).groupBy("targetId", "diseaseId", "colocDoE", "homogenizedRaw").count().filter(
    F.col("homogenizedRaw").isNotNull()
).count()

                                                                                

375518

In [None]:
gwasResolvedColoc.filter(F.col("colocDoE").isNotNull()).withColumn(
    "homogenizedRaw",
    F.when(
        F.col("colocDoE").isNotNull(), F.first("colocDoE").over(window_spec)
    ).otherwise(F.lit(None)),
).groupBy("targetId", "diseaseId", "colocDoE", "homogenizedRaw").count().filter(
    F.col("homogenizedRaw").isNotNull()
).count()

In [52]:
# ignorenulls=True
gwasResolvedColoc.withColumn(
    "homogenizedRaw",
    F.when(
        F.col("colocDoE").isNotNull(),
        F.first("colocDoE", ignorenulls=True).over(window_spec),
    ).otherwise(F.lit(None)),
).groupBy("targetId", "diseaseId", "colocDoE", "homogenizedRaw").count().filter(
    F.col("homogenizedRaw").isNull()
).groupBy(
    "colocDoE"
).count().show()



+--------+------+
|colocDoE| count|
+--------+------+
|    null|622321|
+--------+------+



                                                                                

In [40]:
gwasResolvedColoc.withColumn(
    "homogenizedRaw",
    F.when(
        F.col("colocDoE").isNotNull(), F.first("colocDoE").over(window_spec)
    ).otherwise(F.lit(None)),
).groupBy("targetId", "diseaseId", "colocDoE", "homogenizedRaw").count().count()

                                                                                

997839

In [44]:
gwasResolvedColoc.withColumn(
    "homogenizedRaw",
    F.when(
        F.col("colocDoE").isNotNull(), F.first("colocDoE").over(window_spec)
    ).otherwise(F.lit(None)),
).groupBy("targetId", "diseaseId", "colocDoE", "homogenizedRaw").count().filter(
    F.col("homogenizedRaw").isNull()
).count()

                                                                                

742682

In [None]:
gwasResolvedColoc.withColumn(
    "homogenizedRaw",
    F.when(
        F.col("colocDoE").isNotNull(), F.first("colocDoE").over(window_spec)
    ).otherwise(F.lit(None)),
).groupBy("targetId", "diseaseId", "colocDoE", "homogenizedRaw").count().show()

[Stage 373:>                                                        (0 + 1) / 1]

+---------------+-----------+-----------+--------------+-----+
|       targetId|  diseaseId|   colocDoE|homogenizedRaw|count|
+---------------+-----------+-----------+--------------+-----+
|ENSG00000000003|EFO_0004529|       null|          null|    3|
|ENSG00000000003|EFO_0004611|       null|          null|    2|
|ENSG00000000003|EFO_0004732|       null|          null|    3|
|ENSG00000000419|EFO_0000313|       null|          null|    1|
|ENSG00000000419|EFO_0004339|       null|          null|    1|
|ENSG00000000419|EFO_0004503|       null|          null|    1|
|ENSG00000000457|EFO_0003872|GoF_protect|   GoF_protect|    1|
|ENSG00000000457|EFO_0004286|       null|          null|    1|
|ENSG00000000460|EFO_0003911|       null|          null|    1|
|ENSG00000000460|EFO_0004269|       null|          null|    1|
|ENSG00000000460|EFO_0004339|       null|          null|    1|
|ENSG00000000938|EFO_0004742|       null|          null|    1|
|ENSG00000000971|EFO_0001365|       null|          null

                                                                                

In [27]:
gwasResolvedColoc.groupBy(
    "targetId", "diseaseId", "colocDoE", "pValueExponent", "betaRatioSignAverage"
).count().withColumn("homogenized", F.first("colocDoE").over(window_spec)).withColumn(
    "allDoE", F.size(F.collect_set("homogenized").over(window_spec))
).filter(
    F.col("allDoE") > 1
).show()

                                                                                

+--------+---------+--------+--------------+--------------------+-----+-----------+------+
|targetId|diseaseId|colocDoE|pValueExponent|betaRatioSignAverage|count|homogenized|allDoE|
+--------+---------+--------+--------------+--------------------+-----+-----------+------+
+--------+---------+--------+--------------+--------------------+-----+-----------+------+



In [19]:
gwasCredibleAssoc.show()

[Stage 164:>                                                        (0 + 1) / 1]

+---------------+-----------+-----------+--------------------+------------------+------------------+-------------------+---------------+---------------------+
|       targetId|  diseaseId|homogenized|    leftStudyLocusId|                h4|      datasourceId|      resourceScore|  leftVariantId|credibleLeftStudyType|
+---------------+-----------+-----------+--------------------+------------------+------------------+-------------------+---------------+---------------------+
|ENSG00000000003|EFO_0004529|noEvaluable|9b68d48251bfb71e9...|              null|gwas_credible_sets| 0.9525927165569602|           null|                 null|
|ENSG00000000003|EFO_0004529|noEvaluable|cb3db2374fb1fbeda...|              null|gwas_credible_sets| 0.9525927165569602|           null|                 null|
|ENSG00000000003|EFO_0004529|noEvaluable|b9e47164bf645be64...|              null|gwas_credible_sets| 0.9525927165569602|           null|                 null|
|ENSG00000000003|EFO_0004611|noEvaluable|9b68d

                                                                                

In [12]:
gwasResolvedColoc.show()

+-----------+--------------------+---------------+--------------------+--------------------+----------+--------------+--------------------------+--------------------+--------------------+--------------------+------------------+-------------------+--------------------+--------------------+------------+---------------+---------------------+-------------------+----------------------+---------+--------------+---------+--------------+------------------+-------------------+-------------------------+-------------------+------------------+--------------------+-------------------+------------------+------------+---------------+--------------------+--------------+--------------------+----------------+-----------+
|  diseaseId|    leftStudyLocusId|       targetId|        rightStudyId|   rightStudyLocusId|chromosome|rightStudyType|numberColocalisingVariants|                  h0|                  h1|                  h2|                h3|                 h4|colocalisationMethod|betaRatioSignAverag

In [None]:
### part 2

In [None]:
genEvidDataset = (
    prueba_assessment.filter(F.col("datasourceId") != "chembl")  #### checked 31.05.2023
    .groupBy("targetId", "diseaseId")
    .agg(F.count("targetId").alias("Nr_evidences"))
    .select("targetId", "diseaseId", "Nr_evidences")
    .withColumn("geneticEvidence", F.lit("hasGeneticEvidence"))
    .drop("Nr_evidences")
)

coherency_toAssess_others_datasource = (  #### checked 31.05.2023
    prueba_assessment.filter(
        (F.col("homogenized").isin(columns)) & (F.col("datasourceId") != "chembl")
    )
    .groupBy("targetId", "diseaseId")
    .agg(F.collect_set("datasourceId").alias("datasourceIds"))
)

taDf = spark.createDataFrame(
    data=[
        ("MONDO_0045024", "cell proliferation disorder", "Oncology"),
        ("EFO_0005741", "infectious disease", "Other"),
        ("OTAR_0000014", "pregnancy or perinatal disease", "Other"),
        ("EFO_0005932", "animal disease", "Other"),
        ("MONDO_0024458", "disease of visual system", "Other"),
        ("EFO_0000319", "cardiovascular disease", "Other"),
        ("EFO_0009605", "pancreas disease", "Other"),
        ("EFO_0010282", "gastrointestinal disease", "Other"),
        ("OTAR_0000017", "reproductive system or breast disease", "Other"),
        ("EFO_0010285", "integumentary system disease", "Other"),
        ("EFO_0001379", "endocrine system disease", "Other"),
        ("OTAR_0000010", "respiratory or thoracic disease", "Other"),
        ("EFO_0009690", "urinary system disease", "Other"),
        ("OTAR_0000006", "musculoskeletal or connective tissue disease", "Other"),
        ("MONDO_0021205", "disease of ear", "Other"),
        ("EFO_0000540", "immune system disease", "Other"),
        ("EFO_0005803", "hematologic disease", "Other"),
        ("EFO_0000618", "nervous system disease", "Other"),
        ("MONDO_0002025", "psychiatric disorder", "Other"),
        ("MONDO_0024297", "nutritional or metabolic disease", "Other"),
        ("OTAR_0000018", "genetic, familial or congenital disease", "Other"),
        ("OTAR_0000009", "injury, poisoning or other complication", "Other"),
        ("EFO_0000651", "phenotype", "Other"),
        ("EFO_0001444", "measurement", "Other"),
        ("GO_0008150", "biological process", "Other"),
    ],
    schema=StructType(
        [
            StructField("taId", StringType(), True),
            StructField("taLabel", StringType(), True),
            StructField("taLabelSimple", StringType(), True),
        ]
    ),
).withColumn("taRank", F.monotonically_increasing_id())

### give us a classification of Oncology VS non oncology
wByDisease = Window.partitionBy("diseaseId")  #### checked 31.05.2023
diseaseTA = (
    diseases.withColumn("taId", F.explode("therapeuticAreas"))
    .select(F.col("id").alias("diseaseId"), "taId", "parents")
    .join(taDf, on="taId", how="left")
    .withColumn("minRank", F.min("taRank").over(wByDisease))
    .filter(F.col("taRank") == F.col("minRank"))
    .drop("taRank", "minRank")
)
l2gPred = spark.read.parquet(
    "gs://open-targets-pre-data-releases/24.12-uo_test-3/output/etl/parquet/locusToGenePredictions"
)
l2gTable = (
    l2gPred.select("studyLocusId", "geneId", F.explode_outer("locusToGeneFeatures"))
    .filter(F.col("key").isin(["distanceFootprintMean", "distanceTssMean"]))
    .groupBy("studyLocusId", "geneId")
    .pivot("key")
    .agg(F.first("value"))
)

"""
v2g = spark.read.parquet("gs://genetics-portal-dev-data/22.09.1/outputs/v2g")
varDistToGene = v2g.select(
    F.concat_ws("_", "chr_id", "position", "ref_allele", "alt_allele").alias(
        "variantId"
    ),
    F.col("gene_id").alias("targetId"),
    "source_id",
    "d",
    "distance_score",
).filter(F.col("source_id") == "canonical_tss")
"""

#######
# Build Ot genetics dataset as supporting evidence
#######
otGenetics = (
    prueba_assessment.filter(
        F.col("datasourceId").isin(
            [
                "ot_genetics_portal",
            ]
        )
    )
    # .filter((F.col("homogenized") != "noEvaluable"))
    .join(
        varDistToGene, on=["variantId", "targetId"], how="left"
    )  ### substitute by l2gTable
    .join(genEvidDataset, on=["targetId", "diseaseId"], how="left")
    .withColumn(
        "datasources",
        F.collect_set("datasourceId").over(Window.partitionBy("targetId", "diseaseId")),
    )
    .withColumn(
        "L2G_ranking",
        F.when(
            (F.col("datasourceId") == "ot_genetics_portal"),
            F.row_number().over(ranking.orderBy(F.col("resourceScore").desc())),
        ).otherwise(F.lit(None)),
    )
    .withColumn(
        "averageL2G",
        F.when(
            (F.col("datasourceId") == "ot_genetics_portal"),
            F.avg("resourceScore").over(ranking.orderBy(F.col("resourceScore").desc())),
        ).otherwise(F.lit(None)),
    )
    .withColumn(
        "averageCanonicalTSSDistance",
        F.when(
            (F.col("datasourceId") == "ot_genetics_portal"),
            F.avg("d").over(ranking.orderBy(F.col("resourceScore").desc())),
        ).otherwise(F.lit(None)),
    )
    .withColumn(
        "datasources",
        F.when(
            F.col("rank").isNull(),
            F.array_remove(F.col("datasources"), "ot_genetics_portal"),
        ).otherwise(F.col("datasources")),
    )
    .withColumn(
        "distance_ranking",
        F.when(
            (F.col("datasourceId") == "ot_genetics_portal"),
            F.row_number().over(ranking.orderBy(F.col("d").asc())),
        ).otherwise(F.lit(None)),
    )
    .withColumn(
        "ChemblL2gRanking",
        F.when(
            (F.array_contains(F.col("datasources"), "chembl"))
            & (F.array_contains(F.col("datasources"), "ot_genetics_portal")),
            F.lit(F.col("L2G_ranking")),
        ).otherwise(F.lit(None)),
    )
    .withColumn(
        "chemblDistanceRanking",
        F.when(
            (F.array_contains(F.col("datasources"), "chembl"))
            & (F.array_contains(F.col("datasources"), "ot_genetics_portal")),
            F.lit(F.col("distance_ranking")),
        ).otherwise(F.lit(None)),
    )
    .withColumn(
        "frontierValue",
        ## ot genetics portal
        F.when(
            F.col("datasourceId") == "ot_genetics_portal",  ### the same for gene_burden
            F.when(
                (F.col("beta").isNotNull()) & (F.col("OddsRatio").isNull()),
                F.when(
                    (F.col("beta") <= 0.1) & (F.col("beta") >= -0.1),
                    F.lit("limitValue"),
                ).otherwise(F.lit("noLimitValue")),
            )
            .when(
                (F.col("beta").isNull()) & (F.col("OddsRatio").isNotNull()),
                F.when(
                    (F.col("OddsRatio") <= 1.1) & (F.col("OddsRatio") >= 0.9),
                    F.lit("limitValue"),
                ).otherwise(F.lit("noLimitValue")),
            )
            .when(
                (F.col("beta").isNull()) & (F.col("OddsRatio").isNull()),
                F.lit("noValue"),
            ),
        ),
    )
).persist()

#####
# function for interpreting DoE and coherencies/discrepancies
#####

analysis_chembl = discrepancifier(
    prueba_assessment.filter((F.col("datasourceId") == "chembl"))
    .withColumn(
        "maxClinPhase",
        F.max(F.col("clinicalPhase")).over(Window.partitionBy("targetId", "diseaseId")),
    )
    .groupBy("targetId", "diseaseId", "maxClinPhase")
    .pivot("homogenized")
    .agg(F.count("targetId"))
    .persist()
)

#### propag OtGenetics:
otGenetics_propag = (
    otGenetics.join(  ### propagated using parent terms
        diseases.selectExpr(
            "id as diseaseId",
            "parents",
        ),
        on="diseaseId",
        how="left",
    ).withColumn(  ### propagating ###
        "diseaseId",
        F.explode_outer(F.concat(F.array(F.col("diseaseId")), F.col("parents"))),
    )
).persist()


#### include dictionary for calling dataframes:
# max_L2GScore
# min_distance_ranking


def benchmarkOT(discrepancifier, otGenetics, metric):
    dict_comb = {}
    dict_comb = {
        "hasGeneticEvidence": f"{metric}",
        "diagonalYes": f"{metric}",
        "oneCellYes": f"{metric}",
        "L2GAndColoc": f"{metric}",
    }
    list_l2g = [
        0.10,
        0.15,
        0.20,
        0.25,
        0.30,
        0.35,
        0.40,
        0.45,
        0.50,
        0.55,
        0.60,
        0.65,
        0.70,
        0.75,
        0.80,
        0.85,
        0.90,
        0.95,
    ]
    list_dist = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    return (
        discrepancifier(
            otGenetics.filter((F.col("datasourceId") == "ot_genetics_portal"))
            .withColumn(
                "min_distance_ranking",
                F.min("distance_ranking").over(
                    Window.partitionBy("targetId", "diseaseId")
                ),
            )
            .withColumn(  ### take maximum L2G score per T-D
                "max_L2GScore",
                F.max("resourceScore").over(
                    Window.partitionBy("targetId", "diseaseId")
                ),
            )
            .groupBy(
                "targetId",
                "diseaseId",
                "geneticEvidence",
                f"{value}",
            )  ##### modifications here to include the groups of ranking/distances to TSS
            .pivot("homogenized")
            .agg(F.count("targetId"))
        )
        .selectExpr(
            "targetId",
            "diseaseId",
            "geneticEvidence",
            f"{metric}",
            "coherencyDiagonal as coherencyDiagonal",
            "coherencyOneCell as coherencyOneCell",
            "LoF_protect",
            "GoF_protect",
            "LoF_risk",
            "GoF_risk",
        )
        .join(
            analysis_chembl.selectExpr(
                "targetId",
                "diseaseId",
                "maxClinPhase",
                "coherencyDiagonal as coherencyDiagonal_ch",
                "coherencyOneCell as coherencyOneCell_ch",
                "LoF_protect as LoF_protect_ch",
                "GoF_protect as GoF_protect_ch",
            ),
            on=["targetId", "diseaseId"],
            how="right",
        )
        .withColumn(
            "geneticEvidence",
            F.when(
                F.col(f"{metric}").isNotNull(), F.lit("hasGeneticEvidence")
            ).otherwise(F.lit("noGeneticEvidence")),
        )
        .withColumn(
            "diagonalAgreeWithDrugs",
            F.when(
                (F.col("coherencyDiagonal_ch") == "coherent")
                & (F.col("coherencyDiagonal") == "coherent"),
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        F.col("GoF_risk").isNotNull() | F.col("LoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .when(
                    F.col("GoF_protect_ch").isNotNull()
                    & (
                        F.col("LoF_risk").isNotNull() | F.col("GoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        )
        .withColumn(
            "oneCellAgreeWithDrugs",
            F.when(
                (F.col("coherencyOneCell_ch") == "coherent")
                & (F.col("coherencyOneCell") == "coherent"),
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        (F.col("LoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("GoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .when(
                    (F.col("GoF_protect_ch").isNotNull())
                    & (
                        (F.col("GoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("LoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        )
        .withColumn(
            "Phase4",
            F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=3",
            F.when(F.col("maxClinPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=2",
            F.when(F.col("maxClinPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=1",
            F.when(F.col("maxClinPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase0",
            F.when(F.col("maxClinPhase") == 0, F.lit("yes")).otherwise(F.lit("no")),
        )
        .join(terminated_array, on=["targetId", "diseaseId"], how="left")
        .withColumn(
            "PhaseT",
            F.when(F.col("prediction") == "yes", F.lit("yes")).otherwise(F.lit("no")),
        )
        .join(
            diseaseTA.select("diseaseId", "taLabelSimple"), on="diseaseId", how="left"
        )
        .withColumn(
            "hasGeneticEvidence",
            F.when(
                F.col("geneticEvidence") == "hasGeneticEvidence", F.lit("yes")
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "diagonalYes",
            F.when(
                F.col("diagonalAgreeWithDrugs") == "coherent", F.lit("yes")
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "oneCellYes",
            F.when(
                F.col("oneCellAgreeWithDrugs") == "coherent", F.lit("yes")
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "L2GAndColoc",
            F.when(
                (F.col("geneticEvidence") == "hasGeneticEvidence")
                & (F.col("coherencyDiagonal").isin(["coherent", "dispar"])),
                F.lit("yes"),
            ).otherwise(F.lit("no")),
        )
        .select(
            ["*"]
            + (
                [  ### single columns
                    F.when(F.col(f"{metric}") >= n, F.lit("yes"))
                    .otherwise(F.lit("no"))
                    .alias(f"{metric}>={str(n).replace('.', '_')}")
                    for n in list_l2g
                ]
                if metric == "max_L2GScore"  # Adjust this condition as needed
                else [
                    F.when(F.col(f"{metric}") <= n, F.lit("yes"))
                    .otherwise(F.lit("no"))
                    .alias(f"{metric}<={n}")
                    for n in list_dist
                ]
            )
            + (
                [  ### column combinations for Yes/No colums Plus has DoE (any agreement)
                    F.when((F.col(a) == "yes") & (F.col(x) >= n), F.lit("yes"))
                    .otherwise(F.lit("no"))
                    .alias(f"{x}>={str(n).replace('.', '_')}&{a}_combined")
                    for a, x in dict_comb.items()
                    for n in list_l2g
                ]
                if metric == "max_L2GScore"
                else [
                    F.when((F.col(a) == "yes") & (F.col(x) <= n), F.lit("yes"))
                    .otherwise(F.lit("no"))
                    .alias(f"{x}<={str(n).replace('.', '_')}&{a}_combined")
                    for a, x in dict_comb.items()
                    for n in list_dist
                ]
            )
        )
        .persist()
    )


metric_list = ["max_L2GScore", "min_distance_ranking"]
datasetDict = {}
for value in metric_list:
    if value == "max_L2GScore":
        datasetDict[f"df_l2g_original"] = benchmarkOT(
            discrepancifier, otGenetics, value
        )
        datasetDict[f"df_l2g_propagated"] = benchmarkOT(
            discrepancifier, otGenetics_propag, value
        )
    elif value == "min_distance_ranking":
        datasetDict[f"df_distance_original"] = benchmarkOT(
            discrepancifier, otGenetics, value
        )
        datasetDict[f"df_distance_propagated"] = benchmarkOT(
            discrepancifier, otGenetics_propag, value
        )

In [None]:
#### BUILDING THE NEW GWAS GENETIC EVIDENCE FROM COLOC
from functions import discrepancifier
from DoEAssessment import directionOfEffect
from functions import relative_success
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
import matplotlib.pyplot as plt
from datetime import date, datetime
from pyspark.sql.types import (
    StructType,
    StructField,
    ArrayType,
    DoubleType,
    DecimalType,
    StringType,
    FloatType,
    IntegerType,
)
import pandas as pd

spark = SparkSession.builder.getOrCreate()

path = "gs://open-targets-pre-data-releases/24.12-uo_test-3/output/etl/parquet/"

#### Now load sources of data to generate credible_set_OT_genetics evidences and associations.

target = spark.read.parquet(f"{path}targets/")

diseases = spark.read.parquet(f"{path}diseases/")

evidences = spark.read.parquet(f"{path}evidence").filter(
    F.col("datasourceId").isin(
        [
            "ot_genetics_portal",
            "gene_burden",
            "eva",
            "eva_somatic",
            "gene2phenotype",
            "orphanet",
            "cancer_gene_census",
            "intogen",
            "impc",
            "chembl",
        ]
    )
)
ot_genetics = evidences.filter(F.col("datasourceId") == "ot_genetics_portal")

credibleEvidence = spark.read.parquet(f"{path}evidence").filter(
    F.col("datasourceId").isin(["gwas_credible_sets"])
)
credible = spark.read.parquet(f"{path}credibleSet")

index = spark.read.parquet(f"{path}gwasIndex")

new = spark.read.parquet(f"{path}colocalisation/coloc")

variantIndex = spark.read.parquet(f"{path}variantIndex")

biosample = spark.read.parquet(f"{path}biosample")

print("read spark files")

print("fixing scXQTL and XQTL studies")
#### Fixing scXQTL as XQTLs:
## code provided by @ireneisdoomed
pd.DataFrame.iteritems = pd.DataFrame.items

raw_studies_metadata_schema: StructType = StructType(
    [
        StructField("study_id", StringType(), True),
        StructField("dataset_id", StringType(), True),
        StructField("study_label", StringType(), True),
        StructField("sample_group", StringType(), True),
        StructField("tissue_id", StringType(), True),
        StructField("tissue_label", StringType(), True),
        StructField("condition_label", StringType(), True),
        StructField("sample_size", IntegerType(), True),
        StructField("quant_method", StringType(), True),
        StructField("pmid", StringType(), True),
        StructField("study_type", StringType(), True),
    ]
)
raw_studies_metadata_path = "https://raw.githubusercontent.com/eQTL-Catalogue/eQTL-Catalogue-resources/fe3c4b4ed911b3a184271a6aadcd8c8769a66aba/data_tables/dataset_metadata.tsv"

study_table = spark.createDataFrame(
    pd.read_csv(raw_studies_metadata_path, sep="\t"),
    schema=raw_studies_metadata_schema,
)

# index = spark.read.parquet("gs://open-targets-pre-data-releases/24.12-uo_test-3/output/genetics/parquet/study_index")

study_index_w_correct_type = (
    study_table.select(
        F.concat_ws(
            "_",
            F.col("study_label"),
            F.col("quant_method"),
            F.col("sample_group"),
        ).alias("extracted_column"),
        "study_type",
    )
    .join(
        index
        # Get eQTL Catalogue studies
        .filter(F.col("studyType") != "gwas").filter(
            ~F.col("studyId").startswith("UKB_PPP")
        )
        # Remove measured trait
        .withColumn(
            "extracted_column",
            F.regexp_replace(F.col("studyId"), r"(_ENS.*|_ILMN.*|_X.*|_[0-9]+:.*)", ""),
        ).withColumn(
            "extracted_column",
            # After the previous cleanup, there are multiple traits from the same publication starting with the gene symbol that need to be removed (e.g. `Sun_2018_aptamer_plasma_ANXA2.4961.17.1..1`)
            F.when(
                F.col("extracted_column").startswith("Sun_2018_aptamer_plasma"),
                F.lit("Sun_2018_aptamer_plasma"),
            ).otherwise(F.col("extracted_column")),
        ),
        on="extracted_column",
        how="right",
    )
    .persist()
)

fixed = (
    study_index_w_correct_type.withColumn(
        "toFix",
        F.when(
            (F.col("study_type") != "single-cell")
            & (F.col("studyType").startswith("sc")),
            F.lit(True),
        ).otherwise(F.lit(False)),
    )
    # Remove the substring "sc" from the studyType column
    .withColumn(
        "newStudyType",
        F.when(
            F.col("toFix"), F.regexp_replace(F.col("studyType"), r"sc", "")
        ).otherwise(F.col("studyType")),
    ).drop("toFix", "extracted_column", "study_type")
).persist()
all_studies = index.join(
    fixed.selectExpr("studyId", "newStudyType"), on="studyId", how="left"
).persist()
fixedIndex = all_studies.withColumn(
    "studyType",
    F.when(F.col("newStudyType").isNotNull(), F.col("newStudyType")).otherwise(
        F.col("studyType")
    ),
).drop("newStudyType")

print("fixed scXQTL and XQTL studies")

print("creating new coloc")

#### fixed
newColoc = (
    new.join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on left side
            "studyLocusId as leftStudyLocusId",
            "StudyId as leftStudyId",
            "variantId as leftVariantId",
            "studyType as credibleLeftStudyType",
        ),
        on="leftStudyLocusId",
        how="left",
    )
    .join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on right side
            "studyLocusId as rightStudyLocusId",
            "studyId as rightStudyId",
            "variantId as rightVariantId",
            "studyType as credibleRightStudyType",
        ),
        on="rightStudyLocusId",
        how="left",
    )
    .join(
        fixedIndex.selectExpr(  ### bring modulated target on right side (QTL study)
            "studyId as rightStudyId",
            "geneId",
            "projectId",
            "studyType as indexStudyType",
            "condition",
            "biosampleId",
        ),
        on="rightStudyId",
        how="left",
    )
    .persist()
)
# remove columns without content (only null values on them)
df = credibleEvidence.filter((F.col("datasourceId") == "gwas_credible_sets"))

# Use an aggregation to determine non-null columns
non_null_counts = df.select(
    *[F.sum(F.col(col).isNotNull().cast("int")).alias(col) for col in df.columns]
)

# Collect the counts for each column
non_null_columns = [
    row[0] for row in non_null_counts.collect()[0].asDict().items() if row[1] > 0
]

# Select only the non-null columns
filtered_df = df.select(*non_null_columns).persist()

## bring studyId, variantId, beta from Gwas and pValue
gwasComplete = filtered_df.join(
    credible.selectExpr(
        "studyLocusId", "studyId", "variantId", "beta as betaGwas", "pValueExponent"
    ),
    on="studyLocusId",
    how="left",
)
print("creating new gwasResolvedColoc")

### bring directionality from QTL

gwasResolvedColoc = (
    (
        newColoc.filter(F.col("rightStudyType") != "gwas")
        .withColumnRenamed("geneId", "targetId")
        .join(
            gwasComplete.withColumnRenamed("studyLocusId", "leftStudyLocusId"),
            on=["leftStudyLocusId", "targetId"],
            how="right",
        )
        .join(  ### propagated using parent terms
            diseases.selectExpr(
                "id as diseaseId", "name", "parents", "therapeuticAreas"
            ),
            on="diseaseId",
            how="left",
        )
        .withColumn(
            "diseaseId",
            F.explode_outer(F.concat(F.array(F.col("diseaseId")), F.col("parents"))),
        )
        .drop("parents", "oldDiseaseId")
    )
    .withColumn(
        "colocDoE",
        F.when(
            F.col("rightStudyType").isin(
                ["eqtl", "pqtl", "tuqtl", "sceqtl", "sctuqtl"]
            ),
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_protect"),
            ),
        ).when(
            F.col("rightStudyType").isin(
                ["sqtl", "scsqtl"]
            ),  ### opposite directionality than sqtl
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_protect"),
            ),
        ),
    )
    .persist()
)

#### take the direction from the lowest p value
window_spec = Window.partitionBy("targetId", "diseaseId").orderBy(
    F.col("pValueExponent").asc()
)

print("creating new gwasCredibleAssoc")

### modify to include more information
gwasCredibleAssoc = (
    gwasResolvedColoc.withColumn(
        "homogenized", F.first("colocDoE", ignorenulls=True).over(window_spec)
    )
    .select(
        "targetId",
        "diseaseId",
        "homogenized",
        "leftStudyLocusId",
        "h4",
        "datasourceId",
        "resourceScore",
        "leftVariantId",
        "credibleLeftStudyType",
    )
    .withColumn(
        "homogenized",
        F.when(F.col("homogenized").isNull(), F.lit("noEvaluable")).otherwise(
            F.col("homogenized")
        ),
    )
)  ### there will be duplicates TargetId-DiseaseId because we are taking the most significant DoE

#### LOAD STUDYLOCUSID AND VARIANT DISTANCES
l2gPred = spark.read.parquet(
    "gs://open-targets-pre-data-releases/24.12-uo_test-3/output/etl/parquet/locusToGenePredictions"
)
l2gTable = (
    l2gPred.select("studyLocusId", "geneId", F.explode_outer("locusToGeneFeatures"))
    .filter(F.col("key").isin(["distanceFootprintMean", "distanceTssMean"]))
    .groupBy("studyLocusId", "geneId")
    .pivot("key")
    .agg(F.first("value"))
)
print("creating gwasCredibleAssocDistances")
gwasCredibleAssocDistances = gwasCredibleAssoc.join(
    l2gTable.withColumnRenamed("studyLocusId", "leftStudyLocusId").withColumnRenamed(
        "geneId", "targetId"
    ),
    on=["leftStudyLocusId", "targetId"],
    how="left",
)

print("creating analysis_chembl")
analysis_chembl = (
    discrepancifier(
        directionOfEffect(
            evidences.filter((F.col("datasourceId") == "chembl")), "24.09"
        )
        .withColumn(
            "maxClinPhase",
            F.max(F.col("clinicalPhase")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase")
        .pivot("homogenized")
        .agg(F.count("targetId"))
        .persist()
    )
    .filter(  ### ensure drug has annotated MoA and is coherent per Target-Disease
        ((F.col("GoF_protect").isNotNull()) | F.col("LoF_protect").isNotNull())
        & (F.col("coherencyDiagonal") == "coherent")
    )
    .selectExpr(
        "targetId",
        "diseaseId",
        "maxClinPhase",
        "coherencyDiagonal as coherencyDiagonal_ch",
        "coherencyOneCell as coherencyOneCell_ch",
        "LoF_protect as LoF_protect_ch",
        "GoF_protect as GoF_protect_ch",
    )
)

### pivot colocdoE grouping by T-D-studyLocusId-distances

values = ["max_L2GScore", "min_footPrintDistance_rank", "min_tssDistance_rank"]
value = "max_L2GScore"
dict_comb = {
    "hasGeneticEvidence": f"{value}",
    "diagonalYes": f"{value}",
    "oneCellYes": f"{value}",
    "L2GAndColoc": f"{value}",
}
list_l2g = [
    0.10,
    0.15,
    0.20,
    0.25,
    0.30,
    0.35,
    0.40,
    0.45,
    0.50,
    0.55,
    0.60,
    0.65,
    0.70,
    0.75,
    0.80,
    0.85,
    0.90,
    0.95,
]
print("creating benchmarkOT function")

spark session created at 2025-01-27 15:17:49.358444
Analysis started on 2025-01-27 at  2025-01-27 15:17:49.358444


25/01/27 15:17:54 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


read spark files
fixing scXQTL and XQTL studies
fixed scXQTL and XQTL studies
creating new coloc


                                                                                

creating new gwasResolvedColoc
creating new gwasCredibleAssoc


                                                                                

creating gwasCredibleAssocDistances
creating analysis_chembl


                                                                                

creating benchmarkOT function
creating dataframes in loop


AttributeError: 'function' object has no attribute 'withColumn'

In [None]:
def benchmarkOT(
    gwasCredibleAssocDistances, value, analysis_chembl, list_l2g, dict_comb
):
    return (
        discrepancifier(
            gwasCredibleAssocDistances
            # .filter(F.col("h4").isNotNull()) #### not filter by this because we want to include the L2G AND Coloc question
            .withColumn(  ### take maximum L2G score per T-D
                "max_L2GScore",
                F.max("resourceScore").over(
                    Window.partitionBy("targetId", "diseaseId")
                ),
            )
            .withColumn(
                "min_footPrintDistance_rank",
                F.min("distanceFootprintMean").over(
                    Window.partitionBy("targetId", "diseaseId")
                ),
            )
            .withColumn(
                "min_tssDistance_rank",
                F.min("distanceTssMean").over(
                    Window.partitionBy("targetId", "diseaseId")
                ),
            )
            .groupBy(
                "targetId",
                "diseaseId",
                f"{value}",
                # "leftStudyLocusId",
            )
            .pivot("homogenized")
            .count()
        )
        .join(analysis_chembl, on=["targetId", "diseaseId"], how="right")
        .withColumn(
            "diagonalAgreeWithDrugs",
            F.when(
                (
                    (F.col("coherencyDiagonal_ch") == "coherent")
                    & (F.col("coherencyDiagonal").isNotNull())
                )
                # & (F.col("coherencyDiagonal") == "coherent")
                ,
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        F.col("GoF_risk").isNotNull() | F.col("LoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .when(
                    F.col("GoF_protect_ch").isNotNull()
                    & (
                        F.col("LoF_risk").isNotNull() | F.col("GoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        )
        .withColumn(
            "oneCellAgreeWithDrugs",
            F.when(
                (
                    (F.col("coherencyOneCell_ch") == "coherent")
                    & (F.col("coherencyDiagonal").isNotNull())
                )
                # & (F.col("coherencyOneCell") == "coherent")
                ,
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        (F.col("LoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("GoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .when(
                    (F.col("GoF_protect_ch").isNotNull())
                    & (
                        (F.col("GoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("LoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
            # ).filter(
            #    F.col("diagonalAgreeWithDrugs").isNotNull()
        )
        .withColumn(
            "Phase4",
            F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=3",
            F.when(F.col("maxClinPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=2",
            F.when(F.col("maxClinPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=1",
            F.when(F.col("maxClinPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase0",
            F.when(F.col("maxClinPhase") == 0, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "diagonalYes",
            F.when(
                F.col("diagonalAgreeWithDrugs") == "coherent", F.lit("yes")
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "oneCellYes",
            F.when(
                F.col("oneCellAgreeWithDrugs") == "coherent", F.lit("yes")
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "hasGeneticEvidence",
            F.when(F.col("max_L2GScore").isNotNull(), F.lit("yes")).otherwise(
                F.lit("no")
            ),
        )
        .withColumn(
            "L2GAndColoc",
            F.when(
                (F.col(f"{value}").isNotNull())
                & (F.col("coherencyDiagonal").isin(["coherent", "dispar"])),
                F.lit("yes"),
            ).otherwise(F.lit("no")),
        )
        .select(
            ["*"]
            + (
                [  ### single columns
                    F.when(F.col(f"{value}") >= n, F.lit("yes"))
                    .otherwise(F.lit("no"))
                    .alias(f"{value}>={str(n).replace('.', '_')}")
                    for n in list_l2g
                ]
            )
            + (
                [  ### column combinations for Yes/No colums Plus has DoE (any agreement)
                    F.when((F.col(a) == "yes") & (F.col(x) >= n), F.lit("yes"))
                    .otherwise(F.lit("no"))
                    .alias(f"{x}>={str(n).replace('.', '_')}&{a}_combined")
                    for a, x in dict_comb.items()
                    for n in list_l2g
                ]
            )
        )
        .persist()
    )


### HERE

### Make all the datasets
values = ["max_L2GScore", "min_footPrintDistance_rank", "min_tssDistance_rank"]

print("creating dataframes in loop")

datasetDict = {}
for value in values:
    if value == "max_L2GScore":
        datasetDict[f"df_l2g_original"] = benchmarkOT(
            discrepancifier,
            gwasCredibleAssocDistances,
            value,
            analysis_chembl,
        )
    else:
        datasetDict[f"{value}"] = benchmarkOT(
            discrepancifier, gwasCredibleAssocDistances, value, analysis_chembl
        )

In [None]:
#### BUILDING THE NEW GWAS GENETIC EVIDENCE FROM COLOC
from functions import discrepancifier
from DoEAssessment import directionOfEffect
from functions import relative_success
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
import matplotlib.pyplot as plt
from datetime import date, datetime
from pyspark.sql.types import (
    StructType,
    StructField,
    ArrayType,
    DoubleType,
    DecimalType,
    StringType,
    FloatType,
    IntegerType,
)
import pandas as pd

spark = SparkSession.builder.getOrCreate()

path = "gs://open-targets-pre-data-releases/24.12-uo_test-3/output/etl/parquet/"

#### Now load sources of data to generate credible_set_OT_genetics evidences and associations.

target = spark.read.parquet(f"{path}targets/")

diseases = spark.read.parquet(f"{path}diseases/")

evidences = spark.read.parquet(f"{path}evidence").filter(
    F.col("datasourceId").isin(
        [
            "ot_genetics_portal",
            "gene_burden",
            "eva",
            "eva_somatic",
            "gene2phenotype",
            "orphanet",
            "cancer_gene_census",
            "intogen",
            "impc",
            "chembl",
        ]
    )
)
ot_genetics = evidences.filter(F.col("datasourceId") == "ot_genetics_portal")

credibleEvidence = spark.read.parquet(f"{path}evidence").filter(
    F.col("datasourceId").isin(["gwas_credible_sets"])
)
credible = spark.read.parquet(f"{path}credibleSet")

index = spark.read.parquet(f"{path}gwasIndex")

new = spark.read.parquet(f"{path}colocalisation/coloc")

variantIndex = spark.read.parquet(f"{path}variantIndex")

biosample = spark.read.parquet(f"{path}biosample")

print("read spark files")

print("fixing scXQTL and XQTL studies")
#### Fixing scXQTL as XQTLs:
## code provided by @ireneisdoomed
pd.DataFrame.iteritems = pd.DataFrame.items

raw_studies_metadata_schema: StructType = StructType(
    [
        StructField("study_id", StringType(), True),
        StructField("dataset_id", StringType(), True),
        StructField("study_label", StringType(), True),
        StructField("sample_group", StringType(), True),
        StructField("tissue_id", StringType(), True),
        StructField("tissue_label", StringType(), True),
        StructField("condition_label", StringType(), True),
        StructField("sample_size", IntegerType(), True),
        StructField("quant_method", StringType(), True),
        StructField("pmid", StringType(), True),
        StructField("study_type", StringType(), True),
    ]
)
raw_studies_metadata_path = "https://raw.githubusercontent.com/eQTL-Catalogue/eQTL-Catalogue-resources/fe3c4b4ed911b3a184271a6aadcd8c8769a66aba/data_tables/dataset_metadata.tsv"

study_table = spark.createDataFrame(
    pd.read_csv(raw_studies_metadata_path, sep="\t"),
    schema=raw_studies_metadata_schema,
)

# index = spark.read.parquet("gs://open-targets-pre-data-releases/24.12-uo_test-3/output/genetics/parquet/study_index")

study_index_w_correct_type = (
    study_table.select(
        F.concat_ws(
            "_",
            F.col("study_label"),
            F.col("quant_method"),
            F.col("sample_group"),
        ).alias("extracted_column"),
        "study_type",
    )
    .join(
        index
        # Get eQTL Catalogue studies
        .filter(F.col("studyType") != "gwas").filter(
            ~F.col("studyId").startswith("UKB_PPP")
        )
        # Remove measured trait
        .withColumn(
            "extracted_column",
            F.regexp_replace(F.col("studyId"), r"(_ENS.*|_ILMN.*|_X.*|_[0-9]+:.*)", ""),
        ).withColumn(
            "extracted_column",
            # After the previous cleanup, there are multiple traits from the same publication starting with the gene symbol that need to be removed (e.g. `Sun_2018_aptamer_plasma_ANXA2.4961.17.1..1`)
            F.when(
                F.col("extracted_column").startswith("Sun_2018_aptamer_plasma"),
                F.lit("Sun_2018_aptamer_plasma"),
            ).otherwise(F.col("extracted_column")),
        ),
        on="extracted_column",
        how="right",
    )
    .persist()
)

fixed = (
    study_index_w_correct_type.withColumn(
        "toFix",
        F.when(
            (F.col("study_type") != "single-cell")
            & (F.col("studyType").startswith("sc")),
            F.lit(True),
        ).otherwise(F.lit(False)),
    )
    # Remove the substring "sc" from the studyType column
    .withColumn(
        "newStudyType",
        F.when(
            F.col("toFix"), F.regexp_replace(F.col("studyType"), r"sc", "")
        ).otherwise(F.col("studyType")),
    ).drop("toFix", "extracted_column", "study_type")
).persist()
all_studies = index.join(
    fixed.selectExpr("studyId", "newStudyType"), on="studyId", how="left"
).persist()
fixedIndex = all_studies.withColumn(
    "studyType",
    F.when(F.col("newStudyType").isNotNull(), F.col("newStudyType")).otherwise(
        F.col("studyType")
    ),
).drop("newStudyType")

print("fixed scXQTL and XQTL studies")

print("creating new coloc")

#### fixed
newColoc = (
    new.join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on left side
            "studyLocusId as leftStudyLocusId",
            "StudyId as leftStudyId",
            "variantId as leftVariantId",
            "studyType as credibleLeftStudyType",
        ),
        on="leftStudyLocusId",
        how="left",
    )
    .join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on right side
            "studyLocusId as rightStudyLocusId",
            "studyId as rightStudyId",
            "variantId as rightVariantId",
            "studyType as credibleRightStudyType",
        ),
        on="rightStudyLocusId",
        how="left",
    )
    .join(
        fixedIndex.selectExpr(  ### bring modulated target on right side (QTL study)
            "studyId as rightStudyId",
            "geneId",
            "projectId",
            "studyType as indexStudyType",
            "condition",
            "biosampleId",
        ),
        on="rightStudyId",
        how="left",
    )
    .persist()
)
# remove columns without content (only null values on them)
df = credibleEvidence.filter((F.col("datasourceId") == "gwas_credible_sets"))

# Use an aggregation to determine non-null columns
non_null_counts = df.select(
    *[F.sum(F.col(col).isNotNull().cast("int")).alias(col) for col in df.columns]
)

# Collect the counts for each column
non_null_columns = [
    row[0] for row in non_null_counts.collect()[0].asDict().items() if row[1] > 0
]

# Select only the non-null columns
filtered_df = df.select(*non_null_columns).persist()

## bring studyId, variantId, beta from Gwas and pValue
gwasComplete = filtered_df.join(
    credible.selectExpr(
        "studyLocusId", "studyId", "variantId", "beta as betaGwas", "pValueExponent"
    ),
    on="studyLocusId",
    how="left",
)
print("creating new gwasResolvedColoc")

### bring directionality from QTL

gwasResolvedColoc = (
    (
        newColoc.filter(F.col("rightStudyType") != "gwas")
        .withColumnRenamed("geneId", "targetId")
        .join(
            gwasComplete.withColumnRenamed("studyLocusId", "leftStudyLocusId"),
            on=["leftStudyLocusId", "targetId"],
            how="right",
        )
        .join(  ### propagated using parent terms
            diseases.selectExpr(
                "id as diseaseId", "name", "parents", "therapeuticAreas"
            ),
            on="diseaseId",
            how="left",
        )
        .withColumn(
            "diseaseId",
            F.explode_outer(F.concat(F.array(F.col("diseaseId")), F.col("parents"))),
        )
        .drop("parents", "oldDiseaseId")
    )
    .withColumn(
        "colocDoE",
        F.when(
            F.col("rightStudyType").isin(
                ["eqtl", "pqtl", "tuqtl", "sceqtl", "sctuqtl"]
            ),
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_protect"),
            ),
        ).when(
            F.col("rightStudyType").isin(
                ["sqtl", "scsqtl"]
            ),  ### opposite directionality than sqtl
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_protect"),
            ),
        ),
    )
    .persist()
)

#### take the direction from the lowest p value
window_spec = Window.partitionBy("targetId", "diseaseId").orderBy(
    F.col("pValueExponent").asc()
)

print("creating new gwasCredibleAssoc")

### modify to include more information
gwasCredibleAssoc = (
    gwasResolvedColoc.withColumn(
        "homogenized", F.first("colocDoE", ignorenulls=True).over(window_spec)
    )
    .select(
        "targetId",
        "diseaseId",
        "homogenized",
        "leftStudyLocusId",
        "h4",
        "datasourceId",
        "resourceScore",
        "leftVariantId",
        "credibleLeftStudyType",
    )
    .withColumn(
        "homogenized",
        F.when(F.col("homogenized").isNull(), F.lit("noEvaluable")).otherwise(
            F.col("homogenized")
        ),
    )
)  ### there will be duplicates TargetId-DiseaseId because we are taking the most significant DoE

#### LOAD STUDYLOCUSID AND VARIANT DISTANCES
l2gPred = spark.read.parquet(
    "gs://open-targets-pre-data-releases/24.12-uo_test-3/output/etl/parquet/locusToGenePredictions"
)
l2gTable = (
    l2gPred.select("studyLocusId", "geneId", F.explode_outer("locusToGeneFeatures"))
    .filter(F.col("key").isin(["distanceFootprintMean", "distanceTssMean"]))
    .groupBy("studyLocusId", "geneId")
    .pivot("key")
    .agg(F.first("value"))
)
print("creating gwasCredibleAssocDistances")
gwasCredibleAssocDistances = gwasCredibleAssoc.join(
    l2gTable.withColumnRenamed("studyLocusId", "leftStudyLocusId").withColumnRenamed(
        "geneId", "targetId"
    ),
    on=["leftStudyLocusId", "targetId"],
    how="left",
)

print("creating analysis_chembl")
analysis_chembl = (
    discrepancifier(
        directionOfEffect(
            evidences.filter((F.col("datasourceId") == "chembl")), "24.09"
        )
        .withColumn(
            "maxClinPhase",
            F.max(F.col("clinicalPhase")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase")
        .pivot("homogenized")
        .agg(F.count("targetId"))
        .persist()
    )
    .filter(  ### ensure drug has annotated MoA and is coherent per Target-Disease
        ((F.col("GoF_protect").isNotNull()) | F.col("LoF_protect").isNotNull())
        & (F.col("coherencyDiagonal") == "coherent")
    )
    .selectExpr(
        "targetId",
        "diseaseId",
        "maxClinPhase",
        "coherencyDiagonal as coherencyDiagonal_ch",
        "coherencyOneCell as coherencyOneCell_ch",
        "LoF_protect as LoF_protect_ch",
        "GoF_protect as GoF_protect_ch",
    )
)

### pivot colocdoE grouping by T-D-studyLocusId-distances

values = ["max_L2GScore", "min_footPrintDistance_rank", "min_tssDistance_rank"]
value = "max_L2GScore"

list_l2g = [
    0.10,
    0.15,
    0.20,
    0.25,
    0.30,
    0.35,
    0.40,
    0.45,
    0.50,
    0.55,
    0.60,
    0.65,
    0.70,
    0.75,
    0.80,
    0.85,
    0.90,
    0.95,
]
print("creating benchmarkOT function")

dict_comb = {}


def benchmarkOT(
    dict_comb, value, gwasCredibleAssocDistances, analysis_chembl, list_l2g
):

    dict_comb = {
        "hasGeneticEvidence": f"{value}",
        "diagonalYes": f"{value}",
        "oneCellYes": f"{value}",
        "L2GAndColoc": f"{value}",
    }
    return (
        discrepancifier(
            gwasCredibleAssocDistances
            # .filter(F.col("h4").isNotNull()) #### not filter by this because we want to include the L2G AND Coloc question
            .withColumn(  ### take maximum L2G score per T-D
                "max_L2GScore",
                F.max("resourceScore").over(
                    Window.partitionBy("targetId", "diseaseId")
                ),
            )
            .withColumn(
                "min_footPrintDistance_rank",
                F.min("distanceFootprintMean").over(
                    Window.partitionBy("targetId", "diseaseId")
                ),
            )
            .withColumn(
                "min_tssDistance_rank",
                F.min("distanceTssMean").over(
                    Window.partitionBy("targetId", "diseaseId")
                ),
            )
            .groupBy(
                "targetId",
                "diseaseId",
                f"{value}",
                # "leftStudyLocusId",
            )
            .pivot("homogenized")
            .count()
        )
        .join(analysis_chembl, on=["targetId", "diseaseId"], how="right")
        .withColumn(
            "diagonalAgreeWithDrugs",
            F.when(
                (
                    (F.col("coherencyDiagonal_ch") == "coherent")
                    & (F.col("coherencyDiagonal").isNotNull())
                )
                # & (F.col("coherencyDiagonal") == "coherent")
                ,
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        F.col("GoF_risk").isNotNull() | F.col("LoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .when(
                    F.col("GoF_protect_ch").isNotNull()
                    & (
                        F.col("LoF_risk").isNotNull() | F.col("GoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        )
        .withColumn(
            "oneCellAgreeWithDrugs",
            F.when(
                (
                    (F.col("coherencyOneCell_ch") == "coherent")
                    & (F.col("coherencyDiagonal").isNotNull())
                )
                # & (F.col("coherencyOneCell") == "coherent")
                ,
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        (F.col("LoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("GoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .when(
                    (F.col("GoF_protect_ch").isNotNull())
                    & (
                        (F.col("GoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("LoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
            # ).filter(
            #    F.col("diagonalAgreeWithDrugs").isNotNull()
        )
        .withColumn(
            "Phase4",
            F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=3",
            F.when(F.col("maxClinPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=2",
            F.when(F.col("maxClinPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=1",
            F.when(F.col("maxClinPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase0",
            F.when(F.col("maxClinPhase") == 0, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "diagonalYes",
            F.when(
                F.col("diagonalAgreeWithDrugs") == "coherent", F.lit("yes")
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "oneCellYes",
            F.when(
                F.col("oneCellAgreeWithDrugs") == "coherent", F.lit("yes")
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "hasGeneticEvidence",
            F.when(F.col(f"{value}").isNotNull(), F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "L2GAndColoc",
            F.when(
                (F.col(f"{value}").isNotNull())
                & (F.col("coherencyDiagonal").isin(["coherent", "dispar"])),
                F.lit("yes"),
            ).otherwise(F.lit("no")),
        )
        .select(
            ["*"]
            + (
                [  ### single columns
                    F.when(F.col(f"{value}") >= n, F.lit("yes"))
                    .otherwise(F.lit("no"))
                    .alias(f"{value}>={str(n).replace('.', '_')}")
                    for n in list_l2g
                ]
            )
            + (
                [  ### column combinations for Yes/No colums Plus has DoE (any agreement)
                    F.when((F.col(a) == "yes") & (F.col(x) >= n), F.lit("yes"))
                    .otherwise(F.lit("no"))
                    .alias(f"{x}>={str(n).replace('.', '_')}&{a}_combined")
                    for a, x in dict_comb.items()
                    for n in list_l2g
                ]
            )
        )
        .persist()
    )


### HERE

### Make all the datasets
values = ["max_L2GScore", "min_footPrintDistance_rank", "min_tssDistance_rank"]

print("creating dataframes in loop")

datasetDict = {}
for value in values:
    if value == "max_L2GScore":
        datasetDict[f"df_l2g_original"] = benchmarkOT(
            dict_comb, value, gwasCredibleAssocDistances, analysis_chembl, list_l2g
        )
    else:
        datasetDict[f"{value}"] = benchmarkOT(
            dict_comb, value, gwasCredibleAssocDistances, analysis_chembl, list_l2g
        )


def comparisons_df(dataset) -> list:
    """Return list of all comparisons to be used in the analysis"""
    toAnalysis = dataset.columns[22:]
    dataType = ["byDatatype"] * len(toAnalysis)
    l_studies = []
    l_studies.extend([list(a) for a in zip(toAnalysis, dataType)])

    schema = StructType(
        [
            StructField("comparison", StringType(), True),
            StructField("comparisonType", StringType(), True),
        ]
    )

    comparisons = spark.createDataFrame(l_studies, schema=schema)
    ### include all the columns as predictor

    predictions = spark.createDataFrame(
        data=[
            ("Phase4", "clinical"),
            # ("Phase>=3", "clinical"),
            # ("Phase>=2", "clinical"),
            # ("Phase>=1", "clinical"),
            # ("PhaseT", "clinical"),
        ]
    )
    return comparisons.join(predictions, how="full").collect()


from functions import relative_success, spreadSheetFormatter, convertTuple

full_data = spark.createDataFrame(
    data=[
        ("yes", "yes"),
        ("yes", "no"),
        ("no", "yes"),
        ("no", "no"),
    ],
    schema=StructType(
        [
            StructField("prediction", StringType(), True),
            StructField("comparison", StringType(), True),
        ]
    ),
)


def aggregations_original(
    df,
    data,
    listado,
    comparisonColumn,
    comparisonType,
    predictionColumn,
    predictionType,
    today_date,
):

    wComparison = Window.partitionBy(F.col(comparisonColumn))
    wPrediction = Window.partitionBy(F.col(predictionColumn))
    wPredictionComparison = Window.partitionBy(
        F.col(comparisonColumn), F.col(predictionColumn)
    )

    uniqIds = df.select("targetId", "diseaseId").distinct().count()
    out = (
        df.withColumn("comparisonType", F.lit(comparisonType))
        .withColumn("predictionType", F.lit(predictionType))
        .withColumn("total", F.lit(uniqIds))
        .withColumn("a", F.count("targetId").over(wPredictionComparison))
        .withColumn(
            "predictionTotal",
            F.count("targetId").over(wPrediction),
        )
        .withColumn(
            "comparisonTotal",
            F.count("targetId").over(wComparison),
        )
        .select(
            F.col(predictionColumn).alias("prediction"),
            F.col(comparisonColumn).alias("comparison"),
            "comparisonType",
            "predictionType",
            "a",
            "predictionTotal",
            "comparisonTotal",
            "total",
        )
        .filter(F.col("prediction").isNotNull())
        .filter(F.col("comparison").isNotNull())
        .distinct()
    )
    """
    out.write.mode("overwrite").parquet(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )
    """
    filePath = "gs://ot-team/jroldan/" + str(
        today_date
        + "_"
        + "analysis/"
        + data
        # + "_propagated"
        + "/"
        + comparisonColumn
        + "_"
        + predictionColumn
        + ".parquet"
    )

    listado.append(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )
    print(
        today_date
        + "_"
        + "analysis/"
        + data
        # + "_propagated"
        + "/"
        + comparisonColumn
        + "_"
        + predictionColumn
        + ".parquet"
    )

    c = datetime.now()
    c.strftime("%H:%M:%S")
    print(c)
    array1 = np.delete(
        out.join(full_data, on=["prediction", "comparison"], how="outer")
        .groupBy("comparison")
        .pivot("prediction")
        .agg(F.first("a"))
        .sort(F.col("comparison").desc())
        .select("comparison", "yes", "no")
        .fillna(0)
        .toPandas()
        .to_numpy(),
        [0],
        1,
    )
    total = np.sum(array1)
    res_npPhaseX = np.array(array1, dtype=int)
    resX = convertTuple(fisher_exact(res_npPhaseX, alternative="two-sided"))
    resx_CI = convertTuple(
        odds_ratio(res_npPhaseX).confidence_interval(confidence_level=0.95)
    )

    result_st.append(resX)
    result_ci.append(resx_CI)
    (rs_result, rs_ci) = relative_success(array1)

    results.append(
        [
            comparisonType,
            comparisonColumn,
            predictionColumn,
            round(float(resX.split(",")[0]), 2),
            float(resX.split(",")[1]),
            round(float(resx_CI.split(",")[0]), 2),
            round(float(resx_CI.split(",")[1]), 2),
            str(total),
            np.array(res_npPhaseX).tolist(),
            round(float(rs_result), 2),
            round(float(rs_ci[0]), 2),
            round(float(rs_ci[1]), 2),
            filePath,
        ]
    )
    return results


import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
from scipy.stats.contingency import odds_ratio

# Initialize an empty list to store the results
result_st = []
result_ci = []
results = []


def convertTuple(tup):
    st = ",".join(map(str, tup))
    return st


print("launched function to run analysis")


listado = []
today_date = str(date.today())
for key, df_analysis in datasetDict.items():
    aggSetups_original = comparisons_df(df_analysis)
    print("corresponding dataframe key: ", key)
    df_analysis.persist()
    for row in aggSetups_original:
        print(key, value)
        aggregations_original(df_analysis, key, listado, *row, today_date)

print("finished analysis")

spark session created at 2025-01-27 22:16:36.625418
Analysis started on 2025-01-27 at  2025-01-27 22:16:36.625418


25/01/27 22:16:41 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

read spark files
fixing scXQTL and XQTL studies
fixed scXQTL and XQTL studies
creating new coloc


                                                                                

creating new gwasResolvedColoc
creating new gwasCredibleAssoc


                                                                                

creating gwasCredibleAssocDistances
creating analysis_chembl


                                                                                

creating benchmarkOT function
creating dataframes in loop


                                                                                

launched function to run analysis
corresponding dataframe key:  df_l2g_original
df_l2g_original min_tssDistance_rank


25/01/27 22:21:03 WARN CacheManager: Asked to cache already cached data.
                                                                                

AnalysisException: Column 'clinicalStatus' does not exist. Did you mean one of the following? [diagonalYes, noEvaluable, oneCellYes, diseaseId, predictionType, targetId, total, GoF_protect, L2GAndColoc, LoF_protect, Phase0, Phase4, Phase>=1, Phase>=2, Phase>=3, comparisonType, maxClinPhase, max_L2GScore, GoF_protect_ch, GoF_risk, LoF_protect_ch, LoF_risk, coherencyOneCell, coherencyDiagonal, hasGeneticEvidence, coherencyDiagonal_ch, max_L2GScore>=0_1, max_L2GScore>=0_2, max_L2GScore>=0_3, max_L2GScore>=0_4, max_L2GScore>=0_5, max_L2GScore>=0_6, max_L2GScore>=0_7, max_L2GScore>=0_8, max_L2GScore>=0_9, coherencyOneCell_ch, diagonalAgreeWithDrugs, max_L2GScore>=0_15, max_L2GScore>=0_25, max_L2GScore>=0_35, max_L2GScore>=0_45, max_L2GScore>=0_55, max_L2GScore>=0_65, max_L2GScore>=0_75, max_L2GScore>=0_85, max_L2GScore>=0_95, oneCellAgreeWithDrugs, max_L2GScore>=0_1&diagonalYes_combined, max_L2GScore>=0_1&oneCellYes_combined, max_L2GScore>=0_2&diagonalYes_combined, max_L2GScore>=0_2&oneCellYes_combined, max_L2GScore>=0_3&diagonalYes_combined, max_L2GScore>=0_3&oneCellYes_combined, max_L2GScore>=0_4&diagonalYes_combined, max_L2GScore>=0_4&oneCellYes_combined, max_L2GScore>=0_5&diagonalYes_combined, max_L2GScore>=0_5&oneCellYes_combined, max_L2GScore>=0_6&diagonalYes_combined, max_L2GScore>=0_6&oneCellYes_combined, max_L2GScore>=0_7&diagonalYes_combined, max_L2GScore>=0_7&oneCellYes_combined, max_L2GScore>=0_8&diagonalYes_combined, max_L2GScore>=0_8&oneCellYes_combined, max_L2GScore>=0_9&diagonalYes_combined, max_L2GScore>=0_9&oneCellYes_combined, max_L2GScore>=0_1&L2GAndColoc_combined, max_L2GScore>=0_15&diagonalYes_combined, max_L2GScore>=0_15&oneCellYes_combined, max_L2GScore>=0_2&L2GAndColoc_combined, max_L2GScore>=0_25&diagonalYes_combined, max_L2GScore>=0_25&oneCellYes_combined, max_L2GScore>=0_3&L2GAndColoc_combined, max_L2GScore>=0_35&diagonalYes_combined, max_L2GScore>=0_35&oneCellYes_combined, max_L2GScore>=0_4&L2GAndColoc_combined, max_L2GScore>=0_45&diagonalYes_combined, max_L2GScore>=0_45&oneCellYes_combined, max_L2GScore>=0_5&L2GAndColoc_combined, max_L2GScore>=0_55&diagonalYes_combined, max_L2GScore>=0_55&oneCellYes_combined, max_L2GScore>=0_6&L2GAndColoc_combined, max_L2GScore>=0_65&diagonalYes_combined, max_L2GScore>=0_65&oneCellYes_combined, max_L2GScore>=0_7&L2GAndColoc_combined, max_L2GScore>=0_75&diagonalYes_combined, max_L2GScore>=0_75&oneCellYes_combined, max_L2GScore>=0_8&L2GAndColoc_combined, max_L2GScore>=0_85&diagonalYes_combined, max_L2GScore>=0_85&oneCellYes_combined, max_L2GScore>=0_9&L2GAndColoc_combined, max_L2GScore>=0_95&diagonalYes_combined, max_L2GScore>=0_95&oneCellYes_combined, max_L2GScore>=0_15&L2GAndColoc_combined, max_L2GScore>=0_25&L2GAndColoc_combined, max_L2GScore>=0_35&L2GAndColoc_combined, max_L2GScore>=0_45&L2GAndColoc_combined, max_L2GScore>=0_55&L2GAndColoc_combined, max_L2GScore>=0_65&L2GAndColoc_combined, max_L2GScore>=0_75&L2GAndColoc_combined, max_L2GScore>=0_85&L2GAndColoc_combined, max_L2GScore>=0_95&L2GAndColoc_combined, max_L2GScore>=0_1&hasGeneticEvidence_combined, max_L2GScore>=0_2&hasGeneticEvidence_combined, max_L2GScore>=0_3&hasGeneticEvidence_combined, max_L2GScore>=0_4&hasGeneticEvidence_combined, max_L2GScore>=0_5&hasGeneticEvidence_combined, max_L2GScore>=0_6&hasGeneticEvidence_combined, max_L2GScore>=0_7&hasGeneticEvidence_combined, max_L2GScore>=0_8&hasGeneticEvidence_combined, max_L2GScore>=0_9&hasGeneticEvidence_combined, max_L2GScore>=0_15&hasGeneticEvidence_combined, max_L2GScore>=0_25&hasGeneticEvidence_combined, max_L2GScore>=0_35&hasGeneticEvidence_combined, max_L2GScore>=0_45&hasGeneticEvidence_combined, max_L2GScore>=0_55&hasGeneticEvidence_combined, max_L2GScore>=0_65&hasGeneticEvidence_combined, max_L2GScore>=0_75&hasGeneticEvidence_combined, max_L2GScore>=0_85&hasGeneticEvidence_combined, max_L2GScore>=0_95&hasGeneticEvidence_combined];
'Project [targetId#718, diseaseId#818, max_L2GScore#13786, GoF_protect#15485L, GoF_risk#15486L, LoF_protect#15487L, LoF_risk#15488L, noEvaluable#15489L, coherencyDiagonal#15508, coherencyOneCell#15518, maxClinPhase#8314, coherencyDiagonal_ch#13774, coherencyOneCell_ch#13775, LoF_protect_ch#13776L, GoF_protect_ch#13777L, diagonalAgreeWithDrugs#15544, oneCellAgreeWithDrugs#15561, Phase4#15579, Phase>=3#15598, Phase>=2#15618, Phase>=1#15639, Phase0#15661, diagonalYes#15684, oneCellYes#15708, ... 96 more fields]
+- Project [targetId#718, diseaseId#818, max_L2GScore#13786, GoF_protect#15485L, GoF_risk#15486L, LoF_protect#15487L, LoF_risk#15488L, noEvaluable#15489L, coherencyDiagonal#15508, coherencyOneCell#15518, maxClinPhase#8314, coherencyDiagonal_ch#13774, coherencyOneCell_ch#13775, LoF_protect_ch#13776L, GoF_protect_ch#13777L, diagonalAgreeWithDrugs#15544, oneCellAgreeWithDrugs#15561, Phase4#15579, Phase>=3#15598, Phase>=2#15618, Phase>=1#15639, Phase0#15661, diagonalYes#15684, oneCellYes#15708, ... 95 more fields]
   +- Project [targetId#718, diseaseId#818, max_L2GScore#13786, GoF_protect#15485L, GoF_risk#15486L, LoF_protect#15487L, LoF_risk#15488L, noEvaluable#15489L, coherencyDiagonal#15508, coherencyOneCell#15518, maxClinPhase#8314, coherencyDiagonal_ch#13774, coherencyOneCell_ch#13775, LoF_protect_ch#13776L, GoF_protect_ch#13777L, diagonalAgreeWithDrugs#15544, oneCellAgreeWithDrugs#15561, Phase4#15579, Phase>=3#15598, Phase>=2#15618, Phase>=1#15639, Phase0#15661, diagonalYes#15684, oneCellYes#15708, ... 94 more fields]
      +- Project [targetId#718, diseaseId#818, max_L2GScore#13786, GoF_protect#15485L, GoF_risk#15486L, LoF_protect#15487L, LoF_risk#15488L, noEvaluable#15489L, coherencyDiagonal#15508, coherencyOneCell#15518, maxClinPhase#8314, coherencyDiagonal_ch#13774, coherencyOneCell_ch#13775, LoF_protect_ch#13776L, GoF_protect_ch#13777L, diagonalAgreeWithDrugs#15544, oneCellAgreeWithDrugs#15561, Phase4#15579, Phase>=3#15598, Phase>=2#15618, Phase>=1#15639, Phase0#15661, diagonalYes#15684, oneCellYes#15708, ... 93 more fields]
         +- Project [targetId#718, diseaseId#818, max_L2GScore#13786, GoF_protect#15485L, GoF_risk#15486L, LoF_protect#15487L, LoF_risk#15488L, noEvaluable#15489L, coherencyDiagonal#15508, coherencyOneCell#15518, maxClinPhase#8314, coherencyDiagonal_ch#13774, coherencyOneCell_ch#13775, LoF_protect_ch#13776L, GoF_protect_ch#13777L, diagonalAgreeWithDrugs#15544, oneCellAgreeWithDrugs#15561, Phase4#15579, Phase>=3#15598, Phase>=2#15618, Phase>=1#15639, Phase0#15661, diagonalYes#15684, oneCellYes#15708, ... 92 more fields]
            +- Project [targetId#718, diseaseId#818, max_L2GScore#13786, GoF_protect#15485L, GoF_risk#15486L, LoF_protect#15487L, LoF_risk#15488L, noEvaluable#15489L, coherencyDiagonal#15508, coherencyOneCell#15518, maxClinPhase#8314, coherencyDiagonal_ch#13774, coherencyOneCell_ch#13775, LoF_protect_ch#13776L, GoF_protect_ch#13777L, diagonalAgreeWithDrugs#15544, oneCellAgreeWithDrugs#15561, Phase4#15579, Phase>=3#15598, Phase>=2#15618, Phase>=1#15639, Phase0#15661, diagonalYes#15684, oneCellYes#15708, ... 2 more fields]
               +- Project [targetId#718, diseaseId#818, max_L2GScore#13786, GoF_protect#15485L, GoF_risk#15486L, LoF_protect#15487L, LoF_risk#15488L, noEvaluable#15489L, coherencyDiagonal#15508, coherencyOneCell#15518, maxClinPhase#8314, coherencyDiagonal_ch#13774, coherencyOneCell_ch#13775, LoF_protect_ch#13776L, GoF_protect_ch#13777L, diagonalAgreeWithDrugs#15544, oneCellAgreeWithDrugs#15561, Phase4#15579, Phase>=3#15598, Phase>=2#15618, Phase>=1#15639, Phase0#15661, diagonalYes#15684, oneCellYes#15708, CASE WHEN isnotnull(max_L2GScore#13786) THEN yes ELSE no END AS hasGeneticEvidence#15733]
                  +- Project [targetId#718, diseaseId#818, max_L2GScore#13786, GoF_protect#15485L, GoF_risk#15486L, LoF_protect#15487L, LoF_risk#15488L, noEvaluable#15489L, coherencyDiagonal#15508, coherencyOneCell#15518, maxClinPhase#8314, coherencyDiagonal_ch#13774, coherencyOneCell_ch#13775, LoF_protect_ch#13776L, GoF_protect_ch#13777L, diagonalAgreeWithDrugs#15544, oneCellAgreeWithDrugs#15561, Phase4#15579, Phase>=3#15598, Phase>=2#15618, Phase>=1#15639, Phase0#15661, diagonalYes#15684, CASE WHEN (oneCellAgreeWithDrugs#15561 = coherent) THEN yes ELSE no END AS oneCellYes#15708]
                     +- Project [targetId#718, diseaseId#818, max_L2GScore#13786, GoF_protect#15485L, GoF_risk#15486L, LoF_protect#15487L, LoF_risk#15488L, noEvaluable#15489L, coherencyDiagonal#15508, coherencyOneCell#15518, maxClinPhase#8314, coherencyDiagonal_ch#13774, coherencyOneCell_ch#13775, LoF_protect_ch#13776L, GoF_protect_ch#13777L, diagonalAgreeWithDrugs#15544, oneCellAgreeWithDrugs#15561, Phase4#15579, Phase>=3#15598, Phase>=2#15618, Phase>=1#15639, Phase0#15661, CASE WHEN (diagonalAgreeWithDrugs#15544 = coherent) THEN yes ELSE no END AS diagonalYes#15684]
                        +- Project [targetId#718, diseaseId#818, max_L2GScore#13786, GoF_protect#15485L, GoF_risk#15486L, LoF_protect#15487L, LoF_risk#15488L, noEvaluable#15489L, coherencyDiagonal#15508, coherencyOneCell#15518, maxClinPhase#8314, coherencyDiagonal_ch#13774, coherencyOneCell_ch#13775, LoF_protect_ch#13776L, GoF_protect_ch#13777L, diagonalAgreeWithDrugs#15544, oneCellAgreeWithDrugs#15561, Phase4#15579, Phase>=3#15598, Phase>=2#15618, Phase>=1#15639, CASE WHEN (maxClinPhase#8314 = cast(0 as double)) THEN yes ELSE no END AS Phase0#15661]
                           +- Project [targetId#718, diseaseId#818, max_L2GScore#13786, GoF_protect#15485L, GoF_risk#15486L, LoF_protect#15487L, LoF_risk#15488L, noEvaluable#15489L, coherencyDiagonal#15508, coherencyOneCell#15518, maxClinPhase#8314, coherencyDiagonal_ch#13774, coherencyOneCell_ch#13775, LoF_protect_ch#13776L, GoF_protect_ch#13777L, diagonalAgreeWithDrugs#15544, oneCellAgreeWithDrugs#15561, Phase4#15579, Phase>=3#15598, Phase>=2#15618, CASE WHEN (maxClinPhase#8314 >= cast(1 as double)) THEN yes ELSE no END AS Phase>=1#15639]
                              +- Project [targetId#718, diseaseId#818, max_L2GScore#13786, GoF_protect#15485L, GoF_risk#15486L, LoF_protect#15487L, LoF_risk#15488L, noEvaluable#15489L, coherencyDiagonal#15508, coherencyOneCell#15518, maxClinPhase#8314, coherencyDiagonal_ch#13774, coherencyOneCell_ch#13775, LoF_protect_ch#13776L, GoF_protect_ch#13777L, diagonalAgreeWithDrugs#15544, oneCellAgreeWithDrugs#15561, Phase4#15579, Phase>=3#15598, CASE WHEN (maxClinPhase#8314 >= cast(2 as double)) THEN yes ELSE no END AS Phase>=2#15618]
                                 +- Project [targetId#718, diseaseId#818, max_L2GScore#13786, GoF_protect#15485L, GoF_risk#15486L, LoF_protect#15487L, LoF_risk#15488L, noEvaluable#15489L, coherencyDiagonal#15508, coherencyOneCell#15518, maxClinPhase#8314, coherencyDiagonal_ch#13774, coherencyOneCell_ch#13775, LoF_protect_ch#13776L, GoF_protect_ch#13777L, diagonalAgreeWithDrugs#15544, oneCellAgreeWithDrugs#15561, Phase4#15579, CASE WHEN (maxClinPhase#8314 >= cast(3 as double)) THEN yes ELSE no END AS Phase>=3#15598]
                                    +- Project [targetId#718, diseaseId#818, max_L2GScore#13786, GoF_protect#15485L, GoF_risk#15486L, LoF_protect#15487L, LoF_risk#15488L, noEvaluable#15489L, coherencyDiagonal#15508, coherencyOneCell#15518, maxClinPhase#8314, coherencyDiagonal_ch#13774, coherencyOneCell_ch#13775, LoF_protect_ch#13776L, GoF_protect_ch#13777L, diagonalAgreeWithDrugs#15544, oneCellAgreeWithDrugs#15561, CASE WHEN (maxClinPhase#8314 = cast(4 as double)) THEN yes ELSE no END AS Phase4#15579]
                                       +- Project [targetId#718, diseaseId#818, max_L2GScore#13786, GoF_protect#15485L, GoF_risk#15486L, LoF_protect#15487L, LoF_risk#15488L, noEvaluable#15489L, coherencyDiagonal#15508, coherencyOneCell#15518, maxClinPhase#8314, coherencyDiagonal_ch#13774, coherencyOneCell_ch#13775, LoF_protect_ch#13776L, GoF_protect_ch#13777L, diagonalAgreeWithDrugs#15544, CASE WHEN ((coherencyOneCell_ch#13775 = coherent) AND isnotnull(coherencyDiagonal#15508)) THEN CASE WHEN (isnotnull(LoF_protect_ch#13776L) AND (((isnotnull(LoF_protect#15487L) AND isnull(LoF_risk#15488L)) AND isnull(GoF_protect#15485L)) AND isnull(GoF_risk#15486L))) THEN coherent WHEN (isnotnull(GoF_protect_ch#13777L) AND (((isnotnull(GoF_protect#15485L) AND isnull(LoF_risk#15488L)) AND isnull(LoF_protect#15487L)) AND isnull(GoF_risk#15486L))) THEN coherent ELSE dispar END END AS oneCellAgreeWithDrugs#15561]
                                          +- Project [targetId#718, diseaseId#818, max_L2GScore#13786, GoF_protect#15485L, GoF_risk#15486L, LoF_protect#15487L, LoF_risk#15488L, noEvaluable#15489L, coherencyDiagonal#15508, coherencyOneCell#15518, maxClinPhase#8314, coherencyDiagonal_ch#13774, coherencyOneCell_ch#13775, LoF_protect_ch#13776L, GoF_protect_ch#13777L, CASE WHEN ((coherencyDiagonal_ch#13774 = coherent) AND isnotnull(coherencyDiagonal#15508)) THEN CASE WHEN (isnotnull(LoF_protect_ch#13776L) AND (isnotnull(GoF_risk#15486L) OR isnotnull(LoF_protect#15487L))) THEN coherent WHEN (isnotnull(GoF_protect_ch#13777L) AND (isnotnull(LoF_risk#15488L) OR isnotnull(GoF_protect#15485L))) THEN coherent ELSE dispar END END AS diagonalAgreeWithDrugs#15544]
                                             +- Project [targetId#718, diseaseId#818, max_L2GScore#13786, GoF_protect#15485L, GoF_risk#15486L, LoF_protect#15487L, LoF_risk#15488L, noEvaluable#15489L, coherencyDiagonal#15508, coherencyOneCell#15518, maxClinPhase#8314, coherencyDiagonal_ch#13774, coherencyOneCell_ch#13775, LoF_protect_ch#13776L, GoF_protect_ch#13777L]
                                                +- Join RightOuter, ((targetId#933 = targetId#718) AND (diseaseId#4812 = diseaseId#818))
                                                   :- Project [targetId#933, diseaseId#4812, max_L2GScore#13786, GoF_protect#15485L, GoF_risk#15486L, LoF_protect#15487L, LoF_risk#15488L, noEvaluable#15489L, coherencyDiagonal#15508, CASE WHEN ((((isnull(LoF_risk#15488L) AND isnull(LoF_protect#15487L)) AND isnull(GoF_risk#15486L)) AND isnull(GoF_protect#15485L)) AND isnull(noEvaluable#15489L)) THEN noEvid WHEN ((((isnull(LoF_risk#15488L) AND isnull(LoF_protect#15487L)) AND isnull(GoF_risk#15486L)) AND isnull(GoF_protect#15485L)) AND isnotnull(noEvaluable#15489L)) THEN EvidNotDoE WHEN (((isnotnull(LoF_risk#15488L) OR isnotnull(LoF_protect#15487L)) OR isnotnull(GoF_risk#15486L)) OR isnotnull(GoF_protect#15485L)) THEN CASE WHEN (isnotnull(LoF_risk#15488L) AND ((isnull(LoF_protect#15487L) AND isnull(GoF_risk#15486L)) AND isnull(GoF_protect#15485L))) THEN coherent WHEN (isnotnull(GoF_risk#15486L) AND ((isnull(LoF_protect#15487L) AND isnull(LoF_risk#15488L)) AND isnull(GoF_protect#15485L))) THEN coherent WHEN (isnotnull(LoF_protect#15487L) AND ((isnull(LoF_risk#15488L) AND isnull(GoF_risk#15486L)) AND isnull(GoF_protect#15485L))) THEN coherent WHEN (isnotnull(GoF_protect#15485L) AND ((isnull(LoF_protect#15487L) AND isnull(GoF_risk#15486L)) AND isnull(LoF_risk#15488L))) THEN coherent ELSE dispar END END AS coherencyOneCell#15518]
                                                   :  +- Project [targetId#933, diseaseId#4812, max_L2GScore#13786, GoF_protect#15485L, GoF_risk#15486L, LoF_protect#15487L, LoF_risk#15488L, noEvaluable#15489L, CASE WHEN ((((isnull(LoF_risk#15488L) AND isnull(LoF_protect#15487L)) AND isnull(GoF_risk#15486L)) AND isnull(GoF_protect#15485L)) AND isnull(noEvaluable#15489L)) THEN noEvid WHEN ((((isnull(LoF_risk#15488L) AND isnull(LoF_protect#15487L)) AND isnull(GoF_risk#15486L)) AND isnull(GoF_protect#15485L)) AND isnotnull(noEvaluable#15489L)) THEN EvidNotDoE WHEN (((isnotnull(LoF_risk#15488L) OR isnotnull(LoF_protect#15487L)) OR isnotnull(GoF_risk#15486L)) OR isnotnull(GoF_protect#15485L)) THEN CASE WHEN (isnotnull(GoF_risk#15486L) AND isnotnull(LoF_risk#15488L)) THEN dispar WHEN (isnotnull(LoF_protect#15487L) AND isnotnull(LoF_risk#15488L)) THEN dispar WHEN (isnotnull(GoF_protect#15485L) AND isnotnull(GoF_risk#15486L)) THEN dispar WHEN (isnotnull(GoF_protect#15485L) AND isnotnull(LoF_protect#15487L)) THEN dispar ELSE coherent END END AS coherencyDiagonal#15508]
                                                   :     +- Project [targetId#933, diseaseId#4812, max_L2GScore#13786, __pivot_count(1) AS count AS `count(1) AS count`#15484[0] AS GoF_protect#15485L, __pivot_count(1) AS count AS `count(1) AS count`#15484[1] AS GoF_risk#15486L, __pivot_count(1) AS count AS `count(1) AS count`#15484[2] AS LoF_protect#15487L, __pivot_count(1) AS count AS `count(1) AS count`#15484[3] AS LoF_risk#15488L, __pivot_count(1) AS count AS `count(1) AS count`#15484[4] AS noEvaluable#15489L]
                                                   :        +- Aggregate [targetId#933, diseaseId#4812, max_L2GScore#13786], [targetId#933, diseaseId#4812, max_L2GScore#13786, pivotfirst(homogenized#5856, count(1) AS count#15472L, GoF_protect, GoF_risk, LoF_protect, LoF_risk, noEvaluable, 0, 0) AS __pivot_count(1) AS count AS `count(1) AS count`#15484]
                                                   :           +- Aggregate [targetId#933, diseaseId#4812, max_L2GScore#13786, homogenized#5856], [targetId#933, diseaseId#4812, max_L2GScore#13786, homogenized#5856, count(1) AS count(1) AS count#15472L]
                                                   :              +- Project [leftStudyLocusId#4687, targetId#933, diseaseId#4812, homogenized#5856, h4#1268, datasourceId#932, resourceScore#1002, leftVariantId#3247, credibleLeftStudyType#3248, distanceFootprintMean#5897, distanceTssMean#5898, max_L2GScore#13786, min_footPrintDistance_rank#13800, min_tssDistance_rank#13815]
                                                   :                 +- Project [leftStudyLocusId#4687, targetId#933, diseaseId#4812, homogenized#5856, h4#1268, datasourceId#932, resourceScore#1002, leftVariantId#3247, credibleLeftStudyType#3248, distanceFootprintMean#5897, distanceTssMean#5898, max_L2GScore#13786, min_footPrintDistance_rank#13800, min_tssDistance_rank#13815, min_tssDistance_rank#13815]
                                                   :                    +- Window [min(distanceTssMean#5898) windowspecdefinition(targetId#933, diseaseId#4812, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS min_tssDistance_rank#13815], [targetId#933, diseaseId#4812]
                                                   :                       +- Project [leftStudyLocusId#4687, targetId#933, diseaseId#4812, homogenized#5856, h4#1268, datasourceId#932, resourceScore#1002, leftVariantId#3247, credibleLeftStudyType#3248, distanceFootprintMean#5897, distanceTssMean#5898, max_L2GScore#13786, min_footPrintDistance_rank#13800]
                                                   :                          +- Project [leftStudyLocusId#4687, targetId#933, diseaseId#4812, homogenized#5856, h4#1268, datasourceId#932, resourceScore#1002, leftVariantId#3247, credibleLeftStudyType#3248, distanceFootprintMean#5897, distanceTssMean#5898, max_L2GScore#13786, min_footPrintDistance_rank#13800]
                                                   :                             +- Project [leftStudyLocusId#4687, targetId#933, diseaseId#4812, homogenized#5856, h4#1268, datasourceId#932, resourceScore#1002, leftVariantId#3247, credibleLeftStudyType#3248, distanceFootprintMean#5897, distanceTssMean#5898, max_L2GScore#13786, min_footPrintDistance_rank#13800, min_footPrintDistance_rank#13800]
                                                   :                                +- Window [min(distanceFootprintMean#5897) windowspecdefinition(targetId#933, diseaseId#4812, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS min_footPrintDistance_rank#13800], [targetId#933, diseaseId#4812]
                                                   :                                   +- Project [leftStudyLocusId#4687, targetId#933, diseaseId#4812, homogenized#5856, h4#1268, datasourceId#932, resourceScore#1002, leftVariantId#3247, credibleLeftStudyType#3248, distanceFootprintMean#5897, distanceTssMean#5898, max_L2GScore#13786]
                                                   :                                      +- Project [leftStudyLocusId#4687, targetId#933, diseaseId#4812, homogenized#5856, h4#1268, datasourceId#932, resourceScore#1002, leftVariantId#3247, credibleLeftStudyType#3248, distanceFootprintMean#5897, distanceTssMean#5898, max_L2GScore#13786]
                                                   :                                         +- Project [leftStudyLocusId#4687, targetId#933, diseaseId#4812, homogenized#5856, h4#1268, datasourceId#932, resourceScore#1002, leftVariantId#3247, credibleLeftStudyType#3248, distanceFootprintMean#5897, distanceTssMean#5898, max_L2GScore#13786, max_L2GScore#13786]
                                                   :                                            +- Window [max(resourceScore#1002) windowspecdefinition(targetId#933, diseaseId#4812, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS max_L2GScore#13786], [targetId#933, diseaseId#4812]
                                                   :                                               +- Project [leftStudyLocusId#4687, targetId#933, diseaseId#4812, homogenized#5856, h4#1268, datasourceId#932, resourceScore#1002, leftVariantId#3247, credibleLeftStudyType#3248, distanceFootprintMean#5897, distanceTssMean#5898]
                                                   :                                                  +- Project [leftStudyLocusId#4687, targetId#933, diseaseId#4812, homogenized#5856, h4#1268, datasourceId#932, resourceScore#1002, leftVariantId#3247, credibleLeftStudyType#3248, distanceFootprintMean#5897, distanceTssMean#5898]
                                                   :                                                     +- Join LeftOuter, ((leftStudyLocusId#4687 = leftStudyLocusId#5907) AND (targetId#933 = targetId#5912))
                                                   :                                                        :- Project [targetId#933, diseaseId#4812, CASE WHEN isnull(homogenized#5806) THEN noEvaluable ELSE homogenized#5806 END AS homogenized#5856, leftStudyLocusId#4687, h4#1268, datasourceId#932, resourceScore#1002, leftVariantId#3247, credibleLeftStudyType#3248]
                                                   :                                                        :  +- Project [targetId#933, diseaseId#4812, homogenized#5806, leftStudyLocusId#4687, h4#1268, datasourceId#932, resourceScore#1002, leftVariantId#3247, credibleLeftStudyType#3248]
                                                   :                                                        :     +- Project [diseaseId#4812, leftStudyLocusId#4687, targetId#933, rightStudyId#3271, rightStudyLocusId#1260, chromosome#1261, rightStudyType#1262, numberColocalisingVariants#1263L, h0#1264, h1#1265, h2#1266, h3#1267, h4#1268, colocalisationMethod#1269, betaRatioSignAverage#1270, leftStudyId#3246, leftVariantId#3247, credibleLeftStudyType#3248, rightVariantId#3272, credibleRightStudyType#3273, projectId#1201, indexStudyType#3324, condition#1224, biosampleId#1228, ... 16 more fields]
                                                   :                                                        :        +- Project [diseaseId#4812, leftStudyLocusId#4687, targetId#933, rightStudyId#3271, rightStudyLocusId#1260, chromosome#1261, rightStudyType#1262, numberColocalisingVariants#1263L, h0#1264, h1#1265, h2#1266, h3#1267, h4#1268, colocalisationMethod#1269, betaRatioSignAverage#1270, leftStudyId#3246, leftVariantId#3247, credibleLeftStudyType#3248, rightVariantId#3272, credibleRightStudyType#3273, projectId#1201, indexStudyType#3324, condition#1224, biosampleId#1228, ... 17 more fields]
                                                   :                                                        :           +- Window [first(colocDoE#4890, true) windowspecdefinition(targetId#933, diseaseId#4812, pValueExponent#4712 ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS homogenized#5806], [targetId#933, diseaseId#4812], [pValueExponent#4712 ASC NULLS FIRST]
                                                   :                                                        :              +- Project [diseaseId#4812, leftStudyLocusId#4687, targetId#933, rightStudyId#3271, rightStudyLocusId#1260, chromosome#1261, rightStudyType#1262, numberColocalisingVariants#1263L, h0#1264, h1#1265, h2#1266, h3#1267, h4#1268, colocalisationMethod#1269, betaRatioSignAverage#1270, leftStudyId#3246, leftVariantId#3247, credibleLeftStudyType#3248, rightVariantId#3272, credibleRightStudyType#3273, projectId#1201, indexStudyType#3324, condition#1224, biosampleId#1228, ... 15 more fields]
                                                   :                                                        :                 +- Project [diseaseId#4812, leftStudyLocusId#4687, targetId#933, rightStudyId#3271, rightStudyLocusId#1260, chromosome#1261, rightStudyType#1262, numberColocalisingVariants#1263L, h0#1264, h1#1265, h2#1266, h3#1267, h4#1268, colocalisationMethod#1269, betaRatioSignAverage#1270, leftStudyId#3246, leftVariantId#3247, credibleLeftStudyType#3248, rightVariantId#3272, credibleRightStudyType#3273, projectId#1201, indexStudyType#3324, condition#1224, biosampleId#1228, ... 15 more fields]
                                                   :                                                        :                    +- Project [diseaseId#4812, leftStudyLocusId#4687, targetId#933, rightStudyId#3271, rightStudyLocusId#1260, chromosome#1261, rightStudyType#1262, numberColocalisingVariants#1263L, h0#1264, h1#1265, h2#1266, h3#1267, h4#1268, colocalisationMethod#1269, betaRatioSignAverage#1270, leftStudyId#3246, leftVariantId#3247, credibleLeftStudyType#3248, rightVariantId#3272, credibleRightStudyType#3273, projectId#1201, indexStudyType#3324, condition#1224, biosampleId#1228, ... 14 more fields]
                                                   :                                                        :                       +- Project [diseaseId#4812, leftStudyLocusId#4687, targetId#933, rightStudyId#3271, rightStudyLocusId#1260, chromosome#1261, rightStudyType#1262, numberColocalisingVariants#1263L, h0#1264, h1#1265, h2#1266, h3#1267, h4#1268, colocalisationMethod#1269, betaRatioSignAverage#1270, leftStudyId#3246, leftVariantId#3247, credibleLeftStudyType#3248, rightVariantId#3272, credibleRightStudyType#3273, projectId#1201, indexStudyType#3324, condition#1224, biosampleId#1228, ... 15 more fields]
                                                   :                                                        :                          +- Generate explode(concat(array(diseaseId#1033), parents#694)), true, [diseaseId#4812]
                                                   :                                                        :                             +- Project [diseaseId#1033, leftStudyLocusId#4687, targetId#933, rightStudyId#3271, rightStudyLocusId#1260, chromosome#1261, rightStudyType#1262, numberColocalisingVariants#1263L, h0#1264, h1#1265, h2#1266, h3#1267, h4#1268, colocalisationMethod#1269, betaRatioSignAverage#1270, leftStudyId#3246, leftVariantId#3247, credibleLeftStudyType#3248, rightVariantId#3272, credibleRightStudyType#3273, projectId#1201, indexStudyType#3324, condition#1224, biosampleId#1228, ... 15 more fields]
                                                   :                                                        :                                +- Join LeftOuter, (diseaseId#1033 = diseaseId#4766)
                                                   :                                                        :                                   :- Project [leftStudyLocusId#4687, targetId#933, rightStudyId#3271, rightStudyLocusId#1260, chromosome#1261, rightStudyType#1262, numberColocalisingVariants#1263L, h0#1264, h1#1265, h2#1266, h3#1267, h4#1268, colocalisationMethod#1269, betaRatioSignAverage#1270, leftStudyId#3246, leftVariantId#3247, credibleLeftStudyType#3248, rightVariantId#3272, credibleRightStudyType#3273, projectId#1201, indexStudyType#3324, condition#1224, biosampleId#1228, datasourceId#932, ... 12 more fields]
                                                   :                                                        :                                   :  +- Join RightOuter, ((leftStudyLocusId#1259 = leftStudyLocusId#4687) AND (targetId#4663 = targetId#933))
                                                   :                                                        :                                   :     :- Project [rightStudyId#3271, rightStudyLocusId#1260, leftStudyLocusId#1259, chromosome#1261, rightStudyType#1262, numberColocalisingVariants#1263L, h0#1264, h1#1265, h2#1266, h3#1267, h4#1268, colocalisationMethod#1269, betaRatioSignAverage#1270, leftStudyId#3246, leftVariantId#3247, credibleLeftStudyType#3248, rightVariantId#3272, credibleRightStudyType#3273, geneId#1200 AS targetId#4663, projectId#1201, indexStudyType#3324, condition#1224, biosampleId#1228]
                                                   :                                                        :                                   :     :  +- Filter NOT (rightStudyType#1262 = gwas)
                                                   :                                                        :                                   :     :     +- Project [rightStudyId#3271, rightStudyLocusId#1260, leftStudyLocusId#1259, chromosome#1261, rightStudyType#1262, numberColocalisingVariants#1263L, h0#1264, h1#1265, h2#1266, h3#1267, h4#1268, colocalisationMethod#1269, betaRatioSignAverage#1270, leftStudyId#3246, leftVariantId#3247, credibleLeftStudyType#3248, rightVariantId#3272, credibleRightStudyType#3273, geneId#1200, projectId#1201, indexStudyType#3324, condition#1224, biosampleId#1228]
                                                   :                                                        :                                   :     :        +- Join LeftOuter, (rightStudyId#3271 = rightStudyId#3323)
                                                   :                                                        :                                   :     :           :- Project [rightStudyLocusId#1260, leftStudyLocusId#1259, chromosome#1261, rightStudyType#1262, numberColocalisingVariants#1263L, h0#1264, h1#1265, h2#1266, h3#1267, h4#1268, colocalisationMethod#1269, betaRatioSignAverage#1270, leftStudyId#3246, leftVariantId#3247, credibleLeftStudyType#3248, rightStudyId#3271, rightVariantId#3272, credibleRightStudyType#3273]
                                                   :                                                        :                                   :     :           :  +- Join LeftOuter, (rightStudyLocusId#1260 = rightStudyLocusId#3270)
                                                   :                                                        :                                   :     :           :     :- Project [leftStudyLocusId#1259, rightStudyLocusId#1260, chromosome#1261, rightStudyType#1262, numberColocalisingVariants#1263L, h0#1264, h1#1265, h2#1266, h3#1267, h4#1268, colocalisationMethod#1269, betaRatioSignAverage#1270, leftStudyId#3246, leftVariantId#3247, credibleLeftStudyType#3248]
                                                   :                                                        :                                   :     :           :     :  +- Join LeftOuter, (leftStudyLocusId#1259 = leftStudyLocusId#3245)
                                                   :                                                        :                                   :     :           :     :     :- Relation [leftStudyLocusId#1259,rightStudyLocusId#1260,chromosome#1261,rightStudyType#1262,numberColocalisingVariants#1263L,h0#1264,h1#1265,h2#1266,h3#1267,h4#1268,colocalisationMethod#1269,betaRatioSignAverage#1270] parquet
                                                   :                                                        :                                   :     :           :     :     +- Project [studyLocusId#1147 AS leftStudyLocusId#3245, StudyId#1148 AS leftStudyId#3246, variantId#1149 AS leftVariantId#3247, studyType#1172 AS credibleLeftStudyType#3248]
                                                   :                                                        :                                   :     :           :     :        +- Relation [studyLocusId#1147,studyId#1148,variantId#1149,chromosome#1150,position#1151,region#1152,beta#1153,zScore#1154,pValueMantissa#1155,pValueExponent#1156,effectAlleleFrequencyFromSource#1157,standardError#1158,subStudyDescription#1159,qualityControls#1160,finemappingMethod#1161,credibleSetIndex#1162,credibleSetlog10BF#1163,purityMeanR2#1164,purityMinR2#1165,locusStart#1166,locusEnd#1167,sampleSize#1168,ldSet#1169,locus#1170,... 2 more fields] parquet
                                                   :                                                        :                                   :     :           :     +- Project [studyLocusId#3278 AS rightStudyLocusId#3270, studyId#3279 AS rightStudyId#3271, variantId#3280 AS rightVariantId#3272, studyType#3303 AS credibleRightStudyType#3273]
                                                   :                                                        :                                   :     :           :        +- Relation [studyLocusId#3278,studyId#3279,variantId#3280,chromosome#3281,position#3282,region#3283,beta#3284,zScore#3285,pValueMantissa#3286,pValueExponent#3287,effectAlleleFrequencyFromSource#3288,standardError#3289,subStudyDescription#3290,qualityControls#3291,finemappingMethod#3292,credibleSetIndex#3293,credibleSetlog10BF#3294,purityMeanR2#3295,purityMinR2#3296,locusStart#3297,locusEnd#3298,sampleSize#3299,ldSet#3300,locus#3301,... 2 more fields] parquet
                                                   :                                                        :                                   :     :           +- Project [studyId#1199 AS rightStudyId#3323, geneId#1200, projectId#1201, studyType#3183 AS indexStudyType#3324, condition#1224, biosampleId#1228]
                                                   :                                                        :                                   :     :              +- Project [studyId#1199, geneId#1200, projectId#1201, studyType#3183, traitFromSource#1203, traitFromSourceMappedIds#1204, biosampleFromSourceId#1205, pubmedId#1206, publicationTitle#1207, publicationFirstAuthor#1208, publicationDate#1209, publicationJournal#1210, backgroundTraitFromSourceMappedIds#1211, initialSampleSize#1212, nCases#1213, nControls#1214, nSamples#1215, cohorts#1216, ldPopulationStructure#1217, discoverySamples#1218, replicationSamples#1219, qualityControls#1220, analysisFlags#1221, summarystatsLocation#1222, ... 6 more fields]
                                                   :                                                        :                                   :     :                 +- Project [studyId#1199, geneId#1200, projectId#1201, CASE WHEN isnotnull(newStudyType#1643) THEN newStudyType#1643 ELSE studyType#1202 END AS studyType#3183, traitFromSource#1203, traitFromSourceMappedIds#1204, biosampleFromSourceId#1205, pubmedId#1206, publicationTitle#1207, publicationFirstAuthor#1208, publicationDate#1209, publicationJournal#1210, backgroundTraitFromSourceMappedIds#1211, initialSampleSize#1212, nCases#1213, nControls#1214, nSamples#1215, cohorts#1216, ldPopulationStructure#1217, discoverySamples#1218, replicationSamples#1219, qualityControls#1220, analysisFlags#1221, summarystatsLocation#1222, ... 7 more fields]
                                                   :                                                        :                                   :     :                    +- Project [studyId#1199, geneId#1200, projectId#1201, studyType#1202, traitFromSource#1203, traitFromSourceMappedIds#1204, biosampleFromSourceId#1205, pubmedId#1206, publicationTitle#1207, publicationFirstAuthor#1208, publicationDate#1209, publicationJournal#1210, backgroundTraitFromSourceMappedIds#1211, initialSampleSize#1212, nCases#1213, nControls#1214, nSamples#1215, cohorts#1216, ldPopulationStructure#1217, discoverySamples#1218, replicationSamples#1219, qualityControls#1220, analysisFlags#1221, summarystatsLocation#1222, ... 7 more fields]
                                                   :                                                        :                                   :     :                       +- Join LeftOuter, (studyId#1199 = studyId#2346)
                                                   :                                                        :                                   :     :                          :- Relation [studyId#1199,geneId#1200,projectId#1201,studyType#1202,traitFromSource#1203,traitFromSourceMappedIds#1204,biosampleFromSourceId#1205,pubmedId#1206,publicationTitle#1207,publicationFirstAuthor#1208,publicationDate#1209,publicationJournal#1210,backgroundTraitFromSourceMappedIds#1211,initialSampleSize#1212,nCases#1213,nControls#1214,nSamples#1215,cohorts#1216,ldPopulationStructure#1217,discoverySamples#1218,replicationSamples#1219,qualityControls#1220,analysisFlags#1221,summarystatsLocation#1222,... 6 more fields] parquet
                                                   :                                                        :                                   :     :                          +- Project [studyId#2346, newStudyType#1643]
                                                   :                                                        :                                   :     :                             +- Project [studyId#2346, geneId#2347, projectId#2348, studyType#2349, traitFromSource#2350, traitFromSourceMappedIds#2351, biosampleFromSourceId#2352, pubmedId#2353, publicationTitle#2354, publicationFirstAuthor#2355, publicationDate#2356, publicationJournal#2357, backgroundTraitFromSourceMappedIds#2358, initialSampleSize#2359, nCases#2360, nControls#2361, nSamples#2362, cohorts#2363, ldPopulationStructure#2364, discoverySamples#2365, replicationSamples#2366, qualityControls#2367, analysisFlags#2368, summarystatsLocation#2369, ... 7 more fields]
                                                   :                                                        :                                   :     :                                +- Project [extracted_column#1385, study_type#1337, studyId#2346, geneId#2347, projectId#2348, studyType#2349, traitFromSource#2350, traitFromSourceMappedIds#2351, biosampleFromSourceId#2352, pubmedId#2353, publicationTitle#2354, publicationFirstAuthor#2355, publicationDate#2356, publicationJournal#2357, backgroundTraitFromSourceMappedIds#2358, initialSampleSize#2359, nCases#2360, nControls#2361, nSamples#2362, cohorts#2363, ldPopulationStructure#2364, discoverySamples#2365, replicationSamples#2366, qualityControls#2367, ... 10 more fields]
                                                   :                                                        :                                   :     :                                   +- Project [extracted_column#1385, study_type#1337, studyId#2346, geneId#2347, projectId#2348, studyType#2349, traitFromSource#2350, traitFromSourceMappedIds#2351, biosampleFromSourceId#2352, pubmedId#2353, publicationTitle#2354, publicationFirstAuthor#2355, publicationDate#2356, publicationJournal#2357, backgroundTraitFromSourceMappedIds#2358, initialSampleSize#2359, nCases#2360, nControls#2361, nSamples#2362, cohorts#2363, ldPopulationStructure#2364, discoverySamples#2365, replicationSamples#2366, qualityControls#2367, ... 9 more fields]
                                                   :                                                        :                                   :     :                                      +- Project [extracted_column#1385, study_type#1337, studyId#2346, geneId#2347, projectId#2348, studyType#2349, traitFromSource#2350, traitFromSourceMappedIds#2351, biosampleFromSourceId#2352, pubmedId#2353, publicationTitle#2354, publicationFirstAuthor#2355, publicationDate#2356, publicationJournal#2357, backgroundTraitFromSourceMappedIds#2358, initialSampleSize#2359, nCases#2360, nControls#2361, nSamples#2362, cohorts#2363, ldPopulationStructure#2364, discoverySamples#2365, replicationSamples#2366, qualityControls#2367, ... 8 more fields]
                                                   :                                                        :                                   :     :                                         +- Join RightOuter, (extracted_column#1349 = extracted_column#1385)
                                                   :                                                        :                                   :     :                                            :- Project [concat_ws(_, study_label#1329, quant_method#1335, sample_group#1330) AS extracted_column#1349, study_type#1337]
                                                   :                                                        :                                   :     :                                            :  +- LogicalRDD [study_id#1327, dataset_id#1328, study_label#1329, sample_group#1330, tissue_id#1331, tissue_label#1332, condition_label#1333, sample_size#1334, quant_method#1335, pmid#1336, study_type#1337], false
                                                   :                                                        :                                   :     :                                            +- Project [studyId#2346, geneId#2347, projectId#2348, studyType#2349, traitFromSource#2350, traitFromSourceMappedIds#2351, biosampleFromSourceId#2352, pubmedId#2353, publicationTitle#2354, publicationFirstAuthor#2355, publicationDate#2356, publicationJournal#2357, backgroundTraitFromSourceMappedIds#2358, initialSampleSize#2359, nCases#2360, nControls#2361, nSamples#2362, cohorts#2363, ldPopulationStructure#2364, discoverySamples#2365, replicationSamples#2366, qualityControls#2367, analysisFlags#2368, summarystatsLocation#2369, ... 7 more fields]
                                                   :                                                        :                                   :     :                                               +- Project [studyId#2346, geneId#2347, projectId#2348, studyType#2349, traitFromSource#2350, traitFromSourceMappedIds#2351, biosampleFromSourceId#2352, pubmedId#2353, publicationTitle#2354, publicationFirstAuthor#2355, publicationDate#2356, publicationJournal#2357, backgroundTraitFromSourceMappedIds#2358, initialSampleSize#2359, nCases#2360, nControls#2361, nSamples#2362, cohorts#2363, ldPopulationStructure#2364, discoverySamples#2365, replicationSamples#2366, qualityControls#2367, analysisFlags#2368, summarystatsLocation#2369, ... 7 more fields]
                                                   :                                                        :                                   :     :                                                  +- Filter NOT StartsWith(studyId#2346, UKB_PPP)
                                                   :                                                        :                                   :     :                                                     +- Filter NOT (studyType#2349 = gwas)
                                                   :                                                        :                                   :     :                                                        +- Relation [studyId#2346,geneId#2347,projectId#2348,studyType#2349,traitFromSource#2350,traitFromSourceMappedIds#2351,biosampleFromSourceId#2352,pubmedId#2353,publicationTitle#2354,publicationFirstAuthor#2355,publicationDate#2356,publicationJournal#2357,backgroundTraitFromSourceMappedIds#2358,initialSampleSize#2359,nCases#2360,nControls#2361,nSamples#2362,cohorts#2363,ldPopulationStructure#2364,discoverySamples#2365,replicationSamples#2366,qualityControls#2367,analysisFlags#2368,summarystatsLocation#2369,... 6 more fields] parquet
                                                   :                                                        :                                   :     +- Project [studyLocusId#1011 AS leftStudyLocusId#4687, datasourceId#932, targetId#933, datatypeId#962, diseaseFromSourceMappedId#966, resourceScore#1002, targetFromSourceId#1018, diseaseId#1033, id#1034, score#1035, sourceId#1038, studyId#4704, variantId#4705, betaGwas#4642, pValueExponent#4712]
                                                   :                                                        :                                   :        +- Project [studyLocusId#1011, datasourceId#932, targetId#933, datatypeId#962, diseaseFromSourceMappedId#966, resourceScore#1002, targetFromSourceId#1018, diseaseId#1033, id#1034, score#1035, sourceId#1038, studyId#4704, variantId#4705, betaGwas#4642, pValueExponent#4712]
                                                   :                                                        :                                   :           +- Join LeftOuter, (studyLocusId#1011 = studyLocusId#4703)
                                                   :                                                        :                                   :              :- Project [datasourceId#932, targetId#933, datatypeId#962, diseaseFromSourceMappedId#966, resourceScore#1002, studyLocusId#1011, targetFromSourceId#1018, diseaseId#1033, id#1034, score#1035, sourceId#1038]
                                                   :                                                        :                                   :              :  +- Filter (datasourceId#932 = gwas_credible_sets)
                                                   :                                                        :                                   :              :     +- Filter datasourceId#932 IN (gwas_credible_sets)
                                                   :                                                        :                                   :              :        +- Relation [datasourceId#932,targetId#933,alleleOrigins#934,allelicRequirements#935,ancestry#936,ancestryId#937,assays#938,assessments#939,beta#940,betaConfidenceIntervalLower#941,betaConfidenceIntervalUpper#942,biologicalModelAllelicComposition#943,biologicalModelGeneticBackground#944,biologicalModelId#945,biomarkerList#946,biomarkerName#947,biomarkers#948,biosamplesFromSource#949,cellLineBackground#950,cellType#951,clinicalPhase#952,clinicalSignificances#953,clinicalStatus#954,cohortDescription#955,... 83 more fields] parquet
                                                   :                                                        :                                   :              +- Project [studyLocusId#4703, studyId#4704, variantId#4705, beta#4709 AS betaGwas#4642, pValueExponent#4712]
                                                   :                                                        :                                   :                 +- Relation [studyLocusId#4703,studyId#4704,variantId#4705,chromosome#4706,position#4707,region#4708,beta#4709,zScore#4710,pValueMantissa#4711,pValueExponent#4712,effectAlleleFrequencyFromSource#4713,standardError#4714,subStudyDescription#4715,qualityControls#4716,finemappingMethod#4717,credibleSetIndex#4718,credibleSetlog10BF#4719,purityMeanR2#4720,purityMinR2#4721,locusStart#4722,locusEnd#4723,sampleSize#4724,ldSet#4725,locus#4726,... 2 more fields] parquet
                                                   :                                                        :                                   +- Project [id#687 AS diseaseId#4766, name#691, parents#694, therapeuticAreas#699]
                                                   :                                                        :                                      +- Relation [id#687,code#688,dbXRefs#689,description#690,name#691,directLocationIds#692,obsoleteTerms#693,parents#694,synonyms#695,ancestors#696,descendants#697,children#698,therapeuticAreas#699,indirectLocationIds#700,ontology#701] parquet
                                                   :                                                        +- Project [leftStudyLocusId#5907, geneId#5867 AS targetId#5912, distanceFootprintMean#5897, distanceTssMean#5898]
                                                   :                                                           +- Project [studyLocusId#5866 AS leftStudyLocusId#5907, geneId#5867, distanceFootprintMean#5897, distanceTssMean#5898]
                                                   :                                                              +- Project [studyLocusId#5866, geneId#5867, __pivot_first(value) AS `first(value)`#5896[0] AS distanceFootprintMean#5897, __pivot_first(value) AS `first(value)`#5896[1] AS distanceTssMean#5898]
                                                   :                                                                 +- Aggregate [studyLocusId#5866, geneId#5867], [studyLocusId#5866, geneId#5867, pivotfirst(key#5874, first(value)#5890, distanceFootprintMean, distanceTssMean, 0, 0) AS __pivot_first(value) AS `first(value)`#5896]
                                                   :                                                                    +- Aggregate [studyLocusId#5866, geneId#5867, key#5874], [studyLocusId#5866, geneId#5867, key#5874, first(value#5875, false) AS first(value)#5890]
                                                   :                                                                       +- Filter key#5874 IN (distanceFootprintMean,distanceTssMean)
                                                   :                                                                          +- Project [studyLocusId#5866, geneId#5867, key#5874, value#5875]
                                                   :                                                                             +- Generate explode(locusToGeneFeatures#5869), true, [key#5874, value#5875]
                                                   :                                                                                +- Relation [studyLocusId#5866,geneId#5867,score#5868,locusToGeneFeatures#5869] parquet
                                                   +- Project [targetId#718, diseaseId#818, maxClinPhase#8314, coherencyDiagonal#13753 AS coherencyDiagonal_ch#13774, coherencyOneCell#13763 AS coherencyOneCell_ch#13775, LoF_protect#11572L AS LoF_protect_ch#13776L, GoF_protect#11571L AS GoF_protect_ch#13777L]
                                                      +- Filter ((isnotnull(GoF_protect#11571L) OR isnotnull(LoF_protect#11572L)) AND (coherencyDiagonal#13753 = coherent))
                                                         +- Project [targetId#718, diseaseId#818, maxClinPhase#8314, GoF_protect#11571L, LoF_protect#11572L, noEvaluable#11573L, GoF_risk#13442, LoF_risk#13587, coherencyDiagonal#13753, CASE WHEN ((((isnull(LoF_risk#13587) AND isnull(LoF_protect#11572L)) AND isnull(GoF_risk#13442)) AND isnull(GoF_protect#11571L)) AND isnull(noEvaluable#11573L)) THEN noEvid WHEN ((((isnull(LoF_risk#13587) AND isnull(LoF_protect#11572L)) AND isnull(GoF_risk#13442)) AND isnull(GoF_protect#11571L)) AND isnotnull(noEvaluable#11573L)) THEN EvidNotDoE WHEN (((isnotnull(LoF_risk#13587) OR isnotnull(LoF_protect#11572L)) OR isnotnull(GoF_risk#13442)) OR isnotnull(GoF_protect#11571L)) THEN CASE WHEN (isnotnull(LoF_risk#13587) AND ((isnull(LoF_protect#11572L) AND isnull(GoF_risk#13442)) AND isnull(GoF_protect#11571L))) THEN coherent WHEN (isnotnull(GoF_risk#13442) AND ((isnull(LoF_protect#11572L) AND isnull(LoF_risk#13587)) AND isnull(GoF_protect#11571L))) THEN coherent WHEN (isnotnull(LoF_protect#11572L) AND ((isnull(LoF_risk#13587) AND isnull(GoF_risk#13442)) AND isnull(GoF_protect#11571L))) THEN coherent WHEN (isnotnull(GoF_protect#11571L) AND ((isnull(LoF_protect#11572L) AND isnull(GoF_risk#13442)) AND isnull(LoF_risk#13587))) THEN coherent ELSE dispar END END AS coherencyOneCell#13763]
                                                            +- Project [targetId#718, diseaseId#818, maxClinPhase#8314, GoF_protect#11571L, LoF_protect#11572L, noEvaluable#11573L, GoF_risk#13442, LoF_risk#13587, CASE WHEN ((((isnull(LoF_risk#13587) AND isnull(LoF_protect#11572L)) AND isnull(GoF_risk#13442)) AND isnull(GoF_protect#11571L)) AND isnull(noEvaluable#11573L)) THEN noEvid WHEN ((((isnull(LoF_risk#13587) AND isnull(LoF_protect#11572L)) AND isnull(GoF_risk#13442)) AND isnull(GoF_protect#11571L)) AND isnotnull(noEvaluable#11573L)) THEN EvidNotDoE WHEN (((isnotnull(LoF_risk#13587) OR isnotnull(LoF_protect#11572L)) OR isnotnull(GoF_risk#13442)) OR isnotnull(GoF_protect#11571L)) THEN CASE WHEN (isnotnull(GoF_risk#13442) AND isnotnull(LoF_risk#13587)) THEN dispar WHEN (isnotnull(LoF_protect#11572L) AND isnotnull(LoF_risk#13587)) THEN dispar WHEN (isnotnull(GoF_protect#11571L) AND isnotnull(GoF_risk#13442)) THEN dispar WHEN (isnotnull(GoF_protect#11571L) AND isnotnull(LoF_protect#11572L)) THEN dispar ELSE coherent END END AS coherencyDiagonal#13753]
                                                               +- Project [targetId#718, diseaseId#818, maxClinPhase#8314, GoF_protect#11571L, LoF_protect#11572L, noEvaluable#11573L, GoF_risk#13442, null AS LoF_risk#13587]
                                                                  +- Project [targetId#718, diseaseId#818, maxClinPhase#8314, GoF_protect#11571L, LoF_protect#11572L, noEvaluable#11573L, null AS GoF_risk#13442]
                                                                     +- Project [targetId#718, diseaseId#818, maxClinPhase#8314, __pivot_count(targetId) AS `count(targetId)`#11570[0] AS GoF_protect#11571L, __pivot_count(targetId) AS `count(targetId)`#11570[1] AS LoF_protect#11572L, __pivot_count(targetId) AS `count(targetId)`#11570[2] AS noEvaluable#11573L]
                                                                        +- Aggregate [targetId#718, diseaseId#818, maxClinPhase#8314], [targetId#718, diseaseId#818, maxClinPhase#8314, pivotfirst(homogenized#7553, count(targetId)#11562L, GoF_protect, LoF_protect, noEvaluable, 0, 0) AS __pivot_count(targetId) AS `count(targetId)`#11570]
                                                                           +- Aggregate [targetId#718, diseaseId#818, maxClinPhase#8314, homogenized#7553], [targetId#718, diseaseId#818, maxClinPhase#8314, homogenized#7553, count(targetId#718) AS count(targetId)#11562L]
                                                                              +- Project [datasourceId#717, targetId#718, alleleOrigins#719, allelicRequirements#720, ancestry#721, ancestryId#722, assays#723, assessments#724, beta#6055, betaConfidenceIntervalLower#726, betaConfidenceIntervalUpper#727, biologicalModelAllelicComposition#728, biologicalModelGeneticBackground#729, biologicalModelId#730, biomarkerList#731, biomarkerName#732, biomarkers#733, biosamplesFromSource#734, cellLineBackground#735, cellType#736, clinicalPhase#737, clinicalSignificances#6271, clinicalStatus#739, cohortDescription#740, ... 97 more fields]
                                                                                 +- Project [datasourceId#717, targetId#718, alleleOrigins#719, allelicRequirements#720, ancestry#721, ancestryId#722, assays#723, assessments#724, beta#6055, betaConfidenceIntervalLower#726, betaConfidenceIntervalUpper#727, biologicalModelAllelicComposition#728, biologicalModelGeneticBackground#729, biologicalModelId#730, biomarkerList#731, biomarkerName#732, biomarkers#733, biosamplesFromSource#734, cellLineBackground#735, cellType#736, clinicalPhase#737, clinicalSignificances#6271, clinicalStatus#739, cohortDescription#740, ... 98 more fields]
                                                                                    +- Window [max(clinicalPhase#737) windowspecdefinition(targetId#718, diseaseId#818, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS maxClinPhase#8314], [targetId#718, diseaseId#818]
                                                                                       +- Project [datasourceId#717, targetId#718, alleleOrigins#719, allelicRequirements#720, ancestry#721, ancestryId#722, assays#723, assessments#724, beta#6055, betaConfidenceIntervalLower#726, betaConfidenceIntervalUpper#727, biologicalModelAllelicComposition#728, biologicalModelGeneticBackground#729, biologicalModelId#730, biomarkerList#731, biomarkerName#732, biomarkers#733, biosamplesFromSource#734, cellLineBackground#735, cellType#736, clinicalPhase#737, clinicalSignificances#6271, clinicalStatus#739, cohortDescription#740, ... 96 more fields]
                                                                                          +- Project [datasourceId#717, targetId#718, alleleOrigins#719, allelicRequirements#720, ancestry#721, ancestryId#722, assays#723, assessments#724, beta#6055, betaConfidenceIntervalLower#726, betaConfidenceIntervalUpper#727, biologicalModelAllelicComposition#728, biologicalModelGeneticBackground#729, biologicalModelId#730, biomarkerList#731, biomarkerName#732, biomarkers#733, biosamplesFromSource#734, cellLineBackground#735, cellType#736, clinicalPhase#737, clinicalSignificances#6271, clinicalStatus#739, cohortDescription#740, ... 96 more fields]
                                                                                             +- Project [datasourceId#717, targetId#718, alleleOrigins#719, allelicRequirements#720, ancestry#721, ancestryId#722, assays#723, assessments#724, beta#6055, betaConfidenceIntervalLower#726, betaConfidenceIntervalUpper#727, biologicalModelAllelicComposition#728, biologicalModelGeneticBackground#729, biologicalModelId#730, biomarkerList#731, biomarkerName#732, biomarkers#733, biosamplesFromSource#734, cellLineBackground#735, cellType#736, clinicalPhase#737, clinicalSignificances#6271, clinicalStatus#739, cohortDescription#740, ... 95 more fields]
                                                                                                +- Project [datasourceId#717, targetId#718, alleleOrigins#719, allelicRequirements#720, ancestry#721, ancestryId#722, assays#723, assessments#724, beta#6055, betaConfidenceIntervalLower#726, betaConfidenceIntervalUpper#727, biologicalModelAllelicComposition#728, biologicalModelGeneticBackground#729, biologicalModelId#730, biomarkerList#731, biomarkerName#732, biomarkers#733, biosamplesFromSource#734, cellLineBackground#735, cellType#736, clinicalPhase#737, clinicalSignificances#6271, clinicalStatus#739, cohortDescription#740, ... 95 more fields]
                                                                                                   +- Project [datasourceId#717, targetId#718, alleleOrigins#719, allelicRequirements#720, ancestry#721, ancestryId#722, assays#723, assessments#724, beta#6055, betaConfidenceIntervalLower#726, betaConfidenceIntervalUpper#727, biologicalModelAllelicComposition#728, biologicalModelGeneticBackground#729, biologicalModelId#730, biomarkerList#731, biomarkerName#732, biomarkers#733, biosamplesFromSource#734, cellLineBackground#735, cellType#736, clinicalPhase#737, clinicalSignificances#6271, clinicalStatus#739, cohortDescription#740, ... 95 more fields]
                                                                                                      +- Project [datasourceId#717, targetId#718, alleleOrigins#719, allelicRequirements#720, ancestry#721, ancestryId#722, assays#723, assessments#724, beta#6055, betaConfidenceIntervalLower#726, betaConfidenceIntervalUpper#727, biologicalModelAllelicComposition#728, biologicalModelGeneticBackground#729, biologicalModelId#730, biomarkerList#731, biomarkerName#732, biomarkers#733, biosamplesFromSource#734, cellLineBackground#735, cellType#736, clinicalPhase#737, clinicalSignificances#6271, clinicalStatus#739, cohortDescription#740, ... 96 more fields]
                                                                                                         +- Window [collect_set(intogen_function#7068, 0, 0) windowspecdefinition(targetId#718, diseaseId#818, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS _we0#7191], [targetId#718, diseaseId#818]
                                                                                                            +- Project [datasourceId#717, targetId#718, alleleOrigins#719, allelicRequirements#720, ancestry#721, ancestryId#722, assays#723, assessments#724, beta#6055, betaConfidenceIntervalLower#726, betaConfidenceIntervalUpper#727, biologicalModelAllelicComposition#728, biologicalModelGeneticBackground#729, biologicalModelId#730, biomarkerList#731, biomarkerName#732, biomarkers#733, biosamplesFromSource#734, cellLineBackground#735, cellType#736, clinicalPhase#737, clinicalSignificances#6271, clinicalStatus#739, cohortDescription#740, ... 94 more fields]
                                                                                                               +- Project [datasourceId#717, targetId#718, alleleOrigins#719, allelicRequirements#720, ancestry#721, ancestryId#722, assays#723, assessments#724, beta#6055, betaConfidenceIntervalLower#726, betaConfidenceIntervalUpper#727, biologicalModelAllelicComposition#728, biologicalModelGeneticBackground#729, biologicalModelId#730, biomarkerList#731, biomarkerName#732, biomarkers#733, biosamplesFromSource#734, cellLineBackground#735, cellType#736, clinicalPhase#737, clinicalSignificances#6271, clinicalStatus#739, cohortDescription#740, ... 94 more fields]
                                                                                                                  +- Project [datasourceId#717, targetId#718, alleleOrigins#719, allelicRequirements#720, ancestry#721, ancestryId#722, assays#723, assessments#724, beta#6055, betaConfidenceIntervalLower#726, betaConfidenceIntervalUpper#727, biologicalModelAllelicComposition#728, biologicalModelGeneticBackground#729, biologicalModelId#730, biomarkerList#731, biomarkerName#732, biomarkers#733, biosamplesFromSource#734, cellLineBackground#735, cellType#736, clinicalPhase#737, clinicalSignificances#6271, clinicalStatus#739, cohortDescription#740, ... 93 more fields]
                                                                                                                     +- Project [datasourceId#717, targetId#718, alleleOrigins#719, allelicRequirements#720, ancestry#721, ancestryId#722, assays#723, assessments#724, beta#6055, betaConfidenceIntervalLower#726, betaConfidenceIntervalUpper#727, biologicalModelAllelicComposition#728, biologicalModelGeneticBackground#729, biologicalModelId#730, biomarkerList#731, biomarkerName#732, biomarkers#733, biosamplesFromSource#734, cellLineBackground#735, cellType#736, clinicalPhase#737, clinicalSignificances#6271, clinicalStatus#739, cohortDescription#740, ... 92 more fields]
                                                                                                                        +- Join LeftOuter, ((drugId2#5999 = drugId#755) AND (targetId2#6006 = targetId#718))
                                                                                                                           :- Join LeftOuter, (target_id#6049 = targetId#718)
                                                                                                                           :  :- Project [datasourceId#717, targetId#718, alleleOrigins#719, allelicRequirements#720, ancestry#721, ancestryId#722, assays#723, assessments#724, beta#6055, betaConfidenceIntervalLower#726, betaConfidenceIntervalUpper#727, biologicalModelAllelicComposition#728, biologicalModelGeneticBackground#729, biologicalModelId#730, biomarkerList#731, biomarkerName#732, biomarkers#733, biosamplesFromSource#734, cellLineBackground#735, cellType#736, clinicalPhase#737, concat_ws(,, clinicalSignificances#738) AS clinicalSignificances#6271, clinicalStatus#739, cohortDescription#740, ... 83 more fields]
                                                                                                                           :  :  +- Project [datasourceId#717, targetId#718, alleleOrigins#719, allelicRequirements#720, ancestry#721, ancestryId#722, assays#723, assessments#724, beta#6055, betaConfidenceIntervalLower#726, betaConfidenceIntervalUpper#727, biologicalModelAllelicComposition#728, biologicalModelGeneticBackground#729, biologicalModelId#730, biomarkerList#731, biomarkerName#732, biomarkers#733, biosamplesFromSource#734, cellLineBackground#735, cellType#736, clinicalPhase#737, clinicalSignificances#738, clinicalStatus#739, cohortDescription#740, ... 83 more fields]
                                                                                                                           :  :     +- Project [datasourceId#717, targetId#718, alleleOrigins#719, allelicRequirements#720, ancestry#721, ancestryId#722, assays#723, assessments#724, cast(beta#725 as double) AS beta#6055, betaConfidenceIntervalLower#726, betaConfidenceIntervalUpper#727, biologicalModelAllelicComposition#728, biologicalModelGeneticBackground#729, biologicalModelId#730, biomarkerList#731, biomarkerName#732, biomarkers#733, biosamplesFromSource#734, cellLineBackground#735, cellType#736, clinicalPhase#737, clinicalSignificances#738, clinicalStatus#739, cohortDescription#740, ... 83 more fields]
                                                                                                                           :  :        +- Filter datasourceId#717 IN (ot_genetics_portal,gene_burden,eva,eva_somatic,gene2phenotype,orphanet,cancer_gene_census,intogen,impc,chembl)
                                                                                                                           :  :           +- Filter (datasourceId#717 = chembl)
                                                                                                                           :  :              +- Filter datasourceId#717 IN (ot_genetics_portal,gene_burden,eva,eva_somatic,gene2phenotype,orphanet,cancer_gene_census,intogen,impc,chembl)
                                                                                                                           :  :                 +- Relation [datasourceId#717,targetId#718,alleleOrigins#719,allelicRequirements#720,ancestry#721,ancestryId#722,assays#723,assessments#724,beta#725,betaConfidenceIntervalLower#726,betaConfidenceIntervalUpper#727,biologicalModelAllelicComposition#728,biologicalModelGeneticBackground#729,biologicalModelId#730,biomarkerList#731,biomarkerName#732,biomarkers#733,biosamplesFromSource#734,cellLineBackground#735,cellType#736,clinicalPhase#737,clinicalSignificances#738,clinicalStatus#739,cohortDescription#740,... 83 more fields] parquet
                                                                                                                           :  +- Project [id#5928 AS target_id#6049, approvedSymbol#5929, description#6034, description_splited#6038, TSorOncogene#6043]
                                                                                                                           :     +- Project [id#5928, approvedSymbol#5929, description#6034, description_splited#6038, CASE WHEN (RLIKE(description_splited#6038, ncogene) AND RLIKE(description_splited#6038, TSG)) THEN bivalent WHEN RLIKE(description_splited#6038, ncogene(\s|$)) THEN oncogene WHEN RLIKE(description_splited#6038, TSG(\s|$)) THEN TSG ELSE noEvaluable END AS TSorOncogene#6043]
                                                                                                                           :        +- Project [id#5928, approvedSymbol#5929, description#6034, concat_ws(,, description#6034) AS description_splited#6038]
                                                                                                                           :           +- Aggregate [id#5928, approvedSymbol#5929], [id#5928, approvedSymbol#5929, collect_set(description#6026, 0, 0) AS description#6034]
                                                                                                                           :              +- Filter description#6026 IN (TSG,oncogene,Oncogene,oncogene,oncogene,TSG,TSG,oncogene,fusion,oncogene,oncogene,fusion)
                                                                                                                           :                 +- Project [id#5928, approvedSymbol#5929, col#6021.description AS description#6026]
                                                                                                                           :                    +- Project [id#5928, approvedSymbol#5929, col#6021]
                                                                                                                           :                       +- Generate explode(hallmarks#5938.attributes), true, [col#6021]
                                                                                                                           :                          +- Relation [id#5928,approvedSymbol#5929,biotype#5930,transcriptIds#5931,canonicalTranscript#5932,canonicalExons#5933,genomicLocation#5934,alternativeGenes#5935,approvedName#5936,go#5937,hallmarks#5938,synonyms#5939,symbolSynonyms#5940,nameSynonyms#5941,functionDescriptions#5942,subcellularLocations#5943,targetClass#5944,obsoleteSymbols#5945,obsoleteNames#5946,constraint#5947,tep#5948,proteinIds#5949,dbXrefs#5950,chemicalProbes#5951,... 4 more fields] parquet
                                                                                                                           +- Aggregate [targetId2#6006, drugId2#5999], [targetId2#6006, drugId2#5999, collect_set(actionType#5984, 0, 0) AS actionType#6016]
                                                                                                                              +- Project [targetId2#6006, drugId2#5999, actionType#5984, mechanismOfAction#5985]
                                                                                                                                 +- Generate explode(targets#5989), true, [targetId2#6006]
                                                                                                                                    +- Project [drugId2#5999, actionType#5984, mechanismOfAction#5985, targets#5989]
                                                                                                                                       +- Generate explode(chemblIds#5986), true, [drugId2#5999]
                                                                                                                                          +- Relation [actionType#5984,mechanismOfAction#5985,chemblIds#5986,targetName#5987,targetType#5988,targets#5989,references#5990] parquet


In [5]:
aggSetups_original = comparisons_df(df_analysis)

In [6]:
aggSetups_original

[Row(comparison='diagonalYes', comparisonType='byDatatype', _1='Phase4', _2='clinical'),
 Row(comparison='oneCellYes', comparisonType='byDatatype', _1='Phase4', _2='clinical'),
 Row(comparison='hasGeneticEvidence', comparisonType='byDatatype', _1='Phase4', _2='clinical'),
 Row(comparison='L2GAndColoc', comparisonType='byDatatype', _1='Phase4', _2='clinical'),
 Row(comparison='min_tssDistance_rank>=0_1', comparisonType='byDatatype', _1='Phase4', _2='clinical'),
 Row(comparison='min_tssDistance_rank>=0_15', comparisonType='byDatatype', _1='Phase4', _2='clinical'),
 Row(comparison='min_tssDistance_rank>=0_2', comparisonType='byDatatype', _1='Phase4', _2='clinical'),
 Row(comparison='min_tssDistance_rank>=0_25', comparisonType='byDatatype', _1='Phase4', _2='clinical'),
 Row(comparison='min_tssDistance_rank>=0_3', comparisonType='byDatatype', _1='Phase4', _2='clinical'),
 Row(comparison='min_tssDistance_rank>=0_35', comparisonType='byDatatype', _1='Phase4', _2='clinical'),
 Row(comparison='

In [2]:
for key, df_analysis in datasetDict.items():
    print(key, df_analysis)

df_l2g_original DataFrame[targetId: string, diseaseId: string, max_L2GScore: double, GoF_protect: bigint, GoF_risk: bigint, LoF_protect: bigint, LoF_risk: bigint, noEvaluable: bigint, coherencyDiagonal: string, coherencyOneCell: string, maxClinPhase: double, coherencyDiagonal_ch: string, coherencyOneCell_ch: string, LoF_protect_ch: bigint, GoF_protect_ch: bigint, diagonalAgreeWithDrugs: string, oneCellAgreeWithDrugs: string, Phase4: string, Phase>=3: string, Phase>=2: string, Phase>=1: string, Phase0: string, diagonalYes: string, oneCellYes: string, hasGeneticEvidence: string, L2GAndColoc: string, max_L2GScore>=0_1: string, max_L2GScore>=0_15: string, max_L2GScore>=0_2: string, max_L2GScore>=0_25: string, max_L2GScore>=0_3: string, max_L2GScore>=0_35: string, max_L2GScore>=0_4: string, max_L2GScore>=0_45: string, max_L2GScore>=0_5: string, max_L2GScore>=0_55: string, max_L2GScore>=0_6: string, max_L2GScore>=0_65: string, max_L2GScore>=0_7: string, max_L2GScore>=0_75: string, max_L2GSco

### debugging ignorenulls=True

In [None]:
#### 10.12.2024
from array import ArrayType
from functions import (
    relative_success,
    spreadSheetFormatter,
    discrepancifier,
    temporary_directionOfEffect,
)
from stoppedTrials import terminated_td
from DoEAssessment import directionOfEffect
from membraneTargets import target_membrane
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
from datetime import datetime
from datetime import date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.types import (
    StructType,
    StructField,
    DoubleType,
    DecimalType,
    StringType,
    FloatType,
)
import pandas as pd

spark = SparkSession.builder.getOrCreate()

path = "gs://open-targets-pre-data-releases/24.12-uo_test-3/output/etl/parquet/"

target = spark.read.parquet(f"{path}targets/")

diseases = spark.read.parquet(f"{path}diseases/")

evidences = spark.read.parquet(f"{path}evidence")

credible = spark.read.parquet(f"{path}credibleSet")

### index with new fix" "gs://ot-team/irene/gentropy/study_index_2412_fixed"
index = spark.read.parquet(f"gs://ot-team/irene/gentropy/study_index_2412_fixed")

new = spark.read.parquet(f"{path}colocalisation/coloc")

variantIndex = spark.read.parquet(f"{path}variantIndex")

biosample = spark.read.parquet(f"{path}biosample")


#### Fixing scXQTL as XQTLs:
## code provided by @ireneisdoomed
pd.DataFrame.iteritems = pd.DataFrame.items

raw_studies_metadata_schema: StructType = StructType(
    [
        StructField("study_id", StringType(), True),
        StructField("dataset_id", StringType(), True),
        StructField("study_label", StringType(), True),
        StructField("sample_group", StringType(), True),
        StructField("tissue_id", StringType(), True),
        StructField("tissue_label", StringType(), True),
        StructField("condition_label", StringType(), True),
        StructField("sample_size", IntegerType(), True),
        StructField("quant_method", StringType(), True),
        StructField("pmid", StringType(), True),
        StructField("study_type", StringType(), True),
    ]
)
raw_studies_metadata_path = "https://raw.githubusercontent.com/eQTL-Catalogue/eQTL-Catalogue-resources/fe3c4b4ed911b3a184271a6aadcd8c8769a66aba/data_tables/dataset_metadata.tsv"

study_table = spark.createDataFrame(
    pd.read_csv(raw_studies_metadata_path, sep="\t"),
    schema=raw_studies_metadata_schema,
)

# index = spark.read.parquet("gs://open-targets-pre-data-releases/24.12-uo_test-3/output/genetics/parquet/study_index")

study_index_w_correct_type = (
    study_table.select(
        F.concat_ws(
            "_",
            F.col("study_label"),
            F.col("quant_method"),
            F.col("sample_group"),
        ).alias("extracted_column"),
        "study_type",
    )
    .join(
        index
        # Get eQTL Catalogue studies
        .filter(F.col("studyType") != "gwas").filter(
            ~F.col("studyId").startswith("UKB_PPP")
        )
        # Remove measured trait
        .withColumn(
            "extracted_column",
            F.regexp_replace(F.col("studyId"), r"(_ENS.*|_ILMN.*|_X.*|_[0-9]+:.*)", ""),
        ).withColumn(
            "extracted_column",
            # After the previous cleanup, there are multiple traits from the same publication starting with the gene symbol that need to be removed (e.g. `Sun_2018_aptamer_plasma_ANXA2.4961.17.1..1`)
            F.when(
                F.col("extracted_column").startswith("Sun_2018_aptamer_plasma"),
                F.lit("Sun_2018_aptamer_plasma"),
            ).otherwise(F.col("extracted_column")),
        ),
        on="extracted_column",
        how="right",
    )
    .persist()
)

fixed = (
    study_index_w_correct_type.withColumn(
        "toFix",
        F.when(
            (F.col("study_type") != "single-cell")
            & (F.col("studyType").startswith("sc")),
            F.lit(True),
        ).otherwise(F.lit(False)),
    )
    # Remove the substring "sc" from the studyType column
    .withColumn(
        "newStudyType",
        F.when(
            F.col("toFix"), F.regexp_replace(F.col("studyType"), r"sc", "")
        ).otherwise(F.col("studyType")),
    ).drop("toFix", "extracted_column", "study_type")
).persist()
all_studies = index.join(
    fixed.selectExpr("studyId", "newStudyType"), on="studyId", how="left"
).persist()
fixedIndex = all_studies.withColumn(
    "studyType",
    F.when(F.col("newStudyType").isNotNull(), F.col("newStudyType")).otherwise(
        F.col("studyType")
    ),
).drop("newStudyType")
#### fixed

newColoc = (
    new.join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on left side
            "studyLocusId as leftStudyLocusId",
            "StudyId as leftStudyId",
            "variantId as leftVariantId",
            "studyType as credibleLeftStudyType",
        ),
        on="leftStudyLocusId",
        how="left",
    )
    .join(
        credible.selectExpr(  #### studyLocusId from credible set to uncover the codified variants on right side
            "studyLocusId as rightStudyLocusId",
            "studyId as rightStudyId",
            "variantId as rightVariantId",
            "studyType as credibleRightStudyType",
        ),
        on="rightStudyLocusId",
        how="left",
    )
    .join(
        fixedIndex.selectExpr(  ### bring modulated target on right side (QTL study)
            "studyId as rightStudyId",
            "geneId",
            "projectId",
            "studyType as indexStudyType",
            "condition",
            "biosampleId",
        ),
        on="rightStudyId",
        how="left",
    )
    .persist()
)
# remove columns without content (only null values on them)
df = evidences.filter((F.col("datasourceId") == "gwas_credible_sets"))

# Use an aggregation to determine non-null columns
non_null_counts = df.select(
    *[F.sum(F.col(col).isNotNull().cast("int")).alias(col) for col in df.columns]
)

# Collect the counts for each column
non_null_columns = [
    row[0] for row in non_null_counts.collect()[0].asDict().items() if row[1] > 0
]

# Select only the non-null columns
filtered_df = df.select(*non_null_columns).persist()

## bring studyId, variantId, beta from Gwas and pValue
gwasComplete = filtered_df.join(
    credible.selectExpr(
        "studyLocusId", "studyId", "variantId", "beta as betaGwas", "pValueExponent"
    ),
    on="studyLocusId",
    how="left",
).persist()

resolvedColoc = (
    (
        newColoc.withColumnRenamed("geneId", "targetId")
        .join(
            gwasComplete.withColumnRenamed("studyLocusId", "leftStudyLocusId"),
            on=["leftStudyLocusId", "targetId"],
            how="inner",
        )
        .join(  ### propagated using parent terms
            diseases.selectExpr(
                "id as diseaseId", "name", "parents", "therapeuticAreas"
            ),
            on="diseaseId",
            how="left",
        )
        .withColumn(
            "diseaseId",
            F.explode_outer(F.concat(F.array(F.col("diseaseId")), F.col("parents"))),
        )
        .drop("parents", "oldDiseaseId")
    )
    .withColumn(
        "colocDoE",
        F.when(
            F.col("rightStudyType").isin(
                ["eqtl", "pqtl", "tuqtl", "sceqtl", "sctuqtl"]
            ),
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_protect"),
            ),
        ).when(
            F.col("rightStudyType").isin(
                ["sqtl", "scsqtl"]
            ),  ### opposite directionality than sqtl
            F.when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("LoF_risk"),
            )
            .when(
                (F.col("betaGwas") > 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("GoF_risk"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") > 0),
                F.lit("GoF_protect"),
            )
            .when(
                (F.col("betaGwas") < 0) & (F.col("betaRatioSignAverage") < 0),
                F.lit("LoF_protect"),
            ),
        ),
    )
    .persist()
)

path = "gs://open-targets-pre-data-releases/24.12-uo_test-3/output/etl/parquet/"

datasource_filter = [
    "ot_genetics_portal",
    "gwas_credible_sets",
    "gene_burden",
    "eva",
    "eva_somatic",
    "gene2phenotype",
    "orphanet",
    "cancer_gene_census",
    "intogen",
    "impc",
    "chembl",
]

assessment, evidences, actionType, oncolabel = temporary_directionOfEffect(
    path, datasource_filter
)

drugApproved = (
    spark.read.parquet("gs://ot-team/irene/l2g/validation/chembl_w_flags")
    .drop("clinicalTrialId", "isComplex")
    .withColumn(
        "isApproved",
        F.when(F.col("isApproved") == "true", F.lit(1)).otherwise(F.lit(0)),
    )
    .distinct()
)

analysis_chembl_indication = (
    discrepancifier(
        assessment.filter((F.col("datasourceId") == "chembl"))
        .join(
            drugApproved.filter(F.col("isApproved") == 1),
            on=["targetId", "diseaseId", "drugId"],
            how="left",
        )
        .withColumn(
            "maxClinPhase",
            F.max(F.col("clinicalPhase")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .withColumn(
            "approvedDrug",
            F.max(F.col("isApproved")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase", "approvedDrug")
        .pivot("homogenized")
        .agg(F.count("targetId"))
    )
    .filter(F.col("coherencyDiagonal") == "coherent")
    .drop(
        "coherencyDiagonal", "coherencyOneCell", "noEvaluable", "GoF_risk", "LoF_risk"
    )
    .withColumnRenamed("GoF_protect", "drugGoF_protect")
    .withColumnRenamed("LoF_protect", "drugLoF_protect")
    .withColumn(
        "approved",
        F.when(F.col("approvedDrug") == 1, F.lit("yes")).otherwise(F.lit("no")),
    )
    .withColumn(
        "newPhases",
        F.when(F.col("approvedDrug") == 1, F.lit(4)).when(
            F.col("approvedDrug").isNull(),
            F.when(F.col("maxClinPhase") == 4, F.lit(3)).otherwise(
                F.col("maxClinPhase")
            ),
        ),
    )
    .persist()
)

chemblAssoc = (
    discrepancifier(
        assessment.filter(
            (F.col("datasourceId") == "chembl")
            & (F.col("homogenized") != "noEvaluable")
        )
        .withColumn(
            "maxClinPhase",
            F.max("clinicalPhase").over(Window.partitionBy("targetId", "diseaseId")),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase")
        .pivot("homogenized")
        .count()
    )
    .filter(F.col("coherencyDiagonal") == "coherent")
    .drop(
        "coherencyDiagonal", "coherencyOneCell", "noEvaluable", "GoF_risk", "LoF_risk"
    )
    .withColumnRenamed("GoF_protect", "drugGoF_protect")
    .withColumnRenamed("LoF_protect", "drugLoF_protect")
)

benchmark = (
    (
        resolvedColoc.filter(F.col("betaGwas") < 0)
        .join(  ### select just GWAS giving protection
            analysis_chembl_indication, on=["targetId", "diseaseId"], how="inner"
        )
        .withColumn(
            "AgreeDrug",
            F.when(
                (F.col("drugGoF_protect").isNotNull())
                & (F.col("colocDoE") == "GoF_protect"),
                F.lit("yes"),
            )
            .when(
                (F.col("drugLoF_protect").isNotNull())
                & (F.col("colocDoE") == "LoF_protect"),
                F.lit("yes"),
            )
            .otherwise(F.lit("no")),
        )
    ).filter(
        F.col("name") != "COVID-19"
    )  #### remove COVID-19 associations
).join(biosample.select("biosampleId", "biosampleName"), on="biosampleId", how="left")

#### Analysis

#### 1 Build a dictionary with the distinct values as key and column names as value
variables_study = ["projectId", "biosampleName", "rightStudyType", "colocDoE"]

# Initialize an empty dictionary
disdic = {}

# Iterate over the list of column names
for col_name in variables_study:
    # Extract distinct values for the column
    distinct_values = benchmark.select(col_name).distinct().collect()

    # Populate the dictionary
    for row in distinct_values:
        distinct_value = row[col_name]
        if distinct_value is not None:  # Exclude None (null) values
            disdic[distinct_value] = col_name

####2 Define agregation function
import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
from scipy.stats.contingency import odds_ratio
from pyspark.sql.types import *


def convertTuple(tup):
    st = ",".join(map(str, tup))
    return st


#####3 run in a function
def aggregations_original(
    df,
    data,
    listado,
    comparisonColumn,
    comparisonType,
    predictionColumn,
    predictionType,
    today_date,
):
    wComparison = Window.partitionBy(comparisonColumn)
    wPrediction = Window.partitionBy(predictionColumn)
    wPredictionComparison = Window.partitionBy(comparisonColumn, predictionColumn)
    results = []
    # uniqIds = df.select("targetId", "diseaseId").distinct().count()
    out = (
        df.withColumn("comparisonType", F.lit(comparisonType))
        .withColumn("dataset", F.lit(data))
        .withColumn("predictionType", F.lit(predictionType))
        # .withColumn("total", F.lit(uniqIds))
        .withColumn("a", F.count("targetId").over(wPredictionComparison))
        .withColumn("comparisonColumn", F.lit(comparisonColumn))
        .withColumn("predictionColumnValue", F.lit(predictionColumn))
        .withColumn(
            "predictionTotal",
            F.count("targetId").over(wPrediction),
        )
        .withColumn(
            "comparisonTotal",
            F.count("targetId").over(wComparison),
        )
        .select(
            F.col(predictionColumn).alias("prediction"),
            F.col(comparisonColumn).alias("comparison"),
            "dataset",
            "comparisonColumn",
            "predictionColumnValue",
            "comparisonType",
            "predictionType",
            "a",
            "predictionTotal",
            "comparisonTotal",
        )
        .filter(F.col("prediction").isNotNull())
        .filter(F.col("comparison").isNotNull())
        .distinct()
    )
    """
    out.write.mode("overwrite").parquet(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + comparisonType
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )
    """

    listado.append(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + comparisonType
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )
    path = "gs://ot-team/jroldan/" + str(
        today_date
        + "_"
        + "analysis/"
        + data
        # + "_propagated"
        + "/"
        + comparisonColumn
        + "_"
        + comparisonType
        + "_"
        + predictionColumn
        + ".parquet"
    )
    print(path)
    ### making analysis
    array1 = np.delete(
        out.join(full_data, on=["prediction", "comparison"], how="outer")
        .groupBy("comparison")
        .pivot("prediction")
        .agg(F.first("a"))
        .sort(F.col("comparison").desc())
        .select("comparison", "yes", "no")
        .fillna(0)
        .toPandas()
        .to_numpy(),
        [0],
        1,
    )
    total = np.sum(array1)
    res_npPhaseX = np.array(array1, dtype=int)
    resX = convertTuple(fisher_exact(res_npPhaseX, alternative="two-sided"))
    resx_CI = convertTuple(
        odds_ratio(res_npPhaseX).confidence_interval(confidence_level=0.95)
    )

    result_st.append(resX)
    result_ci.append(resx_CI)
    (rs_result, rs_ci) = relative_success(array1)
    results.extend(
        [
            comparisonType,
            comparisonColumn,
            predictionColumn,
            round(float(resX.split(",")[0]), 2),
            float(resX.split(",")[1]),
            round(float(resx_CI.split(",")[0]), 2),
            round(float(resx_CI.split(",")[1]), 2),
            str(total),
            np.array(res_npPhaseX).tolist(),
            round(float(rs_result), 2),
            round(float(rs_ci[0]), 2),
            round(float(rs_ci[1]), 2),
            # studies,
            # tissues,
            path,
        ]
    )
    return results


#### 3 Loop over different datasets (as they will have different rows and columns)


def comparisons_df_iterative(disdic, projectId):
    toAnalysis = [(key, value) for key, value in disdic.items() if value == projectId]
    schema = StructType(
        [
            StructField("comparison", StringType(), True),
            StructField("comparisonType", StringType(), True),
        ]
    )

    comparisons = spark.createDataFrame(toAnalysis, schema=schema)
    ### include all the columns as predictor

    predictions = spark.createDataFrame(
        data=[
            ("Phase4", "clinical"),
            ("Phase>=3", "clinical"),
            ("Phase>=2", "clinical"),
            ("Phase>=1", "clinical"),
            ("nPhase4", "clinical"),
            ("nPhase>=3", "clinical"),
            ("nPhase>=2", "clinical"),
            ("nPhase>=1", "clinical"),
            ("approved", "clinical"),
            # ("PhaseT", "clinical"),
        ]
    )
    return comparisons.join(predictions, how="full").collect()


full_data = spark.createDataFrame(
    data=[
        ("yes", "yes"),
        ("yes", "no"),
        ("no", "yes"),
        ("no", "no"),
    ],
    schema=StructType(
        [
            StructField("prediction", StringType(), True),
            StructField("comparison", StringType(), True),
        ]
    ),
)
print("created full_data and lists")

result = []
result_st = []
result_ci = []
array2 = []
listado = []
result_all = []
today_date = str(date.today())
variables_study = ["projectId", "biosampleName", "rightStudyType", "colocDoE"]

print("looping for variables_study")

for variable in variables_study:
    print("analysing", variable)
    #### build list of comparison and prediction columns
    rows = comparisons_df_iterative(disdic, variable)
    #### prepare aggregation depending on the variable problem
    window_spec = Window.partitionBy("targetId", "diseaseId", variable).orderBy(
        F.col("pValueExponent").asc()
    )  ### ignore nulls aded 29.01.2025
    #### take directionality from lowest p value
    bench2 = (
        benchmark.withColumn(
            "agree_lowestPval", F.first("AgreeDrug", ignorenulls=True).over(window_spec)
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase", "approved", "newPhases")
        .pivot(variable)
        .agg(F.collect_set("agree_lowestPVal"))
        .withColumn(
            "Phase4",
            F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=3",
            F.when(F.col("maxClinPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=2",
            F.when(F.col("maxClinPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=1",
            F.when(F.col("maxClinPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(  ###  new phases extracted from aproved label
            "nPhase4",
            F.when(F.col("newPhases") == 4, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "nPhase>=3",
            F.when(F.col("newPhases") >= 3, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "nPhase>=2",
            F.when(F.col("newPhases") >= 2, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "nPhase>=1",
            F.when(F.col("newPhases") >= 1, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "approved",
            F.when(F.col("approved") == "yes", F.lit("yes")).otherwise(F.lit("no")),
        )
    )

### debugging doe fro 

In [None]:
#### 11.12.2024
#######
##########     ATENTION
#### change code to work with generated dataframe instead of reading the parquet

"""
This scripts run Odds ratio analysis for DoE and 
genetic information on drug clinical success

"""
from functions import (
    discrepancifier,
    build_gwasResolvedColoc,
    temporary_directionOfEffect,
)
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.types import (
    StructType,
    StructField,
    ArrayType,
    DoubleType,
    DecimalType,
    StringType,
    FloatType,
)
from datetime import datetime


spark = SparkSession.builder.getOrCreate()
c = datetime.now()
today_date = str(date.today())
print("spark session created at", c)

print("Analysis started on " + today_date + " at ", c)
"""
#coloc = spark.read.parquet(
#    "gs://genetics-portal-dev-data/22.09.1/outputs/v2d_coloc"
#).filter(F.col("right_type") != "gwas")
"""

#### make the dataset from stopped clin trials
### read supplementary table 9
""" ### just showing how i did the dataset
st9 = spark.read.csv("/Users/juanr/Downloads/ST9.csv", sep=",", header=True)
st9.filter(
    (F.col("clinicalStatus").isin(["Terminated", "Withdrawn", "Suspended"]))
    & (F.col("prediction") == "Negative")
).groupBy(
    "targetId", "diseaseId", "clinicalStatus", "prediction"
).count().toPandas().to_csv(
    "targetDiseaseStoppedNegative.csv"
)
"""
### target-diseases terminated&withdrawal in clin trials
terminated = spark.read.csv(
    "gs://ot-team/jroldan/analysis/targetDiseaseStoppedNegative.csv",
    sep=",",
    header=True,
).drop("_c0", "Withdrawn")

path = "gs://open-targets-pre-data-releases/24.12-uo_test-3/output/etl/parquet/"

evidences = (
    spark.read.parquet(f"{path}evidence")
    .filter(
        F.col("datasourceId").isin(
            [
                "ot_genetics_portal",
                "gene_burden",
                "eva",
                "eva_somatic",
                "gene2phenotype",
                "orphanet",
                "cancer_gene_census",
                "intogen",
                "impc",
                "chembl",
            ]
        )
    )
    .persist()
)
ot_genetics = evidences.filter(F.col("datasourceId") == "ot_genetics_portal")

#### Now load sources of data to generate credible_set_OT_genetics evidences and associations.

target = spark.read.parquet(f"{path}targets/")

diseases = spark.read.parquet(f"{path}diseases/")

gwasResolvedColoc = build_gwasResolvedColoc(path)

#### take the direction from the lowest p value
window_spec = Window.partitionBy("targetId", "diseaseId").orderBy(
    F.col("pValueExponent").asc()
)
gwasCredibleAssoc = (
    gwasResolvedColoc.withColumn(
        "homogenized", F.first("colocDoE", ignorenulls=True).over(window_spec)
    )  ## added 30.01.2025
    .select("targetId", "diseaseId", "homogenized")
    .withColumn(
        "homogenized",
        F.when(F.col("homogenized").isNull(), F.lit("noEvaluable")).otherwise(
            F.col("homogenized")
        ),
    )
)

### datasource_filter for temporaryDoE
datasource_filter = [
    "ot_genetics_portal",
    "gene_burden",
    "eva",
    "eva_somatic",
    "gene2phenotype",
    "orphanet",
    "cancer_gene_census",
    "intogen",
    "impc",
    "chembl",
]

spark session created at 2025-01-30 11:50:25.301146
Analysis started on 2025-01-30 at  2025-01-30 11:50:25.301146


25/01/30 11:50:32 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


read spark files
fixing scXQTL and XQTL studies
fixed scXQTL and XQTL studies
creating new coloc


                                                                                

creating new gwasResolvedColoc


AttributeError: 'tuple' object has no attribute 'persist'

In [5]:
un, dos, tre, cuatro = temporary_directionOfEffect(path, datasource_filter)

25/01/30 11:53:58 WARN CacheManager: Asked to cache already cached data.
25/01/30 11:53:59 WARN CacheManager: Asked to cache already cached data.


In [6]:
un.show()

[Stage 29:>                                                         (0 + 1) / 1]

+------------------+---------------+-------------+-------------------+--------+----------+------+-----------+----------+---------------------------+---------------------------+---------------------------------+--------------------------------+-----------------+-------------+-------------+----------+--------------------+------------------+--------+-------------+---------------------+--------------+-----------------+--------+----------------+---------------+----------+--------+-------------------+-------------------+----------------+--------------------+-------------------+-------------------------+-------------------------------------+-------------------------------------+--------------+------+------------+-------------------+-----------------+------------------------+-----------------------+-----------------------------+---------------------+--------------------+----------------------------+-------------------+--------------------+------------+--------------------------------+---------

                                                                                

In [None]:
print("Moving to step 2")

columns_chembl = ["LoF_protect", "GoF_protect"]
columns_dataset = ["LoF_protect", "GoF_protect", "LoF_risk", "GoF_risk", "evidenceDif"]
columns = ["GoF_risk", "LoF_protect", "LoF_risk", "GoF_protect"]
terms = ["noEvaluable", "bivalent_risk", "null", "dispar"]

taDf = spark.createDataFrame(
    data=[
        ("MONDO_0045024", "cell proliferation disorder", "Oncology"),
        ("EFO_0005741", "infectious disease", "Other"),
        ("OTAR_0000014", "pregnancy or perinatal disease", "Other"),
        ("EFO_0005932", "animal disease", "Other"),
        ("MONDO_0024458", "disease of visual system", "Other"),
        ("EFO_0000319", "cardiovascular disease", "Other"),
        ("EFO_0009605", "pancreas disease", "Other"),
        ("EFO_0010282", "gastrointestinal disease", "Other"),
        ("OTAR_0000017", "reproductive system or breast disease", "Other"),
        ("EFO_0010285", "integumentary system disease", "Other"),
        ("EFO_0001379", "endocrine system disease", "Other"),
        ("OTAR_0000010", "respiratory or thoracic disease", "Other"),
        ("EFO_0009690", "urinary system disease", "Other"),
        ("OTAR_0000006", "musculoskeletal or connective tissue disease", "Other"),
        ("MONDO_0021205", "disease of ear", "Other"),
        ("EFO_0000540", "immune system disease", "Other"),
        ("EFO_0005803", "hematologic disease", "Other"),
        ("EFO_0000618", "nervous system disease", "Other"),
        ("MONDO_0002025", "psychiatric disorder", "Other"),
        ("MONDO_0024297", "nutritional or metabolic disease", "Other"),
        ("OTAR_0000018", "genetic, familial or congenital disease", "Other"),
        ("OTAR_0000009", "injury, poisoning or other complication", "Other"),
        ("EFO_0000651", "phenotype", "Other"),
        ("EFO_0001444", "measurement", "Other"),
        ("GO_0008150", "biological process", "Other"),
    ],
    schema=StructType(
        [
            StructField("taId", StringType(), True),
            StructField("taLabel", StringType(), True),
            StructField("taLabelSimple", StringType(), True),
        ]
    ),
).withColumn("taRank", F.monotonically_increasing_id())

### give us a classification of Oncology VS non oncology
wByDisease = Window.partitionBy("diseaseId")  #### checked 31.05.2023
diseaseTA = (
    diseases.withColumn("taId", F.explode("therapeuticAreas"))
    .select(F.col("id").alias("diseaseId"), "taId", "parents")
    .join(taDf, on="taId", how="left")
    .withColumn("minRank", F.min("taRank").over(wByDisease))
    .filter(F.col("taRank") == F.col("minRank"))
    .drop("taRank", "minRank")
)

#### give us propagation of diseases and list of therapeutic areas associated
diseases2 = diseases.select("id", "parents").withColumn(
    "diseaseIdPropagated",
    F.explode_outer(F.concat(F.array(F.col("id")), F.col("parents"))),
)

chembl_trials = (
    prueba_assessment.filter((F.col("datasourceId").isin(["chembl"])))
    .groupBy("targetId", "diseaseId")
    .agg(F.max(F.col("clinicalPhase")).alias("maxClinPhase"))
)

terminated_array = (
    terminated.groupBy("targetId", "diseaseId")
    .agg(F.collect_set("clinicalStatus").alias("clinicalStatus"))
    .withColumn("prediction", F.when(F.col("clinicalStatus").isNotNull(), F.lit("yes")))
)

assessment = prueba_assessment.unionByName(
    gwasCredibleAssoc.withColumn("datasourceId", F.lit("gwas_credible_set")),
    allowMissingColumns=True,
)

print("defining non propagated,propagated and analysis_drugs functions")


def analysis_nonPropagated(assessment, analysisDatasources):
    return discrepancifier(
        assessment.filter(F.col("datasourceId").isin(analysisDatasources))
        .withColumn(
            "datasources",
            F.collect_set(F.col("datasourceId")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy(
            "targetId",
            "diseaseId",
        )
        .pivot("homogenized")
        .agg(F.count("targetId"))
        .persist()
    )


def analysis_propagated(assessment, analysisDatasources):
    return discrepancifier(
        assessment.filter(F.col("datasourceId").isin(analysisDatasources))
        .withColumn(
            "datasources",
            F.collect_set(F.col("datasourceId")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .join(
            diseases2.selectExpr("id as diseaseId", "diseaseIdPropagated"),
            on="diseaseId",
            how="left",
        )
        .withColumnRenamed("diseaseId", "oldDiseaseId")
        .withColumnRenamed("diseaseIdPropagated", "diseaseId")
        .groupBy(
            "targetId",
            "diseaseId",
        )
        .pivot("homogenized")
        .agg(F.count("targetId"))
        .persist()
    )


chembl_ds = ["chembl"]


def analysis_drugs(assessment, chembl_ds):
    return discrepancifier(
        assessment.filter((F.col("datasourceId").isin(chembl_ds)))
        .join(
            drugApproved.filter(F.col("isApproved") == 1),
            on=["targetId", "diseaseId", "drugId"],
            how="left",
        )
        .withColumn(
            "maxClinPhase",
            F.max(F.col("clinicalPhase")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .withColumn(
            "approvedDrug",
            F.max(F.col("isApproved")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .withColumn(
            "approved",
            F.when(F.col("approvedDrug") == 1, F.lit("yes")).otherwise(F.lit("no")),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase", "approved")
        .pivot("homogenized")
        .agg(F.count("targetId"))
        .persist()
    )


##### approved dataset added 30.01.2025
drugApproved = (
    spark.read.parquet("gs://ot-team/irene/l2g/validation/chembl_w_flags")
    .drop("clinicalTrialId", "isComplex")
    .withColumn(
        "isApproved",
        F.when(F.col("isApproved") == "true", F.lit(1)).otherwise(F.lit(0)),
    )
    .distinct()
)


analysis_chembl = analysis_drugs(assessment, chembl_ds)

#######
## include here the analysis
#######

analysisDatasources = []

print("defining full_analysis_propagation")


def full_analysis_propagation(
    assessment, analysisDatasources, analysis_chembl, terminated_array, diseaseTA
):
    return (
        analysis_propagated(assessment, analysisDatasources)
        .join(
            analysis_chembl.selectExpr(
                "targetId",
                "diseaseId",
                "maxClinPhase",
                "approved",
                "coherencyDiagonal as coherencyDiagonal_ch",
                "coherencyOneCell as coherencyOneCell_ch",
                "LoF_protect as LoF_protect_ch",
                "GoF_protect as GoF_protect_ch",
            ),
            on=["targetId", "diseaseId"],
            how="right",
        )
        #### Should remove the coherencyDiagonal.isNotNull()
        .withColumn(
            "geneticEvidence",
            F.when(
                F.col("coherencyDiagonal").isNotNull(), F.lit("hasGeneticEvidence")
            ).otherwise(F.lit("noGeneticEvidence")),
        )
        # .filter(F.col("coherencyDiagonal_ch").isNotNull())
        .withColumn(
            "diagonalAgreeWithDrugs",
            F.when(
                (F.col("coherencyDiagonal_ch") == "coherent")
                & (F.col("coherencyDiagonal") == "coherent"),
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        F.col("GoF_risk").isNotNull() | F.col("LoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .when(
                    F.col("GoF_protect_ch").isNotNull()
                    & (
                        F.col("LoF_risk").isNotNull() | F.col("GoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        )
        .withColumn(
            "oneCellAgreeWithDrugs",
            F.when(
                (F.col("coherencyOneCell_ch") == "coherent")
                & (F.col("coherencyOneCell") == "coherent"),
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        (F.col("LoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("GoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .when(
                    (F.col("GoF_protect_ch").isNotNull())
                    & (
                        (F.col("GoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("LoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        )
        .withColumn(
            "Phase4",
            F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=3",
            F.when(F.col("maxClinPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=2",
            F.when(F.col("maxClinPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=1",
            F.when(F.col("maxClinPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase0",
            F.when(F.col("maxClinPhase") == 0, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "approved",
            F.when(F.col("approved") == "yes", F.lit("yes")).otherwise(F.lit("no")),
        )
        .join(terminated_array, on=["targetId", "diseaseId"], how="left")
        .withColumn(
            "PhaseT",
            F.when(F.col("prediction") == "yes", F.lit("yes")).otherwise(F.lit("no")),
        )
        .join(
            diseaseTA.select("diseaseId", "taLabelSimple"), on="diseaseId", how="left"
        )
        .withColumn(
            "hasGeneticEvidence",
            F.when(
                F.col("geneticEvidence") == "hasGeneticEvidence", F.lit("yes")
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "diagonalYes",
            F.when(
                F.col("hasGeneticEvidence") == "yes",
                F.when(F.col("diagonalAgreeWithDrugs") == "coherent", F.lit("yes"))
                .when(F.col("diagonalAgreeWithDrugs") == "dispar", F.lit("no"))
                .otherwise(F.lit("no")),
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "oneCellYes",
            F.when(
                F.col("hasGeneticEvidence") == "yes",
                F.when(F.col("oneCellAgreeWithDrugs") == "coherent", F.lit("yes"))
                .when(F.col("oneCellAgreeWithDrugs") == "dispar", F.lit("no"))
                .otherwise(F.lit("no")),
            ).otherwise(F.lit("no")),
        )
        .persist()
    )


#####
## no propag
#####
print("defining full analysis no propagation")


def full_analysis_noPropagation(
    assessment, analysisDatasources, analysis_chembl, terminated_array, diseaseTA
):
    return (
        analysis_nonPropagated(assessment, analysisDatasources)
        .join(
            analysis_chembl.selectExpr(
                "targetId",
                "diseaseId",
                "maxClinPhase",
                "coherencyDiagonal as coherencyDiagonal_ch",
                "coherencyOneCell as coherencyOneCell_ch",
                "LoF_protect as LoF_protect_ch",
                "GoF_protect as GoF_protect_ch",
            ),
            on=["targetId", "diseaseId"],
            how="right",
        )
        .withColumn(
            "geneticEvidence",
            F.when(
                F.col("coherencyDiagonal").isNotNull(), F.lit("hasGeneticEvidence")
            ).otherwise(F.lit("noGeneticEvidence")),
        )
        # .filter(F.col("coherencyDiagonal_ch").isNotNull())
        .withColumn(
            "diagonalAgreeWithDrugs",
            F.when(
                (F.col("coherencyDiagonal_ch") == "coherent")
                & (F.col("coherencyDiagonal") == "coherent"),
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        F.col("GoF_risk").isNotNull() | F.col("LoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .when(
                    F.col("GoF_protect_ch").isNotNull()
                    & (
                        F.col("LoF_risk").isNotNull() | F.col("GoF_protect").isNotNull()
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        )
        .withColumn(
            "oneCellAgreeWithDrugs",
            F.when(
                (F.col("coherencyOneCell_ch") == "coherent")
                & (F.col("coherencyOneCell") == "coherent"),
                F.when(
                    (F.col("LoF_protect_ch").isNotNull())
                    & (
                        (F.col("LoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("GoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .when(
                    (F.col("GoF_protect_ch").isNotNull())
                    & (
                        (F.col("GoF_protect").isNotNull())
                        & (F.col("LoF_risk").isNull())
                        & (F.col("LoF_protect").isNull())
                        & (F.col("GoF_risk").isNull())
                    ),
                    F.lit("coherent"),
                )
                .otherwise(F.lit("dispar")),
            ),
        )
        .withColumn(
            "Phase4",
            F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=3",
            F.when(F.col("maxClinPhase") >= 3, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=2",
            F.when(F.col("maxClinPhase") >= 2, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase>=1",
            F.when(F.col("maxClinPhase") >= 1, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "Phase0",
            F.when(F.col("maxClinPhase") == 0, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "approved",
            F.when(F.col("approved") == "yes", F.lit("yes")).otherwise(F.lit("no")),
        )
        .join(terminated_array, on=["targetId", "diseaseId"], how="left")
        .withColumn(
            "PhaseT",
            F.when(F.col("prediction") == "yes", F.lit("yes")).otherwise(F.lit("no")),
        )
        .join(
            diseaseTA.select("diseaseId", "taLabelSimple"), on="diseaseId", how="left"
        )
        .withColumn(
            "hasGeneticEvidence",
            F.when(
                F.col("geneticEvidence") == "hasGeneticEvidence", F.lit("yes")
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "diagonalYes",
            F.when(
                F.col("hasGeneticEvidence") == "yes",
                F.when(F.col("diagonalAgreeWithDrugs") == "coherent", F.lit("yes"))
                .when(F.col("diagonalAgreeWithDrugs") == "dispar", F.lit("no"))
                .otherwise(F.lit("no")),
            ).otherwise(F.lit("no")),
        )
        .withColumn(
            "oneCellYes",
            F.when(
                F.col("hasGeneticEvidence") == "yes",
                F.when(F.col("oneCellAgreeWithDrugs") == "coherent", F.lit("yes"))
                .when(F.col("oneCellAgreeWithDrugs") == "dispar", F.lit("no"))
                .otherwise(F.lit("no")),
            ).otherwise(F.lit("no")),
        )
        .persist()
    )


print("moving to Step 3")

from functions import relative_success, spreadSheetFormatter, convertTuple
import re
import pandas as pd
import numpy as np
from scipy.stats import fisher_exact
from scipy.stats.contingency import odds_ratio, relative_risk

full_data = spark.createDataFrame(
    data=[
        ("yes", "yes"),
        ("yes", "no"),
        ("no", "yes"),
        ("no", "no"),
    ],
    schema=StructType(
        [
            StructField("prediction", StringType(), True),
            StructField("comparison", StringType(), True),
        ]
    ),
)
c = datetime.now()
print("starting dictionaries at", c)

#### continue here on 10.07.2024

## 1nd dictionary
dfs_dict = {}  ### checked and changed on 01.06.2023
dfs_dict_propag = {}


wocgc_list = [
    "gene_burden",
    "intogen",
    "eva",
    "eva_somatic",
    # "ot_genetics_portal",
    "impc",
    "orphanet",
    "gene2phenotype",
    "gwas_credible_set",
]
datasource_list = [
    "gene_burden",
    "intogen",
    "cancer_gene_census",
    "eva",
    "eva_somatic",
    "ot_genetics_portal",
    "gwas_credible_set",
    "impc",
    "orphanet",
    "gene2phenotype",
    "WOcgc",
    "somatic",
    "germline",
]

germline_list = [
    "gene_burden",
    "eva",
    # "ot_genetics_portal",
    "gwas_credible_set",
    "impc",
    "orphanet",
    "gene2phenotype",
]

somatic_list = ["intogen", "cancer_gene_census", "eva_somatic"]

# assessment = prueba_assessment.filter(F.col("datasourceId").isin(datasources_analysis))


def dataset_builder(assessment, value, analysis_chembl, terminated_array, diseaseTA):
    nonPropagated = full_analysis_noPropagation(
        assessment, value, analysis_chembl, terminated_array, diseaseTA
    )
    propagated = full_analysis_propagation(
        assessment, value, analysis_chembl, terminated_array, diseaseTA
    )
    return (
        # Non propagation
        ## All
        nonPropagated,
        ## Other
        nonPropagated.filter(F.col("taLabelSimple") == "Other"),
        ## Other&Null
        nonPropagated.filter(
            (F.col("taLabelSimple").isNull()) | (F.col("taLabelSimple") == "Other")
        ),
        ## Oncology
        nonPropagated.filter(F.col("taLabelSimple") == "Oncology"),
        # Propagation
        ## All
        propagated,
        ## Other
        propagated.filter(F.col("taLabelSimple") == "Other"),
        ## Other&Null
        propagated.filter(
            (F.col("taLabelSimple").isNull()) | (F.col("taLabelSimple") == "Other")
        ),
        ## Oncology
        propagated.filter(F.col("taLabelSimple") == "Oncology"),
    )


for value in datasource_list:
    print(value)
    if value == "WOcgc":
        (
            dfs_dict[f"df_{value}_All_original"],
            dfs_dict[f"df_{value}_Other_original"],
            dfs_dict[f"df_{value}_OtherNull_original"],
            dfs_dict[f"df_{value}_Oncology_original"],
            dfs_dict_propag[f"df_{value}_All_propag"],
            dfs_dict_propag[f"df_{value}_Other_propag"],
            dfs_dict_propag[f"df_{value}_OtherNull_propag"],
            dfs_dict_propag[f"df_{value}_Oncology_propag"],
        ) = dataset_builder(
            assessment, wocgc_list, analysis_chembl, terminated_array, diseaseTA
        )
    elif value == "germline":
        (
            dfs_dict[f"df_{value}_All_original"],
            dfs_dict[f"df_{value}_Other_original"],
            dfs_dict[f"df_{value}_OtherNull_original"],
            dfs_dict[f"df_{value}_Oncology_original"],
            dfs_dict_propag[f"df_{value}_All_propag"],
            dfs_dict_propag[f"df_{value}_Other_propag"],
            dfs_dict_propag[f"df_{value}_OtherNull_propag"],
            dfs_dict_propag[f"df_{value}_Oncology_propag"],
        ) = dataset_builder(
            assessment,
            germline_list,
            analysis_chembl,
            terminated_array,
            diseaseTA,
        )

    elif value == "somatic":
        (
            dfs_dict[f"df_{value}_All_original"],
            dfs_dict[f"df_{value}_Other_original"],
            dfs_dict[f"df_{value}_OtherNull_original"],
            dfs_dict[f"df_{value}_Oncology_original"],
            dfs_dict_propag[f"df_{value}_All_propag"],
            dfs_dict_propag[f"df_{value}_Other_propag"],
            dfs_dict_propag[f"df_{value}_OtherNull_propag"],
            dfs_dict_propag[f"df_{value}_Oncology_propag"],
        ) = dataset_builder(
            assessment,
            somatic_list,
            analysis_chembl,
            terminated_array,
            diseaseTA,
        )

    else:
        (
            dfs_dict[f"df_{value}_All_original"],
            dfs_dict[f"df_{value}_Other_original"],
            dfs_dict[f"df_{value}_OtherNull_original"],
            dfs_dict[f"df_{value}_Oncology_original"],
            dfs_dict_propag[f"df_{value}_All_propag"],
            dfs_dict_propag[f"df_{value}_Other_propag"],
            dfs_dict_propag[f"df_{value}_OtherNull_propag"],
            dfs_dict_propag[f"df_{value}_Oncology_propag"],
        ) = dataset_builder(
            assessment, value, analysis_chembl, terminated_array, diseaseTA
        )


def comparisons_df() -> list:
    """Return list of all comparisons to be used in the analysis"""
    comparisons = spark.createDataFrame(
        data=[
            ("hasGeneticEvidence", "byDatatype"),
            ("diagonalYes", "byDatatype"),
            ("oneCellYes", "byDatatype"),
        ],
        schema=StructType(
            [
                StructField("comparison", StringType(), True),
                StructField("comparisonType", StringType(), True),
            ]
        ),
    )

    predictions = spark.createDataFrame(
        data=[
            ("Phase4", "clinical"),
            ("Phase>=3", "clinical"),
            ("Phase>=2", "clinical"),
            ("Phase>=1", "clinical"),
            ("PhaseT", "clinical"),
            ("approved", "clinical"),
        ]
    )
    return comparisons.join(predictions, how="full").collect()


result = []
result_st = []
result_ci = []
array2 = []
results = []


def aggregations_original(
    df,
    data,
    listado,
    comparisonColumn,
    comparisonType,
    predictionColumn,
    predictionType,
    today_date,
):

    wComparison = Window.partitionBy(comparisonColumn)
    wPrediction = Window.partitionBy(predictionColumn)
    wPredictionComparison = Window.partitionBy(comparisonColumn, predictionColumn)

    uniqIds = df.select("targetId", "diseaseId").distinct().count()

    out = (
        df.withColumn("comparisonType", F.lit(comparisonType))
        .withColumn("predictionType", F.lit(predictionType))
        .withColumn("total", F.lit(uniqIds))
        .withColumn("a", F.count("targetId").over(wPredictionComparison))
        .withColumn(
            "predictionTotal",
            F.count("targetId").over(wPrediction),
        )
        .withColumn(
            "comparisonTotal",
            F.count("targetId").over(wComparison),
        )
        .select(
            F.col(predictionColumn).alias("prediction"),
            F.col(comparisonColumn).alias("comparison"),
            "comparisonType",
            "predictionType",
            "a",
            "predictionTotal",
            "comparisonTotal",
            "total",
        )
        .filter(F.col("prediction").isNotNull())
        .filter(F.col("comparison").isNotNull())
        .distinct()
    )

    out.write.mode("overwrite").parquet(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )

    filePath = "gs://ot-team/jroldan/" + str(
        today_date
        + "_"
        + "analysis/"
        + data
        # + "_propagated"
        + "/"
        + comparisonColumn
        + "_"
        + predictionColumn
        + ".parquet"
    )
    listado.append(
        "gs://ot-team/jroldan/"
        + str(
            today_date
            + "_"
            + "analysis/"
            + data
            # + "_propagated"
            + "/"
            + comparisonColumn
            + "_"
            + predictionColumn
            + ".parquet"
        )
    )
    print(
        today_date
        + "_"
        + "analysis/"
        + data
        # + "_propagated"
        + "/"
        + comparisonColumn
        + "_"
        + predictionColumn
        + ".parquet"
    )
    c = datetime.now()
    c.strftime("%H:%M:%S")
    print(c)

    array1 = np.delete(
        out.join(full_data, on=["prediction", "comparison"], how="outer")
        .groupBy("comparison")
        .pivot("prediction")
        .agg(F.first("a"))
        .sort(F.col("comparison").desc())
        .select("comparison", "yes", "no")
        .fillna(0)
        .toPandas()
        .to_numpy(),
        [0],
        1,
    )
    total = np.sum(array1)
    res_npPhaseX = np.array(array1, dtype=int)
    resX = convertTuple(fisher_exact(res_npPhaseX, alternative="two-sided"))
    resx_CI = convertTuple(
        odds_ratio(res_npPhaseX).confidence_interval(confidence_level=0.95)
    )

    result_st.append(resX)
    result_ci.append(resx_CI)
    (rs_result, rs_ci) = relative_success(array1)

    results.append(
        [
            comparisonType,
            comparisonColumn,
            predictionColumn,
            round(float(resX.split(",")[0]), 2),
            float(resX.split(",")[1]),
            round(float(resx_CI.split(",")[0]), 2),
            round(float(resx_CI.split(",")[1]), 2),
            str(total),
            np.array(res_npPhaseX).tolist(),
            round(float(rs_result), 2),
            round(float(rs_ci[0]), 2),
            round(float(rs_ci[1]), 2),
            filePath,
        ]
    )
    return results


c = datetime.now()

print("start doing aggregations and writing")
today_date = str(date.today())
aggSetups_original = comparisons_df()
listado = []

print("starting with non-propagated aggregations at", c)

for key, df in dfs_dict.items():
    df = df.persist()
    for row in aggSetups_original:
        aggregations_original(df, key, listado, *row, today_date)
    df.unpersist()
    print(key + " df unpersisted")

print("non propagated files wroten succesfully at", c)

print("starting with non-propagated aggregations at", c)
for key, df in dfs_dict_propag.items():
    df = df.persist()
    for row in aggSetups_original:
        aggregations_original(df, key, listado, *row, today_date)
    df.unpersist()
    print(key + " df unpersisted")

#### make the DoE predicting drug success for T-D with multiple MoA in drugs

In [48]:
#### 11.12.2024
#######
##########     ATENTION
#### change code to work with generated dataframe instead of reading the parquet

"""
This scripts run Odds ratio analysis for DoE and 
genetic information on drug clinical success

"""
from functions import (
    discrepancifier,
    build_gwasResolvedColoc,
    temporary_directionOfEffect,
)
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.types import (
    StructType,
    StructField,
    ArrayType,
    DoubleType,
    DecimalType,
    StringType,
    FloatType,
)
from datetime import datetime


spark = SparkSession.builder.getOrCreate()
c = datetime.now()
today_date = str(date.today())
print("spark session created at", c)

print("Analysis started on " + today_date + " at ", c)
"""
#coloc = spark.read.parquet(
#    "gs://genetics-portal-dev-data/22.09.1/outputs/v2d_coloc"
#).filter(F.col("right_type") != "gwas")
"""

#### make the dataset from stopped clin trials
### read supplementary table 9
""" ### just showing how i did the dataset
st9 = spark.read.csv("/Users/juanr/Downloads/ST9.csv", sep=",", header=True)
st9.filter(
    (F.col("clinicalStatus").isin(["Terminated", "Withdrawn", "Suspended"]))
    & (F.col("prediction") == "Negative")
).groupBy(
    "targetId", "diseaseId", "clinicalStatus", "prediction"
).count().toPandas().to_csv(
    "targetDiseaseStoppedNegative.csv"
)
"""
### target-diseases terminated&withdrawal in clin trials
terminated = spark.read.csv(
    "gs://ot-team/jroldan/analysis/targetDiseaseStoppedNegative.csv",
    sep=",",
    header=True,
).drop("_c0", "Withdrawn")

path = "gs://open-targets-pre-data-releases/24.12-uo_test-3/output/etl/parquet/"

evidences = (
    spark.read.parquet(f"{path}evidence")
    .filter(
        F.col("datasourceId").isin(
            [
                "ot_genetics_portal",
                "gene_burden",
                "eva",
                "eva_somatic",
                "gene2phenotype",
                "orphanet",
                "cancer_gene_census",
                "intogen",
                "impc",
                "chembl",
            ]
        )
    )
    .persist()
)
ot_genetics = evidences.filter(F.col("datasourceId") == "ot_genetics_portal")

#### Now load sources of data to generate credible_set_OT_genetics evidences and associations.

target = spark.read.parquet(f"{path}targets/")

diseases = spark.read.parquet(f"{path}diseases/")

gwasResolvedColoc = build_gwasResolvedColoc(path)

#### take the direction from the lowest p value
window_spec = Window.partitionBy("targetId", "diseaseId").orderBy(
    F.col("pValueExponent").asc()
)
gwasCredibleAssoc = (
    gwasResolvedColoc.withColumn(
        "homogenized", F.first("colocDoE", ignorenulls=True).over(window_spec)
    )  ## added 30.01.2025
    .select("targetId", "diseaseId", "homogenized")
    .withColumn(
        "homogenized",
        F.when(F.col("homogenized").isNull(), F.lit("noEvaluable")).otherwise(
            F.col("homogenized")
        ),
    )
)

### datasource_filter for temporaryDoE
datasource_filter = [
    "ot_genetics_portal",
    "gene_burden",
    "eva",
    "eva_somatic",
    "gene2phenotype",
    "orphanet",
    "cancer_gene_census",
    "intogen",
    "impc",
    "chembl",
]

prueba_assessment, evidences, actionType, oncolabel = temporary_directionOfEffect(
    path, datasource_filter
)

print("Moving to step 2")

columns_chembl = ["LoF_protect", "GoF_protect"]
columns_dataset = ["LoF_protect", "GoF_protect", "LoF_risk", "GoF_risk", "evidenceDif"]
columns = ["GoF_risk", "LoF_protect", "LoF_risk", "GoF_protect"]
terms = ["noEvaluable", "bivalent_risk", "null", "dispar"]

taDf = spark.createDataFrame(
    data=[
        ("MONDO_0045024", "cell proliferation disorder", "Oncology"),
        ("EFO_0005741", "infectious disease", "Other"),
        ("OTAR_0000014", "pregnancy or perinatal disease", "Other"),
        ("EFO_0005932", "animal disease", "Other"),
        ("MONDO_0024458", "disease of visual system", "Other"),
        ("EFO_0000319", "cardiovascular disease", "Other"),
        ("EFO_0009605", "pancreas disease", "Other"),
        ("EFO_0010282", "gastrointestinal disease", "Other"),
        ("OTAR_0000017", "reproductive system or breast disease", "Other"),
        ("EFO_0010285", "integumentary system disease", "Other"),
        ("EFO_0001379", "endocrine system disease", "Other"),
        ("OTAR_0000010", "respiratory or thoracic disease", "Other"),
        ("EFO_0009690", "urinary system disease", "Other"),
        ("OTAR_0000006", "musculoskeletal or connective tissue disease", "Other"),
        ("MONDO_0021205", "disease of ear", "Other"),
        ("EFO_0000540", "immune system disease", "Other"),
        ("EFO_0005803", "hematologic disease", "Other"),
        ("EFO_0000618", "nervous system disease", "Other"),
        ("MONDO_0002025", "psychiatric disorder", "Other"),
        ("MONDO_0024297", "nutritional or metabolic disease", "Other"),
        ("OTAR_0000018", "genetic, familial or congenital disease", "Other"),
        ("OTAR_0000009", "injury, poisoning or other complication", "Other"),
        ("EFO_0000651", "phenotype", "Other"),
        ("EFO_0001444", "measurement", "Other"),
        ("GO_0008150", "biological process", "Other"),
    ],
    schema=StructType(
        [
            StructField("taId", StringType(), True),
            StructField("taLabel", StringType(), True),
            StructField("taLabelSimple", StringType(), True),
        ]
    ),
).withColumn("taRank", F.monotonically_increasing_id())

### give us a classification of Oncology VS non oncology
wByDisease = Window.partitionBy("diseaseId")  #### checked 31.05.2023
diseaseTA = (
    diseases.withColumn("taId", F.explode("therapeuticAreas"))
    .select(F.col("id").alias("diseaseId"), "taId", "parents")
    .join(taDf, on="taId", how="left")
    .withColumn("minRank", F.min("taRank").over(wByDisease))
    .filter(F.col("taRank") == F.col("minRank"))
    .drop("taRank", "minRank")
)

#### give us propagation of diseases and list of therapeutic areas associated
diseases2 = diseases.select("id", "parents").withColumn(
    "diseaseIdPropagated",
    F.explode_outer(F.concat(F.array(F.col("id")), F.col("parents"))),
)

chembl_trials = (
    prueba_assessment.filter((F.col("datasourceId").isin(["chembl"])))
    .groupBy("targetId", "diseaseId")
    .agg(F.max(F.col("clinicalPhase")).alias("maxClinPhase"))
)

terminated_array = (
    terminated.groupBy("targetId", "diseaseId")
    .agg(F.collect_set("clinicalStatus").alias("clinicalStatus"))
    .withColumn("prediction", F.when(F.col("clinicalStatus").isNotNull(), F.lit("yes")))
)

assessment = prueba_assessment.unionByName(
    gwasCredibleAssoc.withColumn("datasourceId", F.lit("gwas_credible_set")),
    allowMissingColumns=True,
)

print("defining non propagated,propagated and analysis_drugs functions")


def analysis_nonPropagated(assessment, analysisDatasources):
    return discrepancifier(
        assessment.filter(F.col("datasourceId").isin(analysisDatasources))
        .withColumn(
            "datasources",
            F.collect_set(F.col("datasourceId")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy(
            "targetId",
            "diseaseId",
        )
        .pivot("homogenized")
        .agg(F.count("targetId"))
        .persist()
    )


def analysis_propagated(assessment, analysisDatasources):
    return discrepancifier(
        assessment.filter(F.col("datasourceId").isin(analysisDatasources))
        .withColumn(
            "datasources",
            F.collect_set(F.col("datasourceId")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .join(
            diseases2.selectExpr("id as diseaseId", "diseaseIdPropagated"),
            on="diseaseId",
            how="left",
        )
        .withColumnRenamed("diseaseId", "oldDiseaseId")
        .withColumnRenamed("diseaseIdPropagated", "diseaseId")
        .groupBy(
            "targetId",
            "diseaseId",
        )
        .pivot("homogenized")
        .agg(F.count("targetId"))
        .persist()
    )


chembl_ds = ["chembl"]
drugApproved = (  ### added on 30.01.2025
    spark.read.parquet("gs://ot-team/irene/l2g/validation/chembl_w_flags")
    .drop("clinicalTrialId", "isComplex")
    .withColumn(
        "isApproved",
        F.when(F.col("isApproved") == "true", F.lit(1)).otherwise(F.lit(0)),
    )
    .distinct()
)


def analysis_drugs(assessment, chembl_ds, drugApproved):
    return discrepancifier(
        assessment.filter((F.col("datasourceId").isin(chembl_ds)))
        .join(
            drugApproved.filter(F.col("isApproved") == 1),
            on=["targetId", "diseaseId", "drugId"],
            how="left",
        )
        .withColumn(
            "maxClinPhase",
            F.max(F.col("clinicalPhase")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .withColumn(
            "approvedDrug",
            F.max(F.col("isApproved")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .withColumn(
            "approved",
            F.when(F.col("approvedDrug") == 1, F.lit("yes")).otherwise(F.lit("no")),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase", "approved")
        .pivot("homogenized")
        .agg(F.count("targetId"))
        .persist()
    )


analysis_chembl = analysis_drugs(assessment, chembl_ds, drugApproved)

taDf = spark.createDataFrame(
    data=[
        ("MONDO_0045024", "cell proliferation disorder", "Oncology"),
        ("EFO_0005741", "infectious disease", "Other"),
        ("OTAR_0000014", "pregnancy or perinatal disease", "Other"),
        ("EFO_0005932", "animal disease", "Other"),
        ("MONDO_0024458", "disease of visual system", "Other"),
        ("EFO_0000319", "cardiovascular disease", "Other"),
        ("EFO_0009605", "pancreas disease", "Other"),
        ("EFO_0010282", "gastrointestinal disease", "Other"),
        ("OTAR_0000017", "reproductive system or breast disease", "Other"),
        ("EFO_0010285", "integumentary system disease", "Other"),
        ("EFO_0001379", "endocrine system disease", "Other"),
        ("OTAR_0000010", "respiratory or thoracic disease", "Other"),
        ("EFO_0009690", "urinary system disease", "Other"),
        ("OTAR_0000006", "musculoskeletal or connective tissue disease", "Other"),
        ("MONDO_0021205", "disease of ear", "Other"),
        ("EFO_0000540", "immune system disease", "Other"),
        ("EFO_0005803", "hematologic disease", "Other"),
        ("EFO_0000618", "nervous system disease", "Other"),
        ("MONDO_0002025", "psychiatric disorder", "Other"),
        ("MONDO_0024297", "nutritional or metabolic disease", "Other"),
        ("OTAR_0000018", "genetic, familial or congenital disease", "Other"),
        ("OTAR_0000009", "injury, poisoning or other complication", "Other"),
        ("EFO_0000651", "phenotype", "Other"),
        ("EFO_0001444", "measurement", "Other"),
        ("GO_0008150", "biological process", "Other"),
    ],
    schema=StructType(
        [
            StructField("taId", StringType(), True),
            StructField("taLabel", StringType(), True),
            StructField("taLabelSimple", StringType(), True),
        ]
    ),
).withColumn("taRank", F.monotonically_increasing_id())

### give us a classification of Oncology VS non oncology
wByDisease = Window.partitionBy("diseaseId")  #### checked 31.05.2023
diseaseTA = (
    diseases.withColumn("taId", F.explode("therapeuticAreas"))
    .select(F.col("id").alias("diseaseId"), "taId", "parents")
    .join(taDf, on="taId", how="left")
    .withColumn("minRank", F.min("taRank").over(wByDisease))
    .filter(F.col("taRank") == F.col("minRank"))
    .drop("taRank", "minRank")
)


def analysis_drugs_dispar(assessment, chembl_ds):
    return discrepancifier(
        assessment.filter((F.col("datasourceId").isin(chembl_ds)))
        .withColumn(
            "maxClinPhase_moa",
            F.max(F.col("clinicalPhase")).over(
                Window.partitionBy("targetId", "diseaseId", "homogenized")
            ),
        )
        .withColumn(
            "maxClinPhase",
            F.max(F.col("clinicalPhase")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase")
        .pivot("homogenized")
        .agg(F.first(F.col("maxClinPhase_moa")).alias("doe"))
        .persist()
    )


analysis_chembl_dispar = analysis_drugs_dispar(assessment, chembl_ds)


def analysis_drugs_dispar_approved(assessment, chembl_ds, drugApproved):
    return discrepancifier(
        assessment.filter((F.col("datasourceId").isin(chembl_ds)))
        .join(
            drugApproved.filter(F.col("isApproved") == 1),
            on=["targetId", "diseaseId", "drugId"],
            how="left",
        )
        .withColumn(
            "approvedDrug",
            F.max(F.col("isApproved")).over(
                Window.partitionBy("targetId", "diseaseId", "homogenized")
            ),
        )
        .withColumn(
            "approved",
            F.when(F.col("approvedDrug") == 1, F.lit("yes")).otherwise(F.lit("no")),
        )
        .withColumn(
            "maxClinPhase",
            F.max(F.col("clinicalPhase")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy("targetId", "diseaseId", "maxClinPhase")
        .pivot("homogenized")
        .agg(F.first(F.col("approved")).alias("doe"))
        .persist()
    )


drugApp = (
    analysis_drugs_dispar_approved(assessment, chembl_ds, drugApproved)
    .filter(F.col("coherencyDiagonal") == "dispar")
    .withColumnRenamed("GoF_protect", "GoF_protectApp")
    .withColumnRenamed("LoF_protect", "LoF_protectApp")
).select("targetId", "diseaseId", "maxClinPhase", "GoF_protectApp", "LoF_protectApp")


print("there are the same numbers of T-D:", drugApp.count())

disparDrug = analysis_chembl_dispar.filter(
    F.col("coherencyDiagonal") == "dispar"
).selectExpr(
    "targetId",
    "diseaseId",
    "GoF_protect as GoF_protect_ch",
    "LoF_protect as LoF_protect_ch",
)
print("there are same numbers of T-D:", disparDrug.count())

spark session created at 2025-02-03 16:22:40.625638
Analysis started on 2025-02-03 at  2025-02-03 16:22:40.625638


25/02/03 16:22:42 WARN CacheManager: Asked to cache already cached data.


read spark files
fixing scXQTL and XQTL studies
fixed scXQTL and XQTL studies
creating new coloc


25/02/03 16:23:28 WARN CacheManager: Asked to cache already cached data.        


creating new gwasResolvedColoc


25/02/03 16:23:29 WARN CacheManager: Asked to cache already cached data.
25/02/03 16:23:30 WARN CacheManager: Asked to cache already cached data.


Moving to step 2
defining non propagated,propagated and analysis_drugs functions


                                                                                

there are the same numbers of T-D: 1571




there are same numbers of T-D: 1571


                                                                                

In [78]:
drugsDisparities = discrepancifier(
    assessment.filter((F.col("datasourceId").isin(chembl_ds)))
    .join(
        drugApproved.filter(F.col("isApproved") == 1),
        on=["targetId", "diseaseId", "drugId"],
        how="left",
    )
    .withColumn(
        "approvedDrug",
        F.max(F.col("isApproved")).over(
            Window.partitionBy("targetId", "diseaseId", "homogenized")
        ),
    )
    .withColumn(
        "approved",
        F.when(F.col("approvedDrug") == 1, F.lit("yes")).otherwise(F.lit("no")),
    )
    .withColumn(
        "maxClinPhase",
        F.max(F.col("clinicalPhase")).over(Window.partitionBy("targetId", "diseaseId")),
    )
    .groupBy("targetId", "diseaseId", "maxClinPhase")
    .pivot("approved")
    .agg(F.first(F.col("homogenized")).alias("doe"))
    .persist()
)

                                                                                

In [89]:
datasource_list = [
    "gene_burden",
    "intogen",
    "cancer_gene_census",
    "eva",
    "eva_somatic",
    "gwas_credible_set",
    "impc",
    "orphanet",
    "gene2phenotype",
]
genEvidForDrug = analysis_propagated(assessment, datasource_list)
drugDispar.join(genEvidForDrug, on=["targetId", "diseaseId"], how="left").filter(
    F.col("coherencyDiagonal") == "dispar"
).groupBy("maxClinPhase").count().show()

25/02/03 17:28:32 WARN CacheManager: Asked to cache already cached data.        


+------------+-----+
|maxClinPhase|count|
+------------+-----+
|         1.0|   47|
|         4.0|   95|
|         2.0|   93|
|         3.0|   32|
+------------+-----+



### adding warnings 

In [95]:
### load black box warnings
warnings = spark.read.parquet(
    "gs://open-targets-pre-data-releases/24.12-uo_test-3/output/etl/parquet/drugWarnings/"
).select(F.explode_outer("chemblIds").alias("chemblId"), "warningType", "toxicityClass")

In [100]:
drugWarning = (
    warnings.groupBy("chemblId")
    .pivot("warningType")
    .count()
    .withColumnRenamed("chemblId", "drugId")
)

In [98]:
drugWarning.show()

+-------------+-----------------+---------+
+-------------+-----------------+---------+
|CHEMBL2107840|                1|     null|
|CHEMBL1200632|                2|     null|
|    CHEMBL852|                4|     null|
| CHEMBL188921|             null|       10|
|CHEMBL3989522|                1|     null|
|CHEMBL1200879|                2|     null|
|    CHEMBL468|                2|     null|
|CHEMBL2023898|                1|     null|
|   CHEMBL1544|                1|     null|
|CHEMBL3137358|                1|     null|
|CHEMBL2303621|                1|     null|
|CHEMBL1201827|                1|     null|
|    CHEMBL979|             null|        4|
|  CHEMBL87493|                1|        1|
|CHEMBL1200585|                1|     null|
|  CHEMBL24944|             null|        2|
|CHEMBL4594242|             null|        1|
|   CHEMBL1233|             null|        5|
|CHEMBL1201867|                1|     null|
|CHEMBL1201486|                1|     null|
+-------------+-----------------

In [113]:
drugsDisparities = discrepancifier(
    assessment.filter(F.col("datasourceId").isin(chembl_ds))
    .join(drugWarning, on="drugId", how="left")
    .withColumn(
        "BBW",
        F.size(
            F.collect_set(F.col("`Black Box Warning`")).over(
                Window.partitionBy("targetId", "diseaseId", "drugId")
            ),
        ),
    )
    .groupBy("targetId", "diseaseId")
    .pivot("homogenized")
    .agg(F.first("BBW").alias("doe"))
    .persist()
)

In [127]:
drugWarning.show()

+-------------+-----------------+---------+
+-------------+-----------------+---------+
|CHEMBL2107840|                1|     null|
|CHEMBL1200632|                2|     null|
|    CHEMBL852|                4|     null|
| CHEMBL188921|             null|       10|
|CHEMBL3989522|                1|     null|
|CHEMBL1200879|                2|     null|
|    CHEMBL468|                2|     null|
|CHEMBL2023898|                1|     null|
|   CHEMBL1544|                1|     null|
|CHEMBL3137358|                1|     null|
|CHEMBL2303621|                1|     null|
|CHEMBL1201827|                1|     null|
|    CHEMBL979|             null|        4|
|  CHEMBL87493|                1|        1|
|CHEMBL1200585|                1|     null|
|  CHEMBL24944|             null|        2|
|CHEMBL4594242|             null|        1|
|   CHEMBL1233|             null|        5|
|CHEMBL1201867|                1|     null|
|CHEMBL1201486|                1|     null|
+-------------+-----------------

In [128]:
drugsDisparitiesWD = discrepancifier(
    assessment.filter(F.col("datasourceId").isin(chembl_ds))
    .join(drugWarning, on="drugId", how="left")
    .withColumn(
        "WD",
        F.size(
            F.collect_set(F.col("Withdrawn")).over(
                Window.partitionBy("targetId", "diseaseId", "drugId")
            ),
        ),
    )
    .groupBy("targetId", "diseaseId")
    .pivot("homogenized")
    .agg(F.first("WD").alias("doe"))
    .persist()
)

                                                                                

In [None]:
### black bos warnings
drugsDisparities.withColumnRenamed("GoF_protect", "GoF_protect_ch").withColumnRenamed(
    "LoF_protect", "LoF_protect_ch"
).withColumnRenamed("coherencyDiagonal", "coherencyDiagonal_ch").drop(
    "GoF_risk", "LoF_risk"
).join(
    genEvidForDrug, on=["targetId", "diseaseId"], how="left"
).filter(
    F.col("coherencyDiagonal").isin(["dispar", "coherent"])
).withColumn(
    "bbw",
    F.when(
        (F.col("GoF_protect_ch") > 0) | (F.col("LoF_protect_ch") > 0), F.lit("yes")
    ).otherwise(F.lit("no")),
).groupBy(
    "coherencyDiagonal"
).pivot(
    "bbw"
).count().show()



+-----------------+----+----+
|coherencyDiagonal|  no| yes|
+-----------------+----+----+
|           dispar| 194|  73|
|         coherent|3284|1243|
+-----------------+----+----+



                                                                                

In [130]:
drugsDisparitiesWD.withColumnRenamed("GoF_protect", "GoF_protect_ch").withColumnRenamed(
    "LoF_protect", "LoF_protect_ch"
).withColumnRenamed("coherencyDiagonal", "coherencyDiagonal_ch").drop(
    "GoF_risk", "LoF_risk"
).join(
    genEvidForDrug, on=["targetId", "diseaseId"], how="left"
).filter(
    F.col("coherencyDiagonal").isin(["dispar", "coherent"])
).withColumn(
    "wd",
    F.when(
        (F.col("GoF_protect_ch") > 0) | (F.col("LoF_protect_ch") > 0), F.lit("yes")
    ).otherwise(F.lit("no")),
).groupBy(
    "coherencyDiagonal"
).pivot(
    "wd"
).count().show()



+-----------------+----+---+
|coherencyDiagonal|  no|yes|
+-----------------+----+---+
|           dispar| 266|  1|
|         coherent|4506| 21|
+-----------------+----+---+



                                                                                

In [None]:
drugsDisparities.withColumnRenamed("GoF_protect", "GoF_protect_ch").withColumnRenamed(
    "LoF_protect", "LoF_protect_ch"
).withColumnRenamed("coherencyDiagonal", "coherencyDiagonal_ch").drop(
    "GoF_risk", "LoF_risk"
).join(
    genEvidForDrug, on=["targetId", "diseaseId"], how="left"
).filter(
    F.col("coherencyDiagonal").isin(["dispar", "coherent"])
).withColumn(
    "bbw",
    F.when(
        (F.col("GoF_protect_ch") > 0) | (F.col("LoF_protect_ch") > 0), F.lit("yes")
    ).otherwise(F.lit("no")),
).groupBy(
    "coherencyDiagonal"
).pivot(
    "bbw"
).count().show()

In [92]:
drugDispar.join(genEvidForDrug, on=["targetId", "diseaseId"], how="left").filter(
    F.col("coherencyDiagonal") == "dispar"
).drop("noEvaluable").show()

+---------------+-------------+------------+--------------+--------------+-----------+--------+-----------+--------+-----------------+----------------+
|       targetId|    diseaseId|maxClinPhase|GoF_protect_ch|LoF_protect_ch|GoF_protect|GoF_risk|LoF_protect|LoF_risk|coherencyDiagonal|coherencyOneCell|
+---------------+-------------+------------+--------------+--------------+-----------+--------+-----------+--------+-----------------+----------------+
|ENSG00000140443|MONDO_0015892|         2.0|           2.0|          null|       null|    null|          1|      13|           dispar|          dispar|
|ENSG00000141510|  EFO_0004251|         1.0|          null|           1.0|       null|     186|       null|      14|           dispar|          dispar|
|ENSG00000157388|MONDO_0001422|         2.0|          null|           2.0|       null|       1|       null|       1|           dispar|          dispar|
|ENSG00000165731|  EFO_0002892|         4.0|          null|           4.0|       null|  

In [77]:
disparDrug.selectExpr(
    "targetId",
    "diseaseId",
    "stack(2, 'GoF_protect_ch', GoF_protect_ch, 'LoF_protect_ch', LoF_protect_ch) as (var, val)",
).selectExpr("targetId", "diseaseId", "var as doeDrug", "val as maxPhase").join(
    genEvidForDrug, on=["targetId", "diseaseId"], how="left"
).filter(
    F.col("coherencyDiagonal").isNotNull()
).join(
    diseaseTA.select("diseaseId", "taLabel", "taLabelSimple"),
    on="diseaseId",
    how="left",
).withColumn(
    "agreeGenetic",
    F.when(
        (F.col("doeDrug") == "LoF_protect_ch")
        & ((F.col("LoF_protect").isNotNull()) | (F.col("GoF_risk").isNotNull())),
        F.lit("agree"),
    )
    .when(
        (F.col("doeDrug") == "GoF_protect_ch")
        & ((F.col("GoF_protect").isNotNull()) | (F.col("LoF_risk").isNotNull())),
        F.lit("agree"),
    )
    .otherwise(F.lit("notAgree")),
).withColumn(
    "phase4", F.when(F.col("maxPhase") == 4, F.lit("yes")).otherwise(F.lit("no"))
).show()



+-------------+---------------+--------------+--------+-----------+--------+-----------+--------+-----------+-----------------+----------------+--------------------+-------------+------------+------+
|    diseaseId|       targetId|       doeDrug|maxPhase|GoF_protect|GoF_risk|LoF_protect|LoF_risk|noEvaluable|coherencyDiagonal|coherencyOneCell|             taLabel|taLabelSimple|agreeGenetic|phase4|
+-------------+---------------+--------------+--------+-----------+--------+-----------+--------+-----------+-----------------+----------------+--------------------+-------------+------------+------+
|MONDO_0005148|ENSG00000112164|LoF_protect_ch|     3.0|          9|    null|       null|       4|          2|         coherent|          dispar|    pancreas disease|        Other|    notAgree|    no|
|MONDO_0005148|ENSG00000112164|GoF_protect_ch|     4.0|          9|    null|       null|       4|          2|         coherent|          dispar|    pancreas disease|        Other|       agree|   yes|


                                                                                

In [54]:
datasource_list = [
    "gene_burden",
    "intogen",
    "cancer_gene_census",
    "eva",
    "eva_somatic",
    "gwas_credible_set",
    "impc",
    "orphanet",
    "gene2phenotype",
]
genEvidForDrug = analysis_propagated(assessment, datasource_list).filter(
    F.col("coherencyDiagonal") == "coherent"
)

drugDispar = analysis_drugs_dispar(assessment, chembl_ds).selectExpr(
    "targetId",
    "diseaseId",
    "maxClinPhase",
    "GoF_protect as GoF_protect_ch",
    "LoF_protect as LoF_protect_ch",
)

25/02/03 16:33:12 WARN CacheManager: Asked to cache already cached data.        
25/02/03 16:33:13 WARN CacheManager: Asked to cache already cached data.
25/02/03 16:33:13 WARN CacheManager: Asked to cache already cached data.
25/02/03 16:33:13 WARN CacheManager: Asked to cache already cached data.


#### rows per dif doe 

In [50]:
datasource_list = [
    "gene_burden",
    "intogen",
    "cancer_gene_census",
    "eva",
    "eva_somatic",
    "gwas_credible_set",
    "impc",
    "orphanet",
    "gene2phenotype",
]
genEvidForDrug = analysis_propagated(assessment, datasource_list).filter(
    F.col("coherencyDiagonal") == "coherent"
)

drugDispar = analysis_drugs_dispar_approved(
    assessment, chembl_ds, drugApproved
).selectExpr(
    "targetId",
    "diseaseId",
    "maxClinPhase",
    "GoF_protect as GoF_protect_ch",
    "LoF_protect as LoF_protect_ch",
)

25/02/03 16:29:22 WARN CacheManager: Asked to cache already cached data.        
25/02/03 16:29:23 WARN CacheManager: Asked to cache already cached data.        
25/02/03 16:29:23 WARN CacheManager: Asked to cache already cached data.
25/02/03 16:29:23 WARN CacheManager: Asked to cache already cached data.


In [None]:
drugDispar.show()

81435

In [40]:
drugDispar.groupBy("targetId", "diseaseId").pivot("homogenized").count().show()

+---------------+-------------+-----------+-----------+-----------+
|       targetId|    diseaseId|GoF_protect|LoF_protect|noEvaluable|
+---------------+-------------+-----------+-----------+-----------+
|ENSG00000007314|  EFO_0000555|       null|          1|       null|
|ENSG00000007314|  EFO_0003102|       null|          1|       null|
|ENSG00000007314|  EFO_0003894|       null|          1|       null|
|ENSG00000007314|  EFO_0004699|       null|          1|       null|
|ENSG00000007314|  EFO_0801084|       null|          1|       null|
|ENSG00000007314|  EFO_1000249|       null|          1|       null|
|ENSG00000008018|  EFO_1000453|       null|          1|       null|
|ENSG00000010310|  EFO_0003884|          1|       null|       null|
|ENSG00000012504|  EFO_0004210|          1|       null|       null|
|ENSG00000012504|MONDO_0019052|          1|       null|       null|
|ENSG00000012779|MONDO_0004235|       null|          1|       null|
|ENSG00000014138|  EFO_1001945|       null|     

In [None]:
drugDispar.join(genEvidForDrug, on=["targetId", "diseaseId"], how="left").filter(
    F.col("coherencyDiagonal") == "coherent"
).show()

+---------------+-------------+-----------+------------+--------------+--------------+-----------+--------+-----------+--------+-----------+-----------------+----------------+
|       targetId|    diseaseId|homogenized|maxClinPhase|GoF_protect_ch|LoF_protect_ch|GoF_protect|GoF_risk|LoF_protect|LoF_risk|noEvaluable|coherencyDiagonal|coherencyOneCell|
+---------------+-------------+-----------+------------+--------------+--------------+-----------+--------+-----------+--------+-----------+-----------------+----------------+
|ENSG00000145675|  EFO_0004142|LoF_protect|         1.0|          null|          null|       null|    null|       null|       1|          1|         coherent|        coherent|
|ENSG00000186868|MONDO_0019037|LoF_protect|         2.0|          null|          null|       null|    null|       null|       1|         31|         coherent|        coherent|
|ENSG00000037280|  EFO_0003897|LoF_protect|         2.0|          null|          null|       null|       2|       null| 

In [144]:
assessment.filter((F.col("datasourceId").isin(chembl_ds))).withColumn(
    "minStartDate",
    F.min(F.col("studyStartDate")).over(
        Window.partitionBy("targetId", "diseaseId", "homogenized")
    ),
).withColumn(
    "maxClinPhase_moa",
    F.max(F.col("clinicalPhase")).over(
        Window.partitionBy("targetId", "diseaseId", "homogenized")
    ),
).withColumn(
    "maxStartDate",
    F.when(
        F.col("clinicalPhase") == F.col("maxClinPhase_moa"),
        F.max(F.col("studyStartDate")).over(
            Window.partitionBy("targetId", "diseaseId", "homogenized")
        ),
    ).otherwise(F.lit(None)),
).groupBy(
    "targetId",
    "diseaseId",
    "homogenized",
    "minStartDate",
    "maxStartDate",
    "maxClinPhase_moa",
).count().withColumn(
    "rest", F.col("maxStartDate") - F.col("minStartDate")
).show()



+---------------+-------------+-----------+------------+------------+----------------+-----+----+
|       targetId|    diseaseId|homogenized|minStartDate|maxStartDate|maxClinPhase_moa|count|rest|
+---------------+-------------+-----------+------------+------------+----------------+-----+----+
|ENSG00000000938|  EFO_0000183|LoF_protect|  2008-10-01|  2012-05-01|             1.0|    2|null|
|ENSG00000000938|  EFO_0000519|LoF_protect|  2009-05-01|  2009-10-01|             2.0|    2|null|
|ENSG00000000938|  EFO_0000519|LoF_protect|  2009-05-01|        null|             2.0|    2|null|
|ENSG00000000938|  EFO_0000565|LoF_protect|  2003-11-01|        null|             3.0|   16|null|
|ENSG00000000938|  EFO_0000565|LoF_protect|  2003-11-01|  2022-08-25|             3.0|    2|null|
|ENSG00000000938|  EFO_0000702|LoF_protect|  2007-04-01|  2007-04-01|             2.0|    1|null|
|ENSG00000000938|  EFO_0002918|LoF_protect|  2017-07-07|  2017-07-07|             1.0|    1|null|
|ENSG00000000938|  E

                                                                                

In [71]:
drugApp.join(
    diseaseTA.select("diseaseId", "taLabelSimple", "taLabel"),
    on="diseaseId",
    how="left",
).join(genEvidForDrug, on=["targetId", "diseaseId"], how="left").filter(
    F.col("coherencyDiagonal") == "coherent"
).withColumn(
    "doeAgreeApp",
    F.when(
        (F.col("GoF_protectApp") == "yes")
        & ((F.col("GoF_protect").isNotNull()) | (F.col("LoF_risk").isNotNull())),
        F.lit("agree"),
    )
    .when(
        (F.col("LoF_protectApp") == "yes")
        & ((F.col("LoF_protect").isNotNull()) | (F.col("GoF_risk").isNotNull())),
        F.lit("agree"),
    )
    .when(
        (F.col("GoF_protectApp") == "no") & (F.col("LoF_protectApp") == "no"),
        F.lit("no"),
    )
    .otherwise(F.lit("notAgree")),
).groupBy(
    "doeAgreeApp"
).count().show()

                                                                                ]

+-----------+-----+
|doeAgreeApp|count|
+-----------+-----+
|   notAgree|   11|
|         no|   80|
|      agree|   26|
+-----------+-----+



                                                                                

In [5]:
drugDispar.filter(F.col("coherencyDiagonal") == "dispar").drop(
    "coherencyDiagonal", "coherencyOneCell"
).join(genEvidForDrug, on=["targetId", "diseaseId"], how="left").filter(
    F.col("coherencyDiagonal") == "coherent"
).join(
    diseaseTA.select("diseaseId", "taLabelSimple", "taLabel"),
    on="diseaseId",
    how="left",
).withColumn(
    "doeMaxClinPhase",
    F.when(F.col("GoF_protect_ch") == F.col("maxClinPhase"), F.lit("GoF_protect"))
    .when(F.col("LoF_protect_ch") == F.col("maxClinPhase"), F.lit("LoF_protect"))
    .when(F.col("GoF_protect_ch") == F.col("LoF_protect_ch"), F.lit("same"))
    .otherwise(F.lit("notConsidered")),
).withColumn(
    "doeAgreeMaxPhase",
    F.when(
        (F.col("doeMaxClinPhase") == "GoF_protect")
        & ((F.col("GoF_protect").isNotNull()) | (F.col("LoF_risk").isNotNull())),
        F.lit("agree"),
    )
    .when(
        (F.col("doeMaxClinPhase") == "LoF_protect")
        & ((F.col("LoF_protect").isNotNull()) | (F.col("GoF_risk").isNotNull())),
        F.lit("agree"),
    )
    .otherwise(F.lit("notAgree")),
).join(
    drugApp.drop("maxClinPhase"), on=["targetId", "diseaseId"], how="left"
).withColumn(
    "Phase4", F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no"))
).show()

"""
.groupBy(
    "taLabelSimple", "doeAgreeMaxPhase"
).pivot(
    "Phase4"
).count().select(
    "taLabelSimple", "doeAgreeMaxPhase", "yes", "no"
).sort(
    F.col("taLabelSimple").desc(), F.col("doeAgreeMaxPhase").desc()
).show()
"""



+---------------+-------------+------------+--------------+--------------+-----------+--------+-----------+--------+-----------+-----------------+----------------+-------------+--------------------+---------------+----------------+--------------+--------------+------+
|       targetId|    diseaseId|maxClinPhase|GoF_protect_ch|LoF_protect_ch|GoF_protect|GoF_risk|LoF_protect|LoF_risk|noEvaluable|coherencyDiagonal|coherencyOneCell|taLabelSimple|             taLabel|doeMaxClinPhase|doeAgreeMaxPhase|GoF_protectApp|LoF_protectApp|Phase4|
+---------------+-------------+------------+--------------+--------------+-----------+--------+-----------+--------+-----------+-----------------+----------------+-------------+--------------------+---------------+----------------+--------------+--------------+------+
|ENSG00000134460|  EFO_0000198|         3.0|           3.0|           2.0|       null|    null|       null|       2|       null|         coherent|        coherent|     Oncology|cell proliferati

                                                                                

'\n.groupBy(\n    "taLabelSimple", "doeAgreeMaxPhase"\n).pivot(\n    "Phase4"\n).count().select(\n    "taLabelSimple", "doeAgreeMaxPhase", "yes", "no"\n).sort(\n    F.col("taLabelSimple").desc(), F.col("doeAgreeMaxPhase").desc()\n).show()\n'

In [12]:
disparDrug.join(genEvidForDrug, on=["targetId", "diseaseId"], how="left").filter(
    F.col("coherencyDiagonal").isNotNull()
).show()

+---------------+-------------+--------------+--------------+-----------+--------+-----------+--------+-----------+-----------------+----------------+
|       targetId|    diseaseId|GoF_protect_ch|LoF_protect_ch|GoF_protect|GoF_risk|LoF_protect|LoF_risk|noEvaluable|coherencyDiagonal|coherencyOneCell|
+---------------+-------------+--------------+--------------+-----------+--------+-----------+--------+-----------+-----------------+----------------+
|ENSG00000112164|MONDO_0005148|           4.0|           3.0|          9|    null|       null|       4|          2|         coherent|          dispar|
|ENSG00000132170|  EFO_0002618|           2.0|           1.0|       null|    null|       null|       1|       null|         coherent|        coherent|
|ENSG00000198793|MONDO_0007254|           2.0|           3.0|       null|       1|       null|    null|       null|         coherent|        coherent|
|ENSG00000102468|MONDO_0002009|           3.0|           4.0|       null|    null|       null|

In [7]:
disparDrug.join(genEvidForDrug, on=["targetId", "diseaseId"], how="left").filter(
    F.col("coherencyDiagonal").isNotNull()
).join(drugApp, on=["targetId", "diseaseId"], how="left").filter(
    F.col("coherencyDiagonal") == "coherent"
).show()

+---------------+-------------+--------------+--------------+-----------+--------+-----------+--------+-----------+-----------------+----------------+------------+--------------+--------------+
|       targetId|    diseaseId|GoF_protect_ch|LoF_protect_ch|GoF_protect|GoF_risk|LoF_protect|LoF_risk|noEvaluable|coherencyDiagonal|coherencyOneCell|maxClinPhase|GoF_protectApp|LoF_protectApp|
+---------------+-------------+--------------+--------------+-----------+--------+-----------+--------+-----------+-----------------+----------------+------------+--------------+--------------+
|ENSG00000112164|MONDO_0005148|           4.0|           3.0|          9|    null|       null|       4|          2|         coherent|          dispar|         4.0|           yes|            no|
|ENSG00000132170|  EFO_0002618|           2.0|           1.0|       null|    null|       null|       1|       null|         coherent|        coherent|         2.0|            no|            no|
|ENSG00000198793|MONDO_0007254

In [6]:
disparDrug.join(genEvidForDrug, on=["targetId", "diseaseId"], how="left").filter(
    F.col("coherencyDiagonal").isNotNull()
).join(drugApp, on=["targetId", "diseaseId"], how="left").filter(
    F.col("coherencyDiagonal") == "coherent"
).show()

+---------------+-------------+--------------+--------------+-----------+--------+-----------+--------+-----------+-----------------+----------------+------------+--------------+--------------+
|       targetId|    diseaseId|GoF_protect_ch|LoF_protect_ch|GoF_protect|GoF_risk|LoF_protect|LoF_risk|noEvaluable|coherencyDiagonal|coherencyOneCell|maxClinPhase|GoF_protectApp|LoF_protectApp|
+---------------+-------------+--------------+--------------+-----------+--------+-----------+--------+-----------+-----------------+----------------+------------+--------------+--------------+
|ENSG00000112164|MONDO_0005148|           4.0|           3.0|          9|    null|       null|       4|          2|         coherent|          dispar|         4.0|           yes|            no|
|ENSG00000132170|  EFO_0002618|           2.0|           1.0|       null|    null|       null|       1|       null|         coherent|        coherent|         2.0|            no|            no|
|ENSG00000198793|MONDO_0007254

In [17]:
disparDrug.join(genEvidForDrug, on=["targetId", "diseaseId"], how="left").filter(
    F.col("coherencyDiagonal").isNotNull()
).join(drugApp, on=["targetId", "diseaseId"], how="left").filter(
    F.col("coherencyDiagonal") == "coherent"
).withColumn(
    "doeMaxClinPhase",
    F.when(F.col("GoF_protect_ch") == F.col("maxClinPhase"), F.lit("GoF_protect"))
    .when(F.col("LoF_protect_ch") == F.col("maxClinPhase"), F.lit("LoF_protect"))
    .when(F.col("GoF_protect_ch") == F.col("LoF_protect_ch"), F.lit("same"))
    .otherwise(F.lit("notConsidered")),
).withColumn(
    "doeAgreeMaxPhase",
    F.when(
        (F.col("doeMaxClinPhase") == "GoF_protect")
        & ((F.col("GoF_protect").isNotNull()) | (F.col("LoF_risk").isNotNull())),
        F.lit("agree"),
    )
    .when(
        (F.col("doeMaxClinPhase") == "LoF_protect")
        & ((F.col("LoF_protect").isNotNull()) | (F.col("GoF_risk").isNotNull())),
        F.lit("agree"),
    )
    .otherwise(F.lit("notAgree")),
).withColumn(
    "Phase4", F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no"))
).groupBy(
    "doeAgreeMaxPhase"
).pivot(
    "Phase4"
).count().show()



+----------------+---+---+
|doeAgreeMaxPhase| no|yes|
+----------------+---+---+
|        notAgree| 21| 22|
|           agree| 40| 34|
+----------------+---+---+



                                                                                

In [19]:
disparDrug.join(genEvidForDrug, on=["targetId", "diseaseId"], how="left").filter(
    F.col("coherencyDiagonal").isNotNull()
).join(drugApp, on=["targetId", "diseaseId"], how="left").filter(
    F.col("coherencyDiagonal") == "coherent"
).withColumn(
    "doeMaxClinPhase",
    F.when(F.col("GoF_protect_ch") == F.col("maxClinPhase"), F.lit("GoF_protect"))
    .when(F.col("LoF_protect_ch") == F.col("maxClinPhase"), F.lit("LoF_protect"))
    .when(F.col("GoF_protect_ch") == F.col("LoF_protect_ch"), F.lit("same"))
    .otherwise(F.lit("notConsidered")),
).withColumn(
    "doeAgreeMaxPhase",
    F.when(
        (F.col("doeMaxClinPhase") == "GoF_protect")
        & ((F.col("GoF_protect").isNotNull()) | (F.col("LoF_risk").isNotNull())),
        F.lit("agree"),
    )
    .when(
        (F.col("doeMaxClinPhase") == "LoF_protect")
        & ((F.col("LoF_protect").isNotNull()) | (F.col("GoF_risk").isNotNull())),
        F.lit("agree"),
    )
    .otherwise(F.lit("notAgree")),
).show()

+---------------+-------------+--------------+--------------+-----------+--------+-----------+--------+-----------+-----------------+----------------+------------+--------------+--------------+---------------+----------------+
|       targetId|    diseaseId|GoF_protect_ch|LoF_protect_ch|GoF_protect|GoF_risk|LoF_protect|LoF_risk|noEvaluable|coherencyDiagonal|coherencyOneCell|maxClinPhase|GoF_protectApp|LoF_protectApp|doeMaxClinPhase|doeAgreeMaxPhase|
+---------------+-------------+--------------+--------------+-----------+--------+-----------+--------+-----------+-----------------+----------------+------------+--------------+--------------+---------------+----------------+
|ENSG00000112164|MONDO_0005148|           4.0|           3.0|          9|    null|       null|       4|          2|         coherent|          dispar|         4.0|           yes|            no|    GoF_protect|           agree|
|ENSG00000132170|  EFO_0002618|           2.0|           1.0|       null|    null|       nul

In [29]:
disparDrug.join(genEvidForDrug, on=["targetId", "diseaseId"], how="left").filter(
    F.col("coherencyDiagonal").isNotNull()
).join(drugApp, on=["targetId", "diseaseId"], how="left").filter(
    F.col("coherencyDiagonal") == "coherent"
).withColumn(
    "doeMaxClinPhase",
    F.when(F.col("GoF_protect_ch") == F.col("maxClinPhase"), F.lit("GoF_protect"))
    .when(F.col("LoF_protect_ch") == F.col("maxClinPhase"), F.lit("LoF_protect"))
    .when(F.col("GoF_protect_ch") == F.col("LoF_protect_ch"), F.lit("same"))
    .otherwise(F.lit("notConsidered")),
).withColumn(
    "doeAgreeMaxPhase",
    F.when(
        (F.col("doeMaxClinPhase") == "GoF_protect")
        & ((F.col("GoF_protect").isNotNull()) | (F.col("LoF_risk").isNotNull())),
        F.lit("agree"),
    )
    .when(
        (F.col("doeMaxClinPhase") == "LoF_protect")
        & ((F.col("LoF_protect").isNotNull()) | (F.col("GoF_risk").isNotNull())),
        F.lit("agree"),
    )
    .otherwise(F.lit("notAgree")),
).withColumn(
    "Phase4", F.when(F.col("maxClinPhase") == 4, F.lit("yes")).otherwise(F.lit("no"))
).withColumn(
    "doeAdvancedApp",
    F.when(
        (F.col("GoF_protectApp") == "yes")
        & (F.col("LoF_protectApp") == "no")
        & ((F.col("GoF_protect").isNotNull()) | (F.col("LoF_risk").isNotNull())),
        F.lit("agreeAndApproved"),
    )
    .when(
        (F.col("GoF_protectApp") == "no")
        & (F.col("LoF_protectApp") == "yes")
        & ((F.col("GoF_risk").isNotNull()) | (F.col("LoF_protect").isNotNull())),
        F.lit("agreeAndApproved"),
    )
    .when(
        (F.col("LoF_protectApp") == "no")
        & (F.col("GoF_protectApp") == "no")
        & (F.col("doeAgreeMaxPhase") == "agree"),
        F.lit("AgreeNoApproved"),
    )
    .when(
        (F.col("LoF_protectApp") == "no")
        & (F.col("GoF_protectApp") == "no")
        & (F.col("doeAgreeMaxPhase") == "notAgree"),
        F.lit("NotAgreeNotApproved"),
    )
    .when(
        (F.col("LoF_protectApp") == "yes")
        & (F.col("GoF_protectApp") == "no")
        & (F.col("doeAgreeMaxPhase") == "agree"),
        F.lit("agreeAndApproved"),
    )
    .when(
        (F.col("LoF_protectApp") == "no")
        & (F.col("GoF_protectApp") == "yes")
        & (F.col("doeAgreeMaxPhase") == "agree"),
        F.lit("agreeAndApproved"),
    )
    .when(
        (F.col("LoF_protectApp") == "yes")
        & (F.col("GoF_protectApp") == "no")
        & (F.col("doeAgreeMaxPhase") == "notAgree"),
        F.lit("agreeAndApproved"),
    )
    .when(
        (F.col("LoF_protectApp") == "no")
        & (F.col("GoF_protectApp") == "yes")
        & (F.col("doeAgreeMaxPhase") == "notAgree"),
        F.lit("NotAgreeNotApproved"),
    )
    .when(
        (F.col("LoF_protectApp") == "yes") & (F.col("GoF_protectApp") == "yes"),
        F.lit("agreeAndApproved"),
    )
    .otherwise(F.lit("notConsidered")),
).show()
#### here adding the columns

+---------------+-------------+--------------+--------------+-----------+--------+-----------+--------+-----------+-----------------+----------------+------------+--------------+--------------+---------------+----------------+------+-------------------+
|       targetId|    diseaseId|GoF_protect_ch|LoF_protect_ch|GoF_protect|GoF_risk|LoF_protect|LoF_risk|noEvaluable|coherencyDiagonal|coherencyOneCell|maxClinPhase|GoF_protectApp|LoF_protectApp|doeMaxClinPhase|doeAgreeMaxPhase|Phase4|     doeAdvancedApp|
+---------------+-------------+--------------+--------------+-----------+--------+-----------+--------+-----------+-----------------+----------------+------------+--------------+--------------+---------------+----------------+------+-------------------+
|ENSG00000112164|MONDO_0005148|           4.0|           3.0|          9|    null|       null|       4|          2|         coherent|          dispar|         4.0|           yes|            no|    GoF_protect|           agree|   yes|   ag

In [None]:
disparDrug.join(genEvidForDrug, on=["targetId", "diseaseId"], how="left").filter(
    F.col("coherencyDiagonal").isNotNull()
).join(drugApp, on=["targetId", "diseaseId"], how="left").filter(
    F.col("coherencyDiagonal") == "coherent"
).withColumn(
    "doeAdvanced",
    F.when(  ### doe agreement and advanced
        (F.col("GoF_protect_ch") > F.col("LoF_protect_ch"))
        & ((F.col("GoF_protect").isNotNull()) | (F.col("LoF_risk").isNotNull())),
        F.lit("AgreeAndAdvanced"),
    )
    .when(  ### doE agreement and advanced
        (F.col("LoF_protect_ch") > F.col("GoF_protect_ch"))
        & ((F.col("GoF_risk").isNotNull()) | (F.col("LoF_protect").isNotNull())),
        F.lit("AgreeAndAdvanced"),
    )
    .when(  ### doE NO agreement and advanced
        (F.col("LoF_protect_ch") < F.col("GoF_protect_ch"))
        & ((F.col("GoF_risk").isNotNull()) | (F.col("LoF_protect").isNotNull())),
        F.lit("NoAgreeAdvanced"),
    )
    .when(
        (F.col("GoF_protect_ch") < F.col("LoF_protect_ch"))
        & ((F.col("GoF_protect").isNotNull()) | (F.col("LoF_risk").isNotNull())),
        F.lit("NoAgreeAdvanced"),
    )
    .when(  ###
        (F.col("LoF_protect_ch") == F.col("GoF_protect_ch"))
        & ((F.col("GoF_risk").isNotNull()) | (F.col("LoF_protect").isNotNull())),
        F.lit("noDif"),
    )
    .otherwise(F.lit("behind")),
).withColumn(
    "doeAdvancedApp",
    F.when(
        (F.col("GoF_protectApp") == "yes")
        & (F.col("LoF_protectApp") == "no")
        & ((F.col("GoF_protect").isNotNull()) | (F.col("LoF_risk").isNotNull())),
        F.lit("approved"),
    )
    .when(
        (F.col("GoF_protectApp") == "no")
        & (F.col("LoF_protectApp") == "yes")
        & ((F.col("GoF_risk").isNotNull()) | (F.col("LoF_protect").isNotNull())),
        F.lit("approved"),
    )
    .when(
        (F.col("LoF_protectApp") == "no") & (F.col("GoF_protectApp") == "no"),
        F.lit("noDif"),
    )
    .when(
        (F.col("LoF_protectApp") == "yes") & (F.col("GoF_protectApp") == "yes"),
        F.lit("noDif"),
    )
    .otherwise(F.lit("behind")),
).withColumn(
    "Phase4",
    F.when(
        (F.col("GoF_protect_ch") == 4) | (F.col("LoF_protect") == 4),
        F.when((F.col("doeAdvanced") == "advanced"), F.lit("yes"))
        .when(F.col("doeAdvanced") == "noDif", F.lit("yes"))
        .when(F.col("doeAdvanced") == "behind", F.lit("no")),
    ).otherwise(F.lit("no")),
).withColumn(
    "doeAgree",
    F.when(F.col("doeAdvanced").isin(["advanced", "noDif"]), F.lit("yes")).otherwise(
        F.lit("no")
    ),
).groupBy(
    "doeAdvanced"
).count().show()


"""
.select(
    "GoF_protect_ch",
    "LoF_protect_ch",
    "GoF_protect",
    "GoF_risk",
    "LoF_protect",
    "LoF_risk",
    "doeAdvanced",
    "Phase4",
    "doeAgree",
).groupBy(
    "doeAgree"
).pivot(
    "Phase4"
).count().show()
"""



+---------------+-----+
|    doeAdvanced|count|
+---------------+-----+
|       advanced|   49|
|advancedNoagree|   35|
|         behind|   25|
|          noDif|    8|
+---------------+-----+



                                                                                

'\n.select(\n    "GoF_protect_ch",\n    "LoF_protect_ch",\n    "GoF_protect",\n    "GoF_risk",\n    "LoF_protect",\n    "LoF_risk",\n    "doeAdvanced",\n    "Phase4",\n    "doeAgree",\n).groupBy(\n    "doeAgree"\n).pivot(\n    "Phase4"\n).count().show()\n'

### to test 

In [10]:
disparDrug.join(genEvidForDrug, on=["targetId", "diseaseId"], how="left").filter(
    F.col("coherencyDiagonal").isNotNull()
).join(drugApp, on=["targetId", "diseaseId"], how="left").filter(
    F.col("coherencyDiagonal") == "coherent"
).withColumn(
    "doeAdvanced",
    F.when(  ### doe agreement and advanced
        (F.col("GoF_protect_ch") > F.col("LoF_protect_ch"))
        & ((F.col("GoF_protect").isNotNull()) | (F.col("LoF_risk").isNotNull())),
        F.lit("yes"),
    )
    .when(  ### doE agreement and advanced
        (F.col("LoF_protect_ch") > F.col("GoF_protect_ch"))
        & ((F.col("GoF_risk").isNotNull()) | (F.col("LoF_protect").isNotNull())),
        F.lit("yes"),
    )
    .when(  ### doE NO agreement and advanced
        (F.col("LoF_protect_ch") < F.col("GoF_protect_ch"))
        & ((F.col("GoF_risk").isNotNull()) | (F.col("LoF_protect").isNotNull())),
        F.lit("no"),
    )
    .when(
        (F.col("GoF_protect_ch") < F.col("LoF_protect_ch"))
        & ((F.col("GoF_protect").isNotNull()) | (F.col("LoF_risk").isNotNull())),
        F.lit("no"),
    )
    .when(  ###
        (F.col("LoF_protect_ch") == F.col("GoF_protect_ch"))
        & ((F.col("GoF_risk").isNotNull()) | (F.col("LoF_protect").isNotNull())),
        F.lit("noDif"),
    )
    .when(  ###
        (F.col("LoF_protect_ch") == F.col("GoF_protect_ch"))
        & ((F.col("LoF_risk").isNotNull()) | (F.col("GoF_protect").isNotNull())),
        F.lit("noDif"),
    ),
    #    .otherwise(F.lit("behind")),
).withColumn(
    "doeAdvancedApp",
    F.when(
        (F.col("GoF_protectApp") == "yes")
        & (F.col("LoF_protectApp") == "no")
        & ((F.col("GoF_protect").isNotNull()) | (F.col("LoF_risk").isNotNull())),
        F.lit("approved"),
    )
    .when(
        (F.col("GoF_protectApp") == "no")
        & (F.col("LoF_protectApp") == "yes")
        & ((F.col("GoF_risk").isNotNull()) | (F.col("LoF_protect").isNotNull())),
        F.lit("approved"),
    )
    .when(
        (F.col("LoF_protectApp") == "no") & (F.col("GoF_protectApp") == "no"),
        F.lit("noDif"),
    )
    .when(
        (F.col("LoF_protectApp") == "yes") & (F.col("GoF_protectApp") == "yes"),
        F.lit("noDif"),
    )
    .otherwise(F.lit("behind")),
).withColumn(
    "Phase4",
    F.when(
        (F.col("GoF_protect_ch") == 4) | (F.col("LoF_protect") == 4),
        F.when((F.col("doeAdvanced") == "advanced"), F.lit("yes"))
        .when(F.col("doeAdvanced") == "noDif", F.lit("yes"))
        .when(F.col("doeAdvanced") == "behind", F.lit("no")),
    ).otherwise(F.lit("no")),
).withColumn(
    "doeAgree",
    F.when(F.col("doeAdvanced").isin(["advanced", "noDif"]), F.lit("yes")).otherwise(
        F.lit("no")
    ),
).groupBy(
    "doeAdvanced"
).count().show()

"""
.select(
    "GoF_protect_ch",
    "LoF_protect_ch",
    "GoF_protect",
    "GoF_risk",
    "LoF_protect",
    "LoF_risk",
    "doeAdvanced",
    "Phase4",
    "doeAgree",
).groupBy(
    "doeAgree"
).pivot(
    "Phase4"
).count().show()
"""



+-----------+-----+
|doeAdvanced|count|
+-----------+-----+
|        yes|   49|
|         no|   35|
|      noDif|   33|
+-----------+-----+



                                                                                

'\n.select(\n    "GoF_protect_ch",\n    "LoF_protect_ch",\n    "GoF_protect",\n    "GoF_risk",\n    "LoF_protect",\n    "LoF_risk",\n    "doeAdvanced",\n    "Phase4",\n    "doeAgree",\n).groupBy(\n    "doeAgree"\n).pivot(\n    "Phase4"\n).count().show()\n'

In [None]:
disparDrug.join(genEvidForDrugDispar, on=["targetId", "diseaseId"], how="left").filter(
    F.col("coherencyDiagonal").isNotNull()
)

In [8]:
genEvidForDrugDispar = analysis_propagated(assessment, datasource_list).filter(
    F.col("coherencyDiagonal") == "dispar"
)

25/02/03 10:31:02 WARN CacheManager: Asked to cache already cached data.        


                                                                                

In [None]:
### let's do the same but using just germline variation

In [11]:
datasource_list = [
    "gene_burden",
    # "intogen",
    # "cancer_gene_census",
    "eva",
    # "eva_somatic",
    "gwas_credible_set",
    "impc",
    "orphanet",
    "gene2phenotype",
]
germlineEvidForDrug = analysis_propagated(assessment, datasource_list).filter(
    F.col("coherencyDiagonal") == "coherent"
)

                                                                                

In [12]:
germlineEvidForDrug.show()



+---------------+---------------+-----------+--------+-----------+--------+-----------+-----------------+----------------+
|       targetId|      diseaseId|GoF_protect|GoF_risk|LoF_protect|LoF_risk|noEvaluable|coherencyDiagonal|coherencyOneCell|
+---------------+---------------+-----------+--------+-----------+--------+-----------+-----------------+----------------+
|ENSG00000198646|  MONDO_0011060|       null|    null|       null|      33|       null|         coherent|        coherent|
|ENSG00000161133|  MONDO_0031520|       null|    null|       null|      13|       null|         coherent|        coherent|
|ENSG00000075043|  MONDO_0100172|       null|    null|       null|       4|          1|         coherent|        coherent|
|ENSG00000227507|    EFO_1001901|       null|    null|       null|       1|       null|         coherent|        coherent|
|ENSG00000186350|  MONDO_0016145|       null|    null|       null|       1|       null|         coherent|        coherent|
|ENSG00000102001

                                                                                

In [119]:
disparDrug.join(germlineEvidForDrug, on=["targetId", "diseaseId"], how="left").filter(
    F.col("coherencyDiagonal").isNotNull()
).join(drugApp, on=["targetId", "diseaseId"], how="left").filter(
    F.col("coherencyDiagonal") == "coherent"
).withColumn(
    "doeAdvanced",
    F.when(
        (F.col("GoF_protect_ch") > F.col("LoF_protect_ch"))
        & ((F.col("GoF_protect").isNotNull()) | (F.col("LoF_risk").isNotNull())),
        F.lit("advanced"),
    )
    .when(
        (F.col("LoF_protect_ch") > F.col("GoF_protect_ch"))
        & ((F.col("GoF_risk").isNotNull()) | (F.col("LoF_protect").isNotNull())),
        F.lit("advanced"),
    )
    .when(
        (F.col("LoF_protect_ch") == F.col("GoF_protect_ch")),
        F.lit("noDif"),
    )
    .otherwise(F.lit("behind")),
).withColumn(
    "doeAdvancedApp",
    F.when(
        (F.col("GoF_protectApp") == "yes")
        & (F.col("LoF_protectApp") == "no")
        & ((F.col("GoF_protect").isNotNull()) | (F.col("LoF_risk").isNotNull())),
        F.lit("approved"),
    )
    .when(
        (F.col("GoF_protectApp") == "no")
        & (F.col("LoF_protectApp") == "yes")
        & ((F.col("GoF_risk").isNotNull()) | (F.col("LoF_protect").isNotNull())),
        F.lit("approved"),
    )
    .when(
        (F.col("LoF_protectApp") == "no") & (F.col("GoF_protectApp") == "no"),
        F.lit("noDif"),
    )
    .when(
        (F.col("LoF_protectApp") == "yes") & (F.col("GoF_protectApp") == "yes"),
        F.lit("noDif"),
    )
    .otherwise(F.lit("behind")),
).groupBy(
    "doeAdvanced", "doeAdvancedApp"
).count().show()



+-----------+--------------+-----+
|doeAdvanced|doeAdvancedApp|count|
+-----------+--------------+-----+
|   advanced|         noDif|   20|
|     behind|        behind|    9|
|      noDif|         noDif|   20|
|   advanced|      approved|    8|
|     behind|         noDif|   19|
|      noDif|      approved|    7|
|      noDif|        behind|    1|
+-----------+--------------+-----+



                                                                                

In [129]:
spark.read.parquet(
    "gs://open-targets-pre-data-releases/24.12-uo_test-3/output/etl/parquet/mechanismOfAction/"
).groupBy("actionType").count().show(200, truncate=False)



+-----------------------------+-----+
|actionType                   |count|
+-----------------------------+-----+
|RNAI INHIBITOR               |21   |
|NEGATIVE ALLOSTERIC MODULATOR|24   |
|CROSS-LINKING AGENT          |27   |
|ANTAGONIST                   |967  |
|BINDING AGENT                |232  |
|EXOGENOUS PROTEIN            |39   |
|PARTIAL AGONIST              |64   |
|ANTISENSE INHIBITOR          |38   |
|PROTEOLYTIC ENZYME           |3    |
|ACTIVATOR                    |70   |
|BLOCKER                      |179  |
|VACCINE ANTIGEN              |19   |
|INHIBITOR                    |3303 |
|SUBSTRATE                    |8    |
|POSITIVE ALLOSTERIC MODULATOR|82   |
|POSITIVE MODULATOR           |36   |
|OTHER                        |31   |
|AGONIST                      |942  |
|OPENER                       |39   |
|INVERSE AGONIST              |15   |
|HYDROLYTIC ENZYME            |9    |
|RELEASING AGENT              |26   |
|MODULATOR                    |106  |
|STABILISER 

                                                                                

In [None]:
spark.read.parquet(
    "gs://open-targets-pre-data-releases/24.12-uo_test-3/output/etl/parquet/mechanismOfAction/"
).groupBy("actionType").count().show(200, truncate=False)