In [1]:
""" this scripts run the analysis for comparing QTL studies, tissues together with therapy areas matched"""

from functions import relative_success, spreadSheetFormatter
from stoppedTrials import terminated_td
from DoEAssessment import directionOfEffect
from membraneTargets import target_membrane
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
from datetime import datetime
from datetime import date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.types import (
    StructType,
    StructField,
    DoubleType,
    DecimalType,
    StringType,
    FloatType,
)

spark = SparkSession.builder.getOrCreate()

platform_v = "24.09"

target_path = (
    f"gs://open-targets-data-releases/{platform_v}/output/etl/parquet/targets/"
)
target = spark.read.parquet(target_path)

disease_path = (
    f"gs://open-targets-data-releases/{platform_v}/output/etl/parquet/diseases/"
)
diseases = spark.read.parquet(disease_path)

evidences = spark.read.parquet(
    f"gs://open-targets-data-releases/{platform_v}/output/etl/parquet/evidence"
)

coloc = spark.read.parquet(
    "gs://genetics-portal-dev-data/22.09.1/outputs/v2d_coloc"
).filter(F.col("right_type") != "gwas")


### laod sample sizes - look at the document
sampleSize = spark.read.csv("gs://ot-team/jroldan/colocSampleSize.csv", header=True)

terminated = terminated_td(
    spark, "gs://ot-team/jroldan/analysis/targetDiseaseStoppedNegative.csv"
)
### load QTL tissues mapped to therapy areas
onto_samples = spark.read.csv(
    "gs://ot-team/jroldan/20240112_mappedSQLtissuesGTP3_5.csv", header=True
)

#### GSEA annotation for hallmark inflamation targets
immflam_annot = (
    spark.read.json(
        "gs://ot-team/jroldan/analysis/HALLMARK_INFLAMMATORY_RESPONSE.v2023.2.Hs_edited.json"
    )
    .select(F.explode_outer("geneSymbols").alias("approvedSymbol"))
    .withColumn("isInflam", F.lit("yes"))
)

#### Build the uniqueBetas_analyse
targetType = (
    target.select("id", "approvedSymbol", F.explode_outer("targetClass"))
    .select("id", "approvedSymbol", "col.label")
    .groupBy("id", "approvedSymbol")
    .agg(F.collect_set("label").alias("label"))
    .filter(F.col("label").isNotNull())
    .selectExpr(
        "id as targetIdtargetType",
        "approvedSymbol as approvedSymbol",
        "label as targetType",
    )
    .join(immflam_annot, on="approvedSymbol", how="left")
)

### take ontology of samples
samplesOnto = (
    onto_samples.withColumn(
        "right_bio_feature", F.split(F.col("original"), " - ").getItem(0)
    )
    .withColumn(
        "therapyArea", F.split(F.col("20231207_curated_simplified"), " - ").getItem(2)
    )
    .withColumn("EFO", F.split(F.col("20231207_curated_simplified"), " - ").getItem(1))
    .withColumn(
        "right_bio_feature2",
        F.split(F.col("20231207_curated_simplified"), " - ").getItem(0),
    )
    .drop(
        "curated_simplified",
        "20231207_curated_simplified",
        "original",
        "curated",
        "_c3",
        "_c4",
    )
)

coloc2 = (
    coloc.select(
        F.concat_ws("_", "left_chrom", "left_pos", "left_ref", "left_alt").alias(
            "left_locus_id"
        ),
        F.concat_ws("_", "right_chrom", "right_pos", "right_ref", "right_alt").alias(
            "right_locus_id"
        ),
        F.col("left_study").alias("left_study_id"),
        F.col("right_study").alias("right_study_id"),
        "right_gene_id",
        "coloc_h4",
        "left_var_right_study_beta",
        "right_phenotype",
        F.col("left_type"),
        F.col("right_type"),
        F.col("right_bio_feature"),
        F.col("is_flipped"),
        "left_var_right_study_pval",
    )
    .withColumn(
        "beta_assessed",  ### diferent from sQTL and oQTL
        F.when(
            (F.col("left_var_right_study_beta") > 0)
            & (F.col("right_study_id") != "GTEx-sQTL"),
            F.lit("gof"),
        ).when(
            (F.col("left_var_right_study_beta") < 0)
            & (F.col("right_study_id") != "GTEx-sQTL"),
            F.lit("lof"),
        )
        #### for sQTL is the opposite
        .when(
            (F.col("left_var_right_study_beta") > 0)
            & (F.col("right_study_id") == "GTEx-sQTL"),
            F.lit("lof"),
        )
        .when(
            (F.col("left_var_right_study_beta") < 0)
            & (F.col("right_study_id") == "GTEx-sQTL"),
            F.lit("gof"),
        )
        .otherwise(F.lit("neutral")),
    )
    .join(samplesOnto, on=["right_bio_feature"], how="left")
    .drop("right_bio_feature")
)
### check for disparities using count of different assessment for beta for target

disparities = coloc2.groupBy("left_locus_id", "left_study_id", "right_gene_id").agg(
    F.size(F.collect_set("beta_assessed")).alias("count"),
)
### add the label of which left_locus_id,left_study_id and right_gene_id are having contradictions

coloc3 = (
    coloc2.withColumnRenamed("right_bio_feature2", "right_bio_feature")
    .join(
        disparities, on=["left_locus_id", "left_study_id", "right_gene_id"], how="left"
    )
    .persist()
)
#### Run directionOfEffect
prueba_assessment = directionOfEffect(evidences, platform_v)

## add therapeuticArea name to  diseases
taDf = spark.createDataFrame(
    data=[
        ("MONDO_0045024", "cell proliferation disorder", "Oncology"),
        ("EFO_0005741", "infectious disease", "Other"),
        ("OTAR_0000014", "pregnancy or perinatal disease", "Other"),
        ("EFO_0005932", "animal disease", "Other"),
        ("MONDO_0024458", "disease of visual system", "Other"),
        ("EFO_0000319", "cardiovascular disease", "Other"),
        ("EFO_0009605", "pancreas disease", "Other"),
        ("EFO_0010282", "gastrointestinal disease", "Other"),
        ("OTAR_0000017", "reproductive system or breast disease", "Other"),
        ("EFO_0010285", "integumentary system disease", "Other"),
        ("EFO_0001379", "endocrine system disease", "Other"),
        ("OTAR_0000010", "respiratory or thoracic disease", "Other"),
        ("EFO_0009690", "urinary system disease", "Other"),
        ("OTAR_0000006", "musculoskeletal or connective tissue disease", "Other"),
        ("MONDO_0021205", "disease of ear", "Other"),
        ("EFO_0000540", "immune system disease", "Other"),
        ("EFO_0005803", "hematologic disease", "Other"),
        ("EFO_0000618", "nervous system disease", "Other"),
        ("MONDO_0002025", "psychiatric disorder", "Other"),
        ("OTAR_0000020", "nutritional or metabolic disease", "Other"),
        ("OTAR_0000018", "genetic, familial or congenital disease", "Other"),
        ("OTAR_0000009", "injury, poisoning or other complication", "Other"),
        ("EFO_0000651", "phenotype", "Other"),
        ("EFO_0001444", "measurement", "Other"),
        ("GO_0008150", "biological process", "Other"),
    ],
    schema=StructType(
        [
            StructField("taId", StringType(), True),
            StructField("taLabel", StringType(), True),
            StructField("taLabelSimple", StringType(), True),
        ]
    ),
)
#### prepare for disease propagation, but explode at the end
diseases2 = (
    diseases.withColumn("taId", F.explode_outer("therapeuticAreas"))
    .join(taDf.drop("taLabelSimple"), on="taId", how="left")
    .groupBy("id", "parents", "name")
    .agg(
        F.collect_set("taId").alias("taId"),
        F.collect_set("taLabel").alias("diseaseTherapyAreas"),
    )
)
#### load data from ot genetics with the assessments
delimiter = ","
evidence_ot_genetics = (
    (
        prueba_assessment.filter((F.col("datasourceId").isin(["ot_genetics_portal"])))
        .groupBy("targetId", "diseaseId", "variantId", "studyId")
        .pivot("directionOnTrait")
        .count()
        .drop("noEvaluable", "conflict/noEvaluable")
        .persist()
    )
    .join(diseases2, F.col("diseaseId") == diseases2.id, "left")
    .withColumnRenamed("diseaseId", "oldDiseaseId")
    .withColumn(
        "diseaseId",
        F.explode_outer(F.concat(F.array(F.col("id")), F.col("parents"))),
    )
    .drop("id")
)

coloc_otgene = (
    coloc3.withColumnRenamed("left_study_id", "studyId")
    .withColumnRenamed("left_locus_id", "locusId")
    .withColumnRenamed("right_gene_id", "targetId")
    .join(
        evidence_ot_genetics.withColumnRenamed("variantId", "locusId"),
        on=["studyId", "locusId", "targetId"],
        how="left",
    )
    .withColumnRenamed("GoF", "GoF_OT")  # remove
    .withColumnRenamed("LoF", "LoF_OT")  # remove
).persist()

chembl_trials = (
    prueba_assessment.filter((F.col("datasourceId").isin(["chembl"])))
    .groupBy("targetId", "diseaseId")
    .agg(F.max(F.col("clinicalPhase")).alias("maxClinPhase"))
)

chembl = (
    (
        prueba_assessment.filter(
            (F.col("datasourceId").isin(["chembl"]))
            & (F.col("homogenized").isin(["noEvaluable", "dispar"]) == False)
        )
    )
    .groupBy("targetId", "diseaseId")
    .pivot("variantEffect")
    .count()
    .withColumnRenamed("LoF", "LoF_Ch")
    .withColumnRenamed("GoF", "GoF_Ch")
    .join(chembl_trials, on=["targetId", "diseaseId"], how="left")
    .persist()
)

coloc_bnch = coloc_otgene.join(chembl, on=["targetId", "diseaseId"], how="inner")
coloc_bnch2 = coloc_bnch.join(
    sampleSize.select("right_study_id", "sampleSize"), on="right_study_id", how="left"
).persist()

withEvidence = (
    coloc_bnch2.withColumn(
        "ChEMBL",
        F.when(
            (F.col("GoF_Ch").isNotNull()) & (F.col("LoF_Ch").isNotNull()),
            F.lit("gof&lof"),
        )
        .when(
            (F.col("LoF_Ch").isNotNull()) & (F.col("GoF_Ch").isNull()),
            F.lit(F.lit("lof")),
        )
        .when(
            (F.col("GoF_Ch").isNotNull()) & (F.col("LoF_Ch").isNull()),
            F.lit(F.lit("gof")),
        ),
    )
    .withColumn(
        "Coherency_chembl",
        F.when(  ### there are cases of drug with gof&lof
            (F.col("protect").isNotNull()),
            F.when(
                (F.col("beta_assessed") == "gof"),
                F.when(
                    (F.col("GoF_Ch").isNotNull()) & (F.col("LoF_Ch").isNull()),
                    F.lit("coherent"),
                ).when(
                    (F.col("LoF_Ch").isNotNull()) & (F.col("GoF_Ch").isNull()),
                    F.lit("dispar"),
                ),
            ).when(
                (F.col("beta_assessed") == "lof"),
                F.when(
                    (F.col("GoF_Ch").isNotNull()) & (F.col("LoF_Ch").isNull()),
                    F.lit("dispar"),
                ).when(
                    (F.col("LoF_Ch").isNotNull()) & (F.col("GoF_Ch").isNull()),
                    F.lit("coherent"),
                ),
            ),
        ),
    )
    .join(target.select("id", "approvedSymbol"), target.id == F.col("targetId"), "left")
    .drop("id")
    .persist()
)

uniqueBetas = withEvidence.persist()

custom_schema = StructType(
    [
        StructField("_c0", StringType(), True),
        StructField("_c1", StringType(), True),
        StructField("_c2", StringType(), True),
        StructField("_c3", DecimalType(38, 37), True),
        # Add more fields as needed
    ]
)

spark session created at 2024-11-14 09:12:37.189029
Analysis started on 2024-11-14 at  2024-11-14 09:12:37.189029


24/11/14 09:12:42 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
24/11/14 09:14:31 WARN CacheManager: Asked to cache already cached data.        


In [14]:
chembl = (
    (
        (
            prueba_assessment.filter(
                (F.col("datasourceId").isin(["chembl"]))
                & (F.col("homogenized").isin(["noEvaluable", "dispar"]) == False)
            )
        )
        .groupBy("targetId", "diseaseId", "variantEffect")
        .count()
        .withColumnRenamed("LoF", "LoF_Ch")
        .withColumnRenamed("GoF", "GoF_Ch")
    )
    .withColumn(
        "numbers",
        F.size(
            F.collect_set("variantEffect").over(
                Window.partitionBy("targetId", "diseaseId")
            )
        ),
    )
    .filter(F.col("numbers") < 2)
    # .sort(F.col("numbers").desc())
    .withColumn(
        "drugDoE",
        F.when(F.col("variantEffect") == "LoF", F.lit("LoF_protect"))
        .when(F.col("variantEffect") == "GoF", F.lit("GoF_protect"))
        .otherwise(F.col("variantEffect")),
    )
    .drop("count", "numbers", "variantEffect")
)

In [None]:
chembl.show()

In [15]:
chembl.show()

+---------------+-------------+-----------+
|       targetId|    diseaseId|    drugDoE|
+---------------+-------------+-----------+
|ENSG00000007314|  EFO_0000555|LoF_protect|
|ENSG00000007314|  EFO_0003102|LoF_protect|
|ENSG00000007314|  EFO_0003894|LoF_protect|
|ENSG00000007314|  EFO_0004699|LoF_protect|
|ENSG00000007314|  EFO_0801084|LoF_protect|
|ENSG00000007314|  EFO_1000249|LoF_protect|
|ENSG00000008018|  EFO_1000453|LoF_protect|
|ENSG00000010310|  EFO_0003884|GoF_protect|
|ENSG00000012504|  EFO_0004210|GoF_protect|
|ENSG00000012504|MONDO_0019052|GoF_protect|
|ENSG00000012779|MONDO_0004235|LoF_protect|
|ENSG00000014138|  EFO_1001945|LoF_protect|
|ENSG00000023228|  EFO_1000657|LoF_protect|
|ENSG00000023445|  EFO_0000616|LoF_protect|
|ENSG00000025708|  EFO_0000702|LoF_protect|
|ENSG00000037280|  EFO_0003897|LoF_protect|
|ENSG00000058091|  EFO_0000574|LoF_protect|
|ENSG00000062822|  EFO_0004991|LoF_protect|
|ENSG00000062822|  EFO_0005537|LoF_protect|
|ENSG00000062822|  EFO_0005952|L

In [17]:
chembl.write.parquet("gs://ot-team/jroldan/analysis/chemblDoE.parquet")

                                                                                

In [18]:
spark.read.parquet("gs://ot-team/jroldan/analysis/chemblDoE.parquet").show()

+---------------+-------------+-----------+
|       targetId|    diseaseId|    drugDoE|
+---------------+-------------+-----------+
|ENSG00000004779|  EFO_0000341|LoF_protect|
|ENSG00000004779|  EFO_0003060|LoF_protect|
|ENSG00000006071|MONDO_0009061|LoF_protect|
|ENSG00000007314|   HP_0000726|LoF_protect|
|ENSG00000007314|MONDO_0041052|LoF_protect|
|ENSG00000011677|  EFO_0005407|GoF_protect|
|ENSG00000011677|  EFO_0009551|GoF_protect|
|ENSG00000011677|MONDO_0007113|GoF_protect|
|ENSG00000014138|  EFO_1001365|LoF_protect|
|ENSG00000022355|MONDO_0002050|GoF_protect|
|ENSG00000022355|MONDO_0005129|GoF_protect|
|ENSG00000023228|  EFO_0000233|LoF_protect|
|ENSG00000023228|  EFO_0000764|LoF_protect|
|ENSG00000023228|MONDO_0005575|LoF_protect|
|ENSG00000036530|  EFO_0000474|LoF_protect|
|ENSG00000037280|  EFO_0000768|LoF_protect|
|ENSG00000048052|MONDO_0001056|LoF_protect|
|ENSG00000050628|MONDO_0005178|GoF_protect|
|ENSG00000053918|  EFO_0008522|LoF_protect|
|ENSG00000059758|  EFO_0001378|L