In [2]:
from DoEAssessment import directionOfEffect
from functions import discrepancifier
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.types import (
    StructType,
    StructField,
    DoubleType,
    DecimalType,
    StringType,
    FloatType,
)
from datetime import datetime


spark = SparkSession.builder.getOrCreate()

spark session created at 2024-10-16 07:50:49.235859
Analysis started on 2024-10-16 at  2024-10-16 07:50:49.235859


24/10/16 07:50:54 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [2]:
platform_v = "24.09"
evidences = (
    spark.read.parquet(
        f"gs://open-targets-data-releases/{platform_v}/output/etl/parquet/evidence"
    )
    .filter(
        F.col("datasourceId").isin(
            [
                "ot_genetics_portal",
                "gene_burden",
                "eva",
                "eva_somatic",
                "gene2phenotype",
                "orphanet",
                "cancer_gene_census",
                "intogen",
                "impc",
                "chembl",
            ]
        )
    )
    .persist()
)

disease_path = (
    f"gs://open-targets-data-releases/{platform_v}/output/etl/parquet/diseases/"
)
diseases = spark.read.parquet(disease_path)

target_path = (
    f"gs://open-targets-data-releases/{platform_v}/output/etl/parquet/targets/"
)
target = spark.read.parquet(target_path)

                                                                                

In [3]:
assessed_evidences = (
    directionOfEffect(evidences, platform_v)
    .filter(F.col("homogenized") != "noEvaluable")
    .withColumn(
        "datasources",
        F.collect_set("datasourceId").over(Window.partitionBy("targetId", "diseaseId")),
    )
    .withColumn(
        "maxClinPhase",
        F.max("clinicalPhase").over(Window.partitionBy("targetId", "diseaseId")),
    )
)
coherency_assessed = discrepancifier(
    assessed_evidences.withColumn(
        "arrayDiseaseFromSource",
        F.collect_set(F.col("diseaseFromSource")).over(
            Window.partitionBy("targetId", "diseaseId")
        ),
    )
    .groupBy(
        "targetId", "diseaseId", "datasources", "maxClinPhase", "arrayDiseaseFromSource"
    )
    .pivot("homogenized")
    .count()
).persist()

                                                                                

### Let's generate a dataset with datasources in the column of homogenized:

In [47]:
assessed_evidences.groupBy("targetId", "diseaseId").pivot("homogenized").agg(
    F.collect_set("datasourceId")
).show()

+---------------+-------------+-----------+--------------------+--------------------+--------+
|       targetId|    diseaseId|GoF_protect|            GoF_risk|         LoF_protect|LoF_risk|
+---------------+-------------+-----------+--------------------+--------------------+--------+
|ENSG00000000938|MONDO_0019156|         []|                  []|                  []|  [impc]|
|ENSG00000000971|MONDO_0012104|         []|                  []|                  []|  [impc]|
|ENSG00000001617|MONDO_0016305|         []|                  []|                  []|  [impc]|
|ENSG00000001626|  EFO_0000768|         []|                  []|                  []|  [impc]|
|ENSG00000001626|MONDO_0011766|         []|                  []|                  []|  [impc]|
|ENSG00000001626| Orphanet_124|         []|                  []|                  []|  [impc]|
|ENSG00000001630|MONDO_0010144|         []|                  []|                  []|  [impc]|
|ENSG00000002746|MONDO_0015494|         []|       

In [4]:
taDf = spark.createDataFrame(
    data=[
        ("MONDO_0045024", "cell proliferation disorder", "Oncology"),
        ("EFO_0005741", "infectious disease", "Other"),
        ("OTAR_0000014", "pregnancy or perinatal disease", "Other"),
        ("EFO_0005932", "animal disease", "Other"),
        ("MONDO_0024458", "disease of visual system", "Other"),
        ("EFO_0000319", "cardiovascular disease", "Other"),
        ("EFO_0009605", "pancreas disease", "Other"),
        ("EFO_0010282", "gastrointestinal disease", "Other"),
        ("OTAR_0000017", "reproductive system or breast disease", "Other"),
        ("EFO_0010285", "integumentary system disease", "Other"),
        ("EFO_0001379", "endocrine system disease", "Other"),
        ("OTAR_0000010", "respiratory or thoracic disease", "Other"),
        ("EFO_0009690", "urinary system disease", "Other"),
        ("OTAR_0000006", "musculoskeletal or connective tissue disease", "Other"),
        ("MONDO_0021205", "disease of ear", "Other"),
        ("EFO_0000540", "immune system disease", "Other"),
        ("EFO_0005803", "hematologic disease", "Other"),
        ("EFO_0000618", "nervous system disease", "Other"),
        ("MONDO_0002025", "psychiatric disorder", "Other"),
        ("MONDO_0024297", "nutritional or metabolic disease", "Other"),
        ("OTAR_0000018", "genetic, familial or congenital disease", "Other"),
        ("OTAR_0000009", "injury, poisoning or other complication", "Other"),
        ("EFO_0000651", "phenotype", "Other"),
        ("EFO_0001444", "measurement", "Other"),
        ("GO_0008150", "biological process", "Other"),
    ],
    schema=StructType(
        [
            StructField("taId", StringType(), True),
            StructField("taLabel", StringType(), True),
            StructField("taLabelSimple", StringType(), True),
        ]
    ),
).withColumn("taRank", F.monotonically_increasing_id())

In [7]:
complete = (
    coherency_assessed.join(
        diseases.selectExpr("id as diseaseId", "name", "therapeuticAreas"),
        on="diseaseId",
        how="left",
    )
    .select(
        "*",
        F.explode_outer(F.col("therapeuticAreas")).alias("therapeuticAreas_expl"),
    )
    .join(
        taDf.selectExpr("taId as therapeuticAreas_expl", "taLabel", "taLabelSimple"),
        on="therapeuticAreas_expl",
        how="left",
    )
    .groupBy(
        "targetId",
        "diseaseId",
        "arrayDiseaseFromSource",
        "name",
        "GoF_protect",
        "GoF_risk",
        "LoF_protect",
        "LoF_risk",
        "coherencyDiagonal",
        "coherencyOneCell",
        "datasources",
        "maxClinPhase",
    )
    .agg(
        F.collect_set("therapeuticAreas_expl").alias("therapeuticAreas"),
        F.collect_set("taLabel").alias("taName"),
        F.collect_set("taLabelSimple").alias("taLabelSimple"),
    )
    .filter(
        (F.col("coherencyDiagonal") == "dispar")
        | (F.col("coherencyOneCell") == "dispar")
    )
    .join(
        target.selectExpr("id as targetId", "approvedSymbol"), on="targetId", how="left"
    )
    .withColumn(
        "linkOT",
        F.concat(
            F.lit("https://platform.opentargets.org/evidence/"),
            F.col("approvedSymbol"),
            F.lit("/"),
            F.col("diseaseId"),  # Column containing "EFO_0009188"
        ),
    )
    .persist()
)

24/10/15 15:30:01 WARN CacheManager: Asked to cache already cached data.


#### Some Key numbers

In [39]:
print("Total associations with DoE =", coherency_assessed.count())
print(
    "Total associations being discrepant on at least one criteria =", complete.count()
)
print(
    "Associations discrepant on coherency Diagonal =",
    complete.filter(F.col("coherencyDiagonal") == "dispar").count(),
)
print(
    "Associations discrepant on coherency oneCell =",
    complete.filter(F.col("coherencyOneCell") == "dispar").count(),
)

Total associations with DoE = 865816
Total associations being discrepant on at least one criteria = 15885
Associations discrepant on coherency Diagonal = 9137
Associations discrepant on coherency oneCell = 15885


#### More detailed comparisons inside of each criteria for coherency

In [23]:
complete.filter(
    F.size(
        F.array_intersect(
            F.col("taName"), F.array(F.lit("measurement"), F.lit("phenotype"))
        )
    )
    > 0
).groupBy("taName", "coherencyDiagonal", "coherencyOneCell").count().show()

+--------------------+-----------------+----------------+-----+
|              taName|coherencyDiagonal|coherencyOneCell|count|
+--------------------+-----------------+----------------+-----+
|         [phenotype]|         coherent|          dispar|   44|
|         [phenotype]|           dispar|          dispar|  317|
|       [measurement]|         coherent|          dispar| 4882|
|       [measurement]|           dispar|          dispar| 5755|
|[genetic, familia...|         coherent|          dispar|    4|
|[genetic, familia...|           dispar|          dispar|    3|
|[urinary system d...|           dispar|          dispar|    3|
|[hematologic dise...|         coherent|          dispar|    1|
+--------------------+-----------------+----------------+-----+



#### 2/3 of associations under coherency Diagonal are Measurement/Phenotypes

In [40]:
#### Associations in coherency diagonal that are measurements/phenotypes
complete.filter(
    (
        F.size(
            F.array_intersect(
                F.col("taName"), F.array(F.lit("measurement"), F.lit("phenotype"))
            )
        )
        > 0
    )
    & (F.col("coherencyDiagonal") == "dispar")
).count()

6078

#### After removing measurement/phenotypes:

In [44]:
complete.filter(
    (
        F.size(
            F.array_intersect(
                F.col("taName"), F.array(F.lit("measurement"), F.lit("phenotype"))
            )
        )
        == 0
    )
    & (F.col("coherencyDiagonal") == "dispar")
).count()

3059

#### From this 3,059 associations, how any are Chembl VS Genetics?

In [1]:
complete.filter(
    (
        F.size(
            F.array_intersect(
                F.col("taName"), F.array(F.lit("measurement"), F.lit("phenotype"))
            )
        )
        == 0
    )
    & (F.col("coherencyDiagonal") == "dispar")
).filter(
    (F.size(F.array_intersect(F.col("datasources"), F.array(F.lit("chembl")))) > 1)
).count()

NameError: name 'complete' is not defined

In [None]:
#### Associations in coherency diagonal that are measurements/phenotypes
complete.filter(
    (
        F.size(
            F.array_intersect(
                F.col("taName"), F.array(F.lit("measurement"), F.lit("phenotype"))
            )
        )
        > 0
    )
    & (F.col("coherencyDiagonal") == "dispar")
).count()

In [42]:
#### Associations in coherency diagonal that are measurements/phenotypes
complete.filter((F.col("coherencyDiagonal") == "dispar")).groupBy(
    "taName"
).count().sort(F.col("count").desc()).show()

+--------------------+-----+
|              taName|count|
+--------------------+-----+
|       [measurement]| 5755|
|         [phenotype]|  317|
|[psychiatric diso...|  307|
|[cardiovascular d...|  207|
|[cell proliferati...|  178|
|[cell proliferati...|  155|
|[cell proliferati...|  148|
|[genetic, familia...|  106|
|[cell proliferati...|   98|
|[cell proliferati...|   82|
|[cell proliferati...|   76|
|[hematologic dise...|   64|
|[cell proliferati...|   61|
|[respiratory or t...|   60|
|[endocrine system...|   56|
|[hematologic dise...|   56|
|[cell proliferati...|   54|
|[genetic, familia...|   53|
|[biological process]|   49|
|[hematologic dise...|   49|
+--------------------+-----+
only showing top 20 rows



In [30]:
complete.filter(
    (
        F.size(
            F.array_intersect(
                F.col("taName"), F.array(F.lit("measurement"), F.lit("phenotype"))
            )
        )
        > 0
    )
    & (F.col("coherencyDiagonal") == "dispar")
).groupBy("name").count().sort(F.col("count").desc()).show(250)

+--------------------+-----+
|                name|count|
+--------------------+-----+
|mean corpuscular ...|  297|
|      platelet count|  235|
|mean platelet volume|  224|
|mean corpuscular ...|  224|
|  reticulocyte count|  222|
|         body height|  218|
|      lean body mass|  181|
|Red cell distribu...|  165|
|sex hormone-bindi...|  156|
|    eosinophil count|  137|
|heel bone mineral...|  137|
|    lymphocyte count|  137|
|   erythrocyte count|  132|
|      monocyte count|  128|
|appendicular lean...|  128|
|          hair color|  120|
|    neutrophil count|  120|
|hemoglobin measur...|  113|
|     leukocyte count|  105|
|reticulocyte meas...|   99|
|testosterone meas...|   96|
|forced expiratory...|   96|
|       fat body mass|   91|
|      vital capacity|   87|
| body fat percentage|   84|
|hair shape measur...|   79|
|       platelet crit|   78|
|mean corpuscular ...|   73|
|platelet componen...|   70|
|mean reticulocyte...|   70|
|          hematocrit|   67|
|blood protein

In [31]:
complete.filter(
    (
        F.size(
            F.array_intersect(
                F.col("taName"), F.array(F.lit("measurement"), F.lit("phenotype"))
            )
        )
        > 0
    )
    & (F.col("coherencyOneCell") == "dispar")
).groupBy("name").count().sort(F.col("count").desc()).show(250)

+--------------------+-----+
|                name|count|
+--------------------+-----+
|mean corpuscular ...|  550|
|      platelet count|  511|
|mean platelet volume|  417|
|         body height|  416|
|  reticulocyte count|  401|
|mean corpuscular ...|  332|
|      lean body mass|  320|
|sex hormone-bindi...|  319|
|Red cell distribu...|  313|
|    lymphocyte count|  290|
|appendicular lean...|  280|
|    eosinophil count|  270|
|   erythrocyte count|  265|
|heel bone mineral...|  264|
|    neutrophil count|  254|
|      monocyte count|  229|
|     leukocyte count|  224|
|hemoglobin measur...|  213|
|       platelet crit|  192|
|reticulocyte meas...|  188|
|      vital capacity|  182|
|       fat body mass|  172|
|forced expiratory...|  168|
|platelet componen...|  153|
| body fat percentage|  148|
|mean corpuscular ...|  145|
|          hematocrit|  139|
|mean reticulocyte...|  136|
|testosterone meas...|  131|
|          hair color|  129|
|eosinophil percen...|  115|
|lymphocyte pe

In [10]:
complete.filter(
    (F.col("diseaseId").isNotNull())
    & (
        (F.array_contains(F.col("taName"), "measurement"))
        | (F.array_contains(F.col("taName"), "phenotype"))
    )
).count()

                                                                                

11009

In [44]:
test3.groupBy("therapeuticAreas", "taName").count().sort(F.col("count").desc()).show(
    # truncate=False
)

+--------------------+--------------------+-----+
|    therapeuticAreas|              taName|count|
+--------------------+--------------------+-----+
|       [EFO_0001444]|       [measurement]|10637|
|       [EFO_0000651]|         [phenotype]|  361|
|[EFO_0000618, MON...|[psychiatric diso...|  325|
|       [EFO_0000319]|[cardiovascular d...|  317|
|     [MONDO_0045024]|[cell proliferati...|  293|
|[MONDO_0045024, E...|[cell proliferati...|  279|
|[OTAR_0000017, MO...|[cell proliferati...|  242|
|[MONDO_0045024, O...|[cell proliferati...|  190|
|[EFO_0009690, MON...|[cell proliferati...|  162|
|[OTAR_0000018, EF...|[genetic, familia...|  162|
|[MONDO_0045024, E...|[cell proliferati...|  129|
|[EFO_0005803, MON...|[hematologic dise...|  114|
|[OTAR_0000017, MO...|[cell proliferati...|  112|
|[OTAR_0000017, MO...|[cell proliferati...|  103|
|[MONDO_0045024, E...|[cell proliferati...|  101|
|[MONDO_0045024, E...|[cell proliferati...|   96|
|[EFO_0005803, MON...|[hematologic dise...|   96|


In [45]:
test3.groupBy("therapeuticAreas", "taName", "coherencyDiagonal").count().sort(
    F.col("count").desc()
).show(
    # truncate=False
)

+--------------------+--------------------+-----------------+-----+
|    therapeuticAreas|              taName|coherencyDiagonal|count|
+--------------------+--------------------+-----------------+-----+
|       [EFO_0001444]|       [measurement]|           dispar| 5755|
|       [EFO_0001444]|       [measurement]|         coherent| 4882|
|       [EFO_0000651]|         [phenotype]|           dispar|  317|
|[EFO_0000618, MON...|[psychiatric diso...|           dispar|  296|
|       [EFO_0000319]|[cardiovascular d...|           dispar|  207|
|     [MONDO_0045024]|[cell proliferati...|           dispar|  178|
|[OTAR_0000017, MO...|[cell proliferati...|           dispar|  155|
|[MONDO_0045024, E...|[cell proliferati...|           dispar|  148|
|[MONDO_0045024, E...|[cell proliferati...|         coherent|  131|
|     [MONDO_0045024]|[cell proliferati...|         coherent|  115|
|       [EFO_0000319]|[cardiovascular d...|         coherent|  110|
|[EFO_0009690, MON...|[cell proliferati...|     

In [46]:
test3.groupBy("therapeuticAreas", "taName", "coherencyOneCell").count().sort(
    F.col("count").desc()
).show(
    # truncate=False
)

+--------------------+--------------------+----------------+-----+
|    therapeuticAreas|              taName|coherencyOneCell|count|
+--------------------+--------------------+----------------+-----+
|       [EFO_0001444]|       [measurement]|          dispar|10637|
|       [EFO_0000651]|         [phenotype]|          dispar|  361|
|[EFO_0000618, MON...|[psychiatric diso...|          dispar|  325|
|       [EFO_0000319]|[cardiovascular d...|          dispar|  317|
|     [MONDO_0045024]|[cell proliferati...|          dispar|  293|
|[MONDO_0045024, E...|[cell proliferati...|          dispar|  279|
|[OTAR_0000017, MO...|[cell proliferati...|          dispar|  242|
|[MONDO_0045024, O...|[cell proliferati...|          dispar|  190|
|[EFO_0009690, MON...|[cell proliferati...|          dispar|  162|
|[OTAR_0000018, EF...|[genetic, familia...|          dispar|  162|
|[MONDO_0045024, E...|[cell proliferati...|          dispar|  129|
|[EFO_0005803, MON...|[hematologic dise...|          dispar|  

In [37]:
test3.filter(F.array_contains(F.col("taName"), "measurement")).show(truncate=False)

+---------------+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------+-----------+--------+-----------+--------+-----------------+----------------+---------------------------------+------------+----------------+-------------+-------------+--------------+-------------------------------------------------------------+
|targetId       |diseaseId  |arrayDiseaseFromSource                                                                                                                                                                                                                 |name                                            |GoF_protect|GoF_risk|LoF_protect|LoF_risk|coherencyDiagonal|coherencyOneCell|datasources                      |maxClinPhase|therapeut