In [3]:
""" this scripts run the analysis for comparing QTL studies, tissues together with therapy areas matched"""

# from functions import relative_success, spreadSheetFormatter
# from stoppedTrials import terminated_td
# from DoEAssessment import directionOfEffect
# from membraneTargets import target_membrane
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
from datetime import datetime
from datetime import date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.types import (
    StructType,
    StructField,
    DoubleType,
    DecimalType,
    StringType,
    FloatType,
)

spark = SparkSession.builder.getOrCreate()

In [27]:
platform_v = "24.09"

chembl = spark.read.parquet(
    f"gs://open-targets-data-releases/{platform_v}/output/etl/parquet/evidence/sourceId=chembl"
)

target_path = (
    f"gs://open-targets-data-releases/{platform_v}/output/etl/parquet/targets/"
)
target = spark.read.parquet(target_path)

path_s25 = "gs://ot-team/jroldan/GIR_S01.csv"
nelson = spark.read.csv(path_s25, header=True)

path_mapping = "gs://ot-team/jroldan/mappings.csv"
mapping = spark.read.csv(path_mapping, header=True)

                                                                                

[Stage 11:>                                                         (0 + 1) / 1]

In [28]:
nelson.show()

+-------------+---------------------------------+------+------------------+--------------------+--------------------+----------------+------------------+--------+--------+--------+--------+------+-----------+-------------+--------------------+------------+----------+--------------------+--------------------+----------+------------+------+-------------+---------+--------+-----------+----------+-----------+---------+------+--------------------+---------------+------------------+------------+------------+-----------+----------------+------+-----------+------+
|       ti_uid|indication_association_similarity|target|indication_mesh_id|indication_mesh_term|historical_max_phase|active_max_phase|combined_max_phase|succ_p_1|succ_1_2|succ_2_3|succ_3_a|orphan|year_launch|assoc_mesh_id|     assoc_mesh_term|assoc_source|assoc_info|      original_trait|       original_link|assoc_year|pic_qtl_pval|pic_h4|af_gnomad_nfe|l2g_share|l2g_rank|assoc_share|assoc_rank|source_name| abs_beta|abs_or|            

24/09/26 10:39:15 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

[Stage 11:>                                                         (0 + 1) / 1]

In [None]:
mappings.count()

                                                                                

2320

[Stage 11:>                                                         (0 + 1) / 1]

In [29]:
### prepare mapping EFO-MeSH dataset  (#2,320 pairs)
mappings = (
    mapping.withColumn(
        "efo_column",
        F.regexp_replace(F.col("curie_id"), ":", "_"),  # Replaces ":" with "_"
    )
    .withColumn(
        "mesh_column",
        F.regexp_replace(
            F.col("mapped_curie"), "^MeSH:", ""
        ),  # Removes "MeSH:" from the start
    )
    .selectExpr("label", "efo_column", "mesh_column as indication_mesh_id")
)
### Get csv from Nelson and join bring EFO column to MeSH terms
nelson_efo = (
    nelson.join(mappings, on="indication_mesh_id", how="left")
    .withColumn("diseaseId", F.col("efo_column"))
    .withColumnRenamed("target", "approvedSymbol")
)
### Make chembl T-D-MaxClinPhase pairs and join with approved Symbol
chemblTD = (
    chembl.groupBy("targetId", "diseaseId")
    .agg(F.max(F.col("clinicalPhase")).alias("maxClinPhase"))
    .join(
        target.selectExpr("id as targetId", "approvedSymbol"), on="targetId", how="left"
    )
)
#### Join Nelson dataset T-I-MaxClinPhase mapped to EFO with Chembl pairs
matches = nelson_efo.join(chemblTD, on=["approvedSymbol", "diseaseId"], how="outer")

[Stage 11:>                                                         (0 + 1) / 1]

In [11]:
chemblTD.count()

                                                                                

81365

[Stage 11:>                                                         (0 + 1) / 1]

In [23]:
matches.toPandas().to_csv(f"gs://ot-team/jroldan/analysis/matchesNelsonChembl.csv")

[Stage 11:>                                                         (0 + 1) / 1]

[Stage 11:>                                                         (0 + 1) / 1]

In [39]:
matches.toPandas().to_csv(f"gs://ot-team/jroldan/analysis/matchesNelsonChembl_all.csv")

[Stage 11:>                                                         (0 + 1) / 1]

In [46]:
matches.withColumn(
    "combinedPhase",
    F.when(F.col("combined_max_phase") == "Preclinical", F.lit(0.5))
    .when(F.col("combined_max_phase") == "Phase I", F.lit(1.0))
    .when(F.col("combined_max_phase") == "Phase II", F.lit(2.0))
    .when(F.col("combined_max_phase") == "Phase III", F.lit(3.0))
    .when(F.col("combined_max_phase") == "Launched", F.lit(4.0)),
).filter(
    F.col("combinedPhase").isNotNull() & F.col("maxClinPhase").isNotNull()
).groupBy(
    "combinedPhase", "maxClinPhase"
).count().withColumn(
    "differences",
    F.when(F.col("combinedPhase") == F.col("maxClinPhase"), F.lit("equal"))
    .when(F.col("combinedPhase") > F.col("maxClinPhase"), F.lit("Nelson>>Chembl"))
    .when(F.col("combinedPhase") < F.col("maxClinPhase"), F.lit("Nelson<<Chembl")),
).sort(
    F.col("differences").desc()
).groupBy(
    "differences"
).agg(
    F.sum(F.col("count"))
).show(
    100
)

                                                                                

+--------------+----------+
|   differences|sum(count)|
+--------------+----------+
|         equal|      1835|
|Nelson<<Chembl|      1912|
|Nelson>>Chembl|       701|
+--------------+----------+



                                                                                

[Stage 335:>                                                        (0 + 1) / 1]

In [47]:
matches.withColumn(
    "combinedPhase",
    F.when(F.col("combined_max_phase") == "Preclinical", F.lit(0.5))
    .when(F.col("combined_max_phase") == "Phase I", F.lit(1.0))
    .when(F.col("combined_max_phase") == "Phase II", F.lit(2.0))
    .when(F.col("combined_max_phase") == "Phase III", F.lit(3.0))
    .when(F.col("combined_max_phase") == "Launched", F.lit(4.0)),
).filter(
    F.col("combinedPhase").isNotNull() & F.col("maxClinPhase").isNotNull()
).groupBy(
    "combinedPhase", "maxClinPhase"
).count().withColumn(
    "differences",
    F.when(F.col("combinedPhase") == F.col("maxClinPhase"), F.lit("equal"))
    .when(F.col("combinedPhase") > F.col("maxClinPhase"), F.lit("Nelson>>Chembl"))
    .when(F.col("combinedPhase") < F.col("maxClinPhase"), F.lit("Nelson<<Chembl")),
).sort(
    F.col("differences").desc()
).show(
    100
)

[Stage 335:>                (0 + 1) / 1][Stage 343:>                (0 + 1) / 1]

+-------------+------------+-----+--------------+
|combinedPhase|maxClinPhase|count|   differences|
+-------------+------------+-----+--------------+
|          0.5|         0.5|    5|         equal|
|          4.0|         4.0|  505|         equal|
|          1.0|         1.0|  224|         equal|
|          3.0|         3.0|  188|         equal|
|          2.0|         2.0|  913|         equal|
|          3.0|         2.0|  145|Nelson>>Chembl|
|          4.0|         2.0|   74|Nelson>>Chembl|
|          2.0|         0.5|    7|Nelson>>Chembl|
|          2.0|         1.0|  320|Nelson>>Chembl|
|          4.0|         3.0|  108|Nelson>>Chembl|
|          4.0|         1.0|   13|Nelson>>Chembl|
|          1.0|         0.5|    1|Nelson>>Chembl|
|          3.0|         1.0|   32|Nelson>>Chembl|
|          3.0|         0.5|    1|Nelson>>Chembl|
|          0.5|         1.0|  164|Nelson<<Chembl|
|          0.5|         4.0|  309|Nelson<<Chembl|
|          1.0|         3.0|   59|Nelson<<Chembl|


                                                                                

                                                                                

In [51]:
matches.withColumn(
    "combinedPhase",
    F.when(F.col("combined_max_phase") == "Preclinical", F.lit(0.5))
    .when(F.col("combined_max_phase") == "Phase I", F.lit(1.0))
    .when(F.col("combined_max_phase") == "Phase II", F.lit(2.0))
    .when(F.col("combined_max_phase") == "Phase III", F.lit(3.0))
    .when(F.col("combined_max_phase") == "Launched", F.lit(4.0)),
).filter(F.col("combinedPhase").isNotNull() & F.col("maxClinPhase").isNotNull()).filter(
    (F.col("combinedPhase") == 4.0) & (F.col("maxClinPhase") == 0.5)
).show()

                                                                                

+--------------+---------+------------------+------+---------------------------------+--------------------+--------------------+----------------+------------------+--------+--------+--------+--------+------+-----------+-------------+---------------+------------+----------+--------------+-------------+----------+------------+------+-------------+---------+--------+-----------+----------+-----------+--------+------+-----+---------------+-------------+------------+------------+-----------+----------------+------+-----------+------+-----+----------+--------+------------+-------------+
|approvedSymbol|diseaseId|indication_mesh_id|ti_uid|indication_association_similarity|indication_mesh_term|historical_max_phase|active_max_phase|combined_max_phase|succ_p_1|succ_1_2|succ_2_3|succ_3_a|orphan|year_launch|assoc_mesh_id|assoc_mesh_term|assoc_source|assoc_info|original_trait|original_link|assoc_year|pic_qtl_pval|pic_h4|af_gnomad_nfe|l2g_share|l2g_rank|assoc_share|assoc_rank|source_name|abs_beta|ab

In [60]:
matches.withColumn(
    "combinedPhase",
    F.when(F.col("combined_max_phase") == "Preclinical", F.lit(0.5))
    .when(F.col("combined_max_phase") == "Phase I", F.lit(1.0))
    .when(F.col("combined_max_phase") == "Phase II", F.lit(2.0))
    .when(F.col("combined_max_phase") == "Phase III", F.lit(3.0))
    .when(F.col("combined_max_phase") == "Launched", F.lit(4.0)),
).filter(F.col("combinedPhase").isNotNull() & F.col("maxClinPhase").isNotNull()).filter(
    (F.col("combinedPhase") == 0.5) & (F.col("maxClinPhase") == 4.0)
).groupBy(
    "label"
).count().sort(
    F.col("count").desc()
).show(
    309, truncate=False
)

[Stage 435:>                                                        (0 + 1) / 1]

+----------------------------------------+-----+
|label                                   |count|
+----------------------------------------+-----+
|immune system disease                   |36   |
|neoplasm                                |30   |
|cardiovascular disease                  |25   |
|diabetes mellitus                       |12   |
|obesity                                 |10   |
|epilepsy                                |10   |
|post-traumatic stress disorder          |8    |
|atherosclerosis                         |8    |
|inflammatory bowel disease              |5    |
|heart failure                           |5    |
|mental or behavioural disorder          |5    |
|chronic obstructive pulmonary disease   |5    |
|myocardial infarction                   |4    |
|insomnia                                |4    |
|Hepatitis, Alcoholic                    |4    |
|atrial fibrillation                     |4    |
|blood coagulation disease               |4    |
|non-small cell lung

                                                                                

In [61]:
matches.withColumn(
    "combinedPhase",
    F.when(F.col("combined_max_phase") == "Preclinical", F.lit(0.5))
    .when(F.col("combined_max_phase") == "Phase I", F.lit(1.0))
    .when(F.col("combined_max_phase") == "Phase II", F.lit(2.0))
    .when(F.col("combined_max_phase") == "Phase III", F.lit(3.0))
    .when(F.col("combined_max_phase") == "Launched", F.lit(4.0)),
).filter(F.col("combinedPhase").isNotNull() & F.col("maxClinPhase").isNotNull()).filter(
    (F.col("combinedPhase") == 0.5) & (F.col("maxClinPhase") == 4.0)
).groupBy(
    "approvedSymbol"
).count().sort(
    F.col("count").desc()
).show(
    309, truncate=False
)

                                                                                

+--------------+-----+
|approvedSymbol|count|
+--------------+-----+
|NR3C1         |11   |
|PTGS2         |8    |
|PTGS1         |6    |
|CNR1          |5    |
|ESR1          |5    |
|PGF           |5    |
|ACE           |4    |
|GRIN1         |4    |
|OPRM1         |4    |
|PPARG         |4    |
|ADORA2B       |4    |
|HMGCR         |4    |
|VEGFA         |3    |
|ADRA2A        |3    |
|OPRD1         |3    |
|AGTR1         |3    |
|VDR           |3    |
|PDE3A         |3    |
|ADORA3        |3    |
|HRH1          |3    |
|TRPV1         |3    |
|SLC5A2        |3    |
|RXRG          |2    |
|GABRA4        |2    |
|FKBP1A        |2    |
|SLC6A4        |2    |
|PGR           |2    |
|CRBN          |2    |
|SCN5A         |2    |
|MT-ND1        |2    |
|HTR3A         |2    |
|PDE10A        |2    |
|ABAT          |2    |
|TUBB          |2    |
|SCN1A         |2    |
|TNF           |2    |
|TNFSF13B      |2    |
|IFNG          |2    |
|PDE4A         |2    |
|MS4A1         |2    |
|AR        

In [63]:
### cases where Nelson is Phase 4 and Chembl below 4

matches.withColumn(
    "combinedPhase",
    F.when(F.col("combined_max_phase") == "Preclinical", F.lit(0.5))
    .when(F.col("combined_max_phase") == "Phase I", F.lit(1.0))
    .when(F.col("combined_max_phase") == "Phase II", F.lit(2.0))
    .when(F.col("combined_max_phase") == "Phase III", F.lit(3.0))
    .when(F.col("combined_max_phase") == "Launched", F.lit(4.0)),
).filter(F.col("combinedPhase").isNotNull() & F.col("maxClinPhase").isNotNull()).filter(
    (F.col("combinedPhase") == 4.0) & (F.col("maxClinPhase") < 4.0)
).show(
    309, truncate=False
)

                                                                                

+--------------+-----------+------------------+---------------+---------------------------------+--------------------------------------+--------------------+----------------+------------------+--------+--------+--------+--------+------+-----------+-------------+-------------------------------------------+------------+--------------------------------------------+----------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------+----------+------------+--------+-------------+---------+--------+-----------+----------+-----------+----------+-----------+------------------------------------------------------+---------------+----------------------------+------------+------------+-----------+----------------+------+-----------+------+-------------------------------------+-----------+---------------+------------+-----------

In [62]:
### Diseases from cases where Nelson is Phase 4 and Chembl below 2

matches.withColumn(
    "combinedPhase",
    F.when(F.col("combined_max_phase") == "Preclinical", F.lit(0.5))
    .when(F.col("combined_max_phase") == "Phase I", F.lit(1.0))
    .when(F.col("combined_max_phase") == "Phase II", F.lit(2.0))
    .when(F.col("combined_max_phase") == "Phase III", F.lit(3.0))
    .when(F.col("combined_max_phase") == "Launched", F.lit(4.0)),
).filter(F.col("combinedPhase").isNotNull() & F.col("maxClinPhase").isNotNull()).filter(
    (F.col("combinedPhase") == 4.0) & (F.col("maxClinPhase") < 4.0)
).groupBy(
    "label"
).count().sort(
    F.col("count").desc()
).show(
    309, truncate=False
)

                                                                                

+-------------------------------------+-----+
|label                                |count|
+-------------------------------------+-----+
|non-small cell lung carcinoma        |14   |
|kidney neoplasm                      |14   |
|liver neoplasm                       |13   |
|acute myeloid leukemia               |11   |
|pancreatic neoplasm                  |10   |
|ovarian neoplasm                     |9    |
|stomach neoplasm                     |7    |
|breast neoplasm                      |6    |
|small cell lung carcinoma            |6    |
|Behcet's syndrome                    |5    |
|endometrial neoplasm                 |5    |
|status epilepticus                   |4    |
|non-Hodgkins lymphoma                |4    |
|colorectal neoplasm                  |4    |
|ventricular fibrillation             |3    |
|thyroid neoplasm                     |3    |
|melanoma                             |3    |
|Vitiligo                             |2    |
|chronic lymphocytic leukemia     

In [66]:
matches.withColumn(
    "combinedPhase",
    F.when(F.col("combined_max_phase") == "Preclinical", F.lit(0.5))
    .when(F.col("combined_max_phase") == "Phase I", F.lit(1.0))
    .when(F.col("combined_max_phase") == "Phase II", F.lit(2.0))
    .when(F.col("combined_max_phase") == "Phase III", F.lit(3.0))
    .when(F.col("combined_max_phase") == "Launched", F.lit(4.0)),
).filter(F.col("combinedPhase").isNotNull() & F.col("maxClinPhase").isNotNull()).filter(
    (F.col("combinedPhase") == 4.0) & (F.col("maxClinPhase") < 4.0)
).count()

                                                                                

195

In [65]:
### Targets from cases where Nelson is Phase 4 and Chembl below 2

matches.withColumn(
    "combinedPhase",
    F.when(F.col("combined_max_phase") == "Preclinical", F.lit(0.5))
    .when(F.col("combined_max_phase") == "Phase I", F.lit(1.0))
    .when(F.col("combined_max_phase") == "Phase II", F.lit(2.0))
    .when(F.col("combined_max_phase") == "Phase III", F.lit(3.0))
    .when(F.col("combined_max_phase") == "Launched", F.lit(4.0)),
).filter(F.col("combinedPhase").isNotNull() & F.col("maxClinPhase").isNotNull()).filter(
    (F.col("combinedPhase") == 4.0) & (F.col("maxClinPhase") < 4.0)
).groupBy(
    "approvedSymbol"
).count().sort(
    F.col("count").desc()
).show(
    309, truncate=False
)

                                                                                

+--------------+-----+
|approvedSymbol|count|
+--------------+-----+
|FGFR1         |8    |
|FLT1          |7    |
|KIT           |6    |
|BRAF          |6    |
|FGFR3         |6    |
|PDCD1         |6    |
|IL2RA         |5    |
|KDR           |5    |
|FLT4          |5    |
|TOP2A         |5    |
|CD274         |4    |
|BTK           |4    |
|RET           |4    |
|PDGFRA        |4    |
|FGFR2         |4    |
|ABCB1         |4    |
|NR3C1         |4    |
|VEGFA         |3    |
|PDGFRB        |3    |
|IL17RA        |3    |
|JAK1          |3    |
|FGFR4         |3    |
|FLT3          |3    |
|IL1B          |2    |
|IL17F         |2    |
|ERBB4         |2    |
|F10           |2    |
|TUBB          |2    |
|CD3E          |2    |
|PIK3CD        |2    |
|PPARA         |2    |
|CSF1R         |2    |
|IL4R          |2    |
|APOC3         |2    |
|JAK3          |2    |
|TEK           |2    |
|MAP2K2        |2    |
|MAP2K1        |2    |
|LYN           |2    |
|CHRM1         |1    |
|TOP1      