In [2]:
from DoEAssessment import directionOfEffect
from functions import discrepancifier
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.types import (
    StructType,
    StructField,
    DoubleType,
    DecimalType,
    StringType,
    FloatType,
)
from datetime import datetime


spark = SparkSession.builder.getOrCreate()

spark session created at 2024-10-16 07:50:49.235859
Analysis started on 2024-10-16 at  2024-10-16 07:50:49.235859


24/10/16 07:50:54 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [3]:
platform_v = "24.09"
evidences = (
    spark.read.parquet(
        f"gs://open-targets-data-releases/{platform_v}/output/etl/parquet/evidence"
    )
    .filter(
        F.col("datasourceId").isin(
            [
                "ot_genetics_portal",
                "gene_burden",
                "eva",
                "eva_somatic",
                "gene2phenotype",
                "orphanet",
                "cancer_gene_census",
                "intogen",
                "impc",
                "chembl",
            ]
        )
    )
    .persist()
)

disease_path = (
    f"gs://open-targets-data-releases/{platform_v}/output/etl/parquet/diseases/"
)
diseases = spark.read.parquet(disease_path)

target_path = (
    f"gs://open-targets-data-releases/{platform_v}/output/etl/parquet/targets/"
)
target = spark.read.parquet(target_path)

In [4]:
assessed_evidences = (
    directionOfEffect(evidences, platform_v)
    .filter(F.col("homogenized") != "noEvaluable")
    .withColumn(
        "datasources",
        F.collect_set("datasourceId").over(Window.partitionBy("targetId", "diseaseId")),
    )
    .withColumn(
        "maxClinPhase",
        F.max("clinicalPhase").over(Window.partitionBy("targetId", "diseaseId")),
    )
)
coherency_assessed = discrepancifier(
    assessed_evidences.withColumn(
        "arrayDiseaseFromSource",
        F.collect_set(F.col("diseaseFromSource")).over(
            Window.partitionBy("targetId", "diseaseId")
        ),
    )
    .groupBy(
        "targetId", "diseaseId", "datasources", "maxClinPhase", "arrayDiseaseFromSource"
    )
    .pivot("homogenized")
    .count()
).persist()

                                                                                

#### dataset where datasources are collected per DoE assessment

In [52]:
coherency_assessed_datasourcesPerDoE = discrepancifier(
    assessed_evidences.withColumn(
        "arrayDiseaseFromSource",
        F.collect_set(F.col("diseaseFromSource")).over(
            Window.partitionBy("targetId", "diseaseId")
        ),
    )
    .groupBy(
        "targetId", "diseaseId", "datasources", "maxClinPhase", "arrayDiseaseFromSource"
    )
    .pivot("homogenized")
    .agg(F.collect_set(F.col("datasourceId")))
).select("targetId", "diseaseId", "GoF_protect", "GoF_risk", "LoF_protect", "LoF_risk")

24/10/16 10:35:34 WARN CacheManager: Asked to cache already cached data.


#### Make a dataset where I can have if genetics (no chembl datasource) is in agreement

In [57]:
assessed_evidences_genetics = (
    directionOfEffect(evidences, platform_v)
    .filter(F.col("homogenized") != "noEvaluable")
    .filter(F.col("datasourceId") != "chembl")
    .withColumn(
        "datasources",
        F.collect_set("datasourceId").over(Window.partitionBy("targetId", "diseaseId")),
    )
    .withColumn(
        "maxClinPhase",
        F.max("clinicalPhase").over(Window.partitionBy("targetId", "diseaseId")),
    )
)
coherency_assessed_genetics = (
    discrepancifier(
        assessed_evidences_genetics.withColumn(
            "arrayDiseaseFromSource",
            F.collect_set(F.col("diseaseFromSource")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy(
            "targetId",
            "diseaseId",
            "datasources",
            "maxClinPhase",
            "arrayDiseaseFromSource",
        )
        .pivot("homogenized")
        .count()
    )
    .withColumn(
        "geneticsAgreeDiagonal",
        F.when(F.col("coherencyDiagonal") == "coherent", F.lit("yes")).otherwise(
            F.lit("no")
        ),
    )
    .withColumn(
        "geneticsAgreeOneCell",
        F.when(F.col("coherencyOneCell") == "coherent", F.lit("yes")).otherwise(
            F.lit("no")
        ),
    )
    .select("targetId", "diseaseId", "geneticsAgreeDiagonal", "geneticsAgreeOneCell")
    .persist()
)

24/10/16 10:39:23 WARN CacheManager: Asked to cache already cached data.
24/10/16 10:39:24 WARN CacheManager: Asked to cache already cached data.


#### Make a dataset where I can have if genetics (no chembl nor IMPC datasource) is in agreement

In [59]:
assessed_evidences_genetics_noIMPC = (
    directionOfEffect(evidences, platform_v)
    .filter(F.col("homogenized") != "noEvaluable")
    .filter(F.col("datasourceId") != "chembl")
    .filter(F.col("datasourceId") != "IMPC")
    .withColumn(
        "datasources",
        F.collect_set("datasourceId").over(Window.partitionBy("targetId", "diseaseId")),
    )
    .withColumn(
        "maxClinPhase",
        F.max("clinicalPhase").over(Window.partitionBy("targetId", "diseaseId")),
    )
)
coherency_assessed_genetics_noIMPC = (
    discrepancifier(
        assessed_evidences_genetics_noIMPC.withColumn(
            "arrayDiseaseFromSource",
            F.collect_set(F.col("diseaseFromSource")).over(
                Window.partitionBy("targetId", "diseaseId")
            ),
        )
        .groupBy(
            "targetId",
            "diseaseId",
            "datasources",
            "maxClinPhase",
            "arrayDiseaseFromSource",
        )
        .pivot("homogenized")
        .count()
    )
    .withColumn(
        "geneticsAgreeDiagonal_noIMPC",
        F.when(F.col("coherencyDiagonal") == "coherent", F.lit("yes")).otherwise(
            F.lit("no")
        ),
    )
    .withColumn(
        "geneticsAgreeOneCell_noIMPC",
        F.when(F.col("coherencyOneCell") == "coherent", F.lit("yes")).otherwise(
            F.lit("no")
        ),
    )
    .select(
        "targetId",
        "diseaseId",
        "geneticsAgreeDiagonal_noIMPC",
        "geneticsAgreeOneCell_noIMPC",
    )
    .persist()
)

24/10/16 10:40:57 WARN CacheManager: Asked to cache already cached data.
24/10/16 10:40:57 WARN CacheManager: Asked to cache already cached data.


#### Make a dataset where we collect the diferente disease from source depending on the DoE assessment

In [85]:
coherency_assessed_diseaseFromSource = discrepancifier(
    assessed_evidences.withColumn(
        "arrayDiseaseFromSource",
        F.collect_set(F.col("diseaseFromSource")).over(
            Window.partitionBy("targetId", "diseaseId")
        ),
    )
    .groupBy("targetId", "diseaseId", "datasources", "maxClinPhase")
    .pivot("homogenized")
    .agg(F.collect_set(F.col("diseaseFromSource")))
).persist()

24/10/16 11:29:17 WARN CacheManager: Asked to cache already cached data.
24/10/16 11:29:17 WARN CacheManager: Asked to cache already cached data.


In [88]:
columns_to_rename = ["GoF_protect", "GoF_risk", "LoF_protect", "LoF_risk"]

# Rename all the specified columns by adding 'disSource' to their names
for col in columns_to_rename:
    coherency_assessed_diseaseFromSource = (
        coherency_assessed_diseaseFromSource.withColumnRenamed(col, f"{col}_disSource")
    )

# After renaming, you can select the desired columns
coherency_assessed_diseaseFromSource_new = coherency_assessed_diseaseFromSource.select(
    "targetId",
    "diseaseId",
    "GoF_protect_disSource",
    "LoF_protect_disSource",
    "GoF_risk_disSource",
    "LoF_risk_disSource",
)
# Show the

In [89]:
coherency_assessed_diseaseFromSource_new.show()

+---------------+-------------+---------------------+---------------------+--------------------+--------------------+
|       targetId|    diseaseId|GoF_protect_disSource|LoF_protect_disSource|  GoF_risk_disSource|  LoF_risk_disSource|
+---------------+-------------+---------------------+---------------------+--------------------+--------------------+
|ENSG00000000938|MONDO_0019156|                   []|                   []|                  []|[Angioosteohypotr...|
|ENSG00000000971|MONDO_0012104|                   []|                   []|                  []|[Lipodystrophy, P...|
|ENSG00000001617|MONDO_0016305|                   []|                   []|                  []|[Atypical Pantoth...|
|ENSG00000001626|  EFO_0000768|                   []|                   []|                  []|[Pulmonary Fibros...|
|ENSG00000001626|MONDO_0011766|                   []|                   []|                  []|[46,Xy Gonadal Dy...|
|ENSG00000001626| Orphanet_124|                   []|   

#### Join complete dataset with the others:

In [63]:
complete_whole = (
    complete.join(
        coherency_assessed_datasourcesPerDoE, on=["targetId", "diseaseId"], how="left"
    )
    .join(coherency_assessed_genetics, on=["targetId", "diseaseId"], how="left")
    .join(coherency_assessed_genetics_noIMPC, on=["targetId", "diseaseId"], how="left")
).persist()

### Now we need to join with the dataset with manual annotations

In [70]:
### manual annotations dataset:


annotations = spark.read.csv(
    "gs://ot-team/jroldan/analysis/jroldan_analysis_20240516_subsetInterestingSpace_20241016.csv",
    header=True,
    inferSchema=True,
).select(
    "targetId",
    "diseaseId",
    "_c15",
    "comment",
    "type",
    "data type",
    "is_indication_the_main_disease?",
    "is_the_causality_target-disease_the_reason_of_the_drug?",
    "New_Coherency_Diagonal?",
    "New_Coherency_One_Cell?",
    "Potential_Safety",
)

In [73]:
annotations.count()

3771

#### Join manual annotations and diseases from Source with the complete whole 

In [91]:
complete_whole_ready = complete_whole.join(
    annotations, on=["targetId", "diseaseId"], how="left"
).join(
    coherency_assessed_diseaseFromSource_new, on=["targetId", "diseaseId"], how="left"
)

### Check number of measurement and phenotypes with discrepancies where disease from source is different:

In [None]:
complete_whole_ready.withColumn().filter(F.col("taName").isin(["measurement","phenotype"])).groupBy("")

In [112]:
columns_to_check = [
    "GoF_protect_disSource",
    "LoF_protect_disSource",
    "GoF_risk_disSource",
    "LoF_risk"
]
columns_to_check2 = [
    "LoF_protect_disSource",
    "GoF_protect_disSource",
]
columns_to_check2 = [
    "LoF_protect_disSource",
    "GoF_protect_disSource",
]

# Create a condition that checks if any of the arrays have at least 1 element
condition = F.expr(" OR ".join([f"size({col}) > 0" for col in columns_to_check]))
condition2 = F.expr(" OR ".join([f"size({col}) > 0" for col in columns_to_check2]))
# Apply the condition to filter the rows where any of the arrays have at least 1 element
complete_whole_ready2 = (
    complete_whole_ready.withColumn(
        "diagonalBreak1",
        F.when(condition, F.lit("True")).otherwise(F.lit("False")),
    )
    .withColumn(
        "diagonalBreak2", F.when(condition2, F.lit("True")).otherwise(F.lit("False"))
    )
    .withColumn(
        "diagonalBreakBoth",
        F.when(
            (F.col("diagonalBreak1") == True) | (F.col("diagonalBreak2") == True),
            F.lit("dispar"),
        ).otherwise(F.lit("coherent")),
    )
)

In [113]:
complete_whole_ready2.filter(
    (
        F.size(
            F.array_intersect(
                F.col("taName"), F.array(F.lit("measurement"), F.lit("phenotype"))
            )
        )
        > 0
    )
).groupBy("diagonalBreakBoth").count().show()



+-----------------+-----+
|diagonalBreakBoth|count|
+-----------------+-----+
|           dispar|11009|
+-----------------+-----+



                                                                                

In [108]:
complete_whole_ready2.filter(
    (
        F.size(
            F.array_intersect(
                F.col("taName"), F.array(F.lit("measurement"), F.lit("phenotype"))
            )
        )
        > 0
    )
).show(vertical=True, truncate=False)

-RECORD 0---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 targetId                                                | ENSG00000011304                                                                                                                                                                                                                                                                                                    
 diseaseId                                               | EFO_0004509                                                                                                                                                                                                    

                                                                                

In [104]:
complete_whole_ready2.filter(
    (
        F.size(
            F.array_intersect(
                F.col("taName"), F.array(F.lit("measurement"), F.lit("phenotype"))
            )
        )
        >1
    )
).filter(F.col("diagonalBreakBoth") == "dispar").show()

                                                                                

+---------------+-------------+----------------------+--------------------+-----------+--------+-----------+--------+-----------------+----------------+--------------------+------------+--------------------+--------------------+-----------------+--------------+--------------------+-----------+--------------------+-----------+--------+---------------------+--------------------+----------------------------+---------------------------+----+-------+----+---------+-------------------------------+-------------------------------------------------------+-----------------------+-----------------------+----------------+---------------------+---------------------+------------------+------------------+--------------+--------------+-----------------+
|       targetId|    diseaseId|arrayDiseaseFromSource|                name|GoF_protect|GoF_risk|LoF_protect|LoF_risk|coherencyDiagonal|coherencyOneCell|         datasources|maxClinPhase|    therapeuticAreas|              taName|    taLabelSimple|approv

In [96]:
complete_whole_ready2.withColumn("breakDiagonalDiseaseSource", F.when(F.col("coherencyDiagonal")=="dispar"))

                                                                                

15885

In [77]:
complete_whole_ready.toPandas().to_csv(
    "gs://ot-team/jroldan/analysis/20241016_completeWholeReady.csv"
)

In [48]:
coherency_assessed_genetics_noIMPC.filter(
    F.col("geneticsAgreeDiagonal") == "no"
).count()

6777

In [40]:
coherency_assessed_genetics.filter(F.col("geneticsAgreeOneCell") == "no").count()

                                                                                

12094

In [45]:
coherency_assessed_genetics.filter(F.col("geneticsAgreeDiagonal") == "no").groupBy(
    "datasources"
).count().sort(F.col("count").desc()).show(truncate=False)

+------------------------------------------+-----+
|datasources                               |count|
+------------------------------------------+-----+
|[ot_genetics_portal]                      |5629 |
|[ot_genetics_portal, gene_burden]         |580  |
|[intogen, cancer_gene_census]             |156  |
|[gene_burden]                             |99   |
|[impc, cancer_gene_census]                |81   |
|[eva, orphanet]                           |36   |
|[impc, orphanet]                          |33   |
|[impc, ot_genetics_portal]                |29   |
|[gene_burden, cancer_gene_census]         |14   |
|[impc, eva, orphanet]                     |13   |
|[eva, cancer_gene_census]                 |11   |
|[ot_genetics_portal, cancer_gene_census]  |9    |
|[impc, intogen, cancer_gene_census]       |9    |
|[impc, intogen]                           |8    |
|[eva, ot_genetics_portal]                 |6    |
|[intogen, eva_somatic, cancer_gene_census]|6    |
|[impc, ot_genetics_portal, gen

In [44]:
coherency_assessed_genetics.filter(F.col("geneticsAgreeOneCell") == "yes").groupBy(
    "datasources"
).count().sort(F.col("count").desc()).show(truncate=False)

+---------------------------------+------+
|datasources                      |count |
+---------------------------------+------+
|[impc]                           |638624|
|[ot_genetics_portal]             |83041 |
|[cancer_gene_census]             |33261 |
|[eva]                            |10918 |
|[gene_burden]                    |5739  |
|[impc, eva]                      |1684  |
|[intogen]                        |1604  |
|[eva, gene2phenotype]            |834   |
|[orphanet]                       |490   |
|[intogen, cancer_gene_census]    |486   |
|[gene2phenotype]                 |350   |
|[eva, orphanet]                  |321   |
|[ot_genetics_portal, gene_burden]|265   |
|[impc, eva, gene2phenotype]      |250   |
|[impc, cancer_gene_census]       |198   |
|[eva, orphanet, gene2phenotype]  |162   |
|[eva_somatic]                    |149   |
|[impc, eva, orphanet]            |115   |
|[impc, orphanet]                 |82    |
|[eva, gene_burden]               |76    |
+----------

### Let's generate a dataset with datasources in the column of homogenized:

In [5]:
complementary = (
    assessed_evidences.groupBy("targetId", "diseaseId")
    .pivot("homogenized")
    .agg(F.collect_set("datasourceId"))
)

In [6]:
taDf = spark.createDataFrame(
    data=[
        ("MONDO_0045024", "cell proliferation disorder", "Oncology"),
        ("EFO_0005741", "infectious disease", "Other"),
        ("OTAR_0000014", "pregnancy or perinatal disease", "Other"),
        ("EFO_0005932", "animal disease", "Other"),
        ("MONDO_0024458", "disease of visual system", "Other"),
        ("EFO_0000319", "cardiovascular disease", "Other"),
        ("EFO_0009605", "pancreas disease", "Other"),
        ("EFO_0010282", "gastrointestinal disease", "Other"),
        ("OTAR_0000017", "reproductive system or breast disease", "Other"),
        ("EFO_0010285", "integumentary system disease", "Other"),
        ("EFO_0001379", "endocrine system disease", "Other"),
        ("OTAR_0000010", "respiratory or thoracic disease", "Other"),
        ("EFO_0009690", "urinary system disease", "Other"),
        ("OTAR_0000006", "musculoskeletal or connective tissue disease", "Other"),
        ("MONDO_0021205", "disease of ear", "Other"),
        ("EFO_0000540", "immune system disease", "Other"),
        ("EFO_0005803", "hematologic disease", "Other"),
        ("EFO_0000618", "nervous system disease", "Other"),
        ("MONDO_0002025", "psychiatric disorder", "Other"),
        ("MONDO_0024297", "nutritional or metabolic disease", "Other"),
        ("OTAR_0000018", "genetic, familial or congenital disease", "Other"),
        ("OTAR_0000009", "injury, poisoning or other complication", "Other"),
        ("EFO_0000651", "phenotype", "Other"),
        ("EFO_0001444", "measurement", "Other"),
        ("GO_0008150", "biological process", "Other"),
    ],
    schema=StructType(
        [
            StructField("taId", StringType(), True),
            StructField("taLabel", StringType(), True),
            StructField("taLabelSimple", StringType(), True),
        ]
    ),
).withColumn("taRank", F.monotonically_increasing_id())

In [7]:
complete = (
    coherency_assessed.join(
        diseases.selectExpr("id as diseaseId", "name", "therapeuticAreas"),
        on="diseaseId",
        how="left",
    )
    .select(
        "*",
        F.explode_outer(F.col("therapeuticAreas")).alias("therapeuticAreas_expl"),
    )
    .join(
        taDf.selectExpr("taId as therapeuticAreas_expl", "taLabel", "taLabelSimple"),
        on="therapeuticAreas_expl",
        how="left",
    )
    .groupBy(
        "targetId",
        "diseaseId",
        "arrayDiseaseFromSource",
        "name",
        "GoF_protect",
        "GoF_risk",
        "LoF_protect",
        "LoF_risk",
        "coherencyDiagonal",
        "coherencyOneCell",
        "datasources",
        "maxClinPhase",
    )
    .agg(
        F.collect_set("therapeuticAreas_expl").alias("therapeuticAreas"),
        F.collect_set("taLabel").alias("taName"),
        F.collect_set("taLabelSimple").alias("taLabelSimple"),
    )
    .filter(
        (F.col("coherencyDiagonal") == "dispar")
        | (F.col("coherencyOneCell") == "dispar")
    )
    .join(
        target.selectExpr("id as targetId", "approvedSymbol"), on="targetId", how="left"
    )
    .withColumn(
        "linkOT",
        F.concat(
            F.lit("https://platform.opentargets.org/evidence/"),
            F.col("approvedSymbol"),
            F.lit("/"),
            F.col("diseaseId"),  # Column containing "EFO_0009188"
        ),
    )
    .persist()
)

In [38]:
complete.count()

15885

#### Some Key numbers

In [39]:
print("Total associations with DoE =", coherency_assessed.count())
print(
    "Total associations being discrepant on at least one criteria =", complete.count()
)
print(
    "Associations discrepant on coherency Diagonal =",
    complete.filter(F.col("coherencyDiagonal") == "dispar").count(),
)
print(
    "Associations discrepant on coherency oneCell =",
    complete.filter(F.col("coherencyOneCell") == "dispar").count(),
)

Total associations with DoE = 865816
Total associations being discrepant on at least one criteria = 15885
Associations discrepant on coherency Diagonal = 9137
Associations discrepant on coherency oneCell = 15885


#### More detailed comparisons inside of each criteria for coherency

In [23]:
complete.filter(
    F.size(
        F.array_intersect(
            F.col("taName"), F.array(F.lit("measurement"), F.lit("phenotype"))
        )
    )
    > 0
).groupBy("taName", "coherencyDiagonal", "coherencyOneCell").count().show()

+--------------------+-----------------+----------------+-----+
|              taName|coherencyDiagonal|coherencyOneCell|count|
+--------------------+-----------------+----------------+-----+
|         [phenotype]|         coherent|          dispar|   44|
|         [phenotype]|           dispar|          dispar|  317|
|       [measurement]|         coherent|          dispar| 4882|
|       [measurement]|           dispar|          dispar| 5755|
|[genetic, familia...|         coherent|          dispar|    4|
|[genetic, familia...|           dispar|          dispar|    3|
|[urinary system d...|           dispar|          dispar|    3|
|[hematologic dise...|         coherent|          dispar|    1|
+--------------------+-----------------+----------------+-----+



#### 2/3 of associations under coherency Diagonal are Measurement/Phenotypes

In [40]:
#### Associations in coherency diagonal that are measurements/phenotypes
complete.filter(
    (
        F.size(
            F.array_intersect(
                F.col("taName"), F.array(F.lit("measurement"), F.lit("phenotype"))
            )
        )
        > 0
    )
    & (F.col("coherencyDiagonal") == "dispar")
).count()

6078

#### After removing measurement/phenotypes:

In [44]:
complete.filter(
    (
        F.size(
            F.array_intersect(
                F.col("taName"), F.array(F.lit("measurement"), F.lit("phenotype"))
            )
        )
        == 0
    )
    & (F.col("coherencyDiagonal") == "dispar")
).count()

3059

#### From this 3,059 associations, how many are Chembl VS Genetics?

In [9]:
complete.show()

+---------------+-------------+----------------------+--------------------+-----------+--------+-----------+--------+-----------------+----------------+--------------------+------------+--------------------+--------------------+-----------------+--------------+--------------------+
|       targetId|    diseaseId|arrayDiseaseFromSource|                name|GoF_protect|GoF_risk|LoF_protect|LoF_risk|coherencyDiagonal|coherencyOneCell|         datasources|maxClinPhase|    therapeuticAreas|              taName|    taLabelSimple|approvedSymbol|              linkOT|
+---------------+-------------+----------------------+--------------------+-----------+--------+-----------+--------+-----------------+----------------+--------------------+------------+--------------------+--------------------+-----------------+--------------+--------------------+
|ENSG00000151577|MONDO_0005351|  [Anorexia Nervosa...|    anorexia nervosa|          1|    null|          4|    null|           dispar|          dispar

### how many of genetics VS drugs are from cancer:

In [20]:
complete.filter(
    (F.size(F.col("datasources")) > 1)
    & (F.array_contains(F.col("datasources"), "chembl"))
).filter(F.array_contains(F.col("taLabelSimple"), "Oncology")).count()

1708

### Datasources composition for Genetics VS drugs in Oncology

In [26]:
complete.filter(
    (F.size(F.col("datasources")) > 1)
    & (F.array_contains(F.col("datasources"), "chembl"))
).filter(F.array_contains(F.col("taLabelSimple"), "Oncology")).groupBy(
    "datasources"
).count().sort(
    F.col("count").desc()
).show(
    100, truncate=False
)

+-------------------------------------------------------------------+-----+
|datasources                                                        |count|
+-------------------------------------------------------------------+-----+
|[chembl, cancer_gene_census]                                       |1260 |
|[impc, chembl]                                                     |172  |
|[intogen, chembl]                                                  |104  |
|[intogen, chembl, cancer_gene_census]                              |79   |
|[impc, chembl, cancer_gene_census]                                 |21   |
|[eva, chembl]                                                      |11   |
|[ot_genetics_portal, chembl]                                       |10   |
|[gene_burden, chembl, cancer_gene_census]                          |8    |
|[gene_burden, chembl]                                              |8    |
|[chembl, eva_somatic]                                              |6    |
|[eva, chemb

### Datasources composition for Genetics VS drugs in Other/Non oncology

In [28]:
complete.filter(
    (F.size(F.col("datasources")) > 1)
    & (F.array_contains(F.col("datasources"), "chembl"))
).filter(~F.array_contains(F.col("taLabelSimple"), "Oncology")).groupBy(
    "datasources"
).count().sort(
    F.col("count").desc()
).show(
    100, truncate=False
)

+----------------------------------------------------+-----+
|datasources                                         |count|
+----------------------------------------------------+-----+
|[impc, chembl]                                      |451  |
|[ot_genetics_portal, chembl]                        |110  |
|[eva, chembl]                                       |34   |
|[impc, eva, chembl]                                 |17   |
|[gene_burden, chembl]                               |10   |
|[impc, ot_genetics_portal, chembl]                  |9    |
|[ot_genetics_portal, gene_burden, chembl]           |5    |
|[impc, eva, orphanet, chembl]                       |3    |
|[impc, eva, ot_genetics_portal, chembl]             |3    |
|[orphanet, chembl]                                  |2    |
|[impc, eva, ot_genetics_portal, gene_burden, chembl]|2    |
|[impc, orphanet, chembl]                            |2    |
|[eva, orphanet, chembl]                             |2    |
|[eva, ot_genetics_porta

In [None]:
#### Associations in coherency diagonal that are measurements/phenotypes
complete.filter(
    (
        F.size(
            F.array_intersect(
                F.col("taName"), F.array(F.lit("measurement"), F.lit("phenotype"))
            )
        )
        > 0
    )
    & (F.col("coherencyDiagonal") == "dispar")
).count()

In [42]:
#### Associations in coherency diagonal that are measurements/phenotypes
complete.filter((F.col("coherencyDiagonal") == "dispar")).groupBy(
    "taName"
).count().sort(F.col("count").desc()).show()

+--------------------+-----+
|              taName|count|
+--------------------+-----+
|       [measurement]| 5755|
|         [phenotype]|  317|
|[psychiatric diso...|  307|
|[cardiovascular d...|  207|
|[cell proliferati...|  178|
|[cell proliferati...|  155|
|[cell proliferati...|  148|
|[genetic, familia...|  106|
|[cell proliferati...|   98|
|[cell proliferati...|   82|
|[cell proliferati...|   76|
|[hematologic dise...|   64|
|[cell proliferati...|   61|
|[respiratory or t...|   60|
|[endocrine system...|   56|
|[hematologic dise...|   56|
|[cell proliferati...|   54|
|[genetic, familia...|   53|
|[biological process]|   49|
|[hematologic dise...|   49|
+--------------------+-----+
only showing top 20 rows



In [30]:
complete.filter(
    (
        F.size(
            F.array_intersect(
                F.col("taName"), F.array(F.lit("measurement"), F.lit("phenotype"))
            )
        )
        > 0
    )
    & (F.col("coherencyDiagonal") == "dispar")
).groupBy("name").count().sort(F.col("count").desc()).show(250)

+--------------------+-----+
|                name|count|
+--------------------+-----+
|mean corpuscular ...|  297|
|      platelet count|  235|
|mean platelet volume|  224|
|mean corpuscular ...|  224|
|  reticulocyte count|  222|
|         body height|  218|
|      lean body mass|  181|
|Red cell distribu...|  165|
|sex hormone-bindi...|  156|
|    eosinophil count|  137|
|heel bone mineral...|  137|
|    lymphocyte count|  137|
|   erythrocyte count|  132|
|      monocyte count|  128|
|appendicular lean...|  128|
|          hair color|  120|
|    neutrophil count|  120|
|hemoglobin measur...|  113|
|     leukocyte count|  105|
|reticulocyte meas...|   99|
|testosterone meas...|   96|
|forced expiratory...|   96|
|       fat body mass|   91|
|      vital capacity|   87|
| body fat percentage|   84|
|hair shape measur...|   79|
|       platelet crit|   78|
|mean corpuscular ...|   73|
|platelet componen...|   70|
|mean reticulocyte...|   70|
|          hematocrit|   67|
|blood protein

In [31]:
complete.filter(
    (
        F.size(
            F.array_intersect(
                F.col("taName"), F.array(F.lit("measurement"), F.lit("phenotype"))
            )
        )
        > 0
    )
    & (F.col("coherencyOneCell") == "dispar")
).groupBy("name").count().sort(F.col("count").desc()).show(250)

+--------------------+-----+
|                name|count|
+--------------------+-----+
|mean corpuscular ...|  550|
|      platelet count|  511|
|mean platelet volume|  417|
|         body height|  416|
|  reticulocyte count|  401|
|mean corpuscular ...|  332|
|      lean body mass|  320|
|sex hormone-bindi...|  319|
|Red cell distribu...|  313|
|    lymphocyte count|  290|
|appendicular lean...|  280|
|    eosinophil count|  270|
|   erythrocyte count|  265|
|heel bone mineral...|  264|
|    neutrophil count|  254|
|      monocyte count|  229|
|     leukocyte count|  224|
|hemoglobin measur...|  213|
|       platelet crit|  192|
|reticulocyte meas...|  188|
|      vital capacity|  182|
|       fat body mass|  172|
|forced expiratory...|  168|
|platelet componen...|  153|
| body fat percentage|  148|
|mean corpuscular ...|  145|
|          hematocrit|  139|
|mean reticulocyte...|  136|
|testosterone meas...|  131|
|          hair color|  129|
|eosinophil percen...|  115|
|lymphocyte pe

In [10]:
complete.filter(
    (F.col("diseaseId").isNotNull())
    & (
        (F.array_contains(F.col("taName"), "measurement"))
        | (F.array_contains(F.col("taName"), "phenotype"))
    )
).count()

                                                                                

11009

In [44]:
test3.groupBy("therapeuticAreas", "taName").count().sort(F.col("count").desc()).show(
    # truncate=False
)

+--------------------+--------------------+-----+
|    therapeuticAreas|              taName|count|
+--------------------+--------------------+-----+
|       [EFO_0001444]|       [measurement]|10637|
|       [EFO_0000651]|         [phenotype]|  361|
|[EFO_0000618, MON...|[psychiatric diso...|  325|
|       [EFO_0000319]|[cardiovascular d...|  317|
|     [MONDO_0045024]|[cell proliferati...|  293|
|[MONDO_0045024, E...|[cell proliferati...|  279|
|[OTAR_0000017, MO...|[cell proliferati...|  242|
|[MONDO_0045024, O...|[cell proliferati...|  190|
|[EFO_0009690, MON...|[cell proliferati...|  162|
|[OTAR_0000018, EF...|[genetic, familia...|  162|
|[MONDO_0045024, E...|[cell proliferati...|  129|
|[EFO_0005803, MON...|[hematologic dise...|  114|
|[OTAR_0000017, MO...|[cell proliferati...|  112|
|[OTAR_0000017, MO...|[cell proliferati...|  103|
|[MONDO_0045024, E...|[cell proliferati...|  101|
|[MONDO_0045024, E...|[cell proliferati...|   96|
|[EFO_0005803, MON...|[hematologic dise...|   96|
