### Investigation on potential new mechanism of action or indication

In [2]:
import pyspark.sql.functions as f
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, StringType

In [3]:
spark = (
    SparkSession.builder
    .master('local[*]')
    .config("spark.driver.memory", "15g")
    .getOrCreate()
    )

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/12 09:24:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
disease_evidence = (
    spark.read.parquet("pdb2variants2disease")
    .withColumnRenamed('name', 'targetName')
    # .groupBy(
    #     'diseaseName', 'symbol', 'targetName'
    #     )

    # .agg(
    #     f.collect_set('pdbCompound'),
    #     f.collect_set('drugName'),
    #     f.collect_set('variantIds')
    #     )
    .na.drop("any")
)
disease_evidence.show(1)

                                                                                

+-----------+-----------------+--------------------+------+--------------------+------------+
|pdbCompound|       variantIds|         diseaseName|symbol|          targetName|    drugName|
+-----------+-----------------+--------------------+------+--------------------+------------+
|        CJH|[16_72014532_C_T]|Postaxial acrofac...| DHODH|dihydroorotate de...|CHEMBL483995|
+-----------+-----------------+--------------------+------+--------------------+------------+
only showing top 1 row



                                                                                

In [37]:
# Loading target index:
targets = (
    spark.read.parquet('../../targets')
    .select(
        f.col('id').alias('targetId'),
        f.col('approvedName').alias('targetName')
    )
    .persist()
)

# Loading disease index:
diseases = (
    spark.read.parquet('../../diseases')
    .select(
        f.col('id').alias('diseaseId'),
        f.col('name').alias('diseaseName')
    )
    .persist()
)

print(diseases.show())

# MOLECULES
molecule = (
    spark.read
    .parquet("../../molecule/")
    .select(
        f.col('id'),
        f.col('inchiKey').alias('inchikey'), 
        f.col('name').alias('drugName'),
        'linkedTargets', 'linkedDiseases'
    )
.persist()
)

22/05/12 10:41:19 WARN CacheManager: Asked to cache already cached data.
22/05/12 10:41:19 WARN CacheManager: Asked to cache already cached data.


+-------------+--------------------+
|    diseaseId|         diseaseName|
+-------------+--------------------+
|   GO_0044238|primary metabolic...|
|   HP_0002350|     Cerebellar cyst|
|   HP_0003401|         Paresthesia|
|   HP_0004532|Sacral hypertrich...|
|   HP_0012758|Neurodevelopmenta...|
|   HP_0032155|    Abdominal cramps|
|MONDO_0000473|   arterial disorder|
|MONDO_0000726|idiopathic scoliosis|
|MONDO_0001370|pericardial effusion|
|MONDO_0001574|  capillary disorder|
|MONDO_0001673|   diarrheal disease|
|MONDO_0003724|non-proliferative...|
|MONDO_0005299|      brain ischemia|
|MONDO_0011895|idiopathic hypere...|
|MONDO_0012187|Fanconi anemia co...|
|MONDO_0014662|congenital insens...|
|MONDO_0017375|congenital entero...|
|MONDO_0021041|pleural solitary ...|
|MONDO_0022208| crystal arthropathy|
|MONDO_0024305|acquired hyperpro...|
+-------------+--------------------+
only showing top 20 rows

None


22/05/12 10:41:19 WARN CacheManager: Asked to cache already cached data.


In [34]:
drugs_w_linked_target_names = (
    molecule
    .filter(f.col('linkedTargets').isNotNull())
    .select(
        f.col('id').alias('drugId'),
        f.explode(f.col('linkedTargets.rows')).alias('targetId')
    )
    .join(targets, on='targetId', how='left')
    .groupby('drugId')
    .agg(
        f.collect_set('targetName').alias('linkedTargetName')
    )
    .persist()
)

drugs_w_linked_disease_names = (
    molecule
    .filter(f.col('linkedDiseases').isNotNull())
    .select(
        f.col('id').alias('drugId'),
        f.explode(f.col('linkedDiseases.rows')).alias('diseaseId')
    )
    .join(diseases, on='diseaseId', how='left')
    .groupby('drugId')
    .agg(
        f.collect_set('diseaseName').alias('linkedDiseaseName')
    )
    .persist()
)

resolved_molecules = (
    molecule
    .select(
        f.col('id').alias('drugId'),
        f.col('drugName').alias('drugName'),
    )
    .join(drugs_w_linked_disease_names, on='drugId', how='left')
    .join(drugs_w_linked_target_names, on='drugId', how='left')
)


joined_mapped_linked = (
    disease_evidence
    .select(
        f.col('drugName'),
        f.col('diseaseName'),
        f.col('targetName'),
        f.col('variantIds')
    )
    .groupBy('drugName', 'variantIds')
    .agg(
        f.collect_set(f.col('diseaseName')).alias('mappedDiseaseName'),
        f.collect_set(f.col('targetName')).alias('mappedtargetName'),
    )
    .join(resolved_molecules, on='drugName', how='left')
    .persist()
)

22/05/12 10:27:15 WARN CacheManager: Asked to cache already cached data.
22/05/12 10:27:15 WARN CacheManager: Asked to cache already cached data.


In [38]:
(
    joined_mapped_linked
    .filter(f.col('linkedDiseaseName').isNotNull())
    .show(10, False, True)
)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [11]:
%%bash

ls -lah ../../targets

total 69M
drwxrwxr-x  2 marinegirardey marinegirardey  24K Apr 14 11:06 .
drwxr-xr-x 22 marinegirardey marinegirardey 4.0K May 11 13:28 ..
-rwxrwxr-x  1 marinegirardey marinegirardey    0 Apr 14 11:06 _SUCCESS
-rwxrwxr-x  1 marinegirardey marinegirardey 352K Apr 14 11:06 part-00000-ad8db45e-239a-4036-88a1-012033909e5a-c000.snappy.parquet
-rwxrwxr-x  1 marinegirardey marinegirardey 326K Apr 14 11:06 part-00001-ad8db45e-239a-4036-88a1-012033909e5a-c000.snappy.parquet
-rwxrwxr-x  1 marinegirardey marinegirardey 284K Apr 14 11:06 part-00002-ad8db45e-239a-4036-88a1-012033909e5a-c000.snappy.parquet
-rwxrwxr-x  1 marinegirardey marinegirardey 306K Apr 14 11:06 part-00003-ad8db45e-239a-4036-88a1-012033909e5a-c000.snappy.parquet
-rwxrwxr-x  1 marinegirardey marinegirardey 362K Apr 14 11:06 part-00004-ad8db45e-239a-4036-88a1-012033909e5a-c000.snappy.parquet
-rwxrwxr-x  1 marinegirardey marinegirardey 379K Apr 14 11:06 part-00005-ad8db45e-239a-4036-88a1-012033909e5a-c000.snappy.parquet
-rwxrwxr-x