In [1]:
import pyspark.sql.functions as f
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, StringType

In [2]:
spark = SparkSession.builder.getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/25 20:17:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
evidence = (
    spark.read.parquet("../evidence")
    .select("targetId", "variantId", "diseaseId", "diseaseFromSource")
    .filter(f.col("variantId").isNotNull())
    .withColumn('chr', f.split(f.col('variantId'), '_').getItem(0))
    .withColumn('genomicLocation', f.split(f.col('variantId'), '_').getItem(1))
    .persist()
)

                                                                                

In [4]:
evidence.show()

                                                                                

+------------+---------------+----------------+---------------+--------------------+---+---------------+
|datasourceId|       targetId|       variantId|      diseaseId|   diseaseFromSource|chr|genomicLocation|
+------------+---------------+----------------+---------------+--------------------+---+---------------+
|         eva|ENSG00000198727|    MT_15404_T_C|   Orphanet_506|      Leigh syndrome| MT|          15404|
|         eva|ENSG00000136944| 9_126696728_C_T|  Orphanet_2614|Nail-patella synd...|  9|      126696728|
|         eva|ENSG00000140368| 15_77028657_G_A| Orphanet_69126|Pyogenic arthriti...| 15|       77028657|
|         eva|ENSG00000187741| 16_89740085_C_T|    Orphanet_84|      Fanconi anemia| 16|       89740085|
|         eva|ENSG00000007372| 11_31793299_G_A|  MONDO_0007350|Coloboma, ocular,...| 11|       31793299|
|         eva|ENSG00000007372| 11_31786949_C_T|    Orphanet_77| Congenital aniridia| 11|       31786949|
|         eva|ENSG00000013503|12_106427339_T_C|Orphanet

In [5]:
gen_location = (
    spark.read.json("residue_gen_pos_output/residue_genomic_position.json")
)

gen_location = (
    gen_location
    .withColumn("pdbCompound", gen_location["resInfos.compound"])
    .withColumn("resNb", gen_location["resInfos.res_nb"])
    .withColumn("chain", gen_location["resInfos.chain"])
    .withColumn("resType", gen_location["resInfos.res_type"])
    .withColumn("interType", gen_location["resInfos.inter_type"])
    .withColumn("chr", gen_location["resInfos.chromosome"])
    .withColumn("genLocation_1", gen_location["resInfos.genLocation.res_pos_1"])
    .withColumn("genLocation_2", gen_location["resInfos.genLocation.res_pos_2"])
    .withColumn("genLocation_3", gen_location["resInfos.genLocation.res_pos_3"])
    .drop("resInfos")
    .persist()
)

                                                                                

In [6]:
gen_location.show(1, False, True)

[Stage 6:>                                                          (0 + 1) / 1]

-RECORD 0------------------------
 geneId        | ENSG00000001626 
 pdbStructId   | 1xmj            
 pdbCompound   | ATP             
 resNb         | 466             
 chain         | A               
 resType       | SER             
 interType     | hbond           
 chr           | 7               
 genLocation_1 | 117559467       
 genLocation_2 | 117559468       
 genLocation_3 | 117559469       
only showing top 1 row



                                                                                

In [19]:
res_with_disease = (
    gen_location.join(
        evidence, 
        (gen_location.chr == evidence.chr) &
        (
            (gen_location.genLocation_1 == evidence.genomicLocation) |
            (gen_location.genLocation_2 == evidence.genomicLocation) |
            (gen_location.genLocation_3 == evidence.genomicLocation)
        )
    )
    .drop("genLocation_1", "genLocation_2", "genLocation_3", "targetId", "chr", "genomicLocation")

    .groupby([f.col('geneId'),
            f.col('pdbStructId'),
            f.col("resNb"),
            f.col("resType"),
            f.col("interType"),
            f.col("datasourceId"),
            f.col("diseaseId"),
            f.col("diseaseFromSource")
            ])

    .agg(f.collect_set(f.struct(f.col('variantId'))))
    .persist()
)

In [20]:
res_with_disease.show(5, False, True)

[Stage 35:====>                                                  (15 + 8) / 200]

In [9]:
# MOLECULES
molecule_df = (
    spark.read
    .parquet("../input_files/molecule/")
    .select(
        f.col('inchiKey').alias('inchikey'), 'name'
        #, 'linkedTargets', 'linkedDiseases'
    )
    .persist()
)

In [10]:
# INCHIKEY MOLECULES
inchikey_df = (
    spark.read
        .csv("../input_files/inchikeys/components_inchikeys.csv", sep=',', header=True, comment='#')
        .select(
            f.col('InChIKey').alias('inchikey'), 
            f.col('CCD_ID').alias('pdbCompound')
        )
    .persist()
)

In [42]:
# MOLECULE WITH COMPOUND ID
molecules_inchikey_join = (
    molecule_df
    .join(inchikey_df, on='inchikey')
    .drop("inchikey")
    .persist()
)

In [None]:
molecules_inchikey_join.show()

In [43]:
# COMPOUND NAME
disease_ass_comp_name = (
    molecules_inchikey_join
    .join(res_with_disease, on='pdbCompound')
)

                                                                                

+-----------+------------+---------------+-----------+-----+-----+-------+-------------+---+-------------+-------------+-------------+------------+---------------+-------------+-----------+--------------------+---+---------------+
|pdbCompound|        name|         geneId|pdbStructId|resNb|chain|resType|    interType|chr|genLocation_1|genLocation_2|genLocation_3|datasourceId|       targetId|    variantId|  diseaseId|   diseaseFromSource|chr|genomicLocation|
+-----------+------------+---------------+-----------+-----+-----+-------+-------------+---+-------------+-------------+-------------+------------+---------------+-------------+-----------+--------------------+---+---------------+
|        GDP|CHEMBL384759|ENSG00000136238|       1g4u|   17|    R|    THR|        hbond|  7|      6387227|      6387228|      6387229|         eva|ENSG00000136238|7_6387229_G_A|EFO_0009156|Intellectual disa...|  7|        6387229|
|        GDP|CHEMBL384759|ENSG00000136238|       1g4u|   16|    R|    LYS|  

In [44]:
disease_ass_comp_name.show(2, True, True)

ERROR:root:KeyboardInterrupt while sending command.>               (8 + 3) / 11]
Traceback (most recent call last):
  File "/Users/marinegirardey/miniforge3/envs/plip_env/lib/python3.8/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/Users/marinegirardey/miniforge3/envs/plip_env/lib/python3.8/site-packages/py4j/clientserver.py", line 475, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/Users/marinegirardey/miniforge3/envs/plip_env/lib/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 