In [13]:
import pyspark.sql.functions as f
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, IntegerType, StringType

In [14]:
spark = (
    SparkSession.builder
    .master('local[*]')
    .config("spark.driver.memory", "15g")
    .getOrCreate()
    )

### Open the crossref Uniprot Ensembl to add the Ensembl id and merge

In [15]:
residue_index = (
    spark.read.json("residue_gen_pos_output/res_genomic_pos_mapped.json")
    .withColumnRenamed("pdb_structure_id", "pdbStructureId")
    .withColumnRenamed("prot_chain_id", "chainId")
    .withColumnRenamed("compound_id", "pdbCompoundId")
)
residue_index.show(2)

+---------+-------------+--------------------+--------------+-------+-------------------+-----------------+---------------+----------------+
|accession|pdbCompoundId|   interaction_types|pdbStructureId|chainId|prot_residue_number|prot_residue_type| translation_id|uniprot_position|
+---------+-------------+--------------------+--------------+-------+-------------------+-----------------+---------------+----------------+
|   P09211|          GSH|             [hbond]|          11gs|      A|                 52|              LEU|ENSP00000381607|              53|
|   P09211|          GSH|[hydroph_interact...|          11gs|      B|                 13|              ARG|ENSP00000381607|              14|
+---------+-------------+--------------------+--------------+-------+-------------------+-----------------+---------------+----------------+
only showing top 2 rows



In [16]:
schema = StructType([
    StructField("UniProtKB-AC", StringType(), True),
    StructField("ID_type", StringType(), True),
    StructField("ID", StringType(), True)
    ])

cross_ref_uniprot = (
                    spark
                        .read.csv("files_to_merge_genomic_loc/HUMAN_9606_idmapping.tsv", sep="\t", schema=schema)
                        .filter(f.col('ID_type').rlike('Ensembl_PRO'))
                        .select(f.col('UniProtKB-AC'), f.col('ID'))
                        .withColumnRenamed("UniProtKB-AC", "accession")
                        .withColumnRenamed("ID", "ensemblProtId")
    )
cross_ref_uniprot.show(2)

+---------+---------------+
|accession|  ensemblProtId|
+---------+---------------+
|   P31946|ENSP00000300161|
|   P31946|ENSP00000361930|
+---------+---------------+
only showing top 2 rows



In [17]:
residue_index_1 = (
    cross_ref_uniprot
    .join(residue_index, on='accession', how='inner')
    .drop("translation_id")
)
residue_index_1.show(2)



+---------+---------------+-------------+-----------------+--------------+-------+-------------------+-----------------+----------------+
|accession|  ensemblProtId|pdbCompoundId|interaction_types|pdbStructureId|chainId|prot_residue_number|prot_residue_type|uniprot_position|
+---------+---------------+-------------+-----------------+--------------+-------+-------------------+-----------------+----------------+
|   P00918|ENSP00000285379|           ZN|  [metal_complex]|          12ca|      A|                 94|              HIS|              94|
|   P00918|ENSP00000285379|           ZN|  [metal_complex]|          12ca|      A|                119|              HIS|             119|
+---------+---------------+-------------+-----------------+--------------+-------+-------------------+-----------------+----------------+
only showing top 2 rows



                                                                                

In [18]:
generated_mapping = (
    spark.read.csv("files_to_merge_genomic_loc/generated_mappings.tsv", sep="\t", header=True)
    .withColumnRenamed("protein_id", "ensemblProtId")
    .withColumnRenamed("gene_id", "geneId")
    .withColumnRenamed("amino_acid_position", "prot_residue_number")
)
generated_mapping.show(2)

+---------+---------+---------+---------------+---------------+---+------+-------------------+
|     pos1|     pos2|     pos3|  ensemblProtId|         geneId|chr|strand|prot_residue_number|
+---------+---------+---------+---------------+---------------+---+------+-------------------+
|127588499|127588500|127588501|ENSP00000000233|ENSG00000004059|  7|     +|                  1|
|127588502|127588503|127588504|ENSP00000000233|ENSG00000004059|  7|     +|                  2|
+---------+---------+---------+---------------+---------------+---+------+-------------------+
only showing top 2 rows



In [19]:
residue_gen_pos = (
    residue_index_1
    .join(generated_mapping, on=['ensemblProtId', 'prot_residue_number'], how='inner')
    .drop("uniprot_position")
)
residue_gen_pos.show(2)



+---------------+-------------------+---------+-------------+-----------------+--------------+-------+-----------------+--------+--------+--------+---------------+---+------+
|  ensemblProtId|prot_residue_number|accession|pdbCompoundId|interaction_types|pdbStructureId|chainId|prot_residue_type|    pos1|    pos2|    pos3|         geneId|chr|strand|
+---------------+-------------------+---------+-------------+-----------------+--------------+-------+-----------------+--------+--------+--------+---------------+---+------+
|ENSP00000002596|                221|   O14792|          A3P|        [pistack]|          1zrh|      A|              PHE|11399345|11399344|11399343|ENSG00000002587|  4|     -|
|ENSP00000005178|                330|   Q16654|          P4A|          [hbond]|          2zdx|      A|              PHE|95587117|95587116|95587115|ENSG00000004799|  7|     -|
+---------------+-------------------+---------+-------------+-----------------+--------------+-------+-----------------+-----

                                                                                

In [20]:
evidence = (
    spark.read.parquet("../../evidence")
    .filter(f.col("variantId").isNotNull())
    .withColumn('chromosome', f.split(f.col('variantId'), '_').getItem(0))
    .withColumn('genomicLocation', f.split(f.col('variantId'), '_').getItem(1))
    .groupBy('chromosome', 'genomicLocation')
    .agg(
        f.collect_set(f.struct(
            f.col('variantId'), 
            f.col('diseaseId'), 
            f.col('diseaseFromSource'),
            f.col('datasourceId'))).alias('evidenceInfo')
    )
)
evidence.show(2, truncate=False)



+----------+---------------+-----------------------------------------------------------------+
|chromosome|genomicLocation|evidenceInfo                                                     |
+----------+---------------+-----------------------------------------------------------------+
|1         |100188948      |[{1_100188948_C_G, Orphanet_511, Maple syrup urine disease, eva}]|
|1         |100192396      |[{1_100192396_A_G, Orphanet_511, Maple syrup urine disease, eva}]|
+----------+---------------+-----------------------------------------------------------------+
only showing top 2 rows



                                                                                

In [21]:
unpivot_expression = '''stack(3, 'pos1', pos1, 'pos2', pos2, 'pos3', pos3) as (genLocation_label, genLocation_val)'''

gen_location_unpivot = (
    residue_gen_pos
        .select('geneId', 'accession', 'pdbStructureId', 'pdbCompoundId', 'prot_residue_number', 'chainId', 'prot_residue_type', 'chr', f.expr(unpivot_expression))
)
gen_location_unpivot.show(5)



+---------------+---------+--------------+-------------+-------------------+-------+-----------------+---+-----------------+---------------+
|         geneId|accession|pdbStructureId|pdbCompoundId|prot_residue_number|chainId|prot_residue_type|chr|genLocation_label|genLocation_val|
+---------------+---------+--------------+-------------+-------------------+-------+-----------------+---+-----------------+---------------+
|ENSG00000002587|   O14792|          1zrh|          A3P|                221|      A|              PHE|  4|             pos1|       11399345|
|ENSG00000002587|   O14792|          1zrh|          A3P|                221|      A|              PHE|  4|             pos2|       11399344|
|ENSG00000002587|   O14792|          1zrh|          A3P|                221|      A|              PHE|  4|             pos3|       11399343|
|ENSG00000004799|   Q16654|          2zdx|          P4A|                330|      A|              PHE|  7|             pos1|       95587117|
|ENSG00000004

                                                                                

In [22]:
disease_association = (
    gen_location_unpivot
    .join(
        evidence, 
        (gen_location_unpivot.chr == evidence.chromosome) &
        (gen_location_unpivot.genLocation_val == evidence.genomicLocation),
        how='left'
    )
)
disease_association.show(5)



+---------------+---------+--------------+-------------+-------------------+-------+-----------------+---+-----------------+---------------+----------+---------------+------------+
|         geneId|accession|pdbStructureId|pdbCompoundId|prot_residue_number|chainId|prot_residue_type|chr|genLocation_label|genLocation_val|chromosome|genomicLocation|evidenceInfo|
+---------------+---------+--------------+-------------+-------------------+-------+-----------------+---+-----------------+---------------+----------+---------------+------------+
|ENSG00000004478|   Q02790|          4drj|          RAP|                 87|      A|              ILE| 12|             pos3|        2797739|      null|           null|        null|
|ENSG00000004478|   Q02790|          4lax|          FK5|                 87|      A|              ILE| 12|             pos3|        2797739|      null|           null|        null|
|ENSG00000002587|   O14792|          1zrh|          A3P|                 35|      A|           

                                                                                

In [23]:
# MOLECULES
molecule = (
    spark.read
    .parquet("../../molecule/")
    .select(
        f.col('inchiKey').alias('inchikey'), 
        f.col('name').alias('drugName')
        #, 'linkedTargets', 'linkedDiseases'
    )
    .persist()
)
# INCHIKEY MOLECULES
inchikey = (
    spark.read
        .csv("../../inchikey/components_inchikeys.csv", sep=',', header=True, comment='#')
        .select(
            f.col('InChIKey').alias('inchikey'), 
            f.col('CCD_ID').alias('pdbCompound')
        )
    .persist()
)
# MOLECULE WITH COMPOUND ID
molecules_inchikey_join = (
    molecule
    .join(inchikey, on='inchikey')
    .drop("inchikey")
    .persist()
)
molecules_inchikey_join.show(1)

+--------------------+-----------+
|            drugName|pdbCompound|
+--------------------+-----------+
|(1-Phenylcyclopen...|        007|
+--------------------+-----------+
only showing top 1 row



In [24]:
# TARGET
targets = (
    spark.read
    .parquet("../../targets/")
    .select(
        f.col('id').alias('targetId'), 
        f.col('approvedSymbol').alias('symbol'),
        f.col('approvedName').alias('name')
    )
    .persist()
)
targets.show()

+---------------+-------+--------------------+
|       targetId| symbol|                name|
+---------------+-------+--------------------+
|ENSG00000002016|  RAD52|RAD52 homolog, DN...|
|ENSG00000002745|  WNT16|Wnt family member 16|
|ENSG00000033867| SLC4A7|solute carrier fa...|
|ENSG00000035499|DEPDC1B|DEP domain contai...|
|ENSG00000038532|CLEC16A|C-type lectin dom...|
|ENSG00000051382| PIK3CB|phosphatidylinosi...|
|ENSG00000068137|PLEKHH3|pleckstrin homolo...|
|ENSG00000075826| SEC31B|SEC31 homolog B, ...|
|ENSG00000079616|  KIF22|kinesin family me...|
|ENSG00000083845|   RPS5|ribosomal protein S5|
|ENSG00000084628| NKAIN1|sodium/potassium ...|
|ENSG00000089356|  FXYD3|FXYD domain conta...|
|ENSG00000099974|   DDTL|D-dopachrome taut...|
|ENSG00000100191| SLC5A4|solute carrier fa...|
|ENSG00000101049|   SGK2|serum/glucocortic...|
|ENSG00000101448|  EPPIN|epididymal peptid...|
|ENSG00000102409|   BEX4|brain expressed X...|
|ENSG00000102466|  FGF14|fibroblast growth...|
|ENSG00000104

In [32]:
# DISEASE
diseases = (
    spark.read
    .parquet("../../diseases/")
    .select(
        f.col('id').alias('diseaseId'), f.col('name').alias('diseaseName')
    )
    .persist()
)
diseases.show(1, True, True)

-RECORD 0-----------------------------------
 id                  | GO_0044238           
 code                | http://purl.oboli... 
 dbXRefs             | []                   
 description         | The chemical reac... 
 name                | primary metabolic... 
 directLocationIds   | null                 
 obsoleteTerms       | null                 
 parents             | [GO_0008152]         
 sko                 | null                 
 synonyms            | {null, [primary m... 
 ancestors           | [GO_0008152, GO_0... 
 descendants         | [GO_0005975, GO_0... 
 children            | [GO_0005975, GO_0... 
 therapeuticAreas    | [GO_0008150]         
 indirectLocationIds | [GO_0009101]         
 ontology            | {false, false, {h... 
only showing top 1 row



In [26]:
associations = ( # <- 13_422
    disease_association
    .select('*', f.explode('evidenceInfo').alias('evidence'))
    .select('*', f.col('evidence.*'))
    .select(
        f.col('geneId').alias('targetId'), 
        f.col('pdbCompoundId').alias('pdbCompound'), 
        'diseaseId', 
        'variantId'
    )
    .distinct()
    .persist()
)

In [27]:
associations.show(2)



+---------------+-----------+-------------+---------------+
|       targetId|pdbCompound|    diseaseId|      variantId|
+---------------+-----------+-------------+---------------+
|ENSG00000081307|        ATP|Orphanet_1934|3_132665825_G_A|
|ENSG00000097007|        DB8|  EFO_0000339|9_130872896_C_T|
+---------------+-----------+-------------+---------------+
only showing top 2 rows



                                                                                

In [31]:
# Let's join:
mapped_associations = (
    associations
    .groupBy('targetId', 'pdbCompound', 'diseaseId')
    .agg(f.collect_set('variantId').alias('variantIds'))

    # Joining with disease name -> name:
    .join(diseases, on='diseaseId', how='left')

    # Joining with gene name -> symbol:
    .join(targets, on='targetId', how='left')

    # Joining with drug name -> :
    .join(molecules_inchikey_join, on='pdbCompound')

    .persist()
)

excluded = ['ATP', 'ZN', 'GLU']
(
    mapped_associations
    .filter(
        (f.col('pdbCompound') != 'ATP')
        & (f.col('pdbCompound') != 'ZN')
    )
    .drop('targetId', 'diseaseId')
    # .write.parquet('pdb2variants2disease')
    .count()
)

22/05/11 10:19:12 WARN CacheManager: Asked to cache already cached data.


3602

In [None]:
# %%bash 

# gsutil cp -r pdb2variants2disease gs://ot-team/marine/
# #rm -rf pdb2variants2disease

Copying file://pdb2variants2disease/part-00118-c6da1fb5-98b2-4314-81f5-2acc0ed86b45-c000.snappy.parquet [Content-Type=application/octet-stream]...
Copying file://pdb2variants2disease/part-00146-c6da1fb5-98b2-4314-81f5-2acc0ed86b45-c000.snappy.parquet [Content-Type=application/octet-stream]...
Copying file://pdb2variants2disease/.part-00112-c6da1fb5-98b2-4314-81f5-2acc0ed86b45-c000.snappy.parquet.crc [Content-Type=application/octet-stream]...
Copying file://pdb2variants2disease/part-00173-c6da1fb5-98b2-4314-81f5-2acc0ed86b45-c000.snappy.parquet [Content-Type=application/octet-stream]...
/ [4 files][  8.2 KiB/  8.2 KiB]                                                
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying file://pdb2variants2disease/part-00127-c6da1fb5-98b2-4314-81f5-2ac