# Unichem and Inchikeys mapping of molecules from OT parquet files comparison

In [2]:
import pyspark.sql.functions as f
from pyspark.sql import SparkSession

molecule file here (Drug): http://ftp.ebi.ac.uk/pub/databases/opentargets/platform/21.11/output/etl/parquet/molecule/*

inchikey file : https://ftp.ebi.ac.uk/pub/databases/msd/pdbechem_v2/components_inchikeys.csv

unichem file : https://ftp.ebi.ac.uk/pub/databases/chembl/UniChem/data/wholeSourceMapping/src_id1/src1src3.txt.gz

ensembl file : https://ftp.ebi.ac.uk/pub/databases/msd/sifts/csv/pdb_chain_ensembl.csv

In [11]:
molecule = 'input_files/molecule/'
unichem = 'input_files/unichem/src1src3.txt'
ensembl = 'input_files/ensembl/pdb_chain_ensembl.csv'
inchikey = 'input_files/inchikeys/components_inchikeys.csv'

In [4]:
spark = SparkSession.builder.getOrCreate()

22/05/04 12:03:26 WARN Utils: Your hostname, Marines-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 172.23.42.25 instead (on interface en0)
22/05/04 12:03:26 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/04 12:03:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/05/04 12:03:30 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/05/04 12:03:30 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/05/04 12:03:30 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
22/05/04 12:03:30 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.


### Molecule from parquet dataset

In [13]:
molecule_df = (
    spark.read
    .parquet(molecule)
    .select(f.col('inchiKey').alias('InChIKey'), f.col('id').alias('chembl_id'), 'name', 'linkedTargets', 'linkedDiseases')
)
molecule_df.show(2)
molecule_df.count()

                                                                                

+--------------------+-------------+--------------------+--------------------+--------------------+
|            InChIKey|    chembl_id|                name|       linkedTargets|      linkedDiseases|
+--------------------+-------------+--------------------+--------------------+--------------------+
|GTTBEUCJPZQMDZ-UH...|CHEMBL1079742|ERLOTINIB HYDROCH...|{[ENSG00000146648...|{[MONDO_0044926, ...|
|ITPDYQOUSLNIHG-UH...|CHEMBL1083993|AMIODARONE HYDROC...|{[ENSG00000055118...|{[EFO_0000275, EF...|
+--------------------+-------------+--------------------+--------------------+--------------------+
only showing top 2 rows



                                                                                

12594

### Unichem dataset

In [15]:
unichem_df = (
    spark.read.csv(unichem, sep=r'\t', header=True)
    .withColumnRenamed('From src:\'1\'', 'chembl_id')
    .withColumnRenamed('To src:\'3\'', 'pdb_compound_id')
)
unichem_df.show(2)
unichem_df.count()

                                                                                

+-------------+---------------+
|    chembl_id|pdb_compound_id|
+-------------+---------------+
| CHEMBL313405|            ABZ|
|CHEMBL1399676|            MQN|
+-------------+---------------+
only showing top 2 rows



                                                                                

14452

### Inchikey dataset

In [16]:
inchikey_df = (
    spark.read
        .csv(inchikey, sep=',', header=True, comment='#')
)
inchikey_df.show(2, truncate=False)
inchikey_df.count()

                                                                                

+------+---------------------------+
|CCD_ID|InChIKey                   |
+------+---------------------------+
|000   |CXHHBNMLPJOKQD-UHFFFAOYSA-N|
|001   |NBYCDVVSYOMFMS-VMPREFPWSA-N|
+------+---------------------------+
only showing top 2 rows



                                                                                

36500

### Join molecule and Inchikey

In [17]:
inchikey_molecule_df = (molecule_df
                       .join(inchikey_df, on='InChIKey')
                       )
inchikey_molecule_df.show(2)
inchikey_molecule_df.count()

                                                                                

+--------------------+------------+--------------------+-------------+--------------+------+
|            InChIKey|   chembl_id|                name|linkedTargets|linkedDiseases|CCD_ID|
+--------------------+------------+--------------------+-------------+--------------+------+
|SJWOFBVBNFLWLP-UH...|CHEMBL382127|(1-Phenylcyclopen...|         null|          null|   007|
|OEVYDSSAPNIURZ-AE...|CHEMBL381806|        CHEMBL381806|         null|          null|   008|
+--------------------+------------+--------------------+-------------+--------------+------+
only showing top 2 rows



                                                                                

3752

### Join molecule and Unichem

In [18]:
unichem_molecule_df = (molecule_df
                       .join(unichem_df, on='chembl_id')
                       )
unichem_molecule_df.show(2)
unichem_molecule_df.count()

                                                                                

+-------------+--------------------+--------------+--------------------+-------------------+---------------+
|    chembl_id|            InChIKey|          name|       linkedTargets|     linkedDiseases|pdb_compound_id|
+-------------+--------------------+--------------+--------------------+-------------------+---------------+
| CHEMBL110458|LXBIFEVIBLOUGU-DP...|    MIGALASTAT|{[ENSG00000102393...|{[Orphanet_324], 1}|            DGJ|
|CHEMBL1169388|RTHCYVBBDHJXIQ-IN...|(S)-Fluoxetine|                null|               null|            SFX|
+-------------+--------------------+--------------+--------------------+-------------------+---------------+
only showing top 2 rows



                                                                                

3657

### Mapping of Unichem and Inchikey together

In [19]:
out_join = (unichem_molecule_df
            .join(inchikey_molecule_df, on='chembl_id', how='outer')
            )
out_join.show(2)
out_join.count()

                                                                                

+------------+--------------------+--------------+--------------------+--------------------+---------------+--------------------+--------------+--------------------+--------------------+------+
|   chembl_id|            InChIKey|          name|       linkedTargets|      linkedDiseases|pdb_compound_id|            InChIKey|          name|       linkedTargets|      linkedDiseases|CCD_ID|
+------------+--------------------+--------------+--------------------+--------------------+---------------+--------------------+--------------+--------------------+--------------------+------+
|CHEMBL100109|UCQIHCRMWNRFNP-QY...|  CHEMBL100109|                null|                null|            DI2|UCQIHCRMWNRFNP-QY...|  CHEMBL100109|                null|                null|   DI2|
|  CHEMBL1002|NDAUXUAQIAJITI-LB...|LEVOSALBUTAMOL|{[ENSG00000169252...|{[EFO_0000270, EF...|            68H|NDAUXUAQIAJITI-LB...|LEVOSALBUTAMOL|{[ENSG00000169252...|{[EFO_0000270, EF...|   68H|
+------------+----------------

                                                                                

3806

Improvement: Better to use both