In [43]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import altair as alt
from pyspark.sql import SparkSession
alt.data_transformers.enable("vegafusion")
plt.rcParams.update({"figure.dpi": 120})

# Functions

In [44]:
def check_dir(dir: str):
    """
    Creates a given path driectory "dir" if it does not exist.
    Args:
        dir (str): Path to the directory. 
    """
    if os.path.exists(dir) and os.path.isdir(dir):
        pass
    else:
        os.makedirs(dir)

# Directories

In [45]:
datadir = "OpenTargets/"
check_dir(datadir)

# Load Datasets

In [46]:
suptabel1 = pd.read_csv(datadir + "supptable1.csv", sep=",")
suptabel2 = pd.read_csv(datadir + "supptable2.csv", sep=",")

In [47]:
print(len(suptabel1))
print(len(suptabel2))

273
43


In [48]:
# Drop duplicates based on Group.1
suptabel1 = suptabel1.drop_duplicates(subset=["Group.1"])
print(len(suptabel1))

suptabel2 = suptabel2.drop_duplicates(subset=["Group.1"])
print(len(suptabel2))

247
39


In [49]:
# Merge the two tables putting the first table first and the second table second 
supptable = pd.concat([suptabel1, suptabel2], axis=0)
print(len(supptable))

286


In [50]:
# Change the name "Group.1" to "gene_id"
supptable = supptable.rename(columns={"Group.1": "gene_id"})
print(supptable.head())

  gene_id   x    n       bin        sd      prob           pbn         z  pos  \
0   ACTR3  18   66  3.633663  2.816817  0.055056  1.142413e-08  5.100203   18   
1   ANXA2  20  109  6.009901  3.694577  0.055137  2.057292e-06  3.786658   20   
2   ASF1B   9   22  1.316832  1.303309  0.059856  2.394384e-06  5.895125    9   
3  ATAD3A  18   64  3.534653  2.528100  0.055229  7.014642e-09  5.721826   18   
4  ATP1B3  10   27  1.376238  1.247810  0.050972  4.470500e-07  6.911118    9   

   neg  ...          survsig     psurvsig         nsurvsig  survsign  \
0    0  ...      significant  significant  non significant  positive   
1    0  ...      significant  significant  non significant  positive   
2    0  ...      significant  significant  non significant  positive   
3    0  ...      significant  significant  non significant  positive   
4    1  ...  non significant  significant  non significant  positive   

   survposfrac  kdriver  canonical  oncogene  tsg  type  
0     0.875000        

## Open Targets Search

### Load Database

In [51]:
targetsPath = datadir+"targets"
moleculePath = datadir+"molecule"
associationPath = datadir+"associationByOverallDirect"
diseasePath = datadir+"diseases"

In [52]:
# Establish spark connection
spark = SparkSession.builder.master('local[*]').getOrCreate()

In [53]:
# Read datasets
targets = spark.read.parquet(targetsPath)
diseases = spark.read.parquet(diseasePath)
molecules = spark.read.parquet(moleculePath)
association = spark.read.parquet(associationPath)

### Targets

In [54]:
# Browse the targets schema
targets.printSchema()

root
 |-- id: string (nullable = true)
 |-- approvedSymbol: string (nullable = true)
 |-- biotype: string (nullable = true)
 |-- transcriptIds: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- canonicalTranscript: struct (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- chromosome: string (nullable = true)
 |    |-- start: long (nullable = true)
 |    |-- end: long (nullable = true)
 |    |-- strand: string (nullable = true)
 |-- canonicalExons: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- genomicLocation: struct (nullable = true)
 |    |-- chromosome: string (nullable = true)
 |    |-- start: long (nullable = true)
 |    |-- end: long (nullable = true)
 |    |-- strand: integer (nullable = true)
 |-- alternativeGenes: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- approvedName: string (nullable = true)
 |-- go: array (nullable = true)
 |    |-- element: struct (containsNull = tru

In [55]:
# Select fields of interest
targetSelect = (
    targets.select(
        "id",
        "approvedSymbol"
    )
)
targetSelect.show(5)

+---------------+--------------+
|             id|approvedSymbol|
+---------------+--------------+
|ENSG00000002586|          CD99|
|ENSG00000015479|         MATR3|
|ENSG00000037280|          FLT4|
|ENSG00000038427|          VCAN|
|ENSG00000050730|         TNIP3|
+---------------+--------------+
only showing top 5 rows



In [56]:
target_df = (
    targetSelect
    .toPandas()
    .rename(columns={
        "approvedSymbol": "gene_id",
        "id": "ensembl_id"
    })
    .merge(supptable, on="gene_id")
)
print(len(target_df))
target_df.head(2)

286


Unnamed: 0,ensembl_id,gene_id,x,n,bin,sd,prob,pbn,z,pos,...,survsig,psurvsig,nsurvsig,survsign,survposfrac,kdriver,canonical,oncogene,tsg,type
0,ENSG00000109805,NCAPG,14,50,2.90099,2.184056,0.05802,6.252813e-07,5.081835,14,...,significant,significant,non significant,positive,0.9,1,0,0,0,BCT
1,ENSG00000138160,KIF11,20,54,3.663366,3.383126,0.06784,1.423412e-10,4.828857,19,...,significant,significant,non significant,positive,0.75,0,0,0,0,BCT


In [57]:
# See if exist "ensembl_id" duplicates
print((target_df.ensembl_id.duplicated()).sum()) 

0


### Diseases

In [59]:
# Browse the disease schema
diseases.printSchema()

root
 |-- id: string (nullable = true)
 |-- code: string (nullable = true)
 |-- dbXRefs: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- description: string (nullable = true)
 |-- name: string (nullable = true)
 |-- directLocationIds: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- obsoleteTerms: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- parents: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- synonyms: struct (nullable = true)
 |    |-- hasBroadSynonym: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- hasExactSynonym: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- hasNarrowSynonym: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- hasRelatedSynonym: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |-- ancestors: array (null

In [60]:
# Select fields of interest
diseaseSelect = (
    diseases.select(
        "id",
        "name",
        "ancestors"
    )
)
diseaseSelect.show(5)

+-----------+--------------------+--------------------+
|         id|                name|           ancestors|
+-----------+--------------------+--------------------+
|EFO_0000255|angioimmunoblasti...|[MONDO_0000430, O...|
|EFO_0000508|    genetic disorder|      [OTAR_0000018]|
|EFO_0001054|             leprosy|[EFO_0009387, OTA...|
|EFO_0004287|ventricular fibri...|[EFO_0003777, EFO...|
|EFO_0004302|anthropometric me...|       [EFO_0001444]|
+-----------+--------------------+--------------------+
only showing top 5 rows



[MONDO_0004992](https://platform.opentargets.org/disease/MONDO_0004992) is the main identifier for cancer. Since this *Disease Category* includes in its `descendants` all cancer types, we will use it to filter targets and drugs

In [61]:
disease_df = (
    diseaseSelect
    .toPandas()
    .explode(column='ancestors')
    .loc[lambda x: x.ancestors == "MONDO_0004992", ["id", "name"]]
    .rename(columns={"id": "disease_id", "name": "disease_name"})
)
print(len(disease_df))
disease_df.head(2)

1715


Unnamed: 0,disease_id,disease_name
74,MONDO_0002759,bladder verrucous carcinoma
77,MONDO_0003086,thymic mucoepidermoid carcinoma


In [81]:
# Adds MONDO_0004992 as its own descendant, so we can filter out diseases that are descendants of MONDO_0004992
disease_df = pd.concat([disease_df, diseaseSelect.toPandas()[["id", "name"]].rename(columns={"id": "disease_id", "name": "disease_name"})[lambda x: x.disease_id=="MONDO_0004992"]])

### Evidence

In [62]:
# Browse the evidence schema
association.printSchema()

root
 |-- diseaseId: string (nullable = true)
 |-- targetId: string (nullable = true)
 |-- score: double (nullable = true)
 |-- evidenceCount: long (nullable = true)



In [63]:
# Select fields of interest
associationSelect = (
    association.select(
        "targetId",
        "diseaseId",
        "score",
    )
)
associationSelect.show(5)

+---------------+-----------+--------------------+
|       targetId|  diseaseId|               score|
+---------------+-----------+--------------------+
|ENSG00000004399|EFO_0000580|0.005174117965585...|
|ENSG00000012048|EFO_0000580|0.017054967553679457|
|ENSG00000026025|EFO_0000580|0.001478319418738...|
|ENSG00000026103|EFO_0000580|0.029566388374776147|
|ENSG00000035862|EFO_0000580|0.007391597093694037|
+---------------+-----------+--------------------+
only showing top 5 rows



In [64]:
association_df = (
    associationSelect
    .toPandas()
    .rename(columns={
        "targetId": "ensembl_id",
        "diseaseId": "disease_id"
    })
    .merge(disease_df)
    .merge(target_df)
    [["gene_id", "disease_id", "disease_name", "score"]]
)
print(len(association_df))
association_df.head(2)

                                                                                

16509


Unnamed: 0,gene_id,disease_id,disease_name,score
0,BRCA1,EFO_0000580,medullary breast carcinoma,0.017055
1,MCM6,EFO_0000580,medullary breast carcinoma,0.003696


In [None]:
association_df.to_csv(datadir+"target_disease_association.csv", index=False)

### Molecules

In [65]:
# Browse the molecule schema
molecules.printSchema()

root
 |-- id: string (nullable = true)
 |-- canonicalSmiles: string (nullable = true)
 |-- inchiKey: string (nullable = true)
 |-- drugType: string (nullable = true)
 |-- name: string (nullable = true)
 |-- yearOfFirstApproval: long (nullable = true)
 |-- maximumClinicalTrialPhase: double (nullable = true)
 |-- parentId: string (nullable = true)
 |-- hasBeenWithdrawn: boolean (nullable = true)
 |-- isApproved: boolean (nullable = true)
 |-- tradeNames: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- synonyms: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- crossReferences: map (nullable = true)
 |    |-- key: string
 |    |-- value: array (valueContainsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- childChemblIds: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- linkedDiseases: struct (nullable = true)
 |    |-- rows: array (nullable = true)
 |    |    |-- element: string (cont

In [66]:
# Select fields of interest
moleculeSelect = (
    molecules.select(
        "id",
        "name",
        "isApproved",
        "linkedTargets",
        "linkedDiseases",
        'maximumClinicalTrialPhase',
        'drugType'
    )
)
moleculeSelect.show(2)

+-------------+-------------+----------+--------------------+--------------------+-------------------------+--------------+
|           id|         name|isApproved|       linkedTargets|      linkedDiseases|maximumClinicalTrialPhase|      drugType|
+-------------+-------------+----------+--------------------+--------------------+-------------------------+--------------+
|CHEMBL1086582|CHEMBL1086582|      NULL|                NULL|                NULL|                     NULL|Small molecule|
|CHEMBL1173055|    RUCAPARIB|      true|{[ENSG00000143799...|{[EFO_0003060, MO...|                      4.0|Small molecule|
+-------------+-------------+----------+--------------------+--------------------+-------------------------+--------------+
only showing top 2 rows



In [67]:
molecule_df = (
    moleculeSelect
    .toPandas()
    .dropna(subset=["linkedTargets", "linkedDiseases"])
    .assign(ensembl_id=lambda x: x.linkedTargets.apply(
        lambda x: x.asDict()['rows']))
    .assign(disease_id=lambda x: x.linkedDiseases.apply(
        lambda x: x.asDict()['rows']))
    .drop(columns=['linkedTargets', 'linkedDiseases'])
    .explode('ensembl_id')
    .explode('disease_id')
    .rename(columns={
        "id": "drug",
        "name": "drug_name"
    })
    .merge(target_df)
)
print(len(molecule_df))
molecule_df.head()

8919


Unnamed: 0,drug,drug_name,isApproved,maximumClinicalTrialPhase,drugType,ensembl_id,disease_id,gene_id,x,n,...,survsig,psurvsig,nsurvsig,survsign,survposfrac,kdriver,canonical,oncogene,tsg,type
0,CHEMBL1234354,PF-04691502,False,2.0,Small molecule,ENSG00000145675,EFO_0003869,PIK3R1,56,168,...,non significant,non significant,significant,negative,0.0,1,1,0,1,BCT
1,CHEMBL1234354,PF-04691502,False,2.0,Small molecule,ENSG00000145675,MONDO_0007254,PIK3R1,56,168,...,non significant,non significant,significant,negative,0.0,1,1,0,1,BCT
2,CHEMBL1234354,PF-04691502,False,2.0,Small molecule,ENSG00000145675,MONDO_0004992,PIK3R1,56,168,...,non significant,non significant,significant,negative,0.0,1,1,0,1,BCT
3,CHEMBL1234354,PF-04691502,False,2.0,Small molecule,ENSG00000117461,EFO_0003869,PIK3R3,15,75,...,significant,non significant,significant,negative,0.25,1,0,0,0,BCT
4,CHEMBL1234354,PF-04691502,False,2.0,Small molecule,ENSG00000117461,MONDO_0007254,PIK3R3,15,75,...,significant,non significant,significant,negative,0.25,1,0,0,0,BCT


In [83]:
# Merge molecule_df with disease_df on disease_id
cancer_associated_drugs = (
    molecule_df
    .merge(disease_df, on="disease_id")
)

In [84]:
cancer_associated_drugs

Unnamed: 0,drug,drug_name,isApproved,maximumClinicalTrialPhase,drugType,ensembl_id,disease_id,gene_id,x,n,...,psurvsig,nsurvsig,survsign,survposfrac,kdriver,canonical,oncogene,tsg,type,disease_name
0,CHEMBL1234354,PF-04691502,False,2.0,Small molecule,ENSG00000145675,MONDO_0007254,PIK3R1,56,168,...,non significant,significant,negative,0.000000,1,1,0,1,BCT,breast cancer
1,CHEMBL1234354,PF-04691502,False,2.0,Small molecule,ENSG00000145675,MONDO_0004992,PIK3R1,56,168,...,non significant,significant,negative,0.000000,1,1,0,1,BCT,cancer
2,CHEMBL1234354,PF-04691502,False,2.0,Small molecule,ENSG00000117461,MONDO_0007254,PIK3R3,15,75,...,non significant,significant,negative,0.250000,1,0,0,0,BCT,breast cancer
3,CHEMBL1234354,PF-04691502,False,2.0,Small molecule,ENSG00000117461,MONDO_0004992,PIK3R3,15,75,...,non significant,significant,negative,0.250000,1,0,0,0,BCT,cancer
4,CHEMBL1683544,ERIBULIN MESYLATE,True,4.0,Small molecule,ENSG00000196230,EFO_0003060,TUBB,25,155,...,significant,non significant,positive,0.666667,0,0,0,0,BCT,non-small cell lung carcinoma
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4867,CHEMBL451887,CARFILZOMIB,True,4.0,Protein,ENSG00000108671,MONDO_0008315,PSMD11,19,58,...,significant,non significant,positive,0.900000,0,0,0,0,BCT,prostate cancer
4868,CHEMBL451887,CARFILZOMIB,True,4.0,Protein,ENSG00000108671,MONDO_0002367,PSMD11,19,58,...,significant,non significant,positive,0.900000,0,0,0,0,BCT,kidney cancer
4869,CHEMBL451887,CARFILZOMIB,True,4.0,Protein,ENSG00000108671,EFO_0000702,PSMD11,19,58,...,significant,non significant,positive,0.900000,0,0,0,0,BCT,small cell lung carcinoma
4870,CHEMBL451887,CARFILZOMIB,True,4.0,Protein,ENSG00000108671,EFO_0000222,PSMD11,19,58,...,significant,non significant,positive,0.900000,0,0,0,0,BCT,acute myeloid leukemia


### Final Considerations

In [74]:
print(molecule_df.gene_id.unique())
print(len(molecule_df.gene_id.unique()))

['PIK3R1' 'PIK3R3' 'TUBB' 'TUBA1B' 'TUBA1C' 'CHEK1' 'TOP2A' 'TYMS' 'CDK4'
 'EZH2' 'PSMD14' 'PSMA1' 'PSMA7' 'PSMC2' 'PSMC4' 'PSMD11' 'CDK1' 'CDK2'
 'AURKB' 'AURKA' 'RRM2' 'POLD1' 'POLE' 'POLA2' 'P4HB' 'KIF11' 'E2F1'
 'E2F2' 'CRBN' 'ATP1B3' 'PLK1' 'VAMP2' 'TFRC' 'TTK' 'GMPS' 'PLK4' 'COL6A3'
 'LAMB3']
38


We found that 38 of our selected neighbours (34 neighbours enriched in positive driver and survival associations and 4 enriched in negative driver and survival associations, supplementary tables 1 and 2) are known drug targets. 

In [88]:
cancer_associated_drugs.gene_id.value_counts()


gene_id
TUBB      678
TUBA1B    672
TUBA1C    672
TOP2A     441
TYMS      289
RRM2      250
POLD1     246
POLA2     246
POLE      246
CDK4      162
CRBN       84
PSMA1      78
PSMA7      78
PSMD14     74
PSMC2      74
PSMC4      74
PSMD11     74
PIK3R1     71
PIK3R3     71
CDK2       64
CHEK1      49
AURKA      42
AURKB      33
CDK1       29
KIF11      23
PLK1       17
EZH2       16
ATP1B3      7
PLK4        6
TFRC        4
TTK         2
Name: count, dtype: int64

31 were targeted by drugs developed for cancer treatment, confirming in this way the therapeutic potential of neighbours enriched in driver associations. 

In [86]:
# Removing from molecule_df those that are in cancer_associated_drugs see how many are left with unique gene_ids 
molecule_df[~molecule_df.gene_id.isin(cancer_associated_drugs.gene_id)].gene_id.value_counts()

gene_id
COL6A3    21
P4HB      17
VAMP2      9
LAMB3      8
GMPS       3
E2F1       1
E2F2       1
Name: count, dtype: int64

In [82]:
# Proteins without drugs with an association with cancer
# Drugs that can be used for drug repurposing
drug_repurposing = (
    molecule_df
    .groupby("gene_id")
    # Add cancer and neoplasm ids and descendant diseases to exclude drugs with those associations
    .filter(lambda x: x.disease_id.isin(
        disease_df.disease_id.tolist() + ["EFO_0000616"]).sum() == 0)
    .groupby("gene_id")["drug"]
    .nunique()
)
drug_repurposing.head(10)

gene_id
COL6A3    2
E2F1      1
E2F2      1
GMPS      1
LAMB3     1
P4HB      4
VAMP2     1
Name: drug, dtype: int64

Interestingly, 7 neighbours (_COL6A3_, _E2F1_, _E2F2_, _GMPS_, _LAMB3_, _PH4B_ and _VAMP2_) were targeted by drugs for non-cancer diseases, suggesting an opportunity for drug repurposing.  

In [35]:
molecule_df.to_csv(datadir+"molecule_target_disease_association.csv", index=False)