# Imports

In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from IPython.display import display
from python_functions import check_dir

# Directories

In [2]:
rawdata = "data/raw/opentargets/"
processeddata = "data/processed/"
check_dir(rawdata)

# Load Datasets

In [3]:
positive = pd.read_csv(processeddata+"bctPcand.csv")
negative = pd.read_csv(processeddata+"bctNcand.csv")
display(positive.head())
negative.head()

Unnamed: 0,gene,nint,nsig,z,npos,nneg,posfrac,enriched,posstrict,pos,...,SAneg,type,DAsign,SAsign,driver,canonical,oncogene,tsgene,references,opentargets
0,AATF,79,18,4.460348,18,0,1.0,1,2,8,...,0,BCT,positive,positive,0,0,0,0,https://doi.org/10.1038/s41467-025-60228-z; ht...,Not Target
1,ADRM1,50,11,5.342295,11,0,1.0,1,2,8,...,0,BCT,positive,positive,0,0,0,0,https://doi.org/10.1002/kjm2.12298,Target (Cancer)
2,AP2S1,31,8,4.646239,8,0,1.0,1,3,8,...,0,BCT,positive,positive,0,0,0,0,0,Not Target
3,ARL14,25,7,4.857009,7,0,1.0,1,2,7,...,0,BCT,positive,positive,0,0,0,0,https://doi.org/10.3389/fcell.2019.00238,Not Target
4,ASNS,26,7,4.158855,6,1,0.857143,1,3,8,...,0,BCT,positive,positive,0,0,0,0,https://doi.org/10.1038/s41419-022-05015-0,Not Target


Unnamed: 0,gene,nint,nsig,z,npos,nneg,posfrac,enriched,posstrict,pos,...,SAneg,type,DAsign,SAsign,driver,canonical,oncogene,tsgene,references,opentargets
0,AHCYL1,69,20,7.102729,0,20,0.0,1,0,2,...,2,BCT,negative,negative,0,0,0,0,https://doi.org/10.1186/s13062-023-00364-y; ht...,Not Target
1,AP3S2,10,4,4.828624,0,4,0.0,1,0,3,...,2,BCT,negative,negative,0,0,0,0,0,Not Target
2,ARL3,9,4,4.14,0,4,0.0,1,0,5,...,2,BCT,negative,negative,0,0,0,0,0,Not Target
3,C3orf18,22,7,4.625119,0,7,0.0,1,0,6,...,2,BCT,negative,negative,0,0,0,0,0,Not Target
4,CLCC1,32,9,3.941381,0,9,0.0,1,0,4,...,2,BCT,negative,negative,0,0,0,0,0,Not Target


In [4]:
genes = list(set(positive.gene.tolist() + negative.gene.tolist()))
print(len(genes))

144


# Open Targets Search

## Load Database

In [5]:
targetsPath = rawdata+"targets"
moleculePath = rawdata+"molecule"
associationPath = rawdata+"associationByOverallDirect"
diseasePath = rawdata+"diseases"

In [6]:
# Establish spark connection
spark = SparkSession.builder.master('local[*]').getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/07/26 16:18:40 WARN Utils: Your hostname, thinkpad-x1, resolves to a loopback address: 127.0.1.1; using 192.168.1.179 instead (on interface wlan0)
25/07/26 16:18:40 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/26 16:18:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:
# Read datasets
targets = spark.read.parquet(targetsPath)
diseases = spark.read.parquet(diseasePath)
molecules = spark.read.parquet(moleculePath)
association = spark.read.parquet(associationPath)

                                                                                

### Targets

In [8]:
# Browse the targets schema
targets.printSchema()

root
 |-- id: string (nullable = true)
 |-- approvedSymbol: string (nullable = true)
 |-- biotype: string (nullable = true)
 |-- transcriptIds: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- canonicalTranscript: struct (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- chromosome: string (nullable = true)
 |    |-- start: long (nullable = true)
 |    |-- end: long (nullable = true)
 |    |-- strand: string (nullable = true)
 |-- canonicalExons: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- genomicLocation: struct (nullable = true)
 |    |-- chromosome: string (nullable = true)
 |    |-- start: long (nullable = true)
 |    |-- end: long (nullable = true)
 |    |-- strand: integer (nullable = true)
 |-- alternativeGenes: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- approvedName: string (nullable = true)
 |-- go: array (nullable = true)
 |    |-- element: struct (containsNull = tru

In [9]:
# Select fields of interest
targetSelect = (
    targets.select(
        "id",
        "approvedSymbol"
    )
)
targetSelect.show(5)

                                                                                

+---------------+--------------+
|             id|approvedSymbol|
+---------------+--------------+
|ENSG00000002586|          CD99|
|ENSG00000015479|         MATR3|
|ENSG00000037280|          FLT4|
|ENSG00000038427|          VCAN|
|ENSG00000050730|         TNIP3|
+---------------+--------------+
only showing top 5 rows


In [10]:
target_df = (
    targetSelect
    .toPandas()
    .rename(columns={
        "approvedSymbol": "gene_id",
        "id": "ensembl_id"
    })
    #.merge(supptable, on="gene_id")
    [lambda x: x.gene_id.isin(genes)]
)
print(len(target_df))
target_df.head(2)

                                                                                

144


Unnamed: 0,ensembl_id,gene_id
1052,ENSG00000138160,KIF11
2057,ENSG00000091428,RAPGEF4


In [11]:
# See if exist "ensembl_id" duplicates
print((target_df.ensembl_id.duplicated()).sum()) 

0


### Diseases

In [12]:
# Browse the disease schema
diseases.printSchema()

root
 |-- id: string (nullable = true)
 |-- code: string (nullable = true)
 |-- dbXRefs: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- description: string (nullable = true)
 |-- name: string (nullable = true)
 |-- directLocationIds: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- obsoleteTerms: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- parents: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- synonyms: struct (nullable = true)
 |    |-- hasBroadSynonym: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- hasExactSynonym: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- hasNarrowSynonym: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- hasRelatedSynonym: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |-- ancestors: array (null

In [13]:
# Select fields of interest
diseaseSelect = (
    diseases.select(
        "id",
        "name",
        "ancestors"
    )
)
diseaseSelect.show(5)

+-----------+--------------------+--------------------+
|         id|                name|           ancestors|
+-----------+--------------------+--------------------+
|EFO_0000255|angioimmunoblasti...|[MONDO_0000430, O...|
|EFO_0000508|    genetic disorder|      [OTAR_0000018]|
|EFO_0001054|             leprosy|[EFO_0009387, OTA...|
|EFO_0004287|ventricular fibri...|[EFO_0003777, EFO...|
|EFO_0004302|anthropometric me...|       [EFO_0001444]|
+-----------+--------------------+--------------------+
only showing top 5 rows


[MONDO_0004992](https://platform.opentargets.org/disease/MONDO_0004992) is the main identifier for cancer. Since this *Disease Category* includes in its `descendants` all cancer types, we will use it to filter targets and drugs

In [14]:
disease_df = (
    diseaseSelect
    .toPandas()
    .explode(column='ancestors')
    .loc[lambda x: x.ancestors == "MONDO_0004992", ["id", "name"]]
    .rename(columns={"id": "disease_id", "name": "disease_name"})
)
print(len(disease_df))
disease_df.head(2)

                                                                                

1715


Unnamed: 0,disease_id,disease_name
74,MONDO_0002759,bladder verrucous carcinoma
77,MONDO_0003086,thymic mucoepidermoid carcinoma


In [15]:
# Adds MONDO_0004992 as its own descendant, so we can filter out diseases that are descendants of MONDO_0004992
disease_df = pd.concat([
    disease_df,
    (
        diseaseSelect
        .toPandas()
        [["id", "name"]]
        .rename(columns={"id": "disease_id", "name": "disease_name"})
        [lambda x: x.disease_id=="MONDO_0004992"]
    )
])

### Evidence

In [16]:
# Browse the evidence schema
association.printSchema()

root
 |-- diseaseId: string (nullable = true)
 |-- targetId: string (nullable = true)
 |-- score: double (nullable = true)
 |-- evidenceCount: long (nullable = true)



In [17]:
# Select fields of interest
associationSelect = (
    association.select(
        "targetId",
        "diseaseId",
        "score",
    )
)
associationSelect.show(5)

+---------------+-----------+--------------------+
|       targetId|  diseaseId|               score|
+---------------+-----------+--------------------+
|ENSG00000004399|EFO_0000580|0.005174117965585...|
|ENSG00000012048|EFO_0000580|0.017054967553679457|
|ENSG00000026025|EFO_0000580|0.001478319418738...|
|ENSG00000026103|EFO_0000580|0.029566388374776147|
|ENSG00000035862|EFO_0000580|0.007391597093694037|
+---------------+-----------+--------------------+
only showing top 5 rows


In [18]:
association_df = (
    associationSelect
    .toPandas()
    .rename(columns={
        "targetId": "ensembl_id",
        "diseaseId": "disease_id"
    })
    .merge(disease_df)
    .merge(target_df)
    [["gene_id", "disease_id", "disease_name", "score"]]
)
print(len(association_df))
association_df.head(2)

                                                                                

6990


Unnamed: 0,gene_id,disease_id,disease_name,score
0,AURKA,EFO_0000580,medullary breast carcinoma,0.003696
1,TRIM28,EFO_0000580,medullary breast carcinoma,0.003696


In [19]:
association_df.to_csv(processeddata+"target_disease_association.csv", index=False)

### Molecules

In [20]:
# Browse the molecule schema
molecules.printSchema()

root
 |-- id: string (nullable = true)
 |-- canonicalSmiles: string (nullable = true)
 |-- inchiKey: string (nullable = true)
 |-- drugType: string (nullable = true)
 |-- name: string (nullable = true)
 |-- yearOfFirstApproval: long (nullable = true)
 |-- maximumClinicalTrialPhase: double (nullable = true)
 |-- parentId: string (nullable = true)
 |-- hasBeenWithdrawn: boolean (nullable = true)
 |-- isApproved: boolean (nullable = true)
 |-- tradeNames: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- synonyms: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- crossReferences: map (nullable = true)
 |    |-- key: string
 |    |-- value: array (valueContainsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- childChemblIds: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- linkedDiseases: struct (nullable = true)
 |    |-- rows: array (nullable = true)
 |    |    |-- element: string (cont

In [21]:
# Select fields of interest
moleculeSelect = (
    molecules.select(
        "id",
        "name",
        "isApproved",
        "linkedTargets",
        "linkedDiseases",
        'maximumClinicalTrialPhase',
        'drugType'
    )
)
moleculeSelect.show(2)

+-------------+-------------+----------+--------------------+--------------------+-------------------------+--------------+
|           id|         name|isApproved|       linkedTargets|      linkedDiseases|maximumClinicalTrialPhase|      drugType|
+-------------+-------------+----------+--------------------+--------------------+-------------------------+--------------+
|CHEMBL1086582|CHEMBL1086582|      NULL|                NULL|                NULL|                     NULL|Small molecule|
|CHEMBL1173055|    RUCAPARIB|      true|{[ENSG00000143799...|{[EFO_0003060, MO...|                      4.0|Small molecule|
+-------------+-------------+----------+--------------------+--------------------+-------------------------+--------------+
only showing top 2 rows


In [22]:
molecule_df = (
    moleculeSelect
    .toPandas()
    .dropna(subset=["linkedTargets", "linkedDiseases"])
    .assign(ensembl_id=lambda x: x.linkedTargets.apply(
        lambda x: x.asDict()['rows']))
    .assign(disease_id=lambda x: x.linkedDiseases.apply(
        lambda x: x.asDict()['rows']))
    .drop(columns=['linkedTargets', 'linkedDiseases'])
    .explode('ensembl_id')
    .explode('disease_id')
    .rename(columns={
        "id": "drug",
        "name": "drug_name"
    })
    .merge(target_df)
)
print(len(molecule_df))
molecule_df.head()

                                                                                

5469


Unnamed: 0,drug,drug_name,isApproved,maximumClinicalTrialPhase,drugType,ensembl_id,disease_id,gene_id
0,CHEMBL1234354,PF-04691502,False,2.0,Small molecule,ENSG00000145675,EFO_0003869,PIK3R1
1,CHEMBL1234354,PF-04691502,False,2.0,Small molecule,ENSG00000145675,MONDO_0007254,PIK3R1
2,CHEMBL1234354,PF-04691502,False,2.0,Small molecule,ENSG00000145675,MONDO_0004992,PIK3R1
3,CHEMBL1683544,ERIBULIN MESYLATE,True,4.0,Small molecule,ENSG00000196230,EFO_0003060,TUBB
4,CHEMBL1683544,ERIBULIN MESYLATE,True,4.0,Small molecule,ENSG00000196230,EFO_0000616,TUBB


In [23]:
# Merge molecule_df with disease_df on disease_id
cancer_associated_drugs = (
    molecule_df
    .merge(disease_df, on="disease_id")
)

## Drug Targets

In [24]:
drugtargets = pd.merge(
    molecule_df.groupby("gene_id")["drug"].nunique().rename("n_drugs"),
    cancer_associated_drugs.groupby("gene_id")["drug"].nunique().rename("n_cancerdrugs"),
    left_index=True, right_index=True, how="left"
).fillna(0).astype(int)

stats = []
for label, df in zip(["bctPcand", "bctNcand"], [positive, negative]):
    targets = (
        df
        .merge(drugtargets, left_on="gene", right_index=True, how="left")
        .fillna(0)
        .assign(
            drugtarget=lambda x: x.n_drugs.astype(bool),
            cancer_drugtarget=lambda x: (
                x.n_drugs.astype(bool) & x.n_cancerdrugs.astype(bool)
            ),
            opentargets=lambda x: (
                x.drugtarget.astype(int) + x.cancer_drugtarget.astype(int)
            ).map({
                0: "Not Target",
                1: "Target (Not Cancer)",
                2: "Target (Cancer)"
            })
        )
        .drop(columns=["n_drugs", "n_cancerdrugs", "drugtarget", "cancer_drugtarget"])
    )
    stats.append({
        "type": label,
        "targets": len(targets[targets.opentargets!="Not Target"]),
        "cancer targets": len(targets[targets.opentargets=="Target (Cancer)"]),
    })
    targets.to_csv(processeddata+f"{label}.csv", index=False)
stats = pd.DataFrame(stats)
display(stats)

Unnamed: 0,type,targets,cancer targets
0,bctPcand,25,16
1,bctNcand,3,2


In [25]:
# Proteins without drugs with an association with cancer
# Drugs that can be used for drug repurposing
drug_repurposing = (
    molecule_df
    .groupby("gene_id")
    # Add cancer and neoplasm ids and descendant diseases to exclude drugs with those associations
    .filter(lambda x: x.disease_id.isin(
        disease_df.disease_id.tolist() + ["EFO_0000616"]).sum() == 0)
    .groupby("gene_id")["drug"]
    .nunique()
)
print(len(drug_repurposing))

10


In [26]:
molecule_df.to_csv(processeddata+"molecule_target_disease_association.csv", index=False)

## Summary

In [27]:
positive = pd.read_csv(processeddata+"bctPcand.csv")
negative = pd.read_csv(processeddata+"bctNcand.csv")

In [28]:
print("# positive non-driver neighbours which are known drug targets: ",
      len(positive[(~positive.driver.astype(bool)) & (positive.opentargets!="Not Target")]))
print("# negative non-driver neighbours which are known drug targets : ",
      len(negative[(~negative.driver.astype(bool)) & (negative.opentargets!="Not Target")]))
print("# positive neighbours which are targeted by cancer drugs: ",
      len(positive[(positive.opentargets=="Target (Cancer)")]))
print("# negative neighbours which are targeted by cancer drugs: ",
      len(negative[(negative.opentargets=="Target (Cancer)")]))
print("# positive neighbours which are targeted by non-cancer drugs: ",
      len(positive[(positive.opentargets=="Target (Not Cancer)")]))
print("# negative neighbours which are targeted by non-cancer drugs: ",
      len(negative[(negative.opentargets=="Target (Not Cancer)")]))
print(positive.loc[(positive.opentargets=="Target (Not Cancer)"), "gene"].tolist())
print(negative.loc[(negative.opentargets=="Target (Not Cancer)"), "gene"].tolist())

# positive non-driver neighbours which are known drug targets:  18
# negative non-driver neighbours which are known drug targets :  1
# positive neighbours which are targeted by cancer drugs:  16
# negative neighbours which are targeted by cancer drugs:  2
# positive neighbours which are targeted by non-cancer drugs:  9
# negative neighbours which are targeted by non-cancer drugs:  1
['E2F1', 'LAMA3', 'MIF', 'RPL28', 'RPL35', 'RPL7A', 'RPLP0', 'RPS15A', 'RPS21']
['CX3CL1']


We found that 28 of our candidate neighbours (25 neighbours enriched in positive driver and survival associations and 3 enriched in negative driver and survival associations, supplementary tables 1 and 2) are known drug targets. 18 are targeted by drugs developed for cancer treatment, confirming the therapeutic potential of neighbours enriched in driver associations. Interestingly, 10 neighbours are targeted only by drugs for non-cancer diseases, suggesting an opportunity for drug repurposing.