In [2]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
import numpy as np
from tqdm import tqdm as tqdm

options = FirefoxOptions()
# options = Options()
options.add_argument("--headless")
driver = webdriver.Firefox(options=options)
# driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

protac_df = pd.read_csv("protac.csv")
protac_ids = list(protac_df["Compound ID"])
warheads = [] # need to reverse smiles string
e3_ligands = [] # smiles string do not need reversing
linker_ids = []

warhead_ids = []
e3_ligand_ids = []

for p in tqdm(protac_ids):
    url = "http://cadd.zju.edu.cn/protacdb/compound/dataset=protac&id=" + str(p)
    driver.get(url)
    WebDriverWait(driver, 100).until(EC.presence_of_element_located((By.ID, "e3_ligand")))
    content = driver.page_source
    soup = BeautifulSoup(content)
    
    # find id of linker and smiles strings of e3 ligands and warheads
    warhead = soup.find("img", {"id":"warhead"})
    e3_ligand = soup.find("img", {"id":"e3_ligand"})
    linker = soup.find("img", {"id":"linker"})
    warheads.append(warhead.get("alt"))
    e3_ligands.append(e3_ligand.get("alt"))
    if linker is not None:
        linker_ids.append(linker.get("alt"))
    else:
        linker_ids.append(0)
        
    # get id of warhead and e3 ligand
    buttons = driver.find_elements(By.CLASS_NAME, "detail_button")
    buttons[0].click()
    WebDriverWait(driver, 100).until(EC.presence_of_element_located((By.ID, "tbMain_protac")))
    if "warhead" in driver.current_url:
        arr = driver.current_url.split("=")
        _id = arr[len(arr)-1]
        warhead_ids.append(_id)
    else:
        print("got the wrong url!!!")
    driver.back()
    WebDriverWait(driver, 100).until(EC.presence_of_element_located((By.ID, "e3_ligand")))
    buttons = driver.find_elements(By.CLASS_NAME, "detail_button")
    buttons[len(buttons)-1].click()
    WebDriverWait(driver, 100).until(EC.presence_of_element_located((By.ID, "tbMain_protac")))
    if "e3_ligand" in driver.current_url:
        arr = driver.current_url.split("=")
        _id = arr[len(arr)-1]
        e3_ligand_ids.append(_id)
    else:
        print("got the wrong url!!!")


for w, e, l, p, e_id, w_id in zip(warheads, e3_ligands, linker_ids, protac_ids, e3_ligand_ids, warhead_ids):
    print(w, " ", e, " ", l, " ", p, " ", e_id, " ", w_id)

100%|███████████████████████████████████████████████████████████████████████████| 5388/5388 [13:05:11<00:00,  8.74s/it]

[R1]N1CCN(CC1)CC2=C(C=C(C=C2OC)C3=CN(C(=O)C4=C3C=CN=C4)C)OC   [R2]C(=O)N[C@H](C(=O)N1C[C@@H](C[C@H]1C(=O)NCC2=CC=C(C=C2)C3=C(N=CS3)C)O)C(C)(C)C   1   1   12   1
[R1]N1CCN(CC1)CC2=C(C=C(C=C2OC)C3=CN(C(=O)C4=C3C=CN=C4)C)OC   [R2]C(=O)N[C@H](C(=O)N1C[C@@H](C[C@H]1C(=O)NCC2=CC=C(C=C2)C3=C(N=CS3)C)O)C(C)(C)C   1   1   12   1
[R1]N1CCN(CC1)CC2=C(C=C(C=C2OC)C3=CN(C(=O)C4=C3C=CN=C4)C)OC   [R2]C(=O)N[C@H](C(=O)N1C[C@@H](C[C@H]1C(=O)NCC2=CC=C(C=C2)C3=C(N=CS3)C)O)C(C)(C)C   2   2   12   1
[R1]N1CCN(CC1)CC2=C(C=C(C=C2OC)C3=CN(C(=O)C4=C3C=CN=C4)C)OC   [R2]C(=O)N[C@H](C(=O)N1C[C@@H](C[C@H]1C(=O)NCC2=CC=C(C=C2)C3=C(N=CS3)C)O)C(C)(C)C   2   2   12   1
[R1]N1CCN(CC1)CC2=C(C=C(C=C2OC)C3=CN(C(=O)C4=C3C=CN=C4)C)OC   [R2]NC1=CC=CC2=C1C(=O)N(C2=O)C3CCC(=O)NC3=O   3   3   2   1
[R1]N1CCN(CC1)CC2=C(C=C(C=C2OC)C3=CN(C(=O)C4=C3C=CN=C4)C)OC   [R2]NC1=CC=CC2=C1C(=O)N(C2=O)C3CCC(=O)NC3=O   4   4   2   1
[R1]N1CCN(CC1)CC2=C(C=C(C=C2OC)C3=CN(C(=O)C4=C3C=CN=C4)C)OC   [R2]NCC1=CC=C(C=C1)S(=O)(=O)NC2=CC=CC3=C2NC=C3Cl  




In [4]:
# process linkers
linker_df = pd.read_csv("linker.csv")
my_ids = list(linker_df["Compound ID"])
linker_smiles = list(linker_df["Smiles"])

linker_dictionary = {}
for ids, smiles in zip(my_ids, linker_smiles):
    linker_dictionary[ids] = smiles


no_linker_count = 0
the_linkers = []
for idx, i in enumerate(linker_ids):
    i = (int) (i)
    if i == 0: # some protacs do not have linkers
        print(idx)
        the_linkers.append("")
        no_linker_count += 1
    else:
        current_linker = linker_dictionary[i]
        the_linkers.append(current_linker)
        
print("protacs with no linkers: ", no_linker_count)

74
235
236
237
357
819
1034
1035
1886
2566
3972
4294
4298
4306
4307
5082
protacs with no linkers:  16


In [5]:
# process e3 ligands
the_e3_ligands = []
for e in e3_ligands:
    e = e[4:]
    if "[R2]" in e:
        e = e.replace("[R2]", "")
    the_e3_ligands.append(e)

for e in the_e3_ligands:
    print(e)

C(=O)N[C@H](C(=O)N1C[C@@H](C[C@H]1C(=O)NCC2=CC=C(C=C2)C3=C(N=CS3)C)O)C(C)(C)C
C(=O)N[C@H](C(=O)N1C[C@@H](C[C@H]1C(=O)NCC2=CC=C(C=C2)C3=C(N=CS3)C)O)C(C)(C)C
C(=O)N[C@H](C(=O)N1C[C@@H](C[C@H]1C(=O)NCC2=CC=C(C=C2)C3=C(N=CS3)C)O)C(C)(C)C
C(=O)N[C@H](C(=O)N1C[C@@H](C[C@H]1C(=O)NCC2=CC=C(C=C2)C3=C(N=CS3)C)O)C(C)(C)C
NC1=CC=CC2=C1C(=O)N(C2=O)C3CCC(=O)NC3=O
NC1=CC=CC2=C1C(=O)N(C2=O)C3CCC(=O)NC3=O
NCC1=CC=C(C=C1)S(=O)(=O)NC2=CC=CC3=C2NC=C3Cl
NCC1=CC=C(C=C1)S(=O)(=O)NC2=CC=CC3=C2NC=C3Cl
NCC1=CC=C(C=C1)S(=O)(=O)NC2=CC=CC3=C2NC=C3Cl
NCC1=CC=C(C=C1)S(=O)(=O)NC2=CC=CC3=C2NC=C3Cl
C(=O)N[C@H](C(=O)N1C[C@@H](C[C@H]1C(=O)NCC2=CC=C(C=C2)C3=C(N=CS3)C)O)C(C)(C)C
C(=O)N[C@H](C(=O)N1C[C@@H](C[C@H]1C(=O)NCC2=CC=C(C=C2)C3=C(N=CS3)C)O)C(C)(C)C
C(=O)N[C@H](C(=O)N1C[C@@H](C[C@H]1C(=O)NCC2=CC=C(C=C2)C3=C(N=CS3)C)O)C(C)(C)C
C(=O)N[C@H](C(=O)N1C[C@@H](C[C@H]1C(=O)NCC2=CC=C(C=C2)C3=C(N=CS3)C)O)C(C)(C)C
C(=O)N[C@H](C(=O)N1C[C@@H](C[C@H]1C(=O)NCC2=CC=C(C=C2)C3=C(N=CS3)C)O)C(C)(C)C
C(=O)N[C@H](C(=O)N1C[C@@H](C[C@H]1C(=O

In [6]:
import requests as r
from Bio import SeqIO
from io import StringIO
import time

# process the protein target
def convert_uniprot_to_sequence(uniprot_list):
    sequences = []

    for prot in tqdm(uniprot_list):
        cID = prot
        baseUrl = "http://www.uniprot.org/uniprot/"
        currentUrl = baseUrl+cID+".fasta"
        response = r.post(currentUrl)
        cData = ''.join(response.text)
        cData = str(cData).split("\n")
        seq = ""
        for i in range(len(cData)):
            if i == 0:
                continue
            else:
                seq += cData[i]

        if seq == "The 'accession' value has invalid format. It should be a valid UniProtKB accession":
            sequences.append(prot)
        else:
            sequences.append(seq)
        time.sleep(0.1) 
        
    return np.array(sequences)

protac_df = pd.read_csv("protac.csv")
protein_target = list(protac_df["Uniprot"])
the_protein_targets = convert_uniprot_to_sequence(protein_target)
for p in the_protein_targets:
    print(p[:10])

100%|██████████████████████████████████████████████████████████████████████████████| 5388/5388 [35:46<00:00,  2.51it/s]

MGKKHKKHKS
MGKKHKKHKA
MGKKHKKHKS
MGKKHKKHKA
MGKKHKKHKA
MGKKHKKHKA
MGKKHKKHKS
MGKKHKKHKA
MGKKHKKHKS
MGKKHKKHKA
MGKKHKKHKS
MGKKHKKHKA
MGKKHKKHKS
MGKKHKKHKA
MGKKHKKHKS
MGKKHKKHKA
MGKKHKKHKS
MGKKHKKHKA
MGKKHKKHKS
MGKKHKKHKA
MGKKHKKHKS
MGKKHKKHKA
MGKKHKKHKS
MGKKHKKHKA
MGKKHKKHKS
MGKKHKKHKA
MGKKHKKHKS
MGKKHKKHKA
MGKKHKKHKS
MGKKHKKHKA
MGKKHKKHKS
MGKKHKKHKA
MGKKHKKHKS
MGKKHKKHKA
MGKKHKKHKS
MGKKHKKHKA
MGKKHKKHKS
MGKKHKKHKA
MGKKHKKHKS
MGKKHKKHKA
MGKKHKKHKA
MGKKHKKHKA
MGKKHKKHKA
MGKKHKKHKS
MGKKHKKHKS
MGKKHKKHKS
MGKKHKKHKA
MGKKHKKHKS
MGKKHKKHKA
MGKKHKKHKS
MGKKHKKHKA
MGKKHKKHKS
MGKKHKKHKA
MGKKHKKHKS
MGKKHKKHKA
MGKKHKKHKS
MGKKHKKHKA
MGKKHKKHKS
MGKKHKKHKA
MGKKHKKHKS
MGKKHKKHKA
MGKKHKKHKS
MGKKHKKHKA
MGKKHKKHKS
MGKKHKKHKA
MGKKHKKHKS
MGKKHKKHKA
MGKKHKKHKS
MGKKHKKHKA
MGKKHKKHKS
MGKKHKKHKA
MFGLKRNAVI
MCNTNMSVPT
MCNTNMSVPT
MCNTNMSVPT
MCNTNMSVPT
MCNTNMSVPT
MCNTNMSVPT
MCNTNMSVPT
MCNTNMSVPT
MCNTNMSVPT
MCNTNMSVPT
MCNTNMSVPT
MCNTNMSVPT
MCNTNMSVPT
MCNTNMSVPT
MCNTNMSVPT
MCNTNMSVPT
MCNTNMSVPT
MCNTNMSVPT
MCNTNMSVPT




In [10]:
e3_targets = list(protac_df["E3 ligase"])
the_e3_targets = convert_uniprot_to_sequence(e3_targets)

100%|██████████████████████████████████████████████████████████████████████████████| 5388/5388 [35:27<00:00,  2.53it/s]


In [11]:
# process warheads
the_warheads = []
for w in warheads:
    corrected = w.replace("[R1]", "") 
    the_warheads.append(corrected)
    
for w in the_warheads:
    print(w)

N1CCN(CC1)CC2=C(C=C(C=C2OC)C3=CN(C(=O)C4=C3C=CN=C4)C)OC
N1CCN(CC1)CC2=C(C=C(C=C2OC)C3=CN(C(=O)C4=C3C=CN=C4)C)OC
N1CCN(CC1)CC2=C(C=C(C=C2OC)C3=CN(C(=O)C4=C3C=CN=C4)C)OC
N1CCN(CC1)CC2=C(C=C(C=C2OC)C3=CN(C(=O)C4=C3C=CN=C4)C)OC
N1CCN(CC1)CC2=C(C=C(C=C2OC)C3=CN(C(=O)C4=C3C=CN=C4)C)OC
N1CCN(CC1)CC2=C(C=C(C=C2OC)C3=CN(C(=O)C4=C3C=CN=C4)C)OC
N1CCN(CC1)CC2=C(C=C(C=C2OC)C3=CN(C(=O)C4=C3C=CN=C4)C)OC
N1CCN(CC1)CC2=C(C=C(C=C2OC)C3=CN(C(=O)C4=C3C=CN=C4)C)OC
N1CCN(CC1)CC2=C(C=C(C=C2OC)C3=CN(C(=O)C4=C3C=CN=C4)C)OC
N1CCN(CC1)CC2=C(C=C(C=C2OC)C3=CN(C(=O)C4=C3C=CN=C4)C)OC
N1CCN(CC1)CC2=C(C=C(C=C2OC)C3=CN(C(=O)C4=C3C=CN=C4)C)OC
N1CCN(CC1)CC2=C(C=C(C=C2OC)C3=CN(C(=O)C4=C3C=CN=C4)C)OC
N1CCN(CC1)CC2=C(C=C(C=C2OC)C3=CN(C(=O)C4=C3C=CN=C4)C)OC
N1CCN(CC1)CC2=C(C=C(C=C2OC)C3=CN(C(=O)C4=C3C=CN=C4)C)OC
N1CCN(CC1)CC2=C(C=C(C=C2OC)C3=CN(C(=O)C4=C3C=CN=C4)C)OC
N1CCN(CC1)CC2=C(C=C(C=C2OC)C3=CN(C(=O)C4=C3C=CN=C4)C)OC
N1CCN(CC1)CC2=C(C=C(C=C2OC)C3=CN(C(=O)C4=C3C=CN=C4)C)OC
N1CCN(CC1)CC2=C(C=C(C=C2OC)C3=CN(C(=O)C4=C3C=CN=

In [12]:
import csv

with open('filtered_protacs.csv', 'w', newline='') as file:
    writer = csv.writer(file)

    writer.writerow(["Protac ID", "Warhead ID", "Linker ID", "E3 Ligand ID", "Target Protein", "E3 Target", "Warhead", "Linker", "E3 Ligand"])
    for i in range(len(protac_ids)):
        writer.writerow([protac_ids[i], warhead_ids[i], linker_ids[i], e3_ligand_ids[i], the_protein_targets[i], the_e3_targets[i], the_warheads[i], the_linkers[i], the_e3_ligands[i]])
        

In [15]:
for e in the_e3_targets:
    print(e)

VHL
VHL
VHL
VHL
CRBN
CRBN
DCAF15
DCAF15
DCAF15
DCAF15
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
VHL
VHL
VHL
CRBN
CRBN
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
VHL
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRBN
CRB

In [3]:
# import random
# # generate negative examples so we have 1:1 positive to negative ratio
# while len(negative_target_proteins) < len(target_protein):
#     index = random.randint(0,len(target_protein)-1)
#     current_target_protein = target_protein[index]
#     current_e3_target = e3_target[index]
#     current_warhead = warheads[index]
#     current_linker = linkers[index]
#     current_e3_ligand = e3_ligands[index]
    
#     idx = random.randint(0, len(unique_warheads)-1)
#     random_warhead = unique_warheads[idx]
#     idx = random.randint(0, len(unique_target_proteins)-1)
#     random_target_protein = unique_target_proteins[idx]
    
#     if current_target_protein != random_target_protein and current_warhead != random_warhead:
#         negative_target_proteins.append(random_target_protein) #  change target protein
#         negative_e3_targets.append(current_e3_target)
#         negative_warheads.append(random_warhead) # change warhead
#         negative_linkers.append(current_linker)
#         negative_e3_ligands.append(current_e3_ligand)
        
# print(len(negative_target_proteins))

5379


In [4]:
# import csv

# with open('filtered_protacs.csv', 'a', newline='') as file:
#     writer = csv.writer(file)

#     # writer.writerow(["Target Protein", "E3 Target", "Warhead", "Linker", "E3 Ligand", "Labels"])
#     for i in range(len(negative_target_proteins)):
#         writer.writerow([negative_target_proteins[i], negative_e3_targets[i], negative_warheads[i], negative_linkers[i], negative_e3_ligands[i],-1.0])
        

In [5]:
# # shuffle rows of csv file
# df = pd.read_csv("filtered_protacs.csv")
# shuffled_df = df.sample(frac=1)

In [6]:
# # save changes
# shuffled_df.to_csv("filtered_protacs.csv")