In [1]:
import fastparquet
import polars as pl
import pandas as pd
from tqdm import tqdm
import numpy as np

pd.set_option('display.max_columns', 200)  # Show all columns
pd.set_option('display.max_rows', 100)     # Show all rows
pd.set_option('display.max_colwidth', 100) # Show full column width
pd.set_option('display.width', 1000)   

In [2]:
columns = [
    "Ligand SMILES",
    "IC50 (nM)",
    "Number of Protein Chains in Target (>1 implies a multichain complex)",
    "BindingDB Target Chain Sequence",
    "Ki (nM)",
    "Kd (nM)"
]

In [3]:
CHUNK_SIZE = 10_000

# Create an iterator for the chunks
chunk_iterator = pd.read_csv(
    '../data/BindingDB/BindingDB_All.tsv',
    sep='\t',
    on_bad_lines='skip', 
    usecols=columns,
    encoding_errors='ignore',
    chunksize=CHUNK_SIZE
)

In [5]:
file_exists = False

TOTAL_LINES = 2923144

# Wrap the chunk_iterator with tqdm to add a progress bar
for chunk in tqdm(chunk_iterator, desc="Processing chunks", total=TOTAL_LINES // CHUNK_SIZE):
    
    # Preprocess the chunk
    
    # Skip rows where 'Number of Protein Chains in Target (>1 implies a multichain complex)' > 1
    chunk = chunk[chunk['Number of Protein Chains in Target (>1 implies a multichain complex)'] <= 1]
    
    # Exclude the 'Number of Protein Chains in Target (>1 implies a multichain complex)' column
    chunk = chunk.drop(columns=['Number of Protein Chains in Target (>1 implies a multichain complex)'])
    
    # Ensure the columns are treated as strings before using the .str accessor
    chunk.loc[:, 'IC50 (nM)'] = chunk['IC50 (nM)'].astype(str)
    chunk.loc[:, 'Ki (nM)'] = chunk['Ki (nM)'].astype(str)
    chunk.loc[:, 'Kd (nM)'] = chunk['Kd (nM)'].astype(str)
    
    # Convert the columns to numeric, handling non-numeric values
    chunk.loc[:, 'IC50 (nM)'] = pd.to_numeric(chunk['IC50 (nM)'].str.replace('>', ''), errors='coerce')
    chunk.loc[:, 'Ki (nM)'] = pd.to_numeric(chunk['Ki (nM)'].str.replace('>', ''), errors='coerce')
    chunk.loc[:, 'Kd (nM)'] = pd.to_numeric(chunk['Kd (nM)'].str.replace('>', ''), errors='coerce')
    
    # Write the chunk to a Parquet file
    if not file_exists:
        chunk.to_parquet('../data/BindingDB_predprocessed/BindingDB_v0.parquet', index=False, engine='fastparquet')
        file_exists = True
    else:
        chunk.to_parquet('../data/BindingDB_predprocessed/BindingDB_v0.parquet', index=False, engine='fastparquet', append=True)

  for obj in iterable:
  for obj in iterable:
  for obj in iterable:
  chunk.loc[:, 'Kd (nM)'] = chunk['Kd (nM)'].astype(str)
  for obj in iterable:
  chunk.loc[:, 'Kd (nM)'] = chunk['Kd (nM)'].astype(str)
  for obj in iterable:
  chunk.loc[:, 'Kd (nM)'] = chunk['Kd (nM)'].astype(str)
  for obj in iterable:
  chunk.loc[:, 'Ki (nM)'] = chunk['Ki (nM)'].astype(str)
  chunk.loc[:, 'Kd (nM)'] = chunk['Kd (nM)'].astype(str)
  for obj in iterable:
  chunk.loc[:, 'Ki (nM)'] = chunk['Ki (nM)'].astype(str)
  chunk.loc[:, 'Kd (nM)'] = chunk['Kd (nM)'].astype(str)
  for obj in iterable:
  chunk.loc[:, 'Kd (nM)'] = chunk['Kd (nM)'].astype(str)
  for obj in iterable:
  chunk.loc[:, 'Ki (nM)'] = chunk['Ki (nM)'].astype(str)
  chunk.loc[:, 'Kd (nM)'] = chunk['Kd (nM)'].astype(str)
  for obj in iterable:
  chunk.loc[:, 'Kd (nM)'] = chunk['Kd (nM)'].astype(str)
  for obj in iterable:
  for obj in iterable:
  chunk.loc[:, 'Kd (nM)'] = chunk['Kd (nM)'].astype(str)
  for obj in iterable:
  chunk.loc[:, 'K

In [6]:
df = pl.read_parquet('../data/BindingDB_predprocessed/BindingDB_v0.parquet')

In [8]:
df

Ligand SMILES,Ki (nM),IC50 (nM),Kd (nM),BindingDB Target Chain Sequence
str,f64,f64,f64,str
"""ONS(=O)(=O)C(F)(F)C(F)(F)C(F)(…",19000.0,,,"""MEALMARGALTGPLRALCLLGCLLSHAAAA…"
"""ONS(=O)(=O)C(F)(F)C(F)(F)C(F)(…",14000.0,,,"""MEALMARGALTGPLRALCLLGCLLSHAAAA…"
"""CN(C)S(=O)(=O)NO""",100000.0,,,"""MEALMARGALTGPLRALCLLGCLLSHAAAA…"
"""ONS(=O)(=O)Cc1ccccc1""",73000.0,,,"""MEALMARGALTGPLRALCLLGCLLSHAAAA…"
"""ONS(=O)(=O)c1ccccc1""",70000.0,,,"""MEALMARGALTGPLRALCLLGCLLSHAAAA…"
…,…,…,…,…
"""CC[C@H]1CN(Cc2cc(C)cc(CC(O)=O)…",,,,"""MGETLGDSPVDPEHGAFADALPMSTSQEIT…"
"""CC[C@@H]1CN(Cc2cc(C)cc(CC(O)=O…",,,,"""MGETLGDSPVDPEHGAFADALPMSTSQEIT…"
"""CC(C)[C@@H]1CN(Cc2cc(C)cc(CC(O…",,,,"""MGETLGDSPVDPEHGAFADALPMSTSQEIT…"
"""COc1ccc(cc1)N(C)c1nc(C)nc2[nH]…",,2600.0,,"""CVSASPSTLARLVSRSAMPAGSSTAWNTAF…"


In [7]:
df.describe()

statistic,Ligand SMILES,Ki (nM),IC50 (nM),Kd (nM),BindingDB Target Chain Sequence
str,str,f64,f64,f64,str
"""count""","""2753719""",547810.0,1687796.0,102479.0,"""2753719"""
"""null_count""","""0""",2205909.0,1065923.0,2651240.0,"""0"""
"""mean""",,365470000.0,670800000.0,243886.725605,
"""std""",,191070000000.0,244100000000.0,13934000.0,
"""min""","""B.CP(c1ccccc1)c1ccc(O)cc1""",0.0,0.0,0.0,"""AAACPRGQGRTLVSGLIYYITGSSKTNTEE…"
"""25%""",,9.4,28.0,60.0,
"""50%""",,111.0,411.0,2500.0,
"""75%""",,1670.0,6700.0,10000.0,
"""max""","""n1nc2c([nH]1)c1nn[nH]c1c1nn[nH…",100000000000000.0,100000000000000.0,3492000000.0,"""sfldnhkkltprrdvptypkyllspetiea…"


In [9]:
print("mollen: " + str(len(set(df["Ligand SMILES"]))))
print("seqlen: " + str(len(set(df["BindingDB Target Chain Sequence"]))))

mollen: 1199936
seqlen: 8583


In [10]:
dataki = df[["Ligand SMILES","Ki (nM)","BindingDB Target Chain Sequence"]].drop_nulls() # remove data without ki and kd value
datakd = df[["Ligand SMILES","Kd (nM)","BindingDB Target Chain Sequence"]].drop_nulls() # remove data without ki and kd value
dataic50 = df[["Ligand SMILES","IC50 (nM)","BindingDB Target Chain Sequence"]].drop_nulls() # remove data without ki and kd value
print("data have ki value: " + str(len(dataki)))
print("data have kd value: " + str(len(datakd)))
print("data have ic50 value: " + str(len(dataic50)))

data have ki value: 547810
data have kd value: 102479
data have ic50 value: 1687796
