In [10]:
import polars as pl
df = pl.read_parquet('../data/BindingDB_predprocessed/BindingDB_v0.parquet')

In [11]:
dataic50 = df[["Ligand SMILES","IC50 (nM)","BindingDB Target Chain Sequence"]].drop_nulls() 

In [12]:
dataic50 

Ligand SMILES,IC50 (nM),BindingDB Target Chain Sequence
str,f64,str
"""NS(=O)(=O)c1ccc(Nc2cc(OCC3CCCC…",29000.0,"""MSGRPRTTSFAESCKPVQQPSAFGSMKVSR…"
"""NS(=O)(=O)c1ccc(Nc2cc(OC3CCCCC…",190.0,"""MSGRPRTTSFAESCKPVQQPSAFGSMKVSR…"
"""NS(=O)(=O)c1ccc(Nc2cc(NC3CCCCC…",970.0,"""MSGRPRTTSFAESCKPVQQPSAFGSMKVSR…"
"""CCN(CC)c1cc(Nc2ccc(cc2)S(N)(=O…",11000.0,"""MSGRPRTTSFAESCKPVQQPSAFGSMKVSR…"
"""N[C@H]1CC[C@@H](CC1)Nc1cc(Nc2c…",780.0,"""MSGRPRTTSFAESCKPVQQPSAFGSMKVSR…"
…,…,…
"""O[C@@H]1CCCN(C1)C(=O)c1cccc(c1…",90.0,"""MSSWIRWHGPAMARLWGFCWLVVGFWRAAF…"
"""O[C@H]1CCCN(C1)C(=O)c1cccc(c1)…",118.0,"""MSSWIRWHGPAMARLWGFCWLVVGFWRAAF…"
"""COc1nc2ccc(Br)cc2cc1[C@@H](c1c…",1600.0,"""MPVRRGHVAPQNTFLDTIIRKFEGQSRKFI…"
"""COc1ccc(cc1)N(C)c1nc(C)nc2[nH]…",2600.0,"""CVSASPSTLARLVSRSAMPAGSSTAWNTAF…"


In [13]:
ligand_smiles = dataic50['Ligand SMILES'].to_list()
target_chain_sequence = dataic50['BindingDB Target Chain Sequence'].to_list()

In [14]:
def create_batches(data, batch_size):
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]

In [17]:
from transformers import BertTokenizerFast, BertModel
checkpoint = 'unikei/bert-base-smiles'
tokenizer = BertTokenizerFast.from_pretrained(checkpoint)
smiles_tokenizer = BertModel.from_pretrained(checkpoint)

In [18]:
from transformers import BertModel, BertTokenizer
tokenizer_rostlab = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False )
model_rostlab = BertModel.from_pretrained("Rostlab/prot_bert")

In [8]:
import re
# Function to generate embeddings for SMILES
def generate_embedding_smiles(smile):
    tokens = tokenizer(smile, return_tensors='pt')
    embedding = smiles_tokenizer(**tokens)
    return embedding[0].detach().numpy().flatten().tolist() 

# Function to generate embeddings for protein sequences
def generate_embedding_protein(sequence):
    sequence_Example = re.sub(r"[UZOB]", "X", sequence)
    encoded_input = tokenizer_rostlab(sequence_Example, return_tensors='pt')
    embedding = model_rostlab(**encoded_input)
    return embedding[0].detach().numpy().flatten().tolist() 

In [15]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import tqdm
# Create a Parquet writer schema

schema = pa.schema([
    ('ID', pa.string()),
    ('encoding', pa.list_(pa.float64()))
])

# Universal function to write embeddings to Parquet
def write_embeddings_to_parquet(data, batch_size, parquet_file, generate_embedding_func):
    num_newlines = len(data)
    total_batches = (num_newlines + batch_size - 1) // batch_size

    writer = None

    for batch in tqdm(create_batches(data, batch_size), total=total_batches, desc="Processing batches"):
        embeddings = [generate_embedding_func(item) for item in batch]
        df = pd.DataFrame({'ID': batch, 'encoding': embeddings})
        
        table = pa.Table.from_pandas(df, schema=schema)
        
        if writer is None:
            writer = pq.ParquetWriter(parquet_file, schema)
        
        writer.write_table(table)
        break

    if writer:
        writer.close()

In [7]:
batch_size = 512
parquet_file_smiles = '../data/embeddings/ligand_embeddings.parquet'
parquet_file_protein = '../data/embeddings/protein_embeddings.parquet'

In [19]:
# Write SMILES embeddings to Parquet
write_embeddings_to_parquet(ligand_smiles, batch_size, parquet_file_smiles, generate_embedding_smiles)

# Write protein sequence embeddings to Parquet
write_embeddings_to_parquet(target_chain_sequence, batch_size, parquet_file_protein, generate_embedding_protein)

Processing batches:   0%|          | 0/3297 [00:00<?, ?it/s]

Processing batches:   0%|          | 0/3297 [00:54<?, ?it/s]
Processing batches:   0%|          | 0/3297 [01:31<?, ?it/s]
