In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem

In [14]:
# Function to read SDF file and extract features
def read_sdf_file(file_path):
    suppl = Chem.SDMolSupplier(file_path)
    smiles_list = []
    for mol in suppl:
        if mol is not None:
            # Extract SMILES structure
            smiles = Chem.MolToSmiles(mol)
            smiles_list.append(smiles)
    return np.array(smiles_list) 

# Function to create DataLoader
def create_dataloader(data, batch_size=32):
    tensor_data = torch.tensor(data, dtype=torch.float32)
    dataset = TensorDataset(tensor_data)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return dataloader

In [15]:
# Example usage
sdf_file_path = '../data/screening_data/screening.sdf'
data = read_sdf_file(sdf_file_path)

[09:57:53] Explicit valence for atom # 1 B, 5, is greater than permitted
[09:57:53] ERROR: Could not sanitize molecule ending on line 34094083
[09:57:53] ERROR: Explicit valence for atom # 1 B, 5, is greater than permitted
[09:58:05] Explicit valence for atom # 2 B, 5, is greater than permitted
[09:58:05] ERROR: Could not sanitize molecule ending on line 37735146
[09:58:05] ERROR: Explicit valence for atom # 2 B, 5, is greater than permitted
[09:58:05] Explicit valence for atom # 1 B, 5, is greater than permitted
[09:58:05] ERROR: Could not sanitize molecule ending on line 37844463
[09:58:05] ERROR: Explicit valence for atom # 1 B, 5, is greater than permitted
[09:59:29] Explicit valence for atom # 0 C, 6, is greater than permitted
[09:59:29] ERROR: Could not sanitize molecule ending on line 64433972
[09:59:29] ERROR: Explicit valence for atom # 0 C, 6, is greater than permitted
[09:59:35] Explicit valence for atom # 0 C, 6, is greater than permitted
[09:59:35] ERROR: Could not sanitiz

In [39]:
data

array(['CCCOc1ccc(C(O)(CC)C(CN2CCOCC2)c2ccccc2)cc1',
       'Cc1cccc(N2C(=O)C(Cl)=C(Nc3ccccc3O)C2=O)c1C',
       'O=C(Cc1cccs1)Nc1cccc(-c2nc3cc4ccccc4cc3[nH]2)c1', ...,
       'CCN1CCN(C(=O)c2cc(C(C)C)n[nH]2)CC1',
       'O=C(c1ccccc1)N1CC(c2ncco2)C2(CCN(CC3CCCCC3)CC2)C1',
       'Cc1ncc(C(=O)NCc2ccccc2)c(C2CCN(CCc3cccnc3)CC2)n1'], dtype='<U141')

In [26]:
def create_batches(data, batch_size):
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]

In [32]:
# Check if CUDA is available
if torch.cuda.is_available():
    # Get the number of GPUs available
    num_gpus = torch.cuda.device_count()
    print(f"Number of GPUs available: {num_gpus}")
    
    # Print the name of each GPU
    for i in range(num_gpus):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
    
    # Select a specific GPU (e.g., GPU 0)
    selected_gpu = 1
    device = torch.device(f"cuda:{selected_gpu}")
    print(f"Using GPU {selected_gpu}: {torch.cuda.get_device_name(selected_gpu)}")
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU.")

Number of GPUs available: 3
GPU 0: NVIDIA A100 80GB PCIe
GPU 1: NVIDIA A100 80GB PCIe
GPU 2: NVIDIA A100 80GB PCIe
Using GPU 1: NVIDIA A100 80GB PCIe


In [34]:
from transformers import BertTokenizerFast, BertModel
checkpoint = 'unikei/bert-base-smiles'
tokenizer_smiles = BertTokenizerFast.from_pretrained(checkpoint)
model_smiles = BertModel.from_pretrained(checkpoint)
model_smiles.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30000, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [43]:
def generate_embeddings(batch, tokenizer, model,device):
    batch_list = batch.tolist()
    inputs = tokenizer(batch_list, return_tensors='pt', padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 1:-1, :].mean(dim=1).cpu().numpy().tolist() 
    return embeddings

In [36]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import tqdm

# Create a Parquet writer schema
schema = pa.schema([
    ('ID', pa.string()),
    ('encoding', pa.list_(pa.float64()))
])

# Universal function to write embeddings to Parquet
def write_embeddings_to_parquet(data, batch_size, parquet_file, generate_embedding_func, tokenizer, model, device):
    num_newlines = len(data)
    total_batches = (num_newlines + batch_size - 1) // batch_size

    writer = None

    for batch in tqdm(create_batches(data, batch_size), total=total_batches, desc="Processing batches"):
        embeddings = generate_embedding_func(batch,tokenizer, model, device)
        df = pd.DataFrame({'ID': batch, 'encoding': embeddings})
        
        table = pa.Table.from_pandas(df, schema=schema)
        
        if writer is None:
            writer = pq.ParquetWriter(parquet_file, schema)
        
        writer.write_table(table)
        
        # Clear CUDA cache to free up memory
        torch.cuda.empty_cache()

    if writer:
        writer.close()

In [45]:
batch_size = 256
parquet_file_smiles = '../data/screening_data/ligand_embeddings.parquet'

In [51]:
np.save('../data/screening_data/smiles.npy', data)

In [None]:
data

array(['CCCOc1ccc(C(O)(CC)C(CN2CCOCC2)c2ccccc2)cc1',
       'Cc1cccc(N2C(=O)C(Cl)=C(Nc3ccccc3O)C2=O)c1C',
       'O=C(Cc1cccs1)Nc1cccc(-c2nc3cc4ccccc4cc3[nH]2)c1', ...,
       'CCN1CCN(C(=O)c2cc(C(C)C)n[nH]2)CC1',
       'O=C(c1ccccc1)N1CC(c2ncco2)C2(CCN(CC3CCCCC3)CC2)C1',
       'Cc1ncc(C(=O)NCc2ccccc2)c(C2CCN(CCc3cccnc3)CC2)n1'], dtype='<U141')

In [47]:
# Write SMILES embeddings to Parquet
write_embeddings_to_parquet(data, batch_size, parquet_file_smiles, generate_embeddings, tokenizer_smiles, model_smiles, device)

Processing batches: 100%|██████████| 4961/4961 [17:50<00:00,  4.64it/s]


In [61]:
import pandas as pd

In [62]:
df = pd.read_parquet("../data/screening_data/ligand_embeddings.parquet")

In [63]:
df

Unnamed: 0,ID,encoding
0,CCCOc1ccc(C(O)(CC)C(CN2CCOCC2)c2ccccc2)cc1,"[0.4631887376308441, 1.1496385335922241, -0.16..."
1,Cc1cccc(N2C(=O)C(Cl)=C(Nc3ccccc3O)C2=O)c1C,"[0.3505532741546631, 0.6738343238830566, 0.027..."
2,O=C(Cc1cccs1)Nc1cccc(-c2nc3cc4ccccc4cc3[nH]2)c1,"[0.2411477416753769, -0.02009040117263794, -0...."
3,Cn1ncc(N2CCC(C(=O)Nc3cccc(-c4nc5ccccc5[nH]4)c3...,"[0.20003139972686768, -0.06338706612586975, -0..."
4,CCOC(=O)c1c(N2C(=O)C=CC2=O)sc2c1CCCC2,"[0.6468941569328308, 0.5003923773765564, 0.236..."
...,...,...
1269938,CCCN(C)c1ccc2ncc(=O)n(C)c2n1,"[0.6567842960357666, 0.25491511821746826, -0.2..."
1269939,CN(C)C(=O)c1cc(N(C)C)nc2ccccc12,"[0.9823178052902222, 0.6226070523262024, -0.19..."
1269940,CCN1CCN(C(=O)c2cc(C(C)C)n[nH]2)CC1,"[0.423491895198822, 0.19044129550457, 0.111522..."
1269941,O=C(c1ccccc1)N1CC(c2ncco2)C2(CCN(CC3CCCCC3)CC2)C1,"[0.567745566368103, 0.27047884464263916, -0.18..."


In [64]:
# To load the data back
loaded_data = np.load('../data/proteins/embeddings/6VKV_GAG_embeddings.npy')
print(len(loaded_data))

320
