In [165]:
!module load cuda

In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append("./materials")
sys.path.append("./materials/models")
from materials.models import fm4m
from multimolecule import RnaTokenizer, RiNALMoModel
import torch

2024-12-04 00:33:24.746725: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-04 00:33:24.749718: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-04 00:33:24.760065: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-04 00:33:24.776868: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-04 00:33:24.781915: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-04 00:33:24.794214: I tensorflow/core/platform/cpu_feature_gu

# RNA embedding

In [120]:
# RNA sequence encoding function with batch processing and memory optimization
# max_length = general_df['rna_sequence'].apply(len).max()
def encode_rna_sequences(rna_sequences, batch_size=64, max_length=2048):
    
    # Load the pretrained tokenizer and model
    rna_tokenizer = RnaTokenizer.from_pretrained('multimolecule/rinalmo')
    rna_model = RiNALMoModel.from_pretrained('multimolecule/rinalmo')

    rna_embeddings = []
    for i in range(0, len(rna_sequences), batch_size):
        batch_sequences = rna_sequences[i:i+batch_size]
        print(f"Processing batch {i // batch_size + 1}/{(len(rna_sequences) + batch_size - 1) // batch_size}")

        # Tokenize the RNA sequences in the batch
        rna_input = rna_tokenizer(batch_sequences.tolist(), return_tensors='pt', padding=True, truncation=True, max_length=max_length)

        # Make sure to move inputs and model to CPU to reduce GPU memory usage
        rna_input = {key: val.to('cuda' if torch.cuda.is_available() else 'cpu') for key, val in rna_input.items()}
        rna_model.to('cuda' if torch.cuda.is_available() else 'cpu')

        # Get model output and extract [CLS] token embedding
        with torch.no_grad():  # Disable gradient computation to save memory
            rna_output = rna_model(**rna_input)
            last_hidden_state = rna_output['last_hidden_state']
            cls_embeddings = last_hidden_state[:, 0, :]  # Extract CLS token for all sequences in the batch

        # Detach and move to CPU, then convert to NumPy
        cls_embeddings_np = cls_embeddings.detach().cpu().numpy()
        rna_embeddings.extend(cls_embeddings_np)

        # Clear cache to avoid memory overflow
        torch.cuda.empty_cache()

    # Convert all embeddings to a tensor
    return torch.tensor(rna_embeddings, dtype=torch.float32)

# SMILES embedding

In [121]:
# fm4m.avail_models()
def encode_smiles(ligand_smiles, model_type='SMI-TED'):
    smiles_list = list(ligand_smiles.values)
    x_batch = fm4m.get_representation_x(smiles_list, model_type=model_type, return_tensor=False)
    x_batch = np.array(x_batch)
    return torch.tensor(x_batch, dtype=torch.float32)

# General dataset

In [122]:
general_df = pd.read_csv('general_dataset/general_processed_index.csv', keep_default_na=False)

In [123]:
general_df

Unnamed: 0,pdb_id,ligand_id,ligand_chain,rna_chain,rna_sequence,ligand_smiles
0,1aju,ARG,B,A,GGCCAGAUUGAGCCUGGGAGCUCUCUGGCC,[H]N([H])C(N([H])[H])N([H])C([H])([H])C([H])([...
1,1akx,ARG,B,A,GGCCAGAUUGAGCCUGGGAGCUCUCUGGCC,[H]N([H])C(N([H])[H])N([H])C([H])([H])C([H])([...
2,1am0,AMP,B,A,GGGUUGGGAAGAAACUGUGGCACUUCGGUGCCAGCAACCC,[H]O[C@@]1([H])[C@@]([H])(O[H])[C@]([H])(N2C([...
3,1arj,ARG,B,N,GGCAGAUCUGAGCCUGGGAGCUCUCUGCC,[H]N([H])C(N([H])[H])N([H])C([H])([H])C([H])([...
4,1eht,TEP,B,A,GGCGAUACCAGCCGAAAGGCCCUUGGCAGCGUC,[H]C1NC2C(C(O)N(C([H])([H])[H])C(O)N2C([H])([H...
...,...,...,...,...,...,...
1385,6yl5,SAH,,K,GGUCACAACGGCUUCCUGGCGUGACCAUUGGAGCA,[H]O[C@@]1([H])[C@@]([H])(O[H])[C@]([H])(N2C([...
1386,6ymi,AMP,Z,O,GGUCACAACGGCUUCCUGGCGUGACC,NC1NCNC2C1NCN2[C@@H]1O[C@H](CO[PH](O)(O)O)[C@@...
1387,6ymj,ADN,AA,O,GGUCACAACGGCUUCCUGGCGUGACC,NC1NCNC2C1NCN2[C@@H]1O[C@H](CO)[C@@H](O)[C@H]1O
1388,7tql,5GP,,3,AGCAGAGUGGCGCAGCGGAAGCGUGCUGGGCCCAUAACCCAGAGGU...,NC1NC(O)C2NCN([C@@H]3O[C@H](COP(O)O)[C@@H](O)[...


In [124]:
fm4m.avail_models()

Unnamed: 0,Model Name,Description
0,SMI-TED,SMILES based encoder decoder model
1,SELFIES-TED,BART model for string based SELFIES modality
2,MolFormer,MolFormer model for string based SMILES modality
3,MHG-GED,Molecular hypergraph model
4,Mordred,Baseline: A descriptor-calculation software ap...
5,MorganFingerprint,Baseline: Circular atom environments based des...


In [157]:
smiles_embeddings = encode_smiles(general_df['ligand_smiles'], model_type='SMI-TED')
general_df['smiles_embedding'] = [list(embedding.numpy()) for embedding in smiles_embeddings]

In [156]:
rna_embeddings = encode_rna_sequences(general_df['rna_sequence'])
general_df['rna_embedding'] = [list(embedding.numpy()) for embedding in rna_embeddings]

In [138]:
general_df

Unnamed: 0,pdb_id,ligand_id,ligand_chain,rna_chain,rna_sequence,ligand_smiles,smiles_embedding,rna_embedding
0,1aju,ARG,B,A,GGCCAGAUUGAGCCUGGGAGCUCUCUGGCC,[H]N([H])C(N([H])[H])N([H])C([H])([H])C([H])([...,"[0.4606024, -0.552954, 0.07159625, 0.42867935,...","[0.19807585, 0.038253188, -1.0105137, -1.19123..."
1,1akx,ARG,B,A,GGCCAGAUUGAGCCUGGGAGCUCUCUGGCC,[H]N([H])C(N([H])[H])N([H])C([H])([H])C([H])([...,"[0.4606024, -0.552954, 0.07159625, 0.42867935,...","[0.19807585, 0.038253188, -1.0105137, -1.19123..."
2,1am0,AMP,B,A,GGGUUGGGAAGAAACUGUGGCACUUCGGUGCCAGCAACCC,[H]O[C@@]1([H])[C@@]([H])(O[H])[C@]([H])(N2C([...,"[0.3783685, -0.48514688, 0.08168459, 0.4056929...","[0.2500691, 0.100058, -1.3738682, -1.2410955, ..."
3,1arj,ARG,B,N,GGCAGAUCUGAGCCUGGGAGCUCUCUGCC,[H]N([H])C(N([H])[H])N([H])C([H])([H])C([H])([...,"[0.47757724, -0.5242162, 0.095895864, 0.414529...","[0.14929794, 0.073322795, -0.97397256, -1.3375..."
4,1eht,TEP,B,A,GGCGAUACCAGCCGAAAGGCCCUUGGCAGCGUC,[H]C1NC2C(C(O)N(C([H])([H])[H])C(O)N2C([H])([H...,"[0.41855708, -0.53773266, 0.03623496, 0.435665...","[0.113713376, 0.1409429, -1.1955042, -1.564314..."
...,...,...,...,...,...,...,...,...
1385,6yl5,SAH,,K,GGUCACAACGGCUUCCUGGCGUGACCAUUGGAGCA,[H]O[C@@]1([H])[C@@]([H])(O[H])[C@]([H])(N2C([...,"[0.39510193, -0.4880833, 0.06998497, 0.4085014...","[0.26429752, 0.32983544, -1.1726136, -1.339479..."
1386,6ymi,AMP,Z,O,GGUCACAACGGCUUCCUGGCGUGACC,NC1NCNC2C1NCN2[C@@H]1O[C@H](CO[PH](O)(O)O)[C@@...,"[0.3783685, -0.48514688, 0.08168459, 0.4056929...","[0.2047549, 0.40685564, -1.0418745, -1.4673102..."
1387,6ymj,ADN,AA,O,GGUCACAACGGCUUCCUGGCGUGACC,NC1NCNC2C1NCN2[C@@H]1O[C@H](CO)[C@@H](O)[C@H]1O,"[0.39568582, -0.5359654, 0.06406295, 0.4215734...","[0.2047549, 0.40685564, -1.0418745, -1.4673102..."
1388,7tql,5GP,,3,AGCAGAGUGGCGCAGCGGAAGCGUGCUGGGCCCAUAACCCAGAGGU...,NC1NC(O)C2NCN([C@@H]3O[C@H](COP(O)O)[C@@H](O)[...,"[0.38205087, -0.4702403, 0.08228299, 0.4623242...","[0.36936635, 0.4023114, -0.7774876, -1.8904905..."


In [159]:
encoded_data_path = 'general_dataset/general_processed_index_encoded.csv'
general_df.to_csv(encoded_data_path, index=False)

# Pdbbind dataset

In [129]:
pdbbind_df = pd.read_csv('pdbbind_dataset_rna/pdbbind_rna_processed_index.csv', keep_default_na=False)

In [161]:
smiles_embeddings = encode_smiles(pdbbind_df['ligand_smiles'], model_type='SMI-TED')
pdbbind_df['smiles_embedding'] = [list(embedding.numpy()) for embedding in smiles_embeddings]

  checkpoint = torch.load(ckpt_path, map_location=torch.device('cpu'))


Random Seed: 12345
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Using Rotation Embedding
Vocab size: 2393
[INFERENCE MODE - smi-ted-Light]


  return bound(*args, **kwds)
100%|██████████| 1/1 [00:04<00:00,  4.39s/it]


In [162]:
rna_embeddings = encode_rna_sequences(pdbbind_df['rna_sequence'])
pdbbind_df['rna_embedding'] = [list(embedding.numpy()) for embedding in rna_embeddings]

Some weights of RiNALMoModel were not initialized from the model checkpoint at multimolecule/rinalmo and are newly initialized: ['rinalmo.pooler.dense.bias', 'rinalmo.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing batch 1/2
Processing batch 2/2


In [163]:
encoded_data_path = 'pdbbind_dataset_rna/pdbbind_rna_processed_index_encoded.csv'
pdbbind_df.to_csv(encoded_data_path, index=False)

In [168]:
pdbbind_df['set']

0      valid
1      train
2       test
3      train
4      valid
       ...  
113    train
114    train
115     test
116    train
117     test
Name: set, Length: 118, dtype: object