In [7]:
import numpy as np
import torch
from scipy.spatial import distance_matrix
import torch.nn.functional as F

# RNA-ligand structure pair distance embedding

In [8]:
from Bio.PDB import PDBParser, MMCIFParser
def load_structure(filepath):
    """
    Loads atomic coordinates from a PDB or CIF file.

    Args:
        filepath (str): Path to the PDB or CIF file.

    Returns:
        list: A list of atomic coordinates as numpy arrays.
    """
    if filepath.endswith('.cif'):
        parser = MMCIFParser(QUIET=True)
    else:
        parser = PDBParser(QUIET=True)
    
    structure = parser.get_structure('structure', filepath)
    coordinates = []
    for model in structure:
        for chain in model:
            for residue in chain:
                for atom in residue:
                    coordinates.append(atom.coord)
    return np.array(coordinates, dtype=np.float32).reshape(-1, 3)

In [9]:
import numpy as np
from scipy.spatial import distance_matrix
import torch

def get_combined_distance_matrix(mol_pos, pocket_pos):
    """
    Combines distance matrices from pocket and molecule positions into a single matrix.
    Args:
        mol_pos (torch.Tensor or np.ndarray): Coordinates of the molecule (shape: [num_mol_atoms, 3]).
        pocket_pos (torch.Tensor or np.ndarray): Coordinates of the pocket (shape: [num_pocket_atoms, 3]).
    Returns:
        torch.Tensor: Combined distance matrix of shape [(num_mol_atoms + num_pocket_atoms), (num_mol_atoms + num_pocket_atoms)].
    """

    # Convert to numpy arrays if they are PyTorch tensors
    if isinstance(mol_pos, torch.Tensor):
        mol_pos = mol_pos.numpy()
    if isinstance(pocket_pos, torch.Tensor):
        pocket_pos = pocket_pos.numpy()

    # Calculate distance matrices for each part
    pocket_pocket_dist = distance_matrix(pocket_pos, pocket_pos).astype(np.float32)  # Top-left
    mol_mol_dist = distance_matrix(mol_pos, mol_pos).astype(np.float32)              # Bottom-right
    mol_pocket_dist = distance_matrix(mol_pos, pocket_pos).astype(np.float32)        # Cross-distance

    # Get the dimensions of the distance matrices
    num_pocket_atoms = pocket_pos.shape[0]
    num_mol_atoms = mol_pos.shape[0]

    # Create the final combined distance matrix with padding
    combined_size = num_pocket_atoms + num_mol_atoms
    combined_matrix = np.zeros((combined_size, combined_size), dtype=np.float32)

    # Fill in the top-left (pocket-pocket distances)
    combined_matrix[:num_pocket_atoms, :num_pocket_atoms] = pocket_pocket_dist

    # Fill in the bottom-right (mol-mol distances)
    combined_matrix[num_pocket_atoms:, num_pocket_atoms:] = mol_mol_dist

    # Fill in the top-right and bottom-left (mol-pocket distances)
    combined_matrix[:num_pocket_atoms, num_pocket_atoms:] = mol_pocket_dist.T
    combined_matrix[num_pocket_atoms:, :num_pocket_atoms] = mol_pocket_dist

    # Convert to torch tensor and return
    return torch.from_numpy(combined_matrix)


In [10]:
# Example usage
ligand_structure = load_structure('/home/tzutang.lin/blue_ufhpc/CoPRA/CoRSA/general_dataset/parsed_ligands_pdb/1aju_B_ARG.pdb')
pocket_structure = load_structure('/home/tzutang.lin/blue_ufhpc/CoPRA/CoRSA/general_dataset/pockets_6A_/1aju_B_ARG_pocket.cif')

mol_pos = ligand_structure.reshape(-1, 3)
pocket_pos = pocket_structure.reshape(-1, 3)

# Get the combined distance matrix
combined_dist_matrix = get_combined_distance_matrix(mol_pos, pocket_pos)

# General dataset

In [11]:
import pandas as pd
encoded_df = pd.read_csv('general_dataset/general_processed_index_encoded.csv', keep_default_na=False)

# Define paths for ligands and pockets
ligand_path = '/home/tzutang.lin/blue_ufhpc/CoPRA/CoRSA/general_dataset/parsed_ligands_pdb/'
pocket_path = '/home/tzutang.lin/blue_ufhpc/CoPRA/CoRSA/general_dataset/pockets_6A_/'

# Placeholder for combined distance matrices
combined_distance_matrices = []

# Iterate over each row in the dataframe to process matrix data
max_size = 0
for idx, row in encoded_df.iterrows():
    pdb_id = row['pdb_id']
    ligand_id = row['ligand_id']
    ligand_chain = row['ligand_chain']
    rna_chain = row['rna_chain']

    # Construct file names for ligand and pocket
    ligand_file = f"{ligand_path}{pdb_id}_{ligand_chain}_{ligand_id}.pdb"
    pocket_file = f"{pocket_path}{pdb_id}_{ligand_chain}_{ligand_id}_pocket.cif"

    # Load structures
    try:
        ligand_structure = load_structure(ligand_file)
        pocket_structure = load_structure(pocket_file)
    except FileNotFoundError as e:
        print(f"Error loading files for {pdb_id}: {e}")
        continue

    # Reshape and calculate combined distance matrix
    mol_pos = ligand_structure.reshape(-1, 3)
    pocket_pos = pocket_structure.reshape(-1, 3)
    combined_dist_matrix = get_combined_distance_matrix(mol_pos, pocket_pos)

    # Ensure combined_dist_matrix is a 2D tensor
    if combined_dist_matrix.ndim == 2:
        combined_distance_matrices.append(combined_dist_matrix)
        max_size = max(max_size, combined_dist_matrix.shape[0])
    else:
        print(f"Unexpected shape for combined distance matrix for {pdb_id}, skipping.")

In [12]:
def get_max_matrix_size(matrices):
    return max(matrix.shape[0] for matrix in matrices)

# Convert the list of matrices to a tensor for further processing
if combined_distance_matrices:
    max_size = get_max_matrix_size(combined_distance_matrices)
    matrix_data = torch.stack([F.pad(torch.tensor(matrix, dtype=torch.float32), 
                                     (0, max_size - matrix.shape[0], 0, max_size - matrix.shape[1]), 
                                     value=0) for matrix in combined_distance_matrices])
else:
    raise ValueError("No valid combined distance matrices found.")


# Convert the tensor to a NumPy array
matrix_data_np = matrix_data.cpu().numpy()

# Save matrix data to an .npy file
np.save('general_dataset/combined_distance_matrices.npy', matrix_data_np)

# # Save the updated dataframe
# encoded_df.to_csv('general_dataset/general_processed_index_encoded_with_matrices.csv', index=False)

# Print shape to verify
print(f"Combined Distance Matrices Shape: {matrix_data.shape}")



  matrix_data = torch.stack([F.pad(torch.tensor(matrix, dtype=torch.float32),


Combined Distance Matrices Shape: torch.Size([1390, 621, 621])


In [13]:
encoded_df

Unnamed: 0,pdb_id,ligand_id,ligand_chain,rna_chain,rna_sequence,ligand_smiles,smiles_embedding,rna_embedding
0,1aju,ARG,B,A,GGCCAGAUUGAGCCUGGGAGCUCUCUGGCC,[H]N([H])C(N([H])[H])N([H])C([H])([H])C([H])([...,"[0.4606024, -0.552954, 0.07159625, 0.42867935,...","[0.19807585, 0.038253188, -1.0105137, -1.19123..."
1,1akx,ARG,B,A,GGCCAGAUUGAGCCUGGGAGCUCUCUGGCC,[H]N([H])C(N([H])[H])N([H])C([H])([H])C([H])([...,"[0.4606024, -0.552954, 0.07159625, 0.42867935,...","[0.19807585, 0.038253188, -1.0105137, -1.19123..."
2,1am0,AMP,B,A,GGGUUGGGAAGAAACUGUGGCACUUCGGUGCCAGCAACCC,[H]O[C@@]1([H])[C@@]([H])(O[H])[C@]([H])(N2C([...,"[0.3783685, -0.48514688, 0.08168459, 0.4056929...","[0.2500691, 0.100058, -1.3738682, -1.2410955, ..."
3,1arj,ARG,B,N,GGCAGAUCUGAGCCUGGGAGCUCUCUGCC,[H]N([H])C(N([H])[H])N([H])C([H])([H])C([H])([...,"[0.47757724, -0.5242162, 0.095895864, 0.414529...","[0.14929794, 0.073322795, -0.97397256, -1.3375..."
4,1eht,TEP,B,A,GGCGAUACCAGCCGAAAGGCCCUUGGCAGCGUC,[H]C1NC2C(C(O)N(C([H])([H])[H])C(O)N2C([H])([H...,"[0.41855708, -0.53773266, 0.03623496, 0.435665...","[0.113713376, 0.1409429, -1.1955042, -1.564314..."
...,...,...,...,...,...,...,...,...
1385,6yl5,SAH,,K,GGUCACAACGGCUUCCUGGCGUGACCAUUGGAGCA,[H]O[C@@]1([H])[C@@]([H])(O[H])[C@]([H])(N2C([...,"[0.39510193, -0.4880833, 0.06998497, 0.4085014...","[0.26429752, 0.32983544, -1.1726136, -1.339479..."
1386,6ymi,AMP,Z,O,GGUCACAACGGCUUCCUGGCGUGACC,NC1NCNC2C1NCN2[C@@H]1O[C@H](CO[PH](O)(O)O)[C@@...,"[0.3783685, -0.48514688, 0.08168459, 0.4056929...","[0.2047549, 0.40685564, -1.0418745, -1.4673102..."
1387,6ymj,ADN,AA,O,GGUCACAACGGCUUCCUGGCGUGACC,NC1NCNC2C1NCN2[C@@H]1O[C@H](CO)[C@@H](O)[C@H]1O,"[0.39568582, -0.5359654, 0.06406295, 0.4215734...","[0.2047549, 0.40685564, -1.0418745, -1.4673102..."
1388,7tql,5GP,,3,AGCAGAGUGGCGCAGCGGAAGCGUGCUGGGCCCAUAACCCAGAGGU...,NC1NC(O)C2NCN([C@@H]3O[C@H](COP(O)O)[C@@H](O)[...,"[0.38205087, -0.4702403, 0.08228299, 0.4623242...","[0.36936635, 0.4023114, -0.7774876, -1.8904905..."


# Pdbbind dataset

In [15]:
import pandas as pd
encoded_df = pd.read_csv('pdbbind_dataset_rna/pdbbind_rna_processed_index_encoded.csv', keep_default_na=False)

# Define paths for ligands and pockets
ligand_path = '/home/tzutang.lin/blue_ufhpc/CoPRA/CoRSA/pdbbind_dataset_rna/parsed_ligands_pdb/'
pocket_path = '/home/tzutang.lin/blue_ufhpc/CoPRA/CoRSA/pdbbind_dataset_rna/pockets_6A/'

# Placeholder for combined distance matrices
combined_distance_matrices = []

# Iterate over each row in the dataframe to process matrix data
max_size = 0
for idx, row in encoded_df.iterrows():
    pdb_id = row['pdb_id']
    ligand_id = row['ligand_id']
    ligand_chain = row['ligand_chain']
    rna_chain = row['rna_chain']

    # Construct file names for ligand and pocket
    ligand_file = f"{ligand_path}{pdb_id}_{ligand_id}_{ligand_chain}.pdb"
    pocket_file = f"{pocket_path}{pdb_id}_{ligand_id}_{ligand_chain}_pocket.cif"

    # Load structures
    try:
        ligand_structure = load_structure(ligand_file)
        pocket_structure = load_structure(pocket_file)
    except FileNotFoundError as e:
        print(f"Error loading files for {pdb_id}: {e}")
        continue

    # Reshape and calculate combined distance matrix
    mol_pos = ligand_structure.reshape(-1, 3)
    pocket_pos = pocket_structure.reshape(-1, 3)
    combined_dist_matrix = get_combined_distance_matrix(mol_pos, pocket_pos)

    # Ensure combined_dist_matrix is a 2D tensor
    if combined_dist_matrix.ndim == 2:
        combined_distance_matrices.append(combined_dist_matrix)
        max_size = max(max_size, combined_dist_matrix.shape[0])
    else:
        print(f"Unexpected shape for combined distance matrix for {pdb_id}, skipping.")

In [16]:
def get_max_matrix_size(matrices):
    return max(matrix.shape[0] for matrix in matrices)

# Convert the list of matrices to a tensor for further processing
if combined_distance_matrices:
    max_size = get_max_matrix_size(combined_distance_matrices)
    matrix_data = torch.stack([F.pad(torch.tensor(matrix, dtype=torch.float32), 
                                     (0, max_size - matrix.shape[0], 0, max_size - matrix.shape[1]), 
                                     value=0) for matrix in combined_distance_matrices])
else:
    raise ValueError("No valid combined distance matrices found.")

    
# Convert the tensor to a NumPy array
matrix_data_np = matrix_data.cpu().numpy()

# Save matrix data to an .npy file
np.save('pdbbind_dataset_rna/combined_distance_matrices.npy', matrix_data_np)    
    
    
# # Save matrix data to encoded_df
# encoded_df['matrix'] = [matrix.tolist() for matrix in matrix_data]

# # Save the updated dataframe
# encoded_df.to_csv('pdbbind_dataset_rna/pdbbind_rna_processed_index_encoded_with_matrices.csv', index=False)

# Print shape to verify
print(f"Combined Distance Matrices Shape: {matrix_data.shape}")




  matrix_data = torch.stack([F.pad(torch.tensor(matrix, dtype=torch.float32),


Combined Distance Matrices Shape: torch.Size([118, 521, 521])


In [17]:
if combined_distance_matrices:
    max_size = 621
    matrix_data = torch.stack([F.pad(torch.tensor(matrix, dtype=torch.float32), 
                                     (0, max_size - matrix.shape[0], 0, max_size - matrix.shape[1]), 
                                     value=0) for matrix in combined_distance_matrices])
else:
    raise ValueError("No valid combined distance matrices found.")

    
# Convert the tensor to a NumPy array
matrix_data_np = matrix_data.cpu().numpy()

# Save matrix data to an .npy file
np.save('pdbbind_dataset_rna/combined_distance_matrices_finetune.npy', matrix_data_np)    
        
    
    
# # Save matrix data to encoded_df
# encoded_df['matrix'] = [matrix.tolist() for matrix in matrix_data]

# # Save the updated dataframe
# encoded_df.to_csv('pdbbind_dataset_rna/pdbbind_rna_processed_index_encoded_with_matrices_finetune.csv', index=False)

# Print shape to verify
print(f"Combined Distance Matrices Shape: {matrix_data.shape}")

  matrix_data = torch.stack([F.pad(torch.tensor(matrix, dtype=torch.float32),


Combined Distance Matrices Shape: torch.Size([118, 621, 621])


In [None]:
encoded_df