In [2]:
# %%
import os
import pickle
from rdkit import Chem
import pandas as pd
from tqdm import tqdm
import pymol
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')
import glob

In [3]:
distance = 5
input_ligand_format = 'mol2'
data_root = './pdbbind_rna_data'
data_dir = os.path.join(data_root, 'pdbbind_rna_set_')
data_df = pd.read_csv(os.path.join(data_root, 'pdbbind_rna_labels_split.csv'))

In [4]:
input_folder = 'pdbbind_rna_data/pdbbind_rna_set_'
os.listdir(input_folder)

[]

In [14]:
data_df[data_df['set'] == 'train']['pdbid']

0      1arj
1      1byj
2      1f27
3      1f1t
4      1qd3
       ... 
111    6e8s
112    6hmo
114    6p2h
116    6e8u
117    6hbt
Name: pdbid, Length: 81, dtype: object

In [15]:
def generate_pocket(data_dir, distance=5):
    complex_id = os.listdir(data_dir)
    for cid in complex_id:
        complex_dir = os.path.join(data_dir, cid)
        lig_pattern = os.path.join(complex_dir, f"{cid}_*.mol2")
        lig_native_files = glob.glob(lig_pattern)
        
        if not lig_native_files:
            print(f"No ligand file found for {cid}. Skipping.")
            continue
            
        lig_native_path = lig_native_files[0]  # Use the first matching file
        rna_path = os.path.join(complex_dir, f"{cid}_rna.pdb")

        if os.path.exists(os.path.join(complex_dir, f'Pocket_{distance}A.pdb')):
            continue

        pymol.cmd.load(rna_path)
        pymol.cmd.remove('resn HOH')
        pymol.cmd.load(lig_native_path)

        # Find out the name of the loaded ligand
        all_objects = pymol.cmd.get_names('objects')
        ligand_name = all_objects[-1]  # Assuming the last loaded object is the ligand

        pymol.cmd.remove('hydrogens')
        pymol.cmd.select('Pocket', f'byres {ligand_name} around {distance}')
        pymol.cmd.save(os.path.join(complex_dir, f'Pocket_{distance}A.pdb'), 'Pocket')
        pymol.cmd.delete('all')

In [16]:
generate_pocket(data_dir=data_dir, distance=distance)

In [17]:
def generate_complex(data_dir, data_df, distance=5):
    pbar = tqdm(total=len(data_df))
    for i, row in data_df.iterrows():
        cid, pKa = row['pdbid'], float(row['-logKd/Ki'])
        complex_dir = os.path.join(data_dir, cid)
        pocket_path = os.path.join(data_dir, cid, f'Pocket_{distance}A.pdb')
        
        lig_pattern = os.path.join(complex_dir, f"{cid}_*_*.pdb")
        lig_native_files = glob.glob(lig_pattern)
        ligand_path = lig_native_files[0]  # Use the first matching file

        save_path = os.path.join(complex_dir, f"{cid}_{distance}A.rdkit")
        ligand = Chem.MolFromPDBFile(ligand_path, removeHs=True)
        if ligand == None:
            print(f"Unable to process ligand of {cid}")
            continue

        pocket = Chem.MolFromPDBFile(pocket_path, removeHs=True)
        if pocket == None:
            print(f"Unable to process rna of {cid}")
            continue

        complex = (ligand, pocket)
        with open(save_path, 'wb') as f:
            pickle.dump(complex, f)

        pbar.update(1)

In [18]:
generate_complex(data_dir, data_df, distance=distance)

100%|██████████| 118/118 [00:00<00:00, 144.21it/s]


In [19]:
import os
import shutil
import random

def split_folder(input_folder, train_folder, test_folder, split_ratio=0.9):
    # Create output directories if they don't exist
    os.makedirs(train_folder, exist_ok=True)
    os.makedirs(test_folder, exist_ok=True)

    # Get all files in the input folder
    files = os.listdir(input_folder)
    
    # Shuffle the files randomly
    random.shuffle(files)
    
    # Calculate split index
    split_index = int(len(files) * split_ratio)
    
    # Split files into train and test sets
    train_files = files[:split_index]
    test_files = files[split_index:]

    # Move files to respective folders
    for file in train_files:
        shutil.move(os.path.join(input_folder, file), os.path.join(train_folder, file))
    
    for file in test_files:
        shutil.move(os.path.join(input_folder, file), os.path.join(test_folder, file))

# Example usage
input_folder = 'pdbbind_rna_data/pdbbind_rna_set'
train_folder = 'pdbbind_rna_data/train_set'
test_folder = 'pdbbind_rna_data/test_set'
split_folder(input_folder, train_folder, test_folder)


FileNotFoundError: [Errno 2] No such file or directory: 'pdbbind_rna_data/pdbbind_rna_set'

In [20]:
import os
import shutil
import random

def split_folder(input_folder, train_folder, val_folder, test_folder):
    # Create output directories if they don't exist
    os.makedirs(train_folder, exist_ok=True)
    os.makedirs(val_folder, exist_ok=True)
    os.makedirs(test_folder, exist_ok=True)

    # Get all files in the input folder
    files = os.listdir(input_folder)
    
    train_files = data_df[data_df['set'] == 'train']['pdbid']
    val_files = data_df[data_df['set'] == 'valid']['pdbid']
    test_files = data_df[data_df['set'] == 'test']['pdbid']

    # Move files to respective folders
    for file in train_files:
        shutil.move(os.path.join(input_folder, file), os.path.join(train_folder, file))
    
    for file in val_files:
        shutil.move(os.path.join(input_folder, file), os.path.join(val_folder, file)) 
    
    for file in test_files:
        shutil.move(os.path.join(input_folder, file), os.path.join(test_folder, file))

# Example usage
input_folder = 'pdbbind_rna_data/pdbbind_rna_set_'
train_folder = 'pdbbind_rna_data/train_set'
val_folder = 'pdbbind_rna_data/val_set'
test_folder = 'pdbbind_rna_data/test_set'
split_folder(input_folder, train_folder, val_folder, test_folder)


In [24]:
 import pandas as pd
 data_root = './pdbbind_rna_data'
 df = pd.read_csv(os.path.join(data_root, "pdbbind_rna_labels_split.csv"))

In [25]:
df[df['pdbid'].isin(os.listdir(train_folder))].to_csv(os.path.join(data_root, 'train_labels.csv'), index=False)
df[df['pdbid'].isin(os.listdir(val_folder))].to_csv(os.path.join(data_root, 'val_labels.csv'), index=False)
df[df['pdbid'].isin(os.listdir(test_folder))].to_csv(os.path.join(data_root, 'test_labels.csv'), index=False)