In [5]:
import os
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import PandasTools

posebusters_root = f'/mnt/data/posebusters/posebusters_benchmark_set/'

def sdf_to_smile(sdf):
    mol = Chem.MolFromMolFile(sdf)
    return Chem.MolToSmiles(mol)

def update_protein_path(df, posebusters_root):
    pdbids = os.listdir(posebusters_root)
    for pdbidx in range(len(pdbids)):
        new_path = os.path.join(posebusters_root, pdbids[pdbidx], f"{pdbids[pdbidx]}_protein.pdb")
        # give the new value
        df.iloc[pdbidx,0] = new_path
    return df

df = pd.read_csv('./origin.csv')

# delete unnamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# delete predicted_affinity and RMSD columns
df = df.drop(['predicted_affinity', 'RMSD'], axis=1)

# convert ligand from sdf to smile
df['ligand'] = df['ligand'].apply(sdf_to_smile)

# update protein path
df = update_protein_path(df, posebusters_root)

# change the name of 'protein' column to 'protein_path'
df.rename(columns={'protein': 'protein_path'}, inplace=True)
df.to_csv('input.csv', index=False)