In [3]:
# without rdkit
import os
import csv
import pandas as pd
import numpy as np
import networkx as nx
from networkx.algorithms import isomorphism

def parse_sdf(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    atoms = []
    bonds = []
    atom_section = False
    bond_section = False
    atom_count = int(lines[3][:3].strip())
    bond_count = int(lines[3][3:6].strip())

    for line in lines[4:]:
        if len(atoms) < atom_count:
            atoms.append(line.split())
        elif len(bonds) < bond_count:
            bonds.append(line.split())

    atom_coords = np.array([[float(atom[0]), float(atom[1]), float(atom[2])] for atom in atoms])
    atom_types = [atom[3] for atom in atoms]
    bond_pairs = [(int(bond[0]) - 1, int(bond[1]) - 1) for bond in bonds]
    
    return atom_coords, atom_types, bond_pairs

def sdf_to_graph(atom_types, bond_pairs):
    G = nx.Graph()
    for i, atom_type in enumerate(atom_types):
        G.add_node(i, atom_type=atom_type)
    for bond in bond_pairs:
        G.add_edge(*bond)
    return G

def calculate_rmsd(coords1, coords2, atom_map):
    mapped_coords1 = np.array([coords1[i] for i in atom_map.keys()])
    mapped_coords2 = np.array([coords2[i] for i in atom_map.values()])
    diff = mapped_coords1 - mapped_coords2
    return np.sqrt(np.mean(np.sum(diff**2, axis=1)))

def calculate_rmsd_without_rdkit(sdf_file1, sdf_file2):
    coords1, atom_types1, bonds1 = parse_sdf(sdf_file1)
    coords2, atom_types2, bonds2 = parse_sdf(sdf_file2)
    
    graph1 = sdf_to_graph(atom_types1, bonds1)
    graph2 = sdf_to_graph(atom_types2, bonds2)
    
    nm = isomorphism.GraphMatcher(graph1, graph2, node_match=lambda n1, n2: n1['atom_type'] == n2['atom_type'])
    
    if nm.is_isomorphic():
        atom_map = nm.mapping
        rmsd = calculate_rmsd(coords1, coords2, atom_map)
        return rmsd
    else:
        print("The molecules are not isomorphic.")
        return None

#prot = '7BMI_U4B'
#sdf_file1 = f'/home/megagatlingpea/workdir/FABind/FABind_plus/posebusters_benchmark/docking_results/{prot}.sdf'
#sdf_file2 = f'/mnt/data/posebusters/posebusters_benchmark_set/{prot}/{prot}_ligand.sdf'

#rmsd = calculate_rmsd_without_rdkit(sdf_file1, sdf_file2)
#if rmsd is not None:
#    print(f"RMSD without RDKit: {rmsd:.4f}")
#else:
#    print("The molecules are not isomorphic.")


In [4]:
def process_sdfs(inference_dir,reference_dir,output_csv):
    results = []
    
    for filename in os.listdir(inference_dir):
        if filename.endswith(".sdf"):
            prot = filename.split('.')[0]
            pred_sdf = os.path.join(inference_dir, filename)
            ref_sdf = os.path.join(reference_dir, prot, f"{prot}_ligand.sdf")
            
            if not os.path.exists(ref_sdf):
                print(f"Warning: Reference file not found for {prot}")
                results.append((prot, -1))
                continue
            
            rmsd = calculate_rmsd_without_rdkit(ref_sdf, pred_sdf)
            if rmsd is None:
                results.append((prot, -1))
            else:
                results.append((prot, rmsd))
    
    # Write results to CSV
    with open(output_csv, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['pdb_id', 'rmsd'])
        for prot, rmsd in results:
            writer.writerow([prot, f"{rmsd:.4f}" if rmsd != -1 else "-1"])

inference_dir = "./posebusters_benchmark/docking_results"
reference_dir = "/mnt/data/posebusters/posebusters_benchmark_set"
output_csv = "unaligned_rmsd_results_nordkit.csv"

process_sdfs(inference_dir, reference_dir, output_csv)
print(f"Results have been written to {output_csv}")

Results have been written to unaligned_rmsd_results_nordkit.csv


In [1]:
import pandas as pd
file_path = './unaligned_rmsd_results_nordkit.csv'
data = pd.read_csv(file_path)

# Calculate Success Rate
valid_rmsd = data[data['rmsd'] != -1]
rmsd_less_than_2 = valid_rmsd[valid_rmsd['rmsd'] < 2]
rmsd_less_than_2_ratio = len(rmsd_less_than_2) / len(data)

print(f"RMSD < 2: {rmsd_less_than_2_ratio:.2%}")

RMSD < 2: 11.21%
