In [7]:
import sys
from pathlib import Path
from typing import Optional

import numpy as np
import pandas as pd
from openbabel import pybel
from pydantic import BaseModel, Field

pybel.ob.obErrorLog.SetOutputLevel(0)


In [8]:
# Add the src directory to the system path
src_dir = Path.cwd().parent / "src"
sys.path.append(str(src_dir))


from preprocessing.models import PDBBindComplex, PDBBindDataset  # noqa: E402
from preprocessing.processors.featurizer import Featurizer  # noqa: E402
from preprocessing.processors.protein import (  # noqa: E402
    PocketProcessor,
    PocketProcessorConfig,
)

In [9]:
from pathlib import Path


class FeatureGeneratorConfig(BaseModel):
    elements_csv_path: Path = Field(..., description="Path to elements.csv file")
    max_samples: Optional[int] = Field(
        None, description="Maximum number of samples to process"
    )


class FeatureGenerator:
    def __init__(self, config: FeatureGeneratorConfig):
        self.config = config
        self.featurizer = Featurizer()
        self.element_dict = self._parse_element_description()

    def _parse_element_description(self):
        # Read the CSV file
        element_info = pd.read_csv(self.config.elements_csv_path)

        # Create a dictionary mapping atomic number to element properties
        element_info_dict = {}
        for _, row in element_info.iterrows():
            atomic_number = int(row["number"])
            element_info_dict[atomic_number] = {"vdWRadius": float(row["vdWRadius"])}
        return element_info_dict

    def _get_vdw_radii(self, mol):
        return np.array(
            [
                float(self.element_dict[atom.atomicnum]["vdWRadius"])
                for atom in mol.atoms
                if atom.atomicnum >= 2
            ]
        )

    def generate_features(self, complex: PDBBindComplex):
        # Process ligand
        ligand = next(pybel.readfile("mol2", str(complex.ligand_mol2)))
        lig_coords, lig_features = self.featurizer.get_features(ligand, molcode=1)
        lig_vdw = self._get_vdw_radii(ligand)

        # Process pocket
        pocket = next(pybel.readfile("mol2", str(complex.charged_pocket_mol2)))
        pocket_coords, pocket_features = self.featurizer.get_features(
            pocket, molcode=-1
        )
        pocket_vdw = self._get_vdw_radii(pocket)

        # Center coordinates
        centroid = lig_coords.mean(axis=0)
        lig_coords -= centroid
        pocket_coords -= centroid

        # Combine features
        combined_coords = np.vstack([lig_coords, pocket_coords])
        combined_features = np.vstack([lig_features, pocket_features])
        combined_vdw = np.concatenate([lig_vdw, pocket_vdw])

        return {
            "coords": combined_coords,
            "features": combined_features,
            "vdw_radii": combined_vdw,
            "affinity": complex.affinity,
        }

In [10]:
data_root = Path.cwd().parent / "data" / "pdb_bind"
general_dataset_path = data_root / "general-set"
refined_dataset_path = data_root / "refined-set"
output_hdf = data_root.parent / "processed" / "processed.hdf5"

# Load dataset
dataset = PDBBindDataset.from_root(data_root)
complex_1a4h = dataset.get_complex("1a4h")
print("Processing complex:", complex_1a4h)


Processing complex: pdb_id='1a4h' protein_pdb=PosixPath('/Users/marvinprakash/codes/PLB/plb_jnu/data/pdb_bind/general-set/1a4h/1a4h_protein.pdb') ligand_mol2=PosixPath('/Users/marvinprakash/codes/PLB/plb_jnu/data/pdb_bind/general-set/1a4h/1a4h_ligand.mol2') charged_pocket_mol2=None affinity=5.92 unrealistic_charge_present=False set_type='general'


In [11]:
# Initialize processors
pp_config = PocketProcessorConfig()
pocket_processor = PocketProcessor(config=pp_config)

fg_config = FeatureGeneratorConfig(elements_csv_path=data_root.parent / "elements.csv")
feature_generator = FeatureGenerator(fg_config)

In [12]:
# Get first n complexes
first_n = list(dataset.complexes.values())[:10]

# Process complexes
valid_complexes = {}
features = {}

for complex in first_n:
    try:
        # Process pocket
        pocket_processor.process_complex(complex)

        if complex.unrealistic_charge_present:
            print(f"Skipping {complex.pdb_id} due to unrealistic charges")
            continue

        # Generate features
        features[complex.pdb_id] = feature_generator.generate_features(complex)
        valid_complexes[complex.pdb_id] = complex

    except Exception as e:
        print(f"Failed to process {complex.pdb_id}: {str(e)}")

# Write to HDF5
writer = HDF5Writer(output_hdf)
writer.create_dataset(valid_complexes, features)
print(f"Created HDF5 file with {len(valid_complexes)} complexes")

NameError: name 'HDF5Writer' is not defined

In [6]:
# Load dataset
dataset = PDBBindDataset.from_root(data_root)
complex_1a4h = dataset.get_complex("1a4h")
print("Processing complex:", complex_1a4h)

# Initialize pocket processor
pp = PocketProcessor()
pp.process_complex(complex=complex_1a4h)

featurizer = Featurizer()

Processing complex: pdb_id='1a4h' protein_pdb=PosixPath('/Users/marvinprakash/codes/PLB/plb_jnu/data/pdb_bind/general-set/1a4h/1a4h_protein.pdb') ligand_mol2=PosixPath('/Users/marvinprakash/codes/PLB/plb_jnu/data/pdb_bind/general-set/1a4h/1a4h_ligand.mol2') charged_pocket_mol2=None affinity=5.92 unrealistic_charge_present=False set_type='general'


In [7]:
pocket = next(pybel.readfile("mol2", str(complex_1a4h.charged_pocket_mol2)))

  Cannot perform atom type translation: table cannot find requested types.
  This Mol2 file is non-standard. Problem with molecule: 1a4h_pocket Cannot interpret atom types correctly, instead attempting to interpret atom type: O as elements instead.
  Cannot perform atom type translation: table cannot find requested types.
  Cannot perform atom type translation: table cannot find requested types.
  This Mol2 file is non-standard. Problem with molecule: 1a4h_pocket Cannot interpret atom types correctly, instead attempting to interpret atom type: O as elements instead.
  Cannot perform atom type translation: table cannot find requested types.
  Cannot perform atom type translation: table cannot find requested types.
  This Mol2 file is non-standard. Problem with molecule: 1a4h_pocket Cannot interpret atom types correctly, instead attempting to interpret atom type: O as elements instead.
  Cannot perform atom type translation: table cannot find requested types.
  Cannot perform atom type t

The issue with the provided `mol2` file appears to be related to how atom types are specified. Open Babel is having trouble interpreting the atom types in the file, which is leading to warnings and potential misinterpretation of the molecular structure.

### Key Observations:

1. **Atom Type Specification**:

    - In the `mol2` format, atom types are typically specified with a suffix (e.g., `O.2`, `C.3`, etc.) to indicate the specific type of atom. These types are often based on the Tripos atom types.
    - In the ligand version of the file, the atom types are specified correctly (e.g., `O.2`, `O.3`, etc.).
    - In the protein version of the file, the atom types are simply `O` or `H`, which is too generic. This is causing Open Babel to issue warnings because it cannot interpret these atom types correctly.

2. **Charges**:
    - The protein version of the file includes charges, but the way the atom types are specified might be causing issues with how these charges are interpreted.

### Suggestions for Fixing the Issue:

1. **Specify Atom Types Correctly**:

    - Ensure that all atoms in the `mol2` file have specific atom types (e.g., `O.2`, `H.1`, etc.) instead of just `O` or `H`. This will help Open Babel interpret the file correctly.
    - For example, in the protein version, change `O` to `O.2` or another appropriate type, and `H` to `H.1` or another suitable type.

2. **Consistency**:

    - Make sure that the atom types are consistent across the entire file. If you're using specific atom types for the ligand, use similar types for the protein.

3. **Check for Other Issues**:
    - Ensure that the charges are correctly formatted and that the `mol2` file adheres to the standard format.

### Example of Corrected Protein Version:

Here’s an example of how the protein version might look after correcting the atom types:

```mol2
@<TRIPOS>MOLECULE
1a4h_pocket
998 966
SMALL
USER_CHARGES
****
Charges calculated by ChargeFW2 0.1, method:
@<TRIPOS>ATOM
    1 O.2       22.122   -2.841   -7.617 O.2  1 HOH1 -0.842
    2 H.1       21.916   -2.532   -6.732 H.1  1 HOH1  0.387
    3 H.2       22.837   -2.314   -7.982 H.1  1 HOH1  0.378
    4 O.2       23.303   -7.498   -3.037 O.2  2 HOH2 -0.814
    5 H.1       23.442   -7.608   -2.093 H.1  2 HOH2  0.423
    6 H.2       22.962   -8.315   -3.408 H.1  2 HOH2  0.363
    7 O.2       20.404   -7.590   -2.633 O.2  3 HOH3 -0.838
    8 H.1       21.242   -7.921   -2.965 H.1  3 HOH3  0.393
    9 H.2       20.164   -6.792   -3.109 H.1  3 HOH3  0.389
   10 O.2       17.630   -5.784   -0.984 O.2  4 HOH4 -0.863
   11 H.1       16.704   -5.536   -1.025 H.1  4 HOH4  0.422
```

### Conclusion:

The main issue is the lack of specific atom types in the protein version of the `mol2` file. By specifying the correct atom types (e.g., `O.2`, `H.1`), Open Babel should be able to interpret the file correctly.


In [8]:
pocket_coords, pocket_features = featurizer.get_features(pocket, molcode=-1)

In [9]:
from rich.jupyter import print

print(pocket_coords)
print(pocket_features)

In [12]:
print(pocket)