In [1]:
import MDAnalysis as mda
import pandas as pd
from biopandas.pdb import PandasPdb
import os
import glob
import re
import math
import numpy as np

def pdb_to_dataframe(pdb_file):
    """
    Load a PDB file using MDAnalysis and convert key atom information to a pandas DataFrame.
    """
    u = mda.Universe(pdb_file)
    
    # Extract atom-related data: atom name, residue name, residue ID, and chain ID
    atom_data = {
        'Atom Name': u.atoms.names,
        'Residue Name': u.atoms.resnames,
        'Residue ID': u.atoms.resids,
        'Chain ID': u.atoms.segids,
        'X': u.atoms.positions[:, 0],
        'Y': u.atoms.positions[:, 1],
        'Z': u.atoms.positions[:, 2],
    }
    
    # Create a pandas DataFrame from the atom data
    df = pd.DataFrame(atom_data)
    
    return df

def grid_list(atom_df):
    return list(zip(atom_df['x_coord'], atom_df['y_coord'], atom_df['z_coord']))

def filtering_proteins(atom_df, grid_list, radius=5.0):
    atom_coords = atom_df[['x_coord', 'y_coord', 'z_coord']].values
    filtered_atoms = set()

    for x, y, z in grid_list:
        distances_sq = (atom_coords[:, 0] - x)**2 + (atom_coords[:, 1] - y)**2 + (atom_coords[:, 2] - z)**2
        mask = distances_sq <= radius**2
        filtered_atoms.update(atom_df.index[mask])

    print(f"Total atoms within {radius} Å cutoff: {len(filtered_atoms)}")
    return atom_df.loc[list(filtered_atoms)]


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_positive_ligand_atoms(positive_file):
    protein_pdb_df = PandasPdb().read_pdb(positive_file)
    protein_pdb_df.df.keys()
    protein = protein_pdb_df.df['ATOM']
    protein = protein[~protein['atom_name'].str.startswith('H')] # don't use hydrogen
    protein_coords = protein[['x_coord', 'y_coord', 'z_coord']].values
    protein_centroid = protein_coords.mean(axis=0)
    print(set(protein['chain_id']))
    print(positive_file)

    ligand_df = PandasPdb().read_pdb(positive_file)
    ligand_df.df.keys()
    ligand = ligand_df.df['HETATM']
    ligand = ligand[ligand['residue_name']=="CLR"]
    x = list(set(zip(ligand['residue_number'], ligand['chain_id'])))

    #get the most inward residue
    min_distance = float('inf')
    closest_clr = None

    all_ligands = []

    for residue_number, chain_id in x:
        clr_atoms = ligand[(ligand['residue_number'] == residue_number) & (ligand['chain_id'] == chain_id)]
        if clr_atoms.empty:
            continue

        clr_coords = clr_atoms[['x_coord', 'y_coord', 'z_coord']].values
        clr_centroid = clr_coords.mean(axis=0)
        
        distance = np.linalg.norm(protein_centroid - clr_centroid)
        
        if distance < min_distance:
            min_distance = distance
            closest_clr = (residue_number, chain_id)

        grid_list_ = grid_list(clr_atoms)

        all_ligands.append(filtering_proteins(protein, grid_list_))

    ligand_ = ligand[(ligand['residue_number'] == closest_clr[0]) & (ligand['chain_id'] == closest_clr[1])]
    grid_list_ = grid_list(ligand_)

    filtered_atoms = filtering_proteins(protein, grid_list_)

    protein_name = os.path.splitext(os.path.basename(positive_file))[0]

    residue_number, chain_id = closest_clr

    # Save to pdb
    filtered_pdb = PandasPdb()
    filtered_pdb.df['ATOM'] = filtered_atoms
    filtered_pdb_path = f"ivan-pdbs-residue-5A/positive/{protein_name}-{residue_number}-{chain_id}-filtered.pdb"
    os.makedirs(os.path.dirname(filtered_pdb_path), exist_ok=True)
    filtered_pdb.to_pdb(path=filtered_pdb_path, records=None, gz=False, append_newline=True)

    return protein, all_ligands


In [3]:
def get_protein_name(filename):
    basename = os.path.basename(filename)  # Get file name without path
    match = re.match(r'([a-zA-Z0-9]{4})', basename)  # Match the first 4-character PDB ID
    if match:
        return match.group(1).upper()
    else:
        return None
def get_mode_index(filename):
    basename = os.path.basename(filename)
    match = re.search(r'mode_(\d+)', basename)
    if match:
        return int(match.group(1))
    else:
        return None  # or raise ValueError("No mode index found.")

def natural_sort_key(s):
    """Function to sort strings in a natural alphanumeric order."""
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', s)]


In [4]:
positive_files = glob.glob("../../../Data/UnsplitData/ivanfiles/*.pdb")
positive_files = sorted(positive_files, key=natural_sort_key)

for positive_file in positive_files:
    protein, all_lig_filtered = get_positive_ligand_atoms(positive_file)

{'E', 'G', 'B', 'C', 'A', 'D'}
../../../Data/UnsplitData/ivanfiles/4HQJ.pdb
Total atoms within 5.0 Å cutoff: 55
Total atoms within 5.0 Å cutoff: 43
Total atoms within 5.0 Å cutoff: 29
Total atoms within 5.0 Å cutoff: 60
Total atoms within 5.0 Å cutoff: 29
Total atoms within 5.0 Å cutoff: 29
{'E', 'G', 'B', 'C', 'A', 'D'}
../../../Data/UnsplitData/ivanfiles/4RET.pdb
Total atoms within 5.0 Å cutoff: 56
Total atoms within 5.0 Å cutoff: 51
Total atoms within 5.0 Å cutoff: 58
Total atoms within 5.0 Å cutoff: 55
Total atoms within 5.0 Å cutoff: 55
{'A', 'C'}
../../../Data/UnsplitData/ivanfiles/5OQT.pdb
Total atoms within 5.0 Å cutoff: 51
Total atoms within 5.0 Å cutoff: 51
{'B', 'A', 'D', 'C'}
../../../Data/UnsplitData/ivanfiles/5SY1.pdb
Total atoms within 5.0 Å cutoff: 38
Total atoms within 5.0 Å cutoff: 34
Total atoms within 5.0 Å cutoff: 34
{'B', 'A', 'D'}
../../../Data/UnsplitData/ivanfiles/5WB2.pdb
Total atoms within 5.0 Å cutoff: 38
Total atoms within 5.0 Å cutoff: 25
Total atoms withi

In [4]:
import numpy as np

def compute_inverse_pairwise_distances(df):
    """
    Compute the pairwise Euclidean distances between residues based on their 3D coordinates.

    Parameters:
    df (pd.DataFrame): DataFrame containing 'X', 'Y', 'Z' coordinates and 'NewIndex' as index.

    Returns:
    pd.DataFrame: A DataFrame containing the pairwise distance matrix.
    """
    # Extract the coordinates (X, Y, Z)
    coordinates = df[['X', 'Y', 'Z']].values

    # Calculate pairwise distances using broadcasting
    diff = coordinates[:, np.newaxis, :] - coordinates[np.newaxis, :, :]
    distances = np.sqrt(np.sum(diff ** 2, axis=-1))

    # Compute inverse distance (1/d)
    with np.errstate(divide='ignore'):  # Ignore division by zero warning
        inverse_distances = 1 / distances

    # Set diagonal elements (self-distances) to 1
    np.fill_diagonal(inverse_distances, 1)

    # Cap values at 1
    inverse_distances = np.minimum(inverse_distances, 1)

    return inverse_distances

def pdb_to_dataframe(pdb_file):
    """
    Load a PDB file using MDAnalysis and convert key atom information to a pandas DataFrame.
    """
    u = mda.Universe(pdb_file)
    
    # Extract atom-related data: atom name, residue name, residue ID, and chain ID
    atom_data = {
        'Atom Name': u.atoms.names,
        'Residue Name': u.atoms.resnames,
        'Residue ID': u.atoms.resids,
        'Chain ID': u.atoms.segids,
        'X': u.atoms.positions[:, 0],
        'Y': u.atoms.positions[:, 1],
        'Z': u.atoms.positions[:, 2],
    }
    
    # Create a pandas DataFrame from the atom data
    df = pd.DataFrame(atom_data)
    
    return df

def one_hot_encoding(pdb_df):
    biggest_set = [
        # Carbon (C) subtypes
        'C', 'CA', 'CB', 'CD', 'CD1', 'CD2', 'CE', 'CE1', 'CE2', 'CE3', 'CG', 'CG1', 'CG2', 'CH2', 'CZ', 'CZ2', 'CZ3',

        # Oxygen (O) subtypes
        'O', 'OH', 'OD1', 'OD2', 'OE1', 'OE2', 'OG', 'OG1', 

        # Nitrogen (N) subtypes
        'N', 'NE', 'NE1', 'NE2', 'ND1', 'ND2', 'NZ', 'NH1', 'NH2', 

        # Sulfur (S) subtypes
        'SD', 'SG'
    ]

    biggest_set.append('UNKNOWN')  # Add an additional column for unknown atom types
    
    # Create a zero matrix with shape (num_rows, num_unique_atoms)
    num_rows = len(pdb_df)
    num_cols = len(biggest_set)
    one_hot_matrix = np.zeros((num_rows, num_cols), dtype=int)

    # Create a mapping from atom name to index
    atom_to_index = {atom: idx for idx, atom in enumerate(biggest_set)}

    # Fill the one-hot matrix
    for i, atom in enumerate(pdb_df['Atom Name']):
        if atom in atom_to_index:
            one_hot_matrix[i, atom_to_index[atom]] = 1
        else:
            one_hot_matrix[i, atom_to_index['UNKNOWN']] = 1
            print(atom, "went to unknown column")

    return one_hot_matrix

def min_max_normalization(matrix):
    """
    Perform Min-Max normalization on a given matrix.

    Parameters:
    matrix (np.ndarray): The input matrix to be normalized.

    Returns:
    np.ndarray: The normalized matrix with values scaled to the range [0, 1].
    """
    # Compute the minimum and maximum values for the matrix
    min_val = np.min(matrix)
    max_val = np.max(matrix)

    # Apply Min-Max normalization formula
    normalized_matrix = (matrix - min_val) / (max_val - min_val)

    return normalized_matrix

In [5]:
positive_files = glob.glob("ivan-pdbs-distinct-5A/positive/*.pdb")
positive_files = sorted(positive_files, key=natural_sort_key)

for file in positive_files:
    pdb_df = pdb_to_dataframe(file)
    encoded_matrix = one_hot_encoding(pdb_df)

    inverse_distance = compute_inverse_pairwise_distances(pdb_df)

    combined_matrix = inverse_distance @ encoded_matrix 
    combined_matrix = min_max_normalization(combined_matrix)

    num_atoms, num_features = combined_matrix.shape
    max_atoms = 150
    print(num_atoms)

    if(num_atoms > 150):
        print(num_atoms)
        raise Exception("Too many atoms!")

    combined_matrix = np.pad(combined_matrix, ((0, max_atoms - num_atoms), (0, 0)), mode='constant')

    base_name = os.path.splitext(os.path.basename(file))[0]
    
    output_file = f"ivan-graph-5A/positive/{base_name}_combined_matrix.npy"
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    np.save(output_file, combined_matrix)



29
55
51
34
38
22
21
21
45
95
23
25
36
19
65
22
30
24
30
20
30
36
58
55
59
56
58
5
52
18
29
31
56
41
34
54
57
45
50
90
34
46
40
55
77
60
24
24
25
22
33
35
55
52
54
36
26


In [6]:
import numpy as np

# Path to your saved .npy file
file_path = "ivan-graph-5A/positive/4HQJ-filtered_combined_matrix.npy"

matrix = np.load(file_path)

# Confirm type and shape
print("Type:", type(matrix))
print("Shape:", matrix.shape)

# View a small sample
print("First 5 rows:\n", matrix[:5])

Type: <class 'numpy.ndarray'>
Shape: (150, 37)
First 5 rows:
 [[0.23198018 0.29527727 0.23731989 0.         0.06558828 0.14542504
  0.0510653  0.12774413 0.12071827 0.         0.14063832 0.
  0.06919102 0.         0.13207502 0.         0.         0.81198566
  0.14879761 0.         0.         0.08619842 0.         0.16280626
  0.         0.26073687 0.         0.06186458 0.         0.
  0.         0.         0.         0.         0.         0.
  0.        ]
 [0.80861499 0.53721793 0.33779622 0.         0.0714861  0.20541368
  0.06912364 0.14972455 0.15069546 0.         0.20661184 0.
  0.0632363  0.         0.16268444 0.         0.         0.71595322
  0.19129213 0.         0.         0.07164292 0.         0.16400105
  0.         0.6517383  0.         0.05182898 0.         0.
  0.         0.         0.         0.         0.         0.
  0.        ]
 [0.71312743 0.57045407 0.37266872 0.         0.07242136 0.23727995
  0.07851033 0.15563872 0.16110287 0.         0.24715015 0.
  0.06159686 0