In [2]:
import MDAnalysis as mda
import pandas as pd
from biopandas.pdb import PandasPdb
import os
import glob
import re
import math
import numpy as np

def pdb_to_dataframe(pdb_file):
    """
    Load a PDB file using MDAnalysis and convert key atom information to a pandas DataFrame.
    """
    u = mda.Universe(pdb_file)
    
    # Extract atom-related data: atom name, residue name, residue ID, and chain ID
    atom_data = {
        'Atom Name': u.atoms.names,
        'Residue Name': u.atoms.resnames,
        'Residue ID': u.atoms.resids,
        'Chain ID': u.atoms.segids,
        'X': u.atoms.positions[:, 0],
        'Y': u.atoms.positions[:, 1],
        'Z': u.atoms.positions[:, 2],
    }
    
    # Create a pandas DataFrame from the atom data
    df = pd.DataFrame(atom_data)
    
    return df

def grid_list(atom_df):
    return list(zip(atom_df['x_coord'], atom_df['y_coord'], atom_df['z_coord']))

def filtering_proteins(atom_df, grid_list, radius=5.0):
    atom_coords = atom_df[['x_coord', 'y_coord', 'z_coord']].values
    filtered_atoms = set()

    for x, y, z in grid_list:
        distances_sq = (atom_coords[:, 0] - x)**2 + (atom_coords[:, 1] - y)**2 + (atom_coords[:, 2] - z)**2
        mask = distances_sq <= radius**2
        filtered_atoms.update(atom_df.index[mask])

    print(f"Total atoms within {radius} Ã… cutoff: {len(filtered_atoms)}")
    return atom_df.loc[list(filtered_atoms)]


In [4]:
def get_positive_ligand_atoms(positive_file, protein_name):
    protein_pdb_df = PandasPdb().read_pdb(positive_file)
    protein_pdb_df.df.keys()
    protein = protein_pdb_df.df['ATOM']
    protein = protein[~protein['atom_name'].str.startswith('H')] # don't use hydrogen
    protein_coords = protein[['x_coord', 'y_coord', 'z_coord']].values
    protein_centroid = protein_coords.mean(axis=0)
    print(set(protein['chain_id']))
    print(positive_file)

    ligand_df = PandasPdb().read_pdb(positive_file)
    ligand_df.df.keys()
    ligand = ligand_df.df['HETATM']
    ligand = ligand[ligand['residue_name']=="CLR"]
    x = list(set(zip(ligand['residue_number'], ligand['chain_id'])))

    #get the most inward residue
    min_distance = float('inf')
    closest_clr = None

    all_ligands = []

    for residue_number, chain_id in x:
        clr_atoms = ligand[(ligand['residue_number'] == residue_number) & (ligand['chain_id'] == chain_id)]
        if clr_atoms.empty:
            continue

        clr_coords = clr_atoms[['x_coord', 'y_coord', 'z_coord']].values
        clr_centroid = clr_coords.mean(axis=0)
        
        distance = np.linalg.norm(protein_centroid - clr_centroid)
        
        if distance < min_distance:
            min_distance = distance
            closest_clr = (residue_number, chain_id)

        grid_list_ = grid_list(clr_atoms)

        all_ligands.append(filtering_proteins(protein, grid_list_))

    ligand_ = ligand[(ligand['residue_number'] == closest_clr[0]) & (ligand['chain_id'] == closest_clr[1])]
    grid_list_ = grid_list(ligand_)

    filtered_atoms = filtering_proteins(protein, grid_list_)

    # Save to pdb
    filtered_pdb = PandasPdb()
    filtered_pdb.df['ATOM'] = filtered_atoms
    filtered_pdb_path = f"filtered-pdbs-distinct-5A/positive/{protein_name}-filtered.pdb"
    os.makedirs(os.path.dirname(filtered_pdb_path), exist_ok=True)
    filtered_pdb.to_pdb(path=filtered_pdb_path, records=None, gz=False, append_newline=True)

    return protein, all_ligands


In [5]:
def check_if_unlabeled_is_positive(positive_df, unlabeled_df):
    # Create a unique key for each atom based on identifying features
    positive_df['atom_key'] = (
        positive_df['atom_name'].str.strip() + '_' +
        positive_df['residue_name'].str.strip() + '_' +
        positive_df['residue_number'].astype(str) + '_' +
        positive_df['chain_id'].fillna('')
    )

    unlabeled_df['atom_key'] = (
        unlabeled_df['atom_name'].str.strip() + '_' +
        unlabeled_df['residue_name'].str.strip() + '_' +
        unlabeled_df['residue_number'].astype(str) + '_' +
        unlabeled_df['chain_id'].fillna('')
    )

    keys1 = set(positive_df['atom_key'])
    keys2 = set(unlabeled_df['atom_key'])

    common_atoms = keys1 & keys2
    total_atoms = max(len(keys1), len(keys2))

    if total_atoms == 0:
        print("Zero total atoms")
        return False

    similarity = len(common_atoms) / total_atoms
    print(similarity)
    return similarity >= 0.2

In [3]:
def get_protein_name(filename):
    basename = os.path.basename(filename)  # Get file name without path
    match = re.match(r'([a-zA-Z0-9]{4})', basename)  # Match the first 4-character PDB ID
    if match:
        return match.group(1).upper()
    else:
        return None
def get_mode_index(filename):
    basename = os.path.basename(filename)
    match = re.search(r'mode_(\d+)', basename)
    if match:
        return int(match.group(1))
    else:
        return None  # or raise ValueError("No mode index found.")

def natural_sort_key(s):
    """Function to sort strings in a natural alphanumeric order."""
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', s)]


In [None]:

positive_files = glob.glob("CLR-PDB/*.pdb")
positive_files = sorted(positive_files, key=natural_sort_key)

unlabeled_files = glob.glob("CLR-Unlabeled-Distinct/*.pdb")
unlabeled_files = sorted(unlabeled_files, key=natural_sort_key)

positive_index = 0
protein, all_lig_filtered = get_positive_ligand_atoms(positive_files[positive_index], get_protein_name(positive_files[positive_index]))

for unlabeled_file in unlabeled_files:
    positive_name = get_protein_name(positive_files[positive_index])
    unlabeled_name = get_protein_name(unlabeled_file)

    fragment_index = get_mode_index(unlabeled_file)

    if positive_name != unlabeled_name:
        positive_index += 1
        positive_name = get_protein_name(positive_files[positive_index])

        if positive_name != unlabeled_name:
            raise Exception("Proteins Not Matching Up!!!")
        
        protein, all_lig_filtered = get_positive_ligand_atoms(positive_files[positive_index], positive_name)

    fragment_df = PandasPdb().read_pdb(unlabeled_file)
    fragment_df.df.keys()
    fragment = fragment_df.df['HETATM']

    grid_list_ = grid_list(fragment)

    filtered_atoms = filtering_proteins(protein, grid_list_)
    
    if not filtered_atoms.empty:
        for lig in all_lig_filtered:
            is_positive = check_if_unlabeled_is_positive(lig, filtered_atoms)

            if is_positive:
                break

        # Save to pdb
        filtered_pdb = PandasPdb()
        filtered_pdb.df['ATOM'] = filtered_atoms

        if is_positive:
            filtered_pdb_path = f"filtered-pdbs-distinct-5A/unlabeled/{unlabeled_name}-f{fragment_index}-positive.pdb"
        else:
            filtered_pdb_path = f"filtered-pdbs-distinct-5A/unlabeled/{unlabeled_name}-f{fragment_index}.pdb"
        os.makedirs(os.path.dirname(filtered_pdb_path), exist_ok=True)
        filtered_pdb.to_pdb(path=filtered_pdb_path, records=None, gz=False, append_newline=True)
    
    fragment_index += 1

In [4]:
import numpy as np

def compute_inverse_pairwise_distances(df):
    """
    Compute the pairwise Euclidean distances between residues based on their 3D coordinates.

    Parameters:
    df (pd.DataFrame): DataFrame containing 'X', 'Y', 'Z' coordinates and 'NewIndex' as index.

    Returns:
    pd.DataFrame: A DataFrame containing the pairwise distance matrix.
    """
    # Extract the coordinates (X, Y, Z)
    coordinates = df[['X', 'Y', 'Z']].values

    # Calculate pairwise distances using broadcasting
    diff = coordinates[:, np.newaxis, :] - coordinates[np.newaxis, :, :]
    distances = np.sqrt(np.sum(diff ** 2, axis=-1))

    # Compute inverse distance (1/d)
    with np.errstate(divide='ignore'):  # Ignore division by zero warning
        inverse_distances = 1 / distances

    # Set diagonal elements (self-distances) to 1
    np.fill_diagonal(inverse_distances, 1)

    # Cap values at 1
    inverse_distances = np.minimum(inverse_distances, 1)

    return inverse_distances

def pdb_to_dataframe(pdb_file):
    """
    Load a PDB file using MDAnalysis and convert key atom information to a pandas DataFrame.
    """
    u = mda.Universe(pdb_file)
    
    # Extract atom-related data: atom name, residue name, residue ID, and chain ID
    atom_data = {
        'Atom Name': u.atoms.names,
        'Residue Name': u.atoms.resnames,
        'Residue ID': u.atoms.resids,
        'Chain ID': u.atoms.segids,
        'X': u.atoms.positions[:, 0],
        'Y': u.atoms.positions[:, 1],
        'Z': u.atoms.positions[:, 2],
    }
    
    # Create a pandas DataFrame from the atom data
    df = pd.DataFrame(atom_data)
    
    return df

def one_hot_encoding(pdb_df):
    biggest_set = [
        # Carbon (C) subtypes
        'C', 'CA', 'CB', 'CD', 'CD1', 'CD2', 'CE', 'CE1', 'CE2', 'CE3', 'CG', 'CG1', 'CG2', 'CH2', 'CZ', 'CZ2', 'CZ3',

        # Oxygen (O) subtypes
        'O', 'OH', 'OD1', 'OD2', 'OE1', 'OE2', 'OG', 'OG1', 

        # Nitrogen (N) subtypes
        'N', 'NE', 'NE1', 'NE2', 'ND1', 'ND2', 'NZ', 'NH1', 'NH2', 

        # Sulfur (S) subtypes
        'SD', 'SG'
    ]

    biggest_set.append('UNKNOWN')  # Add an additional column for unknown atom types
    
    # Create a zero matrix with shape (num_rows, num_unique_atoms)
    num_rows = len(pdb_df)
    num_cols = len(biggest_set)
    one_hot_matrix = np.zeros((num_rows, num_cols), dtype=int)

    # Create a mapping from atom name to index
    atom_to_index = {atom: idx for idx, atom in enumerate(biggest_set)}

    # Fill the one-hot matrix
    for i, atom in enumerate(pdb_df['Atom Name']):
        if atom in atom_to_index:
            one_hot_matrix[i, atom_to_index[atom]] = 1
        else:
            one_hot_matrix[i, atom_to_index['UNKNOWN']] = 1
            print(atom, "went to unknown column")

    return one_hot_matrix

def min_max_normalization(matrix):
    """
    Perform Min-Max normalization on a given matrix.

    Parameters:
    matrix (np.ndarray): The input matrix to be normalized.

    Returns:
    np.ndarray: The normalized matrix with values scaled to the range [0, 1].
    """
    # Compute the minimum and maximum values for the matrix
    min_val = np.min(matrix)
    max_val = np.max(matrix)

    # Apply Min-Max normalization formula
    normalized_matrix = (matrix - min_val) / (max_val - min_val)

    return normalized_matrix

In [None]:
max_atoms = 150
output_dir = "cholesterol-graph-5A/positive"
os.makedirs(output_dir, exist_ok=True)

positive_files = glob.glob("filtered-pdbs-distinct-5A/positive/*.pdb")
positive_files = sorted(positive_files, key=natural_sort_key)

for file in positive_files:
    pdb_df = pdb_to_dataframe(file)
    encoded_matrix = one_hot_encoding(pdb_df)
    inverse_distance = compute_inverse_pairwise_distances(pdb_df) # don't need to normalize since gat notebook already does that

    combined_matrix = inverse_distance @ encoded_matrix # for gnn
    combined_matrix = min_max_normalization(combined_matrix)

    num_atoms = inverse_distance.shape[0]

    if num_atoms > max_atoms:
        print(f"{file} has {num_atoms} atoms, exceeding the limit of {max_atoms}")
        raise Exception("Too many atoms!")
    
    combined_matrix = np.pad(combined_matrix, ((0, max_atoms - num_atoms), (0, 0)), mode='constant') # padding for gnn

    # Save to file
    base_name = os.path.splitext(os.path.basename(file))[0]
    output_path = os.path.join(output_dir, f"{base_name}_combined_matrix.npy")

    np.save(output_path, combined_matrix)

    print(f"Saved: {output_path}")

Saved: cholesterol-separate-graphs-5A/positive/1LRI-filtered_graphs.npy
Saved: cholesterol-separate-graphs-5A/positive/1N83-filtered_graphs.npy
Saved: cholesterol-separate-graphs-5A/positive/1ZHY-filtered_graphs.npy
Saved: cholesterol-separate-graphs-5A/positive/2RH1-filtered_graphs.npy
Saved: cholesterol-separate-graphs-5A/positive/2ZXE-filtered_graphs.npy
Saved: cholesterol-separate-graphs-5A/positive/3A3Y-filtered_graphs.npy
Saved: cholesterol-separate-graphs-5A/positive/3AM6-filtered_graphs.npy
Saved: cholesterol-separate-graphs-5A/positive/3D4S-filtered_graphs.npy
Saved: cholesterol-separate-graphs-5A/positive/3GKI-filtered_graphs.npy
Saved: cholesterol-separate-graphs-5A/positive/3N9Y-filtered_graphs.npy
Saved: cholesterol-separate-graphs-5A/positive/3NY8-filtered_graphs.npy
Saved: cholesterol-separate-graphs-5A/positive/3NY9-filtered_graphs.npy
Saved: cholesterol-separate-graphs-5A/positive/3NYA-filtered_graphs.npy
Saved: cholesterol-separate-graphs-5A/positive/3WGU-filtered_gra

In [None]:
max_atoms = 150
output_dir = "cholesterol-graph-5A/unlabeled"
os.makedirs(output_dir, exist_ok=True)

unlabeled_files = glob.glob("filtered-pdbs-distinct-5A/unlabeled/*.pdb")
unlabeled_files = sorted(unlabeled_files, key=natural_sort_key)

for file in unlabeled_files:
    pdb_df = pdb_to_dataframe(file)
    encoded_matrix = one_hot_encoding(pdb_df)
    inverse_distance = compute_inverse_pairwise_distances(pdb_df) # don't need to normalize since gat notebook already does that

    combined_matrix = inverse_distance @ encoded_matrix # for gnn
    combined_matrix = min_max_normalization(combined_matrix)

    num_atoms = inverse_distance.shape[0]

    if num_atoms > max_atoms:
        print(f"{file} has {num_atoms} atoms, exceeding the limit of {max_atoms}")
        raise Exception("Too many atoms!")
    
    combined_matrix = np.pad(combined_matrix, ((0, max_atoms - num_atoms), (0, 0)), mode='constant') # padding for gnn

    # Save to file
    base_name = os.path.splitext(os.path.basename(file))[0]
    output_path = os.path.join(output_dir, f"{base_name}_combined_matrix.npy")

    np.save(output_path, combined_matrix)

    print(f"Saved: {output_path}")


Saved: cholesterol-separate-graphs-5A/unlabeled/1LRI-f1_graphs.npy
Saved: cholesterol-separate-graphs-5A/unlabeled/1LRI-f4_graphs.npy
Saved: cholesterol-separate-graphs-5A/unlabeled/1N83-f1_graphs.npy
Saved: cholesterol-separate-graphs-5A/unlabeled/1N83-f2_graphs.npy
Saved: cholesterol-separate-graphs-5A/unlabeled/1N83-f4_graphs.npy
Saved: cholesterol-separate-graphs-5A/unlabeled/1ZHY-f1_graphs.npy
Saved: cholesterol-separate-graphs-5A/unlabeled/1ZHY-f2_graphs.npy
Saved: cholesterol-separate-graphs-5A/unlabeled/1ZHY-f3_graphs.npy
Saved: cholesterol-separate-graphs-5A/unlabeled/1ZHY-f4_graphs.npy
Saved: cholesterol-separate-graphs-5A/unlabeled/1ZHY-f5_graphs.npy
Saved: cholesterol-separate-graphs-5A/unlabeled/2RH1-f1_graphs.npy
Saved: cholesterol-separate-graphs-5A/unlabeled/2RH1-f2-positive_graphs.npy
Saved: cholesterol-separate-graphs-5A/unlabeled/2RH1-f3-positive_graphs.npy
Saved: cholesterol-separate-graphs-5A/unlabeled/2RH1-f4_graphs.npy
Saved: cholesterol-separate-graphs-5A/unlabe