In [None]:
import MDAnalysis as mda
import pandas as pd
import numpy as np

np.set_printoptions(linewidth=500)

def pdb_to_dataframe(pdb_file):
    """
    Load a PDB file using MDAnalysis and convert key atom information to a pandas DataFrame.
    """
    u = mda.Universe(pdb_file)
    
    # Extract atom-related data: atom name, residue name, residue ID, and chain ID
    atom_data = {
        'Atom Name': u.atoms.names,
        'Residue Name': u.atoms.resnames,
        'Residue ID': u.atoms.resids,
        'Chain ID': u.atoms.segids,
        'X': u.atoms.positions[:, 0],
        'Y': u.atoms.positions[:, 1],
        'Z': u.atoms.positions[:, 2],
    }
    
    # Create a pandas DataFrame from the atom data
    df = pd.DataFrame(atom_data)
    
    return df

def compute_inverse_pairwise_distances(df):
    """
    Compute the pairwise Euclidean distances between residues based on their 3D coordinates.

    Parameters:
    df (pd.DataFrame): DataFrame containing 'X', 'Y', 'Z' coordinates and 'NewIndex' as index.

    Returns:
    pd.DataFrame: A DataFrame containing the pairwise distance matrix.
    """
    # Extract the coordinates (X, Y, Z)
    coordinates = df[['X', 'Y', 'Z']].values

    # Calculate pairwise distances using broadcasting
    diff = coordinates[:, np.newaxis, :] - coordinates[np.newaxis, :, :]
    distances = np.sqrt(np.sum(diff ** 2, axis=-1))

    # Compute inverse distance (1/d)
    with np.errstate(divide='ignore'):  # Ignore division by zero warning
        inverse_distances = 1 / distances

    # Set diagonal elements (self-distances) to 1
    np.fill_diagonal(inverse_distances, 1)

    # Cap values at 1
    inverse_distances = np.minimum(inverse_distances, 1)

    return inverse_distances

def pdb_to_dataframe(pdb_file):
    """
    Load a PDB file using MDAnalysis and convert key atom information to a pandas DataFrame.
    """
    u = mda.Universe(pdb_file)
    
    # Extract atom-related data: atom name, residue name, residue ID, and chain ID
    atom_data = {
        'Atom Name': u.atoms.names,
        'Residue Name': u.atoms.resnames,
        'Residue ID': u.atoms.resids,
        'Chain ID': u.atoms.segids,
        'X': u.atoms.positions[:, 0],
        'Y': u.atoms.positions[:, 1],
        'Z': u.atoms.positions[:, 2],
    }
    
    # Create a pandas DataFrame from the atom data
    df = pd.DataFrame(atom_data)
    
    return df

def one_hot_encoding(pdb_df):
    biggest_set = [
        # Carbon (C) subtypes
        'C', 'CA', 'CB', 'CD', 'CD1', 'CD2', 'CE', 'CE1', 'CE2', 'CE3', 'CG', 'CG1', 'CG2', 'CH2', 'CZ', 'CZ2', 'CZ3',

        # Oxygen (O) subtypes
        'O', 'OH', 'OD1', 'OD2', 'OE1', 'OE2', 'OG', 'OG1', 

        # Nitrogen (N) subtypes
        'N', 'NE', 'NE1', 'NE2', 'ND1', 'ND2', 'NZ', 'NH1', 'NH2', 

        # Sulfur (S) subtypes
        'SD', 'SG'
    ]

    biggest_set.append('UNKNOWN')  # Add an additional column for unknown atom types
    
    # Create a zero matrix with shape (num_rows, num_unique_atoms)
    num_rows = len(pdb_df)
    num_cols = len(biggest_set)
    one_hot_matrix = np.zeros((num_rows, num_cols), dtype=int)

    # Create a mapping from atom name to index
    atom_to_index = {atom: idx for idx, atom in enumerate(biggest_set)}

    # Fill the one-hot matrix
    for i, atom in enumerate(pdb_df['Atom Name']):
        if atom in atom_to_index:
            one_hot_matrix[i, atom_to_index[atom]] = 1
        else:
            one_hot_matrix[i, atom_to_index['UNKNOWN']] = 1
            print(atom, "went to unknown column")

    return one_hot_matrix

def min_max_normalization(matrix):
    """
    Perform Min-Max normalization on a given matrix.

    Parameters:
    matrix (np.ndarray): The input matrix to be normalized.

    Returns:
    np.ndarray: The normalized matrix with values scaled to the range [0, 1].
    """
    # Compute the minimum and maximum values for the matrix
    min_val = np.min(matrix)
    max_val = np.max(matrix)

    # Apply Min-Max normalization formula
    normalized_matrix = (matrix - min_val) / (max_val - min_val)

    return normalized_matrix

In [None]:
pdb_file = "../GNN/filtered-pdbs-distinct-5A/positive/6PS5-filtered.pdb"

pdb_df = pdb_to_dataframe(pdb_file)

#print(pdb_df)

encoded_matrix = one_hot_encoding(pdb_df)
inverse_distance = compute_inverse_pairwise_distances(pdb_df) # don't need to normalize since gat notebook already does that
combined_matrix = inverse_distance @ encoded_matrix 
combined_matrix = min_max_normalization(combined_matrix)

grid_file = np.load("../../../Data/SplitData/Cholesterol/cholesterol-graph-5A/Test/Positive/3NY9-filtered_combined_matrix.npy")

num_atoms, num_features = combined_matrix.shape
max_atoms = 150
print(num_atoms)

print(np.shape(combined_matrix), "is shape of grid file")

if(num_atoms > 150):
    print(num_atoms)
    raise Exception("Too many atoms!")

combined_matrix = np.pad(combined_matrix, ((0, max_atoms - num_atoms), (0, 0)), mode='constant')

if np.allclose(grid_file, combined_matrix, atol=1e-5):
    print("The two matrices are the same")

else:
    print("They are not the same")

print(grid_file[:5][:5])
print(combined_matrix[:5][:5])

(52, 37) is shape of encoded matrix
52
(52, 37) is shape of grid file
The two matrices are the same
[[0.13295797 0.24828475 0.60506798 0.05598131 0.37776689 0.17782307 0.         0.10747247 0.13378698 0.04486999 0.2307819  0.05926739 0.26892096 0.0477138  0.15196219 0.05443819 0.04390223 0.74639742 0.06166023 0.         0.         0.         0.         0.09326136 0.         0.16059012 0.0566743  0.06423868 0.         0.         0.         0.         0.         0.05266777 0.         0.06807712 0.        ]
 [0.06494222 0.18121871 0.42154027 0.07807241 0.56906514 0.42845842 0.         0.23094505 0.24873357 0.03434439 0.63699175 0.04858898 0.18946446 0.0349975  0.28583588 0.03838085 0.03330956 0.27906659 0.12125059 0.         0.         0.         0.         0.05413393 0.         0.06806461 0.08338914 0.04411912 0.         0.         0.         0.         0.         0.08244442 0.         0.04770548 0.        ]
 [0.06350296 0.16387543 0.3934491  0.07025034 0.68734961 0.27304877 0.         0