In [29]:
import MDAnalysis as mda
import pandas as pd
from biopandas.pdb import PandasPdb
import os
import glob
import re
import math
import numpy as np

def pdb_to_dataframe(pdb_file):
    """
    Load a PDB file using MDAnalysis and convert key atom information to a pandas DataFrame.
    """
    u = mda.Universe(pdb_file)
    
    # Extract atom-related data: atom name, residue name, residue ID, and chain ID
    atom_data = {
        'Atom Name': u.atoms.names,
        'Residue Name': u.atoms.resnames,
        'Residue ID': u.atoms.resids,
        'Chain ID': u.atoms.segids,
        'X': u.atoms.positions[:, 0],
        'Y': u.atoms.positions[:, 1],
        'Z': u.atoms.positions[:, 2],
    }
    
    # Create a pandas DataFrame from the atom data
    df = pd.DataFrame(atom_data)
    
    return df

def grid_list(atom_df):
    return list(zip(atom_df['x_coord'], atom_df['y_coord'], atom_df['z_coord']))

def filtering_proteins(atom_df, grid_list, radius=5.0):
    atom_coords = atom_df[['x_coord', 'y_coord', 'z_coord']].values
    filtered_atoms = set()

    for x, y, z in grid_list:
        distances_sq = (atom_coords[:, 0] - x)**2 + (atom_coords[:, 1] - y)**2 + (atom_coords[:, 2] - z)**2
        mask = distances_sq <= radius**2
        filtered_atoms.update(atom_df.index[mask])

    print(f"Total atoms within {radius} Ã… cutoff: {len(filtered_atoms)}")
    return atom_df.loc[list(filtered_atoms)]


In [30]:
def get_positive_ligand_atoms(positive_file, protein_name):
    protein_pdb_df = PandasPdb().read_pdb(positive_file)
    protein_pdb_df.df.keys()
    protein = protein_pdb_df.df['ATOM']
    protein = protein[~protein['atom_name'].str.startswith('H')] # don't use hydrogen
    protein_coords = protein[['x_coord', 'y_coord', 'z_coord']].values
    protein_centroid = protein_coords.mean(axis=0)
    print(set(protein['chain_id']))
    print(positive_file)

    ligand_df = PandasPdb().read_pdb(positive_file)
    ligand_df.df.keys()
    ligand = ligand_df.df['HETATM']
    ligand = ligand[ligand['residue_name']=="CLR"]
    x = list(set(zip(ligand['residue_number'], ligand['chain_id'])))

    #get the most inward residue
    min_distance = float('inf')
    closest_clr = None

    all_ligands = []

    for residue_number, chain_id in x:
        clr_atoms = ligand[(ligand['residue_number'] == residue_number) & (ligand['chain_id'] == chain_id)]
        if clr_atoms.empty:
            continue

        clr_coords = clr_atoms[['x_coord', 'y_coord', 'z_coord']].values
        clr_centroid = clr_coords.mean(axis=0)
        
        distance = np.linalg.norm(protein_centroid - clr_centroid)
        
        if distance < min_distance:
            min_distance = distance
            closest_clr = (residue_number, chain_id)

        grid_list_ = grid_list(clr_atoms)

        all_ligands.append(filtering_proteins(protein, grid_list_))

    ligand_ = ligand[(ligand['residue_number'] == closest_clr[0]) & (ligand['chain_id'] == closest_clr[1])]
    grid_list_ = grid_list(ligand_)

    filtered_atoms = filtering_proteins(protein, grid_list_)

    # Save to pdb
    filtered_pdb = PandasPdb()
    filtered_pdb.df['ATOM'] = filtered_atoms
    filtered_pdb_path = f"filtered-pdbs-distinct-5A/positive/{protein_name}-filtered.pdb"
    os.makedirs(os.path.dirname(filtered_pdb_path), exist_ok=True)
    filtered_pdb.to_pdb(path=filtered_pdb_path, records=None, gz=False, append_newline=True)

    return protein, all_ligands


In [31]:
def check_if_unlabeled_is_positive(positive_df, unlabeled_df):
    # Create a unique key for each atom based on identifying features
    positive_df['atom_key'] = (
        positive_df['atom_name'].str.strip() + '_' +
        positive_df['residue_name'].str.strip() + '_' +
        positive_df['residue_number'].astype(str) + '_' +
        positive_df['chain_id'].fillna('')
    )

    unlabeled_df['atom_key'] = (
        unlabeled_df['atom_name'].str.strip() + '_' +
        unlabeled_df['residue_name'].str.strip() + '_' +
        unlabeled_df['residue_number'].astype(str) + '_' +
        unlabeled_df['chain_id'].fillna('')
    )

    keys1 = set(positive_df['atom_key'])
    keys2 = set(unlabeled_df['atom_key'])

    common_atoms = keys1 & keys2
    total_atoms = max(len(keys1), len(keys2))

    if total_atoms == 0:
        print("Zero total atoms")
        return False

    similarity = len(common_atoms) / total_atoms
    print(similarity)
    return similarity >= 0.2

In [None]:
def get_protein_name(filename):
    basename = os.path.basename(filename)  # Get file name without path
    match = re.match(r'([a-zA-Z0-9]{4})', basename)  # Match the first 4-character PDB ID
    if match:
        return match.group(1).upper()
    else:
        return None
def get_mode_index(filename):
    basename = os.path.basename(filename)
    match = re.search(r'mode_(\d+)', basename)
    if match:
        return int(match.group(1))
    else:
        return None  # or raise ValueError("No mode index found.")

def natural_sort_key(s):
    """Function to sort strings in a natural alphanumeric order."""
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', s)]

positive_files = glob.glob("CLR-PDB/*.pdb")
positive_files = sorted(positive_files, key=natural_sort_key)

unlabeled_files = glob.glob("CLR-Unlabeled-Distinct/*.pdb")
unlabeled_files = sorted(unlabeled_files, key=natural_sort_key)

positive_index = 0
protein, all_lig_filtered = get_positive_ligand_atoms(positive_files[positive_index], get_protein_name(positive_files[positive_index]))

for unlabeled_file in unlabeled_files:
    positive_name = get_protein_name(positive_files[positive_index])
    unlabeled_name = get_protein_name(unlabeled_file)

    fragment_index = get_mode_index(unlabeled_file)

    if positive_name != unlabeled_name:
        positive_index += 1
        positive_name = get_protein_name(positive_files[positive_index])

        if positive_name != unlabeled_name:
            raise Exception("Proteins Not Matching Up!!!")
        
        protein, all_lig_filtered = get_positive_ligand_atoms(positive_files[positive_index], positive_name)

    fragment_df = PandasPdb().read_pdb(unlabeled_file)
    fragment_df.df.keys()
    fragment = fragment_df.df['HETATM']

    grid_list_ = grid_list(fragment)

    filtered_atoms = filtering_proteins(protein, grid_list_)
    
    if not filtered_atoms.empty:
        for lig in all_lig_filtered:
            is_positive = check_if_unlabeled_is_positive(lig, filtered_atoms)

            if is_positive:
                break

        # Save to pdb
        filtered_pdb = PandasPdb()
        filtered_pdb.df['ATOM'] = filtered_atoms

        if is_positive:
            filtered_pdb_path = f"filtered-pdbs-distinct-5A/unlabeled/{unlabeled_name}-f{fragment_index}-positive.pdb"
        else:
            filtered_pdb_path = f"filtered-pdbs-distinct-5A/unlabeled/{unlabeled_name}-f{fragment_index}.pdb"
        os.makedirs(os.path.dirname(filtered_pdb_path), exist_ok=True)
        filtered_pdb.to_pdb(path=filtered_pdb_path, records=None, gz=False, append_newline=True)
    
    fragment_index += 1

In [None]:
import numpy as np

def compute_inverse_pairwise_distances(df):
    """
    Compute the pairwise Euclidean distances between residues based on their 3D coordinates.

    Parameters:
    df (pd.DataFrame): DataFrame containing 'X', 'Y', 'Z' coordinates and 'NewIndex' as index.

    Returns:
    pd.DataFrame: A DataFrame containing the pairwise distance matrix.
    """
    # Extract the coordinates (X, Y, Z)
    coordinates = df[['X', 'Y', 'Z']].values

    # Calculate pairwise distances using broadcasting
    diff = coordinates[:, np.newaxis, :] - coordinates[np.newaxis, :, :]
    distances = np.sqrt(np.sum(diff ** 2, axis=-1))

    # Compute inverse distance (1/d)
    with np.errstate(divide='ignore'):  # Ignore division by zero warning
        inverse_distances = 1 / distances

    # Set diagonal elements (self-distances) to 1
    np.fill_diagonal(inverse_distances, 1)

    # Cap values at 1
    inverse_distances = np.minimum(inverse_distances, 1)

    return inverse_distances

def pdb_to_dataframe(pdb_file):
    """
    Load a PDB file using MDAnalysis and convert key atom information to a pandas DataFrame.
    """
    u = mda.Universe(pdb_file)
    
    # Extract atom-related data: atom name, residue name, residue ID, and chain ID
    atom_data = {
        'Atom Name': u.atoms.names,
        'Residue Name': u.atoms.resnames,
        'Residue ID': u.atoms.resids,
        'Chain ID': u.atoms.segids,
        'X': u.atoms.positions[:, 0],
        'Y': u.atoms.positions[:, 1],
        'Z': u.atoms.positions[:, 2],
    }
    
    # Create a pandas DataFrame from the atom data
    df = pd.DataFrame(atom_data)
    
    return df

def one_hot_encoding(pdb_df):
    biggest_set = [
        # Carbon (C) subtypes
        'C', 'CA', 'CB', 'CD', 'CD1', 'CD2', 'CE', 'CE1', 'CE2', 'CE3', 'CG', 'CG1', 'CG2', 'CH2', 'CZ', 'CZ2', 'CZ3',

        # Oxygen (O) subtypes
        'O', 'OH', 'OD1', 'OD2', 'OE1', 'OE2', 'OG', 'OG1', 

        # Nitrogen (N) subtypes
        'N', 'NE', 'NE1', 'NE2', 'ND1', 'ND2', 'NZ', 'NH1', 'NH2', 

        # Sulfur (S) subtypes
        'SD', 'SG'
    ]

    biggest_set.append('UNKNOWN')  # Add an additional column for unknown atom types
    
    # Create a zero matrix with shape (num_rows, num_unique_atoms)
    num_rows = len(pdb_df)
    num_cols = len(biggest_set)
    one_hot_matrix = np.zeros((num_rows, num_cols), dtype=int)

    # Create a mapping from atom name to index
    atom_to_index = {atom: idx for idx, atom in enumerate(biggest_set)}

    # Fill the one-hot matrix
    for i, atom in enumerate(pdb_df['Atom Name']):
        if atom in atom_to_index:
            one_hot_matrix[i, atom_to_index[atom]] = 1
        else:
            one_hot_matrix[i, atom_to_index['UNKNOWN']] = 1
            print(atom, "went to unknown column")

    return one_hot_matrix

def min_max_normalization(matrix):
    """
    Perform Min-Max normalization on a given matrix.

    Parameters:
    matrix (np.ndarray): The input matrix to be normalized.

    Returns:
    np.ndarray: The normalized matrix with values scaled to the range [0, 1].
    """
    # Compute the minimum and maximum values for the matrix
    min_val = np.min(matrix)
    max_val = np.max(matrix)

    # Apply Min-Max normalization formula
    normalized_matrix = (matrix - min_val) / (max_val - min_val)

    return normalized_matrix

In [34]:
positive_files = glob.glob("filtered-pdbs-distinct-5A/positive/*.pdb")
positive_files = sorted(positive_files, key=natural_sort_key)

for file in positive_files:
    pdb_df = pdb_to_dataframe(file)
    encoded_matrix = one_hot_encoding(pdb_df)

    inverse_distance = compute_inverse_pairwise_distances(pdb_df)

    combined_matrix = inverse_distance @ encoded_matrix 
    combined_matrix = min_max_normalization(combined_matrix)

    num_atoms, num_features = combined_matrix.shape
    max_atoms = 150
    print(num_atoms)

    if(num_atoms > 150):
        print(num_atoms)
        raise Exception("Too many atoms!")

    combined_matrix = np.pad(combined_matrix, ((0, max_atoms - num_atoms), (0, 0)), mode='constant')

    base_name = os.path.splitext(os.path.basename(file))[0]
    
    output_file = f"cholesterol-graph-5A/positive/{base_name}_combined_matrix.npy"
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    np.save(output_file, combined_matrix)


71
75
77
53
40
38
40
52
99
71
53
52
49
48
52
124
129
45
42
59
42
40
42
32
40
65
59
41
39
45
39
39
47
40
38
53
42
44
47
43
35
48
42
43
44
44
42
44
42
44
46
44
42
42
42
44
41
41
41
41
41
43
52
56
54
20
22
20
20
20
47
47
45
45
46
18
45
42
39
39
38
84
20
46
44
47
37
57
73
43
41
47
43
44
45
43
45
40
120
38
42
81
59
28
53
50
74
72
76
38
34
47
43
54
55
42
43
33
40
47
30
35
35
42
31
32
45
44
41
46
51
58
31
39
38
41
26
28
31
45
37
36
39
36
37
21
29
29
50
25
65
53
27
19
28
21
33
62
53
53
54
54
55
50
56
51
39
34
43
26
28
26
36
37
35
78
85
38
31
84
45
73
58
52
58
53
31
25
86
30
30
25
24
40
23
28
9
26
47
73
47
42
21
43
26
28
28
46
30
48
28
39
35
35
34
36
31
24
30
54
56
56
19
24
44
29
44
22
37
20
24
19
27
33
19
30
35
35
41
33
50
39
33
48
30
23
59
40
33
12
23
57
58
32
45
28
55
65
38
32
31
40
36
31
9
29
24
39
73
46
46
30
76
41
47
33
28
28
41
42
26
37
39
55
56
30
42
32
30
35
42
86
23
34
41
71
25
26
44
37
53
64
58
57
41
30
35
33
35
34
52
47
36
37
29
51
61
41
41
40
39
39
37
30
19
33
46
43
39
55
84
43
58


In [35]:
unlabeled_files = glob.glob("filtered-pdbs-distinct-5A/unlabeled/*.pdb")
unlabeled_files = sorted(unlabeled_files, key=natural_sort_key)

for file in unlabeled_files:
    pdb_df = pdb_to_dataframe(file)
    encoded_matrix = one_hot_encoding(pdb_df)

    inverse_distance = compute_inverse_pairwise_distances(pdb_df)

    combined_matrix = inverse_distance @ encoded_matrix 
    combined_matrix = min_max_normalization(combined_matrix)

    num_atoms, num_features = combined_matrix.shape
    max_atoms = 150
    print(num_atoms)

    if(num_atoms > 150):
        print(num_atoms)
        raise Exception("Too many atoms!")

    combined_matrix = np.pad(combined_matrix, ((0, max_atoms - num_atoms), (0, 0)), mode='constant')

    base_name = os.path.splitext(os.path.basename(file))[0]
    
    output_file = f"cholesterol-graph-5A/unlabeled/{base_name}_combined_matrix.npy"
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    np.save(output_file, combined_matrix)


70
45
75
67
54
75
78
111
88
79
90
34
21
86
68
72
55
85
88
87
97
84
69
66
84
77
62
83
103
61
83
67
108
60
72
59
70
93
75
91
96
66
98
68
87
95
46
44
46
71
54
72
65
66
77
69
68
71
67
58
87
90
75
108
86
84
70
67
74
73
86
86
74
118
87
27
88
85
84
55
27
74
81
67
81
22
77
70
41
78
86
98
47
51
99
70
84
55
35
110
110
78
48
76
46
85
72
52
95
72
32
109
20
47
77
76
42
66
69
43
77
34
70
111
79
42
74
73
26
46
80
79
40
81
96
86
75
72
92
96
66
67
90
56
92
43
75
80
59
75
38
56
42
77
59
50
83
54
77
59
62
99
61
49
101
50
51
90
70
84
96
53
97
59
50
54
50
91
69
36
72
85
56
59
72
43
84
55
54
95
59
134
31
98
129
23
59
86
18
35
64
94
84
92
107
91
79
70
93
72
97
103
102
79
102
87
90
75
80
3
88
109
1
67
124
4
103
104
93
4
116
99
96
87
64
96
37
81
37
81
60
81
83
84
91
65
84
66
76
98
77
82
96
83
74
80
69
67
71
71
69
43
84
76
86
49
37
75
72
85
33
73
77
3
87
41
86
101
68
99
39
84
78
52
85
99
70
101
88
91
69
84
12
31
58
93
96
52
88
8
35
48
102
71
5
2
89
85
76
56
89
70
85
84
78
107
90
79
88
92
70
97
121
75
73
84
67
1