In [1]:
import MDAnalysis as mda
import pandas as pd
from biopandas.pdb import PandasPdb
import os
import glob
import re
import math
import numpy as np

def pdb_to_dataframe(pdb_file):
    """
    Load a PDB file using MDAnalysis and convert key atom information to a pandas DataFrame.
    """
    u = mda.Universe(pdb_file)
    
    # Extract atom-related data: atom name, residue name, residue ID, and chain ID
    atom_data = {
        'Atom Name': u.atoms.names,
        'Residue Name': u.atoms.resnames,
        'Residue ID': u.atoms.resids,
        'Chain ID': u.atoms.segids,
        'X': u.atoms.positions[:, 0],
        'Y': u.atoms.positions[:, 1],
        'Z': u.atoms.positions[:, 2],
    }
    
    # Create a pandas DataFrame from the atom data
    df = pd.DataFrame(atom_data)
    
    return df

def grid_list(atom_df):
    return list(zip(atom_df['x_coord'], atom_df['y_coord'], atom_df['z_coord']))

def filtering_proteins(atom_df, grid_list, radius=5.0):
    atom_coords = atom_df[['x_coord', 'y_coord', 'z_coord']].values
    filtered_atoms = set()

    for x, y, z in grid_list:
        distances_sq = (atom_coords[:, 0] - x)**2 + (atom_coords[:, 1] - y)**2 + (atom_coords[:, 2] - z)**2
        mask = distances_sq <= radius**2
        filtered_atoms.update(atom_df.index[mask])

    print(f"Total atoms within {radius} Å cutoff: {len(filtered_atoms)}")
    return atom_df.loc[list(filtered_atoms)]

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def _atom_name_from_pdb_line(line: str) -> str:
    # PDB atom name field is columns 13-16 (1-indexed) => [12:16] in Python
    return line[12:16].strip()

def reorder_pdb_text_atom_then_hetatm_o1_last(pdb_path: str, inplace: bool = True) -> str:
    """
    Rewrites a PDB so that:
      - All ATOM lines are written before all HETATM lines
      - Within HETATM, any atom with atom_name == 'O1' is last
    Preserves original atom_number and all other fields.
    """
    with open(pdb_path, "r") as f:
        lines = f.readlines()

    # Keep non-coordinate lines (headers, remarks, etc.)
    pre = []
    atom_lines = []
    het_lines = []
    post = []

    # Simple strategy:
    # - Collect all ATOM/HETATM anywhere
    # - Everything else we keep, but we’ll place END/TER at the end
    for ln in lines:
        if ln.startswith("ATOM  "):
            atom_lines.append(ln)
        elif ln.startswith("HETATM"):
            het_lines.append(ln)
        else:
            # We'll decide later where it goes
            post.append(ln)

    # Move END/TER/etc to the end, keep other "header-ish" lines at top.
    # This is conservative: anything before first coord line stays "pre".
    first_coord_idx = None
    for i, ln in enumerate(lines):
        if ln.startswith(("ATOM  ", "HETATM")):
            first_coord_idx = i
            break
    if first_coord_idx is not None:
        pre = [ln for ln in lines[:first_coord_idx] if not ln.startswith(("ATOM  ", "HETATM"))]
    else:
        pre = [ln for ln in lines if not ln.startswith(("ATOM  ", "HETATM"))]
        post = []

    # For the trailing section, remove coord lines + keep only TER/END/MODEL/ENDMDL/etc
    # If you want, you can keep everything, but this avoids duplicate remarks mid-file.
    tail_markers = ("TER", "END", "MODEL", "ENDMDL", "CONECT", "MASTER")
    tail = [ln for ln in post if ln.startswith(tail_markers)]

    # Reorder HETATM so O1 is last (preserve relative order of all others)
    het_o1 = [ln for ln in het_lines if _atom_name_from_pdb_line(ln) == "O1"]
    het_rest = [ln for ln in het_lines if _atom_name_from_pdb_line(ln) != "O1"]
    het_lines = het_rest + het_o1

    out_lines = pre + atom_lines + het_lines + tail

    out_path = pdb_path if inplace else (pdb_path.replace(".pdb", "") + "_reordered.pdb")
    with open(out_path, "w") as f:
        f.writelines(out_lines)

    return out_path

def write_ordered_pdb(path: str, atom_df: pd.DataFrame, het_df: pd.DataFrame):
    pdb = PandasPdb()
    pdb.df["ATOM"] = atom_df
    pdb.df["HETATM"] = het_df
    pdb.to_pdb(path=path, records=["ATOM", "HETATM"], gz=False, append_newline=True)
    reorder_pdb_text_atom_then_hetatm_o1_last(path, inplace=True)

In [3]:
def get_protein_name(filename):
    basename = os.path.basename(filename)  # Get file name without path
    match = re.match(r'([a-zA-Z0-9]{4})', basename)  # Match the first 4-character PDB ID
    if match:
        return match.group(1).upper()
    else:
        return None
def get_mode_index(filename):
    basename = os.path.basename(filename)
    match = re.search(r'mode_(\d+)', basename)
    if match:
        return int(match.group(1))
    else:
        return None  # or raise ValueError("No mode index found.")

def natural_sort_key(s):
    """Function to sort strings in a natural alphanumeric order."""
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', s)]


In [4]:
from pathlib import Path

protein_file = "/home/aashish/cholestrol/pz2_cholesterol/big_box/last_frame/anton_last_protein.pdb"

unlabeled_files = glob.glob("anton_data/*.pdb")
unlabeled_files = sorted(unlabeled_files, key=natural_sort_key)

protein_pdb_df = PandasPdb().read_pdb(protein_file)
protein_pdb_df.df.keys()
protein = protein_pdb_df.df['ATOM']
protein = protein[~protein['atom_name'].str.startswith('H')] # don't use hydrogen

for unlabeled_file in unlabeled_files:
    path = Path(unlabeled_file)
    stem = path.stem           

    id = stem
    fragment_df = PandasPdb().read_pdb(unlabeled_file)
    fragment_df.df.keys()
    fragment = fragment_df.df['HETATM']

    grid_list_ = grid_list(fragment)

    filtered_atoms = filtering_proteins(protein, grid_list_)
    
    if not filtered_atoms.empty:
        fragment = fragment[~fragment['atom_name'].str.startswith('H')]

        filtered_pdb_path = f"filtered-anton-with-clr-5A/unlabeled/{id}.pdb"
        os.makedirs(os.path.dirname(filtered_pdb_path), exist_ok=True)
        write_ordered_pdb(filtered_pdb_path, filtered_atoms, fragment)

Total atoms within 5.0 Å cutoff: 89
Total atoms within 5.0 Å cutoff: 82
Total atoms within 5.0 Å cutoff: 91
Total atoms within 5.0 Å cutoff: 99
Total atoms within 5.0 Å cutoff: 96
Total atoms within 5.0 Å cutoff: 87
Total atoms within 5.0 Å cutoff: 83
Total atoms within 5.0 Å cutoff: 69
Total atoms within 5.0 Å cutoff: 103
Total atoms within 5.0 Å cutoff: 96
Total atoms within 5.0 Å cutoff: 67
Total atoms within 5.0 Å cutoff: 90
Total atoms within 5.0 Å cutoff: 88
Total atoms within 5.0 Å cutoff: 63
Total atoms within 5.0 Å cutoff: 55
Total atoms within 5.0 Å cutoff: 92
Total atoms within 5.0 Å cutoff: 81
Total atoms within 5.0 Å cutoff: 83
Total atoms within 5.0 Å cutoff: 66
Total atoms within 5.0 Å cutoff: 71
Total atoms within 5.0 Å cutoff: 71
Total atoms within 5.0 Å cutoff: 84
Total atoms within 5.0 Å cutoff: 67
Total atoms within 5.0 Å cutoff: 51
Total atoms within 5.0 Å cutoff: 62
Total atoms within 5.0 Å cutoff: 110
Total atoms within 5.0 Å cutoff: 74
Total atoms within 5.0 Å c

In [5]:
import numpy as np

def compute_inverse_pairwise_distances(df):
    """
    Compute the pairwise Euclidean distances between residues based on their 3D coordinates.

    Parameters:
    df (pd.DataFrame): DataFrame containing 'X', 'Y', 'Z' coordinates and 'NewIndex' as index.

    Returns:
    pd.DataFrame: A DataFrame containing the pairwise distance matrix.
    """
    # Extract the coordinates (X, Y, Z)
    coordinates = df[['X', 'Y', 'Z']].values

    # Calculate pairwise distances using broadcasting
    diff = coordinates[:, np.newaxis, :] - coordinates[np.newaxis, :, :]
    distances = np.sqrt(np.sum(diff ** 2, axis=-1))

    # Compute inverse distance (1/d)
    with np.errstate(divide='ignore'):  # Ignore division by zero warning
        inverse_distances = 1 / distances

    # Set diagonal elements (self-distances) to 1
    np.fill_diagonal(inverse_distances, 1)

    # Cap values at 1
    inverse_distances = np.minimum(inverse_distances, 1)

    return inverse_distances

def pdb_to_dataframe(pdb_file):
    """
    Load a PDB file using MDAnalysis and convert key atom information to a pandas DataFrame.
    """
    u = mda.Universe(pdb_file)
    
    # Extract atom-related data: atom name, residue name, residue ID, and chain ID
    atom_data = {
        'Atom Name': u.atoms.names,
        'Residue Name': u.atoms.resnames,
        'Residue ID': u.atoms.resids,
        'Chain ID': u.atoms.segids,
        'X': u.atoms.positions[:, 0],
        'Y': u.atoms.positions[:, 1],
        'Z': u.atoms.positions[:, 2],
    }
    
    # Create a pandas DataFrame from the atom data
    df = pd.DataFrame(atom_data)
    
    return df

def one_hot_encoding(pdb_df):
    biggest_set = [
        # Carbon (C) subtypes
        'C', 'CA', 'CB', 'CD', 'CD1', 'CD2', 'CE', 'CE1', 'CE2', 'CE3',
        'CG', 'CG1', 'CG2', 'CH2', 'CZ', 'CZ2', 'CZ3',

        # Oxygen (O) subtypes
        'O', 'OH', 'OD1', 'OD2', 'OE1', 'OE2', 'OG', 'OG1',

        # Nitrogen (N) subtypes
        'N', 'NE', 'NE1', 'NE2', 'ND1', 'ND2', 'NZ', 'NH1', 'NH2',

        # Sulfur (S) subtypes
        'SD', 'SG',

        # CLR subtypes
        'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10',
        'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19',
        'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26', 'C27', 'O1'
    ]

    biggest_set.append('UNKNOWN')  # Add an additional column for unknown atom types
    
    # Create a zero matrix with shape (num_rows, num_unique_atoms)
    num_rows = len(pdb_df)
    num_cols = len(biggest_set)
    one_hot_matrix = np.zeros((num_rows, num_cols), dtype=int)

    # Create a mapping from atom name to index
    atom_to_index = {atom: idx for idx, atom in enumerate(biggest_set)}

    # Fill the one-hot matrix
    for i, atom in enumerate(pdb_df['Atom Name']):
        if atom in atom_to_index:
            one_hot_matrix[i, atom_to_index[atom]] = 1
        else:
            one_hot_matrix[i, atom_to_index['UNKNOWN']] = 1
            print(atom, "went to unknown column")

    return one_hot_matrix

def min_max_normalization(matrix):
    """
    Perform Min-Max normalization on a given matrix.

    Parameters:
    matrix (np.ndarray): The input matrix to be normalized.

    Returns:
    np.ndarray: The normalized matrix with values scaled to the range [0, 1].
    """
    # Compute the minimum and maximum values for the matrix
    min_val = np.min(matrix)
    max_val = np.max(matrix)

    # Apply Min-Max normalization formula
    normalized_matrix = (matrix - min_val) / (max_val - min_val)

    return normalized_matrix

In [6]:
def create_grid(size=20, resolution=1):
    num_cells = int(size * resolution)
    grid = np.zeros((num_cells, num_cells, num_cells, 65))  # 23 features per grid point
    return grid

# Function to apply 3D rotation to atomic coordinates
def rotate_dataframe(df, rotation_matrix, origin='centroid', inplace=False):
    """
    Rotate coordinates in a PDB DataFrame.

    Parameters
    ----------
    df : pd.DataFrame
        Must have columns 'X','Y','Z' (float Å).
    rotation_matrix : (3,3) ndarray
        Proper rotation matrix.
    origin : {'centroid','mean', array-like of shape (3,), None}
        Point about which to rotate. 'centroid' (same as 'mean') subtracts the
        mean of coordinates before rotating, then adds it back. If an array is
        given, rotate about that fixed point. If None, rotate about (0,0,0).
    inplace : bool
        If True, updates df in place and returns df. Otherwise returns a copy.

    Returns
    -------
    pd.DataFrame
    """
    if not {'X','Y','Z'}.issubset(df.columns):
        raise ValueError("DataFrame must contain columns: 'X','Y','Z'.")

    # Choose working frame
    out = df if inplace else df.copy()

    # Extract coordinates (N,3)
    coords = out[['X','Y','Z']].to_numpy(dtype=float)

    # Determine rotation origin
    if origin in ('centroid', 'mean'):
        pivot = coords.mean(axis=0, keepdims=True)  # (1,3)
    elif origin is None:
        pivot = np.zeros((1,3), dtype=float)
    else:
        pivot = np.asarray(origin, dtype=float).reshape(1,3)

    # Rotate about pivot: (coords - pivot) @ R^T + pivot
    rotated = (coords - pivot) @ rotation_matrix.T + pivot

    # Write back
    out[['X','Y','Z']] = rotated
    return out

# Generate a random rotation matrix
def generate_random_rotation_matrix():
    # Generate a random 3D rotation using Euler angles
    rotation = R.from_euler('xyz', np.random.uniform(0, 360, size=3), degrees=True)
    return rotation.as_matrix()

BIGGEST_SET = [
    # Carbon (C) subtypes
    'C', 'CA', 'CB', 'CD', 'CD1', 'CD2', 'CE', 'CE1', 'CE2', 'CE3',
    'CG', 'CG1', 'CG2', 'CH2', 'CZ', 'CZ2', 'CZ3',

    # Oxygen (O) subtypes
    'O', 'OH', 'OD1', 'OD2', 'OE1', 'OE2', 'OG', 'OG1',

    # Nitrogen (N) subtypes
    'N', 'NE', 'NE1', 'NE2', 'ND1', 'ND2', 'NZ', 'NH1', 'NH2',

    # Sulfur (S) subtypes
    'SD', 'SG',

    # CLR subtypes
    'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10',
    'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19',
    'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26', 'C27', 'O1'
]
BIGGEST_SET.append('UNKNOWN')  # index for unknown atom names
ATOM_INDEX = {atom: i for i, atom in enumerate(BIGGEST_SET)}
ATOM_ONEHOT_DIM = len(BIGGEST_SET)  # 65 with the list above

def atom_one_hot_from_name(atom_name: str) -> np.ndarray:
    vec = np.zeros(ATOM_ONEHOT_DIM, dtype=float)
    key = (atom_name or "").strip().upper()
    idx = ATOM_INDEX.get(key, ATOM_INDEX['UNKNOWN'])
    vec[idx] = 1.0
    if key not in ATOM_INDEX:
        print(atom_name, "went to unknown column")
    return vec

# Function to perform one-hot encoding for residue types
def encode_residue_type(residue):
    features = np.zeros(9)
    if residue in ['ASP', 'GLU']:
        features[0] = 1
    elif residue in ['LYS', 'ARG']:
        features[1] = 1
    elif residue == 'HIS':
        features[2] = 1
    elif residue == 'CYS':
        features[3] = 1
    elif residue in ['ASN', 'GLN', 'SER', 'THR']:
        features[4] = 1
    elif residue == 'GLY':
        features[5] = 1
    elif residue == 'PRO':
        features[6] = 1
    elif residue in ['PHE', 'TYR', 'TRP']:
        features[7] = 1
    elif residue in ['ALA', 'ILE', 'LEU', 'MET', 'VAL']:
        features[8] = 1
    return features

def find_nearest_empty(grid: np.ndarray, gc: np.ndarray, G: int, max_radius: int = None):
    """
    Find the nearest empty voxel to gc by expanding L∞ shells.
    Returns a tuple (x,y,z) or None if none found within max_radius.
    """
    x0, y0, z0 = map(int, gc)
    if max_radius is None:
        max_radius = G  # worst-case fallback

    # If target is already empty, use it
    if 0 <= x0 < G and 0 <= y0 < G and 0 <= z0 < G and not np.any(grid[x0, y0, z0]):
        return (x0, y0, z0)

    for r in range(1, max_radius + 1):
        xmin, xmax = max(0, x0 - r), min(G - 1, x0 + r)
        ymin, ymax = max(0, y0 - r), min(G - 1, y0 + r)
        zmin, zmax = max(0, z0 - r), min(G - 1, z0 + r)

        best_cell = None
        best_d2 = np.inf

        # Scan only the shell (any coord on the boundary of the cube)
        for x in range(xmin, xmax + 1):
            for y in range(ymin, ymax + 1):
                for z in range(zmin, zmax + 1):
                    if not (x in (xmin, xmax) or y in (ymin, ymax) or z in (zmin, zmax)):
                        continue
                    if not np.any(grid[x, y, z]):
                        d2 = (x - x0) ** 2 + (y - y0) ** 2 + (z - z0) ** 2
                        if d2 < best_d2:
                            best_d2 = d2
                            best_cell = (x, y, z)

        if best_cell is not None:
            return best_cell

    return None

# Map atoms to the grid based on their 3D coordinates
def map_atoms_to_grid(df, grid, grid_center, grid_size=20, resolution=1):
    # Compute bounds for min max normalization
    coords = df[['X','Y','Z']].to_numpy(dtype=float)
    min_coords = np.min(coords, axis=0)
    shifted = coords - min_coords
    
    for idx, row in df.iterrows():
        spos = shifted[idx]
        gc = np.rint(spos).astype(int)

        # Try rint cell first
        target = None
        if 0 <= gc[0] < grid_size and 0 <= gc[1] < grid_size and 0 <= gc[2] < grid_size and not np.any(grid[tuple(gc)]):
            target = tuple(gc)
        else:
            # Find nearest empty voxel
            target = find_nearest_empty(grid, gc, grid_size, max_radius=2)

        if target is None:
            raise Exception(f"Atom at df index {idx} could not be placed (no empty voxel found).")
        
        atom_feat = atom_one_hot_from_name(row['Atom Name'])
        # res_feat  = encode_residue_type(row['Residue Name'])
        # combined  = np.concatenate([atom_feat, res_feat])  # length = expected_F

        grid[target] = atom_feat

    return grid

# Main function to generate multiple rotated grids
def generate_rotated_grids(grid_center, filtered_pdb_path, num_rotations=20, grid_size=35, resolution=1):
    pdb_df = pdb_to_dataframe(filtered_pdb_path)
    
    grids = []
    
    for i in range(num_rotations):
        # Create a new grid
        grid = create_grid(size=grid_size, resolution=resolution)
        
        # Generate a random rotation matrix
        rotation_matrix = generate_random_rotation_matrix()
        
        # Rotate the molecule
        rotated_pdb_df = rotate_dataframe(pdb_df, rotation_matrix)
        
        # Map rotated atoms to the grid
        grid = map_atoms_to_grid(rotated_pdb_df, grid, grid_center, grid_size, resolution)

        # Store the rotated grid
        grids.append(grid)
    
    return grids
def saving_features(rotated_grids,output_path,protein_name_):
    os.makedirs(output_path, exist_ok=True)
    # Save each grid
    for idx, grid in enumerate(rotated_grids):
        np.save(f'{output_path}/{protein_name_}_grid_{idx}.npy', grid)
        print(f"Saved rotated grid {idx} successfully.")
    return

In [7]:
max_atoms = 200
output_dir = "cholesterol-separate-anton-clr/positive" 
os.makedirs(output_dir, exist_ok=True)

positive_files = glob.glob("filtered-anton-with-clr-5A/positive/*.pdb")
positive_files = sorted(positive_files, key=natural_sort_key)

for file in positive_files:
    output_dir = "cholesterol-separate-anton-clr/positive" 

    pdb_df = pdb_to_dataframe(file)
    encoded_matrix = one_hot_encoding(pdb_df)
    inverse_distance = compute_inverse_pairwise_distances(pdb_df) # don't need to normalize since gat notebook already does that

    combined_matrix = inverse_distance @ encoded_matrix # for gnn
    combined_matrix = min_max_normalization(combined_matrix)

    num_atoms = inverse_distance.shape[0]

    if num_atoms > max_atoms:
        print(f"{file} has {num_atoms} atoms, exceeding the limit of {max_atoms}")
        raise Exception("Too many atoms!")
    
    combined_matrix = np.pad(combined_matrix, ((0, max_atoms - num_atoms), (0, 0)), mode='constant') # padding for gnn

    base_name = os.path.splitext(os.path.basename(file))[0]
    output_path = os.path.join(output_dir, f"{base_name}_graphs.npy")

    np.save(output_path, { # for gat and gcn
        'inverse_distance': inverse_distance,
        'encoded_matrix': encoded_matrix
    })

    # Save to file
    output_dir = "cholesterol-anton-clr/positive" 
    os.makedirs(output_dir, exist_ok=True)
    base_name = os.path.splitext(os.path.basename(file))[0]
    output_path = os.path.join(output_dir, f"{base_name}_combined_matrix.npy")

    np.save(output_path, combined_matrix)

    print(f"Saved: {output_path}")

In [8]:
max_atoms = 200
output_dir = "cholesterol-separate-anton-clr/unlabeled" 
os.makedirs(output_dir, exist_ok=True)

unlabeled_files = glob.glob("filtered-anton-with-clr-5A/unlabeled/*.pdb")
unlabeled_files = sorted(unlabeled_files, key=natural_sort_key)

for file in unlabeled_files:
    output_dir = "cholesterol-separate-anton-clr/unlabeled" 

    pdb_df = pdb_to_dataframe(file)
    encoded_matrix = one_hot_encoding(pdb_df)
    inverse_distance = compute_inverse_pairwise_distances(pdb_df) # don't need to normalize since gat notebook already does that

    combined_matrix = inverse_distance @ encoded_matrix # for gnn
    combined_matrix = min_max_normalization(combined_matrix)

    num_atoms = inverse_distance.shape[0]

    if num_atoms > max_atoms:
        print(f"{file} has {num_atoms} atoms, exceeding the limit of {max_atoms}")
        raise Exception("Too many atoms!")
    
    combined_matrix = np.pad(combined_matrix, ((0, max_atoms - num_atoms), (0, 0)), mode='constant') # padding for gnn

    base_name = os.path.splitext(os.path.basename(file))[0]
    output_path = os.path.join(output_dir, f"{base_name}_graphs.npy")

    np.save(output_path, { # for gat and gcn
        'inverse_distance': inverse_distance,
        'encoded_matrix': encoded_matrix
    })

    # Save to file
    output_dir = "cholesterol-anton-clr/unlabeled" 
    os.makedirs(output_dir, exist_ok=True)
    base_name = os.path.splitext(os.path.basename(file))[0]
    output_path = os.path.join(output_dir, f"{base_name}_combined_matrix.npy")

    np.save(output_path, combined_matrix)

    print(f"Saved: {output_path}")

Saved: cholesterol-anton-clr/unlabeled/box1_cholersterol_vina_mode_1_combined_matrix.npy
Saved: cholesterol-anton-clr/unlabeled/box1_cholersterol_vina_mode_2_combined_matrix.npy
Saved: cholesterol-anton-clr/unlabeled/box1_cholersterol_vina_mode_3_combined_matrix.npy
Saved: cholesterol-anton-clr/unlabeled/box1_cholersterol_vina_mode_4_combined_matrix.npy
Saved: cholesterol-anton-clr/unlabeled/box1_cholersterol_vina_mode_5_combined_matrix.npy
Saved: cholesterol-anton-clr/unlabeled/box1_cholersterol_vina_mode_6_combined_matrix.npy
Saved: cholesterol-anton-clr/unlabeled/box1_cholersterol_vina_mode_7_combined_matrix.npy
Saved: cholesterol-anton-clr/unlabeled/box1_cholersterol_vina_mode_8_combined_matrix.npy
Saved: cholesterol-anton-clr/unlabeled/box1_cholersterol_vina_mode_9_combined_matrix.npy
Saved: cholesterol-anton-clr/unlabeled/box1_cholersterol_vina_mode_10_combined_matrix.npy
Saved: cholesterol-anton-clr/unlabeled/box1_cholersterol_vina_mode_11_combined_matrix.npy
Saved: cholesterol-

In [9]:
from scipy.spatial.transform import Rotation as R

positive_files = glob.glob("filtered-anton-with-clr-5A/positive/*.pdb")
positive_files = sorted(positive_files, key=natural_sort_key)

for file in positive_files:
    positive_output_path = "total-anton-5A-clr/positive"

    grid_center = np.array([0, 0, 0])  # Grid center at origin

    # Generate rotated grids (5 rotations)
    rotated_grids = generate_rotated_grids(grid_center, file, num_rotations=5)
    base_name = os.path.splitext(os.path.basename(file))[0]
    saving_features(rotated_grids,positive_output_path,base_name)

In [10]:
from scipy.spatial.transform import Rotation as R

unlabeled_files = glob.glob("filtered-anton-with-clr-5A/unlabeled/*.pdb")
unlabeled_files = sorted(unlabeled_files, key=natural_sort_key)

for file in unlabeled_files:
    unlabeled_output_path = "total-anton-5A-clr/unlabeled"

    grid_center = np.array([0, 0, 0])  # Grid center at origin

    # Generate rotated grids (5 rotations)
    rotated_grids = generate_rotated_grids(grid_center, file, num_rotations=5)
    base_name = os.path.splitext(os.path.basename(file))[0]
    saving_features(rotated_grids,unlabeled_output_path,base_name)

Saved rotated grid 0 successfully.
Saved rotated grid 1 successfully.
Saved rotated grid 2 successfully.
Saved rotated grid 3 successfully.
Saved rotated grid 4 successfully.
Saved rotated grid 0 successfully.
Saved rotated grid 1 successfully.
Saved rotated grid 2 successfully.
Saved rotated grid 3 successfully.
Saved rotated grid 4 successfully.
Saved rotated grid 0 successfully.
Saved rotated grid 1 successfully.
Saved rotated grid 2 successfully.
Saved rotated grid 3 successfully.
Saved rotated grid 4 successfully.
Saved rotated grid 0 successfully.
Saved rotated grid 1 successfully.
Saved rotated grid 2 successfully.
Saved rotated grid 3 successfully.
Saved rotated grid 4 successfully.
Saved rotated grid 0 successfully.
Saved rotated grid 1 successfully.
Saved rotated grid 2 successfully.
Saved rotated grid 3 successfully.
Saved rotated grid 4 successfully.
Saved rotated grid 0 successfully.
Saved rotated grid 1 successfully.
Saved rotated grid 2 successfully.
Saved rotated grid 3