<a href="https://colab.research.google.com/github/Jeffateth/AllergenPredict/blob/main/3Ddescriptors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install numpy pandas mdtraj biopython
#!pip install git+https://github.com/openmm/pdbfixer.git



In [2]:
!rm analyze_protein.py
!touch analyze_protein.py

In [3]:
%%writefile analyze_protein.py
#!/usr/bin/env python3
"""
Compute global 3D descriptors for a protein structure (PDB).
Includes total SASA, secondary structure content, contact order,
radius of gyration, compactness, and contact map.

Usage:
    python analyze_protein.py <structure.pdb>
"""

import sys
import os
import subprocess
import matplotlib.pyplot as plt

# Auto-install required packages
def install(pkg):
    subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])

for pkg in ("numpy", "pandas", "mdtraj", "matplotlib"):
    try: __import__(pkg)
    except ImportError: install(pkg)

import numpy as np
import pandas as pd
import mdtraj as md

def compute_contact_map_matrix(traj, cutoff=0.8):
    atom_indices = [a.index for a in traj.topology.atoms if a.name == 'CA']
    ca_traj = traj.atom_slice(atom_indices)
    pairs = [(i, j) for i in range(len(atom_indices)) for j in range(i+1, len(atom_indices))]
    distances, _ = md.compute_contacts(ca_traj, contacts=pairs, scheme='ca')
    n = len(atom_indices)
    contact_map = np.zeros((n, n), dtype=int)
    for (i, j), d in zip(pairs, distances[0]):
        if d < cutoff:
            contact_map[i, j] = 1
            contact_map[j, i] = 1
    return contact_map

def compute_global_descriptors(traj):
    sasa = md.shrake_rupley(traj)[0].sum() * 100  # Å²
    rg = md.compute_rg(traj)[0] * 10  # convert to Å
    n_residues = traj.n_residues
    compactness = n_residues / rg if rg > 0 else 0

    try:
        sec = md.compute_dssp(traj)[0]
        ss_content = {k: sum(sec == k) / n_residues for k in ['H', 'E', 'C']}
    except Exception:
        print("Warning: DSSP not found or failed. Skipping secondary structure content.")
        ss_content = {'H': 0, 'E': 0, 'C': 0}

    cmap = compute_contact_map_matrix(traj, cutoff=0.8)
    contacts = np.argwhere(np.triu(cmap, k=1))
    if len(contacts) == 0:
        contact_order = 0
    else:
        seq_sep = np.abs(contacts[:, 0] - contacts[:, 1])
        contact_order = np.sum(seq_sep) / (n_residues * len(contacts))

    return {
        'Total_SASA': sasa,
        'Radius_of_Gyration': rg,
        'Compactness': compactness,
        'Contact_Order': contact_order,
        'SS_Helix': ss_content.get('H', 0),
        'SS_Strand': ss_content.get('E', 0),
        'SS_Coil': ss_content.get('C', 0)
    }, cmap

def plot_contact_map(cmap, output):
    plt.figure(figsize=(6, 6))
    plt.imshow(cmap, cmap='Greys', origin='lower')
    plt.title("Contact Map")
    plt.xlabel("Residue Index")
    plt.ylabel("Residue Index")
    plt.colorbar(label="Contact (0/1)")
    plt.tight_layout()
    plt.savefig(output, dpi=300)
    plt.close()

def main(pdb_path):
    base = os.path.splitext(os.path.basename(pdb_path))[0]
    print(f"→ Processing: {pdb_path}")
    traj = md.load(pdb_path)
    global_desc, cmap = compute_global_descriptors(traj)

    pd.Series(global_desc).to_csv(f"{base}_3d_descriptors.csv")
    np.save(f"{base}_contact_map.npy", cmap)
    plot_contact_map(cmap, output=f"{base}_contact_map.png")

    print(f"→ Saved '{base}_3d_descriptors.csv', '{base}_contact_map.npy', and '{base}_contact_map.png'.")

def analyze_global(pdb_path):
    """Wrapper for calling the global analysis programmatically."""
    traj = md.load(pdb_path)
    global_desc, cmap = compute_global_descriptors(traj)

    base = os.path.splitext(os.path.basename(pdb_path))[0]
    pd.Series(global_desc).to_csv(f"{base}_3d_descriptors.csv")
    np.save(f"{base}_contact_map.npy", cmap)
    plot_contact_map(cmap, output=f"{base}_contact_map.png")

    return global_desc, cmap

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python analyze_protein.py <input.pdb>")
    else:
        main(sys.argv[1])

Overwriting analyze_protein.py


In [6]:
# Import the script (it must be in the same folder or in your PYTHONPATH)
import analyze_protein


# Call the analyze function directly
desc, cmap = analyze_protein.analyze_global("")   # replace with your filename

In [7]:
import pandas as pd
pd.Series(desc)

Unnamed: 0,0
Total_SASA,62607.410156
Radius_of_Gyration,42.278835
Compactness,33.25541
Contact_Order,0.026836
SS_Helix,0.364865
SS_Strand,0.13798
SS_Coil,0.495733
