In [None]:
#This script contains functions to extract ternary ppi interactions from Schordinger's Protein
import os
import pickle
import pandas as pd
from Bio.PDB import PDBParser, PPBuilder
from Bio import pairwise2
from collections import defaultdict
import re

In [None]:
def extract_sequences(pdb_file):
    """Extract sequences and residues from a PDB file."""
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure("structure", pdb_file)
    model = structure[0]
    ppb = PPBuilder()
    
    chain_data = {}
    for chain in model:
        peptides = ppb.build_peptides(chain)
        full_sequence = ""
        full_residues = []
        
        for peptide in peptides:
            seq = str(peptide.get_sequence())
            residues = [(res.get_resname(), chain.id, res.id[1]) for res in peptide]
            full_sequence += seq
            full_residues.extend(residues)
        
        if full_sequence:
            chain_data[chain.id] = (full_sequence, full_residues)
    
    return chain_data

def align_sequences(seq1, seq2):
    """Perform global sequence alignment and return the best alignment."""
    alignment = pairwise2.align.globalxx(seq1, seq2)[0]
    return alignment

def map_residues(alignment, residues1, residues2):
    """Map residues based on sequence alignment."""
    seq1, seq2 = alignment[0], alignment[1]
    mapping = defaultdict(list)
    idx1, idx2 = 0, 0

    for char1, char2 in zip(seq1, seq2):
        if char1 != "-":
            idx1 += 1
        if char2 != "-":
            idx2 += 1
        if char1 != "-" and char2 != "-":
            pred_res = residues1[idx1 - 1]
            true_res = residues2[idx2 - 1]
            mapping[(pred_res[2], pred_res[1])] = (true_res[2], true_res[1])
    return mapping

def find_best_chain_match(pred_chain, pred_seq, real_sequences):
    """Find the best matching chain in the real file."""
    best_match = None
    best_score = 0
    
    for real_chain, (real_seq, _) in real_sequences.items():
        alignment = align_sequences(pred_seq, real_seq)
        score = alignment[2]
        if score > best_score:
            best_score = score
            best_match = real_chain
    
    return best_match

def process_protein_protein_mapping(predicted_maegz, real_maegz, output_folder, pdb_id):
    predicted_pdb = output_folder + f"/{pdb_id}_predicted.pdb"
    real_pdb = output_folder + f"/{pdb_id}_real.pdb"
    #os.system("source /programs/sbgrid.shrc")
    os.system("source /programs/sbgrid.shrc && $SCHRODINGER/utilities/structconvert %s %s"%(predicted_maegz, predicted_pdb))
    os.system("source /programs/sbgrid.shrc && $SCHRODINGER/utilities/structconvert %s %s"%(real_maegz, real_pdb))
    output_mapping_file = output_folder + f"/{pdb_id}_residue_mapping.pkl"
    """Find and save residue mapping between predicted and real PDB files."""
    pred_sequences = extract_sequences(predicted_pdb)
    real_sequences = extract_sequences(real_pdb)
    
    if "A" not in pred_sequences or "B" not in pred_sequences:
        print("Error: Predicted PDB must contain chains A and B.")
        return
    
    pred_A_seq, pred_A_res = pred_sequences["A"]
    pred_B_seq, pred_B_res = pred_sequences["B"]
    
    match_A = find_best_chain_match("A", pred_A_seq, real_sequences)
    match_B = find_best_chain_match("B", pred_B_seq, real_sequences)
    
    if not match_A or not match_B:
        print("Error: Could not find a match for one or both chains.")
        return
    if match_A == match_B:
        print("Error: Both predicted chains map to the same real chain.")
        return
    
    print(f"Chain A maps to {match_A}, Chain B maps to {match_B}")
    
    alignment_A = align_sequences(pred_A_seq, real_sequences[match_A][0])
    alignment_B = align_sequences(pred_B_seq, real_sequences[match_B][0])
    
    mapping_A = map_residues(alignment_A, pred_A_res, real_sequences[match_A][1])
    mapping_B = map_residues(alignment_B, pred_B_res, real_sequences[match_B][1])
    
    residue_mapping = {"A": mapping_A, "B": mapping_B}
    
    with open(output_mapping_file, "wb") as f:
        pickle.dump(residue_mapping, f)
    
    print(f"Residue mapping saved to {output_mapping_file}")
    
    # Placeholder for running Schrödinger command
    print("Running Schrödinger command...")
    #command for predicted
    os.system('source /programs/sbgrid.shrc && $SCHRODINGER/run protein_interaction_analysis.py %s "chain. %s" "chain. %s" %s/%s_predicted_ppi.csv'%(predicted_pdb, "A", "B", output_folder, pdb_id))
    #command for real
    os.system('source /programs/sbgrid.shrc && $SCHRODINGER/run protein_interaction_analysis.py %s "chain. %s" "chain. %s" %s/%s_real_ppi.csv'%(real_pdb, match_A, match_B, output_folder, pdb_id))
    return residue_mapping

In [None]:
def comparison_workflow(predicted_ppi_path, real_ppi_path, mapping_path):
    # Load predicted and real PPI data
    predicted_ppi = pd.read_csv(predicted_ppi_path)
    real_ppi = pd.read_csv(real_ppi_path)

    # Load residue mapping (Pickle format)
    with open(mapping_path, "rb") as f:
        residue_mapping = pickle.load(f)

    # ===========================
    # Step 2: Clean Column Names
    # ===========================
    predicted_ppi.columns = [col.replace("\n", " ").strip() for col in predicted_ppi.columns]
    real_ppi.columns = [col.replace("\n", " ").strip() for col in real_ppi.columns]

    # ===========================
    # Step 3: Extract Interactions
    # ===========================

    # Function to extract residue number from "A:ASP 55" format
    def extract_residue_number(residue_str):
        match = re.search(r"([A-Z]):\D*(\d+)", residue_str)  # Extracts "A" and "55"
        if match:
            return match.group(1), int(match.group(2))  # Returns (Chain, Residue Number)
        return None, None

    # Extract predicted interactions
    predicted_interactions = set()
    for _, row in predicted_ppi.iterrows():
        res1, res2 = row["Set1 Residues"], row["Set2 Residues"]
        predicted_interactions.add((res1.strip(), res2.strip()))

    # Extract real interactions
    real_interactions = set()
    for _, row in real_ppi.iterrows():
        res1, res2 = row["Set1 Residues"], row["Set2 Residues"]
        real_interactions.add((res1.strip(), res2.strip()))

    # ===========================
    # Step 4: Apply Residue Mapping (Predicted → Real)
    # ===========================

    mapped_interactions = set()
    for res1, res2 in predicted_interactions:
        try:
            chain1, resnum1 = extract_residue_number(res1)
            chain2, resnum2 = extract_residue_number(res2)

            if not chain1 or not chain2:
                continue  # Skip if extraction failed

            # Map residues to real structure
            if (resnum1, chain1) in residue_mapping[chain1] and (resnum2, chain2) in residue_mapping[chain2]:
                mapped_res1 = residue_mapping[chain1][(resnum1, chain1)]
                mapped_res2 = residue_mapping[chain2][(resnum2, chain2)]
                mapped_interactions.add((f"{mapped_res1[1]}:{mapped_res1[0]}", f"{mapped_res2[1]}:{mapped_res2[0]}"))
        except Exception:
            continue  # Ignore any mapping errors

    # ===========================
    # Step 5: Clean Real Interactions (Remove Amino Acid Names)
    # ===========================

    real_interactions_cleaned = set()
    for res1, res2 in real_interactions:
        chain1, resnum1 = extract_residue_number(res1)
        chain2, resnum2 = extract_residue_number(res2)

        if chain1 and chain2:
            real_interactions_cleaned.add((f"{chain1}:{resnum1}", f"{chain2}:{resnum2}"))

    # ===========================
    # Step 6: Check Recovery
    # ===========================

    recovered_interactions = mapped_interactions.intersection(real_interactions_cleaned)
    recovery_rate = len(recovered_interactions) / len(predicted_interactions) if predicted_interactions else 0

    # ===========================
    # Step 7: Debugging Output
    # ===========================

    # Print sample outputs for debugging
    print("\n=== Sample Predicted Interactions (Before Mapping) ===")
    print(list(predicted_interactions)[:10])

    print("\n=== Sample Mapped Predicted Interactions (After Mapping) ===")
    print(list(mapped_interactions)[:10])

    print("\n=== Sample Real Interactions (After Cleaning) ===")
    print(list(real_interactions_cleaned)[:10])

    print("\n=== Recovery Summary ===")
    print(f"Total Predicted Interactions: {len(predicted_interactions)}")
    print(f"Total Real Interactions: {len(real_interactions_cleaned)}")
    print(f"Recovered Interactions: {len(recovered_interactions)}")
    print(f"Recovery Rate: {recovery_rate:.2%}")
    return recovery_rate



In [None]:
pdb_id = "id_for_structure"
output_folder = "output_dir"
predicted_pdb = "/path/to/real/pdb"
real_pdb = "/path/to/predicted/pdb"
process_protein_protein_mapping(predicted_pdb, real_pdb, output_folder, pdb_id)
comparison_workflow(f"{output_folder}/{pdb_id}_predicted_ppi.csv", f"{output_folder}/{pdb_id}_real_ppi.csv", f"{output_folder}/{pdb_id}_residue_mapping.pkl")