In [None]:
#This script contains functions to extract protein-ligand interactions from Schrodinger's result files
import pandas as pd
from Bio.PDB import PDBParser, PPBuilder
from Bio import pairwise2
from collections import defaultdict
import re

# Utility functions

def extract_sequence_and_residues(pdb_file, chain_id=None):
    """
    Extracts the sequence and residue information from a PDB file,
    handling breaks in chains by combining all peptides.
    """
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure("structure", pdb_file)
    model = structure[0]
    ppb = PPBuilder()

    chain_data = {}
    for chain in model:
        if chain_id and chain.id != chain_id:
            continue

        peptides = ppb.build_peptides(chain)
        full_sequence = ""
        full_residues = []

        for peptide in peptides:
            seq = str(peptide.get_sequence())
            residues = [
                (res.get_resname(), chain.id, res.id[1], res.id[2])
                for res in peptide
            ]
            full_sequence += seq
            full_residues.extend(residues)

        if full_sequence:
            chain_data[chain.id] = (full_sequence, full_residues)

    return chain_data


def align_sequences(seq1, seq2):
    alignment = pairwise2.align.globalxx(seq1, seq2)[0]
    return alignment

def map_residues(alignment, residues1, residues2):
    seq1, seq2 = alignment[0], alignment[1]
    mapping = defaultdict(list)
    idx1, idx2 = 0, 0

    for char1, char2 in zip(seq1, seq2):
        if char1 != "-":
            idx1 += 1
        if char2 != "-":
            idx2 += 1
        if char1 != "-" and char2 != "-":
            pred_res = residues1[idx1 - 1]
            true_res = residues2[idx2 - 1]
            mapping[(pred_res[2], pred_res[1])].append((true_res[2], true_res[1]))
    return mapping

def expand_predicted_interactions(predicted_interactions, all_mappings):
    expanded_rows = []
    for _, row in predicted_interactions.iterrows():
        resnum = row["Residue Number"]
        chain = row["Residue Chain"]
        interaction_type = row["Interaction Type"]
        residue_type = row["Residue Type"]
        mapped_residues = all_mappings.get((resnum, chain),[])
        for mapped_resnum, mapped_chain in mapped_residues:
            expanded_row = row.copy()
            expanded_row["Mapped Residue Number"] = mapped_resnum
            expanded_row["Mapped Chain"] = mapped_chain
            expanded_rows.append(expanded_row)
    expanded_df = pd.DataFrame(expanded_rows)
    return expanded_df

def analyze_interaction_recovery_with_expansion(predicted_file, true_file, predicted_pdb, true_pdb):
    predicted_interactions = refined_extract_interaction_data_with_chain(predicted_file)
    true_interactions = refined_extract_interaction_data_with_chain(true_file)
    pred_seq_data = extract_sequence_and_residues(predicted_pdb)
    true_data = extract_sequence_and_residues(true_pdb)
    all_mappings = defaultdict(list)  # Initialize one-to-many mapping
    if isinstance(true_data, dict):
        for chain_id, (true_seq, true_reslist) in true_data.items():
            for pred_chain_id, (pred_seq, pred_reslist) in pred_seq_data.items():
                alignment = align_sequences(pred_seq, true_seq)
                mapping = map_residues(alignment, pred_reslist, true_reslist)
                for k, v in mapping.items():
                    all_mappings[k].extend(v)
    elif isinstance(true_data, tuple):
        for pred_chain_id, (pred_seq, pred_reslist) in pred_seq_data.items():
            true_seq, true_reslist = true_data
            alignment = align_sequences(pred_seq, true_seq)
            all_mappings = map_residues(alignment, pred_reslist, true_reslist)

    # Expand predicted interactions with mappings
    predicted_interactions = expand_predicted_interactions(predicted_interactions, all_mappings)
    # Create Mapped Key for comparison
    predicted_interactions["Mapped Key"] = (
        predicted_interactions["Interaction Type"] + "_" +
        predicted_interactions["Residue Type"] + "_" +
        predicted_interactions["Mapped Residue Number"].astype(str)
    )

    true_interactions["Key"] = (
        true_interactions["Interaction Type"] + "_" +
        true_interactions["Residue Type"] + "_" +
        true_interactions["Residue Number"].astype(str)
    )

    # Recovery check
    true_interactions["Recovered"] = true_interactions.apply(
        lambda row: any(
            (row["Key"] == mapped_key and row["Residue Chain"] == chain)
            for mapped_key, chain in zip(predicted_interactions["Mapped Key"], predicted_interactions["Mapped Chain"])
        ), axis=1
    )

    # Calculate recovery percentage
    recovery_percentage = true_interactions["Recovered"].mean() * 100
    print(f"Recovery Percentage: {recovery_percentage:.2f}%")
    return recovery_percentage, true_interactions

def refined_extract_interaction_data_with_chain(file_path):
    with open(file_path, 'r') as file:
        content = file.read()
    interaction_sections = re.findall(
        r"Interactions grouped by receptor residue:\n(.*?)Total \w+ Contacts interactions:",
        content, re.S
    )
    interaction_type_list = []
    residue_chain_list = []
    residue_type_list = []
    residue_number_list = []
    contact_count_list = []
    for section in interaction_sections:
        lines = section.splitlines()
        for line in lines:
            match = re.match(r"(\w+)\s+([A-Za-z0-9:]+)\((\w{3})\)\s+\d+\s+(\d+)", line.strip())
            if match:
                interaction_type, residue, amino_acid, contacts = match.groups()
                chain_id = residue.split(":")[0]
                residue_number = re.search(r"(\d+)", residue).group()
                interaction_type_list.append(interaction_type)
                residue_chain_list.append(chain_id)
                residue_type_list.append(amino_acid)
                residue_number_list.append(int(residue_number))
                contact_count_list.append(int(contacts))
    return pd.DataFrame({
        "Interaction Type": interaction_type_list,
        "Residue Chain": residue_chain_list,
        "Residue Type": residue_type_list,
        "Residue Number": residue_number_list,
        "# of Contacts": contact_count_list
    })

# Visualization function
def visualize_mapping_details(mapping, predicted_reslist, true_reslist):
    mapping_table = []
    for pred_res, true_res_list in mapping.items():
        for true_res in true_res_list:
            mapping_table.append({
                "Predicted Residue Name": pred_res[0],
                "Predicted Chain": pred_res[1],
                "Predicted Residue Number": pred_res[2],
                "True Residue Name": true_res[0],
                "True Chain": true_res[1],
                "True Residue Number": true_res[2]
            })
    mapping_df = pd.DataFrame(mapping_table)
    print("Residue Mapping Details:")
    print(mapping_df)
    return mapping_df







In [None]:
# Example usage
predicted_file = "/path/to/txt/file/for/predicted/structure"  # Replace with your path
true_file = "/path/to/txt/file/for/pdb/structure"  # Replace with your path
predicted_pdb = "/path/to/predicted/pdb"  # Replace with your path
true_pdb = "/path/to/real/pdb"  # Replace with your path

recovery_percentage, true_interactions = analyze_interaction_recovery_with_expansion(
    predicted_file, true_file, predicted_pdb, true_pdb
)
