In [1]:
import pandas as pd
import subprocess
import os
import numpy as np
from Bio import SeqIO, pairwise2
from Bio.PDB.DSSP import DSSP
from Bio.PDB import PDBParser, Superimposer
import warnings
import matplotlib.pyplot as plt
import json
from Bio.PDB.DSSP import dssp_dict_from_pdb_file
warnings.filterwarnings('ignore')

# Functions

In [2]:
#### RMSD ####

    
# to calculate rmsd using Bio.PDB
# predicted_pdb: moving pdb comparing to reference, need to have the same number of CA atoms as the ref_pdb
def rmsd_pdb(predicted_pdb, ref_pdb):
    parser = PDBParser()
    struct_ref = parser.get_structure(os.path.basename(ref_pdb), ref_pdb)
    struct_predicted = parser.get_structure(os.path.basename(predicted_pdb), predicted_pdb)
    fixed = [atom for atom in struct_ref[0].get_atoms() if atom.name == "CA"]
    moving = [atom for atom in struct_predicted[0].get_atoms() if atom.name == "CA"]
    sup = Superimposer()
    # sets the fixed and moving atom lists
    # finds the rotation and translation matrix that best superimposes the moving atoms onto fixed atoms
    sup.set_atoms(fixed, moving)
    # applies the calculated rotation and translation to all atoms in the second structure
    # superimposing it onto the first structure
    #sup.apply(struct_predicted[0].get_atoms())
    sup.apply(moving)

    return sup.rms


def rmsd_point(coordinate1, coordinate2):
    # 3D coordinates, example: array([ 11.925492,  10.070204, -12.518902], dtype=float32)
    # this is a function to calculate rmsd for a single point
    x1 = coordinate1[0]
    y1 = coordinate1[1]
    z1 = coordinate1[2]
    x2 = coordinate2[0]
    y2 = coordinate2[1]
    z2 = coordinate2[2]
    value = np.sqrt((x1-x2)**2 + (y1-y2)**2 + (z1-z2)**2)
    return value


def rmsd_list(coordinates1, coordinates2):
    # list of 3D coordinates, should have the same number of coordinates
    # this is a function to calculate rmsd for two list of 3D coordinates
    length = len(coordinates1)
    values = []
    for i in range(length):
        x1 = coordinates1[i][0]
        y1 = coordinates1[i][1]
        z1 = coordinates1[i][2]
        x2 = coordinates2[i][0]
        y2 = coordinates2[i][1]
        z2 = coordinates2[i][2]
        value = (x1-x2)**2 + (y1-y2)**2 + (z1-z2)**2
        values.append(value)
    rmsd = np.sqrt(sum(values)/length)
    return rmsd



#### Pairwise Sequence Alignment ####
    

# load fasta file to get the sequence
def load_fasta(fasta_file):
    with open(fasta_file, 'r') as f:
        record = next(SeqIO.parse(f, 'fasta'))
        return record.seq


# extract common residues from two sequences
def common_residues(seq1, seq2):
    # inputs are aligned sequences, including gaps, length should be the same
    # outputs are the common residues of the two sequences, indices are the original ones (without gaps)
    common_indices1 = []
    common_indices2 = []
    res_idx1 = 0
    res_idx2 = 0
    for a, b in zip(seq1, seq2):
        if a == b:
            res_idx1 += 1
            res_idx2 += 1
            common_indices1.append(res_idx1)
            common_indices2.append(res_idx2)
        elif (a != b) and (a != '-') and (b != '-'):
            res_idx1 += 1
            res_idx2 += 1
        elif (a == '-') and (b != '-'):
            res_idx2 += 1
        elif (a != '-') and (b == '-'):
            res_idx1 += 1
        else:
            print(a, b)
    if len(common_indices1) != len(common_indices2):
        print("Two indices have different length!")
        return 1
    return common_indices1, common_indices2

    
# return common indices for each sequence
def pairwise_SA(moving_fasta, ref_fasta):
    # load sequences from fasta files
    seq_moving = load_fasta(moving_fasta)
    seq_ref = load_fasta(ref_fasta)

    # pairwise sequence alignment
    alignments = pairwise2.align.globalxx(seq_moving, seq_ref)
    aligned_seq_moving, aligned_seq_ref = alignments[0][:2]

    # extract common residues
    indices_moving, indices_ref = common_residues(aligned_seq_moving, aligned_seq_ref)

    return indices_moving, indices_ref


#### read residue types from pairs ####
def residue_type(moving_fasta, ref_fasta):
    # load sequences from fasta files
    seq_moving = load_fasta(moving_fasta)
    seq_ref = load_fasta(ref_fasta)

    # pairwise sequence alignment
    alignments = pairwise2.align.globalxx(seq_moving, seq_ref)
    aligned_seq_moving, aligned_seq_ref = alignments[0][:2]

    # extract common residues
    indices_moving, indices_ref = common_residues(aligned_seq_moving, aligned_seq_ref)

    # initialize dictionary to save residue types
    residue_moving = {}
    residue_ref = {}

    # save the residue
    for i in range(len(indices_ref)):
        idx_moving = indices_moving[i] 
        idx_ref = indices_ref[i]
        residue_moving[idx_ref] = seq_moving[idx_moving-1] # use reference index
        residue_ref[idx_ref] = seq_ref[idx_ref-1] # change 1-based index to 0-based index

    # check if the two dictionaries are the same
    if residue_moving != residue_ref:
        print("residue inconsistency!")
        return 1

    # only need to return one dictionary
    return residue_ref
    
    
#### RMSD with pair ####
    
# to calculate rmsd per residue using Bio.PDB
def rmsd_pdb_perResidue(moving_pdb, ref_pdb, moving_fasta, ref_fasta):
    # load PDB structures
    parser = PDBParser()
    struct_moving = parser.get_structure(os.path.basename(moving_pdb), moving_pdb)
    struct_ref = parser.get_structure(os.path.basename(ref_pdb), ref_pdb)

    # extract common residues
    indices_moving, indices_ref = pairwise_SA(moving_fasta, ref_fasta)
    
    # get CA atoms from PDBs
    moving = [atom for atom in struct_moving[0].get_atoms() if (atom.name == "CA") and (atom.full_id[3][1] in indices_moving)]
    fixed = [atom for atom in struct_ref[0].get_atoms() if (atom.name == "CA") and (atom.full_id[3][1] in indices_ref)]

    # check if the two structures have the same number of length
    if len(moving) != len(fixed):
        print("Two structures have different numbers of residues!")
    
    # get the fixed coordinates
    coords_fixed = []
    for i in range(len(fixed)):
        coords_fixed.append(fixed[i].get_coord())
    # get the moving coordinates
    sup = Superimposer()
    sup.set_atoms(fixed, moving)
    sup.apply(moving)
    coords_moving = []
    for i in range(len(moving)):
        coords_moving.append(moving[i].get_coord())

    # calculate rmsd per residue (CA)
    rmsd_perResidue = {}
    for i in range(len(coords_fixed)):
        residue_id = fixed[i].full_id[3][1]
        rmsd_perResidue[residue_id] = rmsd_point(coords_fixed[i], coords_moving[i])

    return rmsd_perResidue


# to calculate rmsd overall using Bio.PDB
# return rmsd between two structures
# indices consistent with the previous function "rmsd_pdb_perResidue"
def rmsd_pdb_overall(moving_pdb, ref_pdb, moving_fasta, ref_fasta):
    # load PDB structures
    parser = PDBParser()
    struct_moving = parser.get_structure(os.path.basename(moving_pdb), moving_pdb)
    struct_ref = parser.get_structure(os.path.basename(ref_pdb), ref_pdb)

    # extract common residues
    indices_moving, indices_ref = pairwise_SA(moving_fasta, ref_fasta)
    
    # get CA atoms from PDBs
    moving = [atom for atom in struct_moving[0].get_atoms() if (atom.name == "CA") and (atom.full_id[3][1] in indices_moving)]
    fixed = [atom for atom in struct_ref[0].get_atoms() if (atom.name == "CA") and (atom.full_id[3][1] in indices_ref)]

    # check if the two structures have the same number of length
    if len(moving) != len(fixed):
        print("Two structures have different numbers of residues!")
    
    # get the fixed coordinates
    coords_fixed = []
    for i in range(len(fixed)):
        coords_fixed.append(fixed[i].get_coord())
    # get the moving coordinates
    sup = Superimposer()
    sup.set_atoms(fixed, moving)
    sup.apply(moving)
    coords_moving = []
    for i in range(len(moving)):
        coords_moving.append(moving[i].get_coord())

    rmsd = rmsd_list(coords_moving, coords_fixed)

    return rmsd



#### delta delta G ####
    
# extract delta G per residue out of sc file
def dG_perResidue(sc_path):
    dG_perResidue = {}
    with open(sc_path, 'r') as f:
        for count, line in enumerate(f.readlines()):
            if(count != 0):
                line = line.strip("\n").split()
                id = int(line[23].split("_")[1])
                score = float(line[22])
                dG_perResidue[id] = score
    return dG_perResidue

# to calculate ddG within pairs
def ddG_perResidue(moving_sc, ref_sc, moving_fasta, ref_fasta):
    #### input csv files !!! ####

    # extract common residues
    indices_moving, indices_ref = pairwise_SA(moving_fasta, ref_fasta)

    # read score file
    dG_moving = pd.read_csv(moving_sc)
    dG_ref = pd.read_csv(ref_sc)

    # initiate dG and ddG dictionary
    dG_iso_perResidue = {}
    dG_ref_perResidue = {}
    ddG_perResidue = {}
    
    # dG and ddG
    for i in range(len(indices_ref)):
        idx_moving = indices_moving[i]
        score_moving = dG_moving[dG_moving['residue_id']==idx_moving]['dG'].iloc[0]
        idx_ref = indices_ref[i]
        score_ref = dG_ref[dG_ref['residue_id']==idx_ref]['dG'].iloc[0]
        ddG = round(score_moving - score_ref, 3)

        # use index for reference as residue id
        dG_iso_perResidue[idx_ref] = score_moving
        dG_ref_perResidue[idx_ref] = score_ref
        ddG_perResidue[idx_ref] = ddG

    return ddG_perResidue, dG_iso_perResidue, dG_ref_perResidue
    

#### extract fa_sol term out of sc files ####
def fa_sol_perResidue(sc_path):
    fa_sol_perResidue = {}
    with open(sc_path, 'r') as f:
        for count, line in enumerate(f.readlines()):
            if (count != 0):
                line = line.strip("\n").split()
                id = int(line[23].split("_")[1])
                fa_sol = float(line[5])
                fa_sol_perResidue[id] = fa_sol
    return fa_sol_perResidue

    
# get fa_sol within pairs
def get_pair_fa_sol(moving_sc, ref_sc, moving_fasta, ref_fasta):
    #### input csv files !!! ####

    # extract common residues
    indices_moving, indices_ref = pairwise_SA(moving_fasta, ref_fasta)

    # read score file
    fa_sol_moving = pd.read_csv(moving_sc)
    fa_sol_ref = pd.read_csv(ref_sc)

    # initiate fa_sol dictionary
    fa_sol_iso_perResidue = {}
    fa_sol_ref_perResidue = {}

    # fa_sol
    for i in range(len(indices_ref)):
        idx_moving = indices_moving[i]
        score_moving = fa_sol_moving[fa_sol_moving['residue_id']==idx_moving]['fa_sol'].iloc[0]
        idx_ref = indices_ref[i]
        score_ref = fa_sol_ref[fa_sol_ref['residue_id']==idx_ref]['fa_sol'].iloc[0]

        # use index for reference as residue id
        fa_sol_iso_perResidue[idx_ref] = score_moving
        fa_sol_ref_perResidue[idx_ref] = score_ref

    return fa_sol_iso_perResidue, fa_sol_ref_perResidue

    
# get tags with the lowest relax score
# here it generates a tag file (basically a txt file)
# that contains 3 ids with lowest score by default

def get_lowestTag(sc_file, tag_file, num = 3):
    if sc_file==tag_file:
        print("score file and tag file should be different!")
        return 1
    # sc file is the one generated by Rosetta Relax
    # tag file is a path, not a folder, need to specify the file name
    scores_and_ids = pd.DataFrame(columns = ['score', 'id'])
    with open(sc_file, "r") as f:
        for count, line in enumerate(f.readlines()):
            if (count != 0) and (count != 1): # excluded the first two lines
                line = line.strip("\n")
                line = line.split()
                scores_and_ids.loc[len(scores_and_ids)] = [float(line[1]), line[23]]
    scores_and_ids = scores_and_ids.sort_values(by = 'score', ascending = True)
    lowest_ids = scores_and_ids['id'].head(num)
    with open(tag_file, 'w') as f:
        for id in lowest_ids:
            f.write(f"{id}\n")
    return 0


# get the lowest relax score
def get_lowestScore(sc_file):
    # very similar to the function get_lowestTag
    # sc file is the one generated by Rosetta Relax
    scores_and_ids = pd.DataFrame(columns = ['score', 'id'])
    with open(sc_file, 'r') as f:
        for count, line in enumerate(f.readlines()):
            if (count != 0) and (count != 1): # excluded the first two lines
                line = line.strip("\n")
                line = line.split()
                scores_and_ids.loc[len(scores_and_ids)] = [float(line[1]), line[23]]
    scores_and_ids = scores_and_ids.sort_values(by = 'score', ascending = True)
    lowest_score = float(scores_and_ids['score'].head(1))
    lowest_tag = scores_and_ids['id'].head(1).iloc[0]
    return lowest_score, lowest_tag
    

#### plddt score (with pair) ####
    
# get plddt score from "ranking_debug.json"
def get_plddt(af_ranking_file):
    af_ranking = json.load(open(af_ranking_file))
    ave_plddt = format(sum(af_ranking['plddts'].values()) / len(af_ranking['plddts'].values()), '.3f')
    return float(ave_plddt)


# get lowest plddt score from "ranking_debug.json"
def get_plddt_highest(af_ranking_file):
    af_ranking = json.load(open(af_ranking_file))
    plddt = max(af_ranking['plddts'].values())
    return plddt

# get plddt per residue
# based on the original residue id within the corresponding pdb file
def get_plddt_perResidue(result_model_pkl):
    plddt_dic = {} # create a dictionary to store the plddt score
    plddt = pd.read_pickle(result_model_pkl)
    plddt = plddt['plddt']
    for i in range(len(plddt)):
        plddt_dic[i+1] = plddt[i]
    return plddt_dic


# get model name for highest ranking
def get_ranked_0_model(af_ranking_file):
    af_ranking = json.load(open(af_ranking_file))
    model_name = af_ranking['order'][0]
    return model_name


# get plddt per residue, index based on pairwise sequence alignment
def plddt_pair_perResidue(moving_pkl, ref_pkl, moving_fasta, ref_fasta):
    ### input pkl files

    # extract common residues
    indices_moving, indices_ref = pairwise_SA(moving_fasta, ref_fasta)

    # read pkl files
    moving_plddt = get_plddt_perResidue(moving_pkl)
    ref_plddt = get_plddt_perResidue(ref_pkl)

    # initiate plddt dictionary
    moving_plddt_perResidue = {}
    ref_plddt_perResidue = {}
    
    # change the index based on sequence alignment
    for i in range(len(indices_ref)):
        idx_moving = indices_moving[i]
        idx_ref = indices_ref[i]
        plddt_score_moving = moving_plddt[idx_moving]
        plddt_score_ref = ref_plddt[idx_ref] # get plddt score from dictionary
        moving_plddt_perResidue[idx_ref] = plddt_score_moving # use reference index for consistency
        ref_plddt_perResidue[idx_ref] = plddt_score_ref

    return moving_plddt_perResidue, ref_plddt_perResidue




    
#### RSA from DSSP ####


# get relative accessible surface
def get_dssp(pdb_file):
    # need to import #
    # from Bio.PDB.DSSP import DSSP
    # from Bio.PDB import PDBParser
    p = PDBParser()
    structure = p.get_structure(str(pdb_file), pdb_file)
    model = structure[0]

    # count residues
    #n = 0
    #for res in model.get_residues():
        #n += 1
    
    dssp = DSSP(model, pdb_file, dssp = 'mkdssp')
    #if len(dssp.keys()) != n:
        #print(os.path.basename(pdb_file) + " dssp length different from pdb!")
        #return 1
    return dssp


# change from residue index to dssp index
def fix_index(pdb_file, chain_id, residue_id):
    # from Bio.PDB.DSSP import dssp_dict_from_pdb_file

    # define chain
    #chain_id = os.path.basename(pdb_file).split('.')[0].split('_')[1]

    # get DSSP index using residue index
    dssp_tuple = dssp_dict_from_pdb_file(pdb_file)
    dssp_dict = dssp_tuple[0]
    key = (chain_id, (' ', residue_id, ' '))
    if key in dssp_dict.keys():
        dssp_id = dssp_dict[chain_id, (' ', residue_id, ' ')][5]
        # reference: https://github.com/biopython/biopython/blob/master/Bio/PDB/DSSP.py
        return dssp_id
    return None

    
# get single RSA value given the residue ID
def get_single_rsa(pdb_file, chain_id, residue_id):
    dssp_data = get_dssp(pdb_file)
    dssp_id = fix_index(pdb_file, chain_id, residue_id)

    if dssp_id == None:
        return None

    for key in dssp_data.keys():
        if dssp_data[key][0] == dssp_id:
            # dssp_data[key][0] is DSSP index
            # according to https://biopython.org/docs/1.76/api/Bio.PDB.DSSP.html
            return dssp_data[key][3]
    return None

    
# get RSA per residue, index based on pairwise sequence alignment
def get_pair_rsa(moving_pdb, ref_pdb, moving_fasta, ref_fasta, moving_chain_id, ref_chain_id):

    # need to import #
    # from Bio import SeqIO, pairwise2
    # from Bio.PDB.DSSP import DSSP
    # from Bio.PDB import PDBParser

    # extract common residues
    indices_moving, indices_ref = pairwise_SA(moving_fasta, ref_fasta)

    # get dssp
    #moving_dssp = get_dssp(moving_pdb)
    #ref_dssp = get_dssp(ref_pdb)

    # initiate rsa dictionary
    moving_rsa_perResidue = {}
    ref_rsa_perResidue = {}

    # index based on reference sequence
    for i in range(len(indices_ref)):
        idx_moving = indices_moving[i]
        idx_ref = indices_ref[i]
        
        # get RSA
        rsa_moving = get_single_rsa(moving_pdb, moving_chain_id, idx_moving)
        if rsa_moving != None:
            moving_rsa_perResidue[idx_ref] = rsa_moving # save in dictionary, use reference index for consistency
            
        rsa_ref = get_single_rsa(ref_pdb, ref_chain_id, idx_ref)
        if rsa_ref != None:
            ref_rsa_perResidue[idx_ref] = rsa_ref

    return moving_rsa_perResidue, ref_rsa_perResidue

# Define pair IDs

In [3]:
# define pair IDs
pair_ids = {'0': 'good',
            '1': 'good',
            '2': 'good',
            '3': 'good',
            '4': 'good',
            '5': 'good',
            '6': 'good',
            '7': 'good',
            '8': 'good',
            '9': 'good',
            '10': 'bad',
            '11': 'bad',
            '13': 'bad'}

# Preprocessing

In [4]:
# extract lowest.tag

af_dir = "alphafold_res"
tag_filename = "lowest.tag" # the text file to store the relax structure with lowest

for folder in os.listdir(af_dir):
    if not folder.startswith('.'):
        print(folder)
        relax_sc_path = os.path.join(af_dir, folder, "relax.sc")
        tag_path = os.path.join(af_dir, folder, tag_filename)
        get_lowestTag(relax_sc_path, tag_path, num = 1)

bad_11_iso
good_1_iso
good_9_iso
good_4_ref
good_4_iso
good_6_ref
bad_10_iso
good_5_iso
good_0_ref
good_0_iso
good_6_iso
bad_11_ref
good_7_ref
bad_13_ref
good_2_iso
good_9_ref
good_3_ref
good_5_ref
good_8_iso
good_3_iso
good_7_iso
bad_10_ref
good_2_ref
good_1_ref
good_8_ref
bad_13_iso


In [5]:
# extract delta G from sc files
for key, value in pair_ids.items():
    # get ids
    iso_id = value + "_" + key + "_iso"
    ref_id = value + "_" + key + "_ref"

    # sc path
    iso_tag_path = os.path.join("alphafold_res", iso_id, "lowest.tag")
    with open(iso_tag_path, 'r') as f:
        iso_relax_name = f.readline().strip()
        iso_sc_name = iso_relax_name + "_perRes.sc"
    iso_sc_path = os.path.join("alphafold_res", iso_id, iso_sc_name)

    ref_tag_path = os.path.join("alphafold_res", ref_id, "lowest.tag")
    with open(ref_tag_path, 'r') as f:
        ref_relax_name = f.readline().strip()
        ref_sc_name = ref_relax_name + "_perRes.sc"
    ref_sc_path = os.path.join("alphafold_res", ref_id, ref_sc_name)

    # read sc files
    sc_iso = dG_perResidue(iso_sc_path)
    sc_iso = pd.DataFrame.from_dict(sc_iso, orient = 'index', columns = ['dG'])
    sc_iso = sc_iso.reset_index().rename(columns = {'index': 'residue_id'})

    sc_ref = dG_perResidue(ref_sc_path)
    sc_ref = pd.DataFrame.from_dict(sc_ref, orient = 'index', columns = ['dG'])
    sc_ref = sc_ref.reset_index().rename(columns = {'index': 'residue_id'})

    # save csv files
    iso_csv_name = iso_id + "_relax.csv"
    iso_csv_path = os.path.join("dG_perResidue", iso_csv_name)
    sc_iso.to_csv(iso_csv_path, index = False)

    ref_csv_name = ref_id + "_relax.csv"
    ref_csv_path = os.path.join("dG_perResidue", ref_csv_name)
    sc_ref.to_csv(ref_csv_path, index = False)

# Calculate metrics

In [11]:
# loop through pairs

tag_filename = "lowest.tag"

for key, value in pair_ids.items():
    # get ids
    iso_id = value + "_" + key + "_iso"
    ref_id = value + "_" + key + "_ref"

    # fasta files directory
    iso_fasta_name = iso_id + ".fasta"
    iso_fasta_path = os.path.join("aa_seq", iso_fasta_name)
    ref_fasta_name = ref_id + ".fasta"
    ref_fasta_path = os.path.join("aa_seq", ref_fasta_name)

    # pdb files directory (AF + Relax)
    iso_tag_path = os.path.join("alphafold_res", iso_id, tag_filename)
    with open(iso_tag_path, 'r') as f:
        iso_relax_name = f.readline().strip()
        iso_pdb_name = iso_relax_name + ".pdb"
    iso_pdb_path = os.path.join("alphafold_res", iso_id, iso_pdb_name)

    ref_tag_path = os.path.join("alphafold_res", ref_id, tag_filename)
    with open(ref_tag_path, 'r') as f:
        ref_relax_name = f.readline().strip()
        ref_pdb_name = ref_relax_name + ".pdb"
    ref_pdb_path = os.path.join("alphafold_res", ref_id, ref_pdb_name)

    # csv files that store delta G score information
    dG_dir = "dG_perResidue"

    iso_sc_name = iso_id + "_relax.csv"
    iso_sc_path = os.path.join("dG_perResidue", iso_sc_name)

    ref_sc_name = ref_id + "_relax.csv"
    ref_sc_path = os.path.join("dG_perResidue", ref_sc_name)
    
    #### indices for isoform ####
    indices_iso, indices_ref = pairwise_SA(iso_fasta_path, ref_fasta_path)
    indices_dict = dict(zip(indices_ref, indices_iso)) # use reference indices as keys
    # initialize the dataframe
    pair_df = pd.DataFrame({
        'Residue': indices_dict.keys(),
        'Residue_isoform': indices_dict.values()
    })

    #### add residue types ####
    residues = residue_type(iso_fasta_path, ref_fasta_path)
    # combine into dataframe
    pair_df['ResidueType'] = pair_df['Residue'].map(residues)


    #### RMSD ####
    rmsd = rmsd_pdb_perResidue(iso_pdb_path, ref_pdb_path, iso_fasta_path, ref_fasta_path)
    # combine into dataframe
    pair_df['RMSD'] = pair_df['Residue'].map(rmsd)


    #### dG and ddG ####
    ddG, dG_iso, dG_ref = ddG_perResidue(iso_sc_path, ref_sc_path, iso_fasta_path, ref_fasta_path) #ddG=iso-ref
    # combine into dataframe
    pair_df['ddG'] = pair_df['Residue'].map(ddG)
    pair_df['dG_iso'] = pair_df['Residue'].map(dG_iso)
    pair_df['dG_ref'] = pair_df['Residue'].map(dG_ref)


    #### plddt per residue ####
    iso_model_name = get_ranked_0_model(os.path.join("alphafold_res", iso_id, "ranking_debug.json"))
    iso_pkl_name = "result_" + iso_model_name + ".pkl"
    iso_pkl_path = os.path.join("alphafold_res", iso_id, iso_pkl_name)

    ref_model_name = get_ranked_0_model(os.path.join("alphafold_res", ref_id, "ranking_debug.json"))
    ref_pkl_name = "result_" + ref_model_name + ".pkl"
    ref_pkl_path = os.path.join("alphafold_res", ref_id, ref_pkl_name)

    iso_plddt, ref_plddt = plddt_pair_perResidue(iso_pkl_path, ref_pkl_path, iso_fasta_path, ref_fasta_path)
    # combine into dataframe
    pair_df['plddt_iso'] = pair_df['Residue'].map(iso_plddt)
    pair_df['plddt_ref'] = pair_df['Residue'].map(ref_plddt)


    #### RSA per residue ####
    rsa_iso, rsa_ref = get_pair_rsa(iso_pdb_path, ref_pdb_path, iso_fasta_path, ref_fasta_path, "A", "A")
    # combine into dataframe
    pair_df['rsa_iso'] = pair_df['Residue'].map(rsa_iso)
    pair_df['rsa_ref'] = pair_df['Residue'].map(rsa_ref)

    # print process
    print(key)

    # save to csv
    csv_name = "pair_" + value + "_" + key + ".csv"
    csv_path = os.path.join("pairs_csv", csv_name)
    pair_df.to_csv(csv_path, index = False)

0
1
2
3
4
5
6
7
8
9
10
11
13


In [7]:
#### metrics for pairs ####

# define hydrophilic & hydrophobic
hydrophobic = ['W', 'Y', 'F', 'I', 'L', 'M', 'V', 'A', 'C']
neutral = ['T', 'H', 'G', 'S', 'Q']
hydrophilic = ['R', 'E', 'N', 'K', 'P', 'D']

# save everything in a list
rows = []


# loop through pairs
for key, value in pair_ids.items():
    # pair id
    pair_id = key + "_" + value

    # get ids
    iso_id = value + "_" + key + "_iso"
    ref_id = value + "_" + key + "_ref"

    # define relax sc path
    # this is the overall delta G for every single structure generated by RosettaRelax
    sc_relax_iso = os.path.join("alphafold_res", iso_id, "relax.sc")
    sc_relax_ref = os.path.join("alphafold_res", ref_id, "relax.sc")

    # define the csv path for current pair
    csv_name = "pair_" + value + "_" + key + ".csv"
    csv_path = os.path.join("pairs_csv", csv_name)
    # read csv files
    df = pd.read_csv(csv_path)

    # overall delta G values for both structures
    # get_lowestScore(sc_path)
    dG_iso, tag_iso = get_lowestScore(sc_relax_iso)
    dG_ref, tag_ref = get_lowestScore(sc_relax_ref)
    # 10% quantile for the lowest per-residue delta G values
    dG_10q_low_iso = df['dG_iso'].quantile(0.1)
    dG_10q_high_iso = df['dG_iso'].quantile(0.9)
    dG_10q_low_ref = df['dG_ref'].quantile(0.1)
    dG_10q_high_ref = df['dG_ref'].quantile(0.9)

    # overall plddt scores for both structures
    # define "ranking_debug.json" path
    json_iso = os.path.join("alphafold_res", iso_id, "ranking_debug.json")
    json_ref = os.path.join("alphafold_res", ref_id, "ranking_debug.json")
    # highest overall plddt score
    plddt_iso = get_plddt_highest(json_iso)
    plddt_ref = get_plddt_highest(json_ref)
    # 10% quantile for the lowest per-residue plddt scores
    plddt_10q_low_iso = df['plddt_iso'].quantile(0.1)
    plddt_10q_high_iso = df['plddt_iso'].quantile(0.9)
    plddt_10q_low_ref = df['plddt_ref'].quantile(0.1)
    plddt_10q_high_ref = df['plddt_ref'].quantile(0.9)

    # median RSA for both structures
    rsa_iso = df['rsa_iso'].dropna().median()
    rsa_ref = df['rsa_ref'].dropna().median()
    # median RSA of hydrophilic residues for both structures
    rsa_hydrophilic_iso = df[df['ResidueType'].isin(hydrophilic)]['rsa_iso'].dropna().median()
    rsa_hydrophilic_ref = df[df['ResidueType'].isin(hydrophilic)]['rsa_ref'].dropna().median()
    # median RSA of hydrophobic residues for both structures
    rsa_hydrophobic_iso = df[df['ResidueType'].isin(hydrophobic)]['rsa_iso'].dropna().median()
    rsa_hydrophobic_ref = df[df['ResidueType'].isin(hydrophobic)]['rsa_ref'].dropna().median()
    # median RSA of each residue type for both structures
    rsa_W_iso = df[df['ResidueType']=='W']['rsa_iso'].dropna().median()
    rsa_W_ref = df[df['ResidueType']=='W']['rsa_ref'].dropna().median() # Tryptophan
    rsa_Y_iso = df[df['ResidueType']=='Y']['rsa_iso'].dropna().median()
    rsa_Y_ref = df[df['ResidueType']=='Y']['rsa_ref'].dropna().median() # Tyrosine
    rsa_F_iso = df[df['ResidueType']=='F']['rsa_iso'].dropna().median()
    rsa_F_ref = df[df['ResidueType']=='F']['rsa_ref'].dropna().median() # Phenylalanine
    rsa_I_iso = df[df['ResidueType']=='I']['rsa_iso'].dropna().median()
    rsa_I_ref = df[df['ResidueType']=='I']['rsa_ref'].dropna().median() # Isoleucine
    rsa_L_iso = df[df['ResidueType']=='L']['rsa_iso'].dropna().median()
    rsa_L_ref = df[df['ResidueType']=='L']['rsa_ref'].dropna().median() # Leucine
    rsa_M_iso = df[df['ResidueType']=='M']['rsa_iso'].dropna().median()
    rsa_M_ref = df[df['ResidueType']=='M']['rsa_ref'].dropna().median() # Methionine
    rsa_V_iso = df[df['ResidueType']=='V']['rsa_iso'].dropna().median()
    rsa_V_ref = df[df['ResidueType']=='V']['rsa_ref'].dropna().median() # Valine
    rsa_A_iso = df[df['ResidueType']=='A']['rsa_iso'].dropna().median()
    rsa_A_ref = df[df['ResidueType']=='A']['rsa_ref'].dropna().median() # Alanine
    rsa_C_iso = df[df['ResidueType']=='C']['rsa_iso'].dropna().median()
    rsa_C_ref = df[df['ResidueType']=='C']['rsa_ref'].dropna().median() # Cysteine
    rsa_T_iso = df[df['ResidueType']=='T']['rsa_iso'].dropna().median()
    rsa_T_ref = df[df['ResidueType']=='T']['rsa_ref'].dropna().median() # Threonine
    rsa_H_iso = df[df['ResidueType']=='H']['rsa_iso'].dropna().median()
    rsa_H_ref = df[df['ResidueType']=='H']['rsa_ref'].dropna().median() # Histidine
    rsa_G_iso = df[df['ResidueType']=='G']['rsa_iso'].dropna().median()
    rsa_G_ref = df[df['ResidueType']=='G']['rsa_ref'].dropna().median() # Glycine
    rsa_S_iso = df[df['ResidueType']=='S']['rsa_iso'].dropna().median()
    rsa_S_ref = df[df['ResidueType']=='S']['rsa_ref'].dropna().median() # Serine
    rsa_Q_iso = df[df['ResidueType']=='Q']['rsa_iso'].dropna().median()
    rsa_Q_ref = df[df['ResidueType']=='Q']['rsa_ref'].dropna().median() # Glutamine
    rsa_R_iso = df[df['ResidueType']=='R']['rsa_iso'].dropna().median()
    rsa_R_ref = df[df['ResidueType']=='R']['rsa_ref'].dropna().median() # Arginine
    rsa_E_iso = df[df['ResidueType']=='E']['rsa_iso'].dropna().median()
    rsa_E_ref = df[df['ResidueType']=='E']['rsa_ref'].dropna().median() # Glutamic acid
    rsa_N_iso = df[df['ResidueType']=='N']['rsa_iso'].dropna().median()
    rsa_N_ref = df[df['ResidueType']=='N']['rsa_ref'].dropna().median() # Asparagine
    rsa_K_iso = df[df['ResidueType']=='K']['rsa_iso'].dropna().median()
    rsa_K_ref = df[df['ResidueType']=='K']['rsa_ref'].dropna().median() # Lysine
    rsa_P_iso = df[df['ResidueType']=='P']['rsa_iso'].dropna().median()
    rsa_P_ref = df[df['ResidueType']=='P']['rsa_ref'].dropna().median() # Proline
    rsa_D_iso = df[df['ResidueType']=='D']['rsa_iso'].dropna().median()
    rsa_D_ref = df[df['ResidueType']=='D']['rsa_ref'].dropna().median() # Aspartic acid

    # pdb path for both iso and ref
    pdb_iso_name = tag_iso + ".pdb"
    pdb_iso_path = os.path.join("alphafold_res", iso_id, pdb_iso_name)
    pdb_ref_name = tag_ref + ".pdb"
    pdb_ref_path = os.path.join("alphafold_res", ref_id, pdb_ref_name)

    # fasta files for both iso and ref
    iso_fasta_name = iso_id + ".fasta"
    iso_fasta_path = os.path.join("aa_seq", iso_fasta_name)
    ref_fasta_name = ref_id + ".fasta"
    ref_fasta_path = os.path.join("aa_seq", ref_fasta_name)

    # overall RMSD
    # rmsd_pdb_overall(moving_pdb, ref_pdb, moving_fasta, ref_fasta)
    rmsd = rmsd_pdb_overall(pdb_iso_path, pdb_ref_path, iso_fasta_path, ref_fasta_path)


    #### integrate everything into a dictionary, and append into list
    row = {'pair_id': pair_id,
           'rmsd': rmsd,
           'dG_iso': dG_iso,
           'dG_ref': dG_ref,
           'dG_10q_low_iso': dG_10q_low_iso,
           'dG_10q_high_iso': dG_10q_high_iso,
           'dG_10q_low_ref': dG_10q_low_ref,
           'dG_10q_high_ref': dG_10q_high_ref,
           'plddt_iso': plddt_iso,
           'plddt_ref': plddt_ref,
           'plddt_10q_low_iso': plddt_10q_low_iso,
           'plddt_10q_high_iso': plddt_10q_high_iso,
           'plddt_10q_low_ref': plddt_10q_low_ref,
           'plddt_10q_high_ref': plddt_10q_high_ref,
           'rsa_iso': rsa_iso,
           'rsa_ref': rsa_ref,
           'rsa_hydrophilic_iso': rsa_hydrophilic_iso,
           'rsa_hydrophilic_ref': rsa_hydrophilic_ref,
           'rsa_hydrophobic_iso': rsa_hydrophobic_iso,
           'rsa_hydrophobic_ref': rsa_hydrophobic_ref,
           'rsa_W_iso': rsa_W_iso,
           'rsa_W_ref': rsa_W_ref,
           'rsa_Y_iso': rsa_Y_iso,
           'rsa_Y_ref': rsa_Y_ref,
           'rsa_F_iso': rsa_F_iso,
           'rsa_F_ref': rsa_F_ref,
           'rsa_I_iso': rsa_I_iso,
           'rsa_I_ref': rsa_I_ref,
           'rsa_L_iso': rsa_L_iso,
           'rsa_L_ref': rsa_L_ref,
           'rsa_M_iso': rsa_M_iso,
           'rsa_M_ref': rsa_M_ref,
           'rsa_V_iso': rsa_V_iso,
           'rsa_V_ref': rsa_V_ref,
           'rsa_A_iso': rsa_A_iso,
           'rsa_A_ref': rsa_A_ref,
           'rsa_C_iso': rsa_C_iso,
           'rsa_C_ref': rsa_C_ref,
           'rsa_T_iso': rsa_T_iso,
           'rsa_T_ref': rsa_T_ref,
           'rsa_H_iso': rsa_H_iso,
           'rsa_H_ref': rsa_H_ref,
           'rsa_G_iso': rsa_G_iso,
           'rsa_G_ref': rsa_G_ref,
           'rsa_S_iso': rsa_S_iso,
           'rsa_S_ref': rsa_S_ref,
           'rsa_Q_iso': rsa_Q_iso,
           'rsa_Q_ref': rsa_Q_ref,
           'rsa_R_iso': rsa_R_iso,
           'rsa_R_ref': rsa_R_ref,
           'rsa_E_iso': rsa_E_iso,
           'rsa_E_ref': rsa_E_ref,
           'rsa_N_iso': rsa_N_iso,
           'rsa_N_ref': rsa_N_ref,
           'rsa_K_iso': rsa_K_iso,
           'rsa_K_ref': rsa_K_ref,
           'rsa_P_iso': rsa_P_iso,
           'rsa_P_ref': rsa_P_ref,
           'rsa_D_iso': rsa_D_iso,
           'rsa_D_ref': rsa_D_ref}
    
    rows.append(row) # save in list


metrics_df = pd.DataFrame(rows)
metrics_df

metrics_df.to_csv('metrics.csv', index = False)

# Test

In [13]:
get_lowestScore("alphafold_res/bad_10_iso/relax.sc")

(-50.998, 'ranked_1_0005')

In [19]:
af_ranking = json.load(open("alphafold_res/bad_10_ref/ranking_debug.json"))
af_ranking

{'plddts': {'model_1_pred_0': 95.97107870012965,
  'model_2_pred_0': 96.15198577210786,
  'model_3_pred_0': 95.92231012818021,
  'model_4_pred_0': 95.66374663187953,
  'model_5_pred_0': 95.65247339759524},
 'order': ['model_2_pred_0',
  'model_1_pred_0',
  'model_3_pred_0',
  'model_4_pred_0',
  'model_5_pred_0']}

In [23]:
get_plddt_highest("alphafold_res/bad_10_ref/ranking_debug.json")

96.15198577210786

In [10]:
# rmsd_pdb_overall(moving_pdb, ref_pdb, moving_fasta, ref_fasta)
rmsd_pdb_overall("alphafold_res/good_0_iso/ranked_4_0001.pdb", "alphafold_res/good_0_ref/ranked_1_0001.pdb", "aa_seq/good_0_iso.fasta", "aa_seq/good_0_ref.fasta")

27.046520133671045

In [11]:
indices_moving, indices_ref = pairwise_SA("aa_seq/good_0_iso.fasta", "aa_seq/good_0_ref.fasta")
len(indices_moving)

744

In [15]:
pair_bad_10_df = pd.read_csv("pairs_csv/pair_bad_10.csv")
len(pair_bad_10_df['ResidueType'].unique())

16