In [6]:
import pandas as pd
import subprocess
import os
import numpy as np
from Bio import SeqIO, pairwise2
from Bio.PDB.DSSP import DSSP
from Bio.PDB import PDBParser, Superimposer
import warnings
import matplotlib.pyplot as plt
import json
from Bio.PDB.DSSP import dssp_dict_from_pdb_file
warnings.filterwarnings('ignore')

# Preprocess and select

In [3]:
chains = [
    {"pdb_id": "4i43", "chain": "A", "pair": 1},
    {"pdb_id": "4ilg", "chain": "A", "pair": 1},
    #{"pdb_id": "1cul", "chain": "C", "pair": 2},
    #{"pdb_id": "1cs4", "chain": "C", "pair": 2},
    {"pdb_id": "7mpi", "chain": "L", "pair": 3},
    {"pdb_id": "7b7d", "chain": "L", "pair": 3},
    #{"pdb_id": "1efm", "chain": "A", "pair": 4},
    #{"pdb_id": "1efu", "chain": "A", "pair": 4},
    {"pdb_id": "6hoo", "chain": "A", "pair": 5},
    {"pdb_id": "6hb8", "chain": "A", "pair": 5},
    {"pdb_id": "6tac", "chain": "A", "pair": 6},
    {"pdb_id": "6ta2", "chain": "A", "pair": 6}, #only 2 AA indel
    #{"pdb_id": "5ec9", "chain": "A", "pair": 7},
    #{"pdb_id": "4zsh", "chain": "A", "pair": 7},
    {"pdb_id": "2z2b", "chain": "A", "pair": 8},
    {"pdb_id": "2z2a", "chain": "A", "pair": 8},
    #{"pdb_id": "4b1c", "chain": "A", "pair": 9},
    #{"pdb_id": "4b0q", "chain": "A", "pair": 9},
    #{"pdb_id": "8u1u", "chain": "E", "pair": 10},
    #{"pdb_id": "8thl", "chain": "F", "pair": 10},
    #{"pdb_id": "7b74", "chain": "A", "pair": 11},
    #{"pdb_id": "7dy0", "chain": "A", "pair": 11},
    {"pdb_id": "8sar", "chain": "B", "pair": 12},
    {"pdb_id": "8saq", "chain": "B", "pair": 12},
    #{"pdb_id": "3stc", "chain": "A", "pair": 13},
    #{"pdb_id": "3qq1", "chain": "A", "pair": 13},
    {"pdb_id": "1c8s", "chain": "A", "pair": 14},
    {"pdb_id": "1c8r", "chain": "A", "pair": 14},
    {"pdb_id": "4awh", "chain": "A", "pair": 15},
    {"pdb_id": "4e5e", "chain": "A", "pair": 15},
    {"pdb_id": "6n4c", "chain": "F", "pair": 16},
    {"pdb_id": "6k4y", "chain": "F", "pair": 16},
    #{"pdb_id": "1wcm", "chain": "D", "pair": 17},
    #{"pdb_id": "1y14", "chain": "A", "pair": 17},
    {"pdb_id": "8e31", "chain": "C", "pair": 18},
    {"pdb_id": "8e3b", "chain": "C", "pair": 18}, #only 3 AA indel
    {"pdb_id": "6bhl", "chain": "A", "pair": 19},
    {"pdb_id": "6b2f", "chain": "A", "pair": 19},
    #{"pdb_id": "8hj1", "chain": "A", "pair": 20},
    #{"pdb_id": "8hj0", "chain": "A", "pair": 20},
    {"pdb_id": "1yae", "chain": "A", "pair": 21},
    {"pdb_id": "1vso", "chain": "A", "pair": 21},
    {"pdb_id": "5fxj", "chain": "A", "pair": 22},
    {"pdb_id": "4pe5", "chain": "A", "pair": 22},
    #{"pdb_id": "3gic", "chain": "B", "pair": 23},
    #{"pdb_id": "3f68", "chain": "H", "pair": 23},
    {"pdb_id": "5l1h", "chain": "A", "pair": 24},
    {"pdb_id": "5vot", "chain": "A", "pair": 24}
]

chains_df = pd.DataFrame(chains)
chains_df.head()

Unnamed: 0,pdb_id,chain,pair
0,4i43,A,1
1,4ilg,A,1
2,1cul,C,2
3,1cs4,C,2
4,7mpi,UA,3


In [9]:
# extract clean pdb and sequence file from pdb file based on chains
# output in the current working path, do not run again

dir = "original_files"

for index, row in chains_df.iterrows():
    pdb_ori = row['pdb_id'] + ".pdb"
    pdb_ori_path = os.path.join(dir, pdb_ori)
    chain = row['chain']
    if os.path.exists(pdb_ori_path):
        command = f"/scicore/home/zavolan/zhu0006/3D_structure/Rosetta/rosetta.binary.linux.release-371/main/tools/protein_tools/scripts/clean_pdb.py {pdb_ori_path} {chain}"
        subprocess.run(command, shell = True)

Found existing PDB file at original_files/4i43.pdb
4i43 A   334 --- --- MOD --- OK
>4i43_A
NTVPFTSAPIEVTIGIDQYSFNVKENQPFHGIKDIPIGHVHVIHFQHADNSSMRYGYWFDCRMGNFYIQYDPKDGLYKMMEERDGAKFENIRQMMVSYPKIDEDDTWYNLTEFVQMDKIRKIVRKDENQFSYVDSSMTTVQENELLKSSLEAKNEDDPAHSLNYTVINFKSREAIRPGHEMEDFLDKSYYLNTVMLQGIFKNSSNYFGELQFAFLNAMFFGNYGSSLQWHAMIELICSSATVPKHMLDKLDEILYYQIKTLPEQYSDILLNERVWNICLYSSFQKNSLHNTEKIMENKYPELLGKDNDDALIYGDDEDDEHNPTIVGGLYYQRP
Found existing PDB file at original_files/4ilg.pdb
4ilg A   321 ALT --- MOD --- OK
>4ilg_A
MNTVPFTSAPIEVTIGIDQYSFNVKENQPFHGIKDIPIGHVHVIHFQHADNSSMRYGYWFDCRMGNFYIQYDPKDGLYKMMEERDGAKFENIVHNFKERQMMVSYPKIDEDDTWYNLTEFVQMDKIRKIVRKDENQFSYVDSSMTTVQENELSSSSSDPAHSLNYTVINFKSREAIRPGHEMEDFLDKSYYLNTVMLQGIFKNSSNYFGELQFAFLNAMFFGNYGSSLQWHAMIELICSSATVPKHMLDKLDEILYYQIKTLPEQYSDILLNERVWNICLYSSFQKNSLHNTEKIMENKYPELLDDEHNPTIVGGLYYQRP
Found existing PDB file at original_files/1cul.pdb
1cul C   328 --- --- MOD --- OK
>1cul_C
ATHRLLLLGAGESGKSTIVKQMRILHVGEKATKVQDIKNNLKEAIETIVAAMSNLVPPVELANPENQFRV

In [10]:
ids = []
for file in os.listdir("chain_files"):
    id = file.split('_')[0]
    ids.append(id)

for index, row in chains_df.iterrows():
    if row['pdb_id'] not in ids:
        print(row['pdb_id'])

7mpi
7b7d
1efm
8thl
7b74
4w5w
3f68


In [None]:
# dealt with these - mistake typing, chain ID replacing, pdb file in gz format
# I checked each pair using pairwise sequence alignment
# and found that, in some examples, there are indels in the sequence files,
# but not in the fasta files extracted from pdb files
# I excluded these examples, and there are 14 pairs left

# Useful functions

In [7]:
# to trim predicted wildtype pdb file
# only for deletion mutants paper
def trim_wildtype(wt_pdb, del_pos, del_folder):
    trimmed_wt_filename = "wt_del" + str(del_pos) + "_" + os.path.basename(wt_pdb)
    trimmed_wt_filepath = os.path.join(del_folder, trimmed_wt_filename)
    with open(wt_pdb, "r") as source, open(trimmed_wt_filepath, "w") as target:
        for count, line in enumerate(source.readlines()):
            if line.startswith("ATOM"):
                line_elements = line.strip("\n").split()
                current_res = int(line_elements[5])
                if current_res!=del_pos:
                    target.write(line)
    return 0


#### RMSD ####

    
# to calculate rmsd using Bio.PDB
# predicted_pdb: moving pdb comparing to reference, need to have the same number of CA atoms as the ref_pdb
def rmsd_pdb(predicted_pdb, ref_pdb):
    parser = PDBParser()
    struct_ref = parser.get_structure(os.path.basename(ref_pdb), ref_pdb)
    struct_predicted = parser.get_structure(os.path.basename(predicted_pdb), predicted_pdb)
    fixed = [atom for atom in struct_ref[0].get_atoms() if atom.name == "CA"]
    moving = [atom for atom in struct_predicted[0].get_atoms() if atom.name == "CA"]
    sup = Superimposer()
    # sets the fixed and moving atom lists
    # finds the rotation and translation matrix that best superimposes the moving atoms onto fixed atoms
    sup.set_atoms(fixed, moving)
    # applies the calculated rotation and translation to all atoms in the second structure
    # superimposing it onto the first structure
    #sup.apply(struct_predicted[0].get_atoms())
    sup.apply(moving)

    return sup.rms


def rmsd_point(coordinate1, coordinate2):
    # 3D coordinates, example: array([ 11.925492,  10.070204, -12.518902], dtype=float32)
    # this is a function to calculate rmsd for a single point
    x1 = coordinate1[0]
    y1 = coordinate1[1]
    z1 = coordinate1[2]
    x2 = coordinate2[0]
    y2 = coordinate2[1]
    z2 = coordinate2[2]
    value = np.sqrt((x1-x2)**2 + (y1-y2)**2 + (z1-z2)**2)
    return value


def rmsd_list(coordinates1, coordinates2):
    # list of 3D coordinates, should have the same number of coordinates
    # this is a function to calculate rmsd for two list of 3D coordinates
    length = len(coordinates1)
    values = []
    for i in range(length):
        x1 = coordinates1[i][0]
        y1 = coordinates1[i][1]
        z1 = coordinates1[i][2]
        x2 = coordinates2[i][0]
        y2 = coordinates2[i][1]
        z2 = coordinates2[i][2]
        value = (x1-x2)**2 + (y1-y2)**2 + (z1-z2)**2
        values.append(value)
    rmsd = np.sqrt(sum(values)/length)
    return rmsd



#### Pairwise Sequence Alignment ####
    

# load fasta file to get the sequence
def load_fasta(fasta_file):
    with open(fasta_file, 'r') as f:
        record = next(SeqIO.parse(f, 'fasta'))
        return record.seq


# extract common residues from two sequences
def common_residues(seq1, seq2):
    # inputs are aligned sequences, including gaps, length should be the same
    # outputs are the common residues of the two sequences, indices are the original ones (without gaps)
    common_indices1 = []
    common_indices2 = []
    res_idx1 = 0
    res_idx2 = 0
    for a, b in zip(seq1, seq2):
        if a == b:
            res_idx1 += 1
            res_idx2 += 1
            common_indices1.append(res_idx1)
            common_indices2.append(res_idx2)
        elif (a != b) and (a != '-') and (b != '-'):
            res_idx1 += 1
            res_idx2 += 1
        elif (a == '-') and (b != '-'):
            res_idx2 += 1
        elif (a != '-') and (b == '-'):
            res_idx1 += 1
        else:
            print(a, b)
    if len(common_indices1) != len(common_indices2):
        print("Two indices have different length!")
        return 1
    return common_indices1, common_indices2

    
# return common indices for each sequence
def pairwise_SA(moving_fasta, ref_fasta):
    # load sequences from fasta files
    seq_moving = load_fasta(moving_fasta)
    seq_ref = load_fasta(ref_fasta)

    # pairwise sequence alignment
    alignments = pairwise2.align.globalxx(seq_moving, seq_ref)
    aligned_seq_moving, aligned_seq_ref = alignments[0][:2]

    # extract common residues
    indices_moving, indices_ref = common_residues(aligned_seq_moving, aligned_seq_ref)

    return indices_moving, indices_ref


#### read residue types from pairs ####
def residue_type(moving_fasta, ref_fasta):
    # load sequences from fasta files
    seq_moving = load_fasta(moving_fasta)
    seq_ref = load_fasta(ref_fasta)

    # pairwise sequence alignment
    alignments = pairwise2.align.globalxx(seq_moving, seq_ref)
    aligned_seq_moving, aligned_seq_ref = alignments[0][:2]

    # extract common residues
    indices_moving, indices_ref = common_residues(aligned_seq_moving, aligned_seq_ref)

    # initialize dictionary to save residue types
    residue_moving = {}
    residue_ref = {}

    # save the residue
    for i in range(len(indices_ref)):
        idx_moving = indices_moving[i] 
        idx_ref = indices_ref[i]
        residue_moving[idx_ref] = seq_moving[idx_moving-1] # use reference index
        residue_ref[idx_ref] = seq_ref[idx_ref-1] # change 1-based index to 0-based index

    # check if the two dictionaries are the same
    if residue_moving != residue_ref:
        print("residue inconsistency!")
        return 1

    # only need to return one dictionary
    return residue_ref
    
    
#### RMSD with pair ####
    
# to calculate rmsd per residue using Bio.PDB
def rmsd_pdb_perResidue(moving_pdb, ref_pdb, moving_fasta, ref_fasta):
    # load PDB structures
    parser = PDBParser()
    struct_moving = parser.get_structure(os.path.basename(moving_pdb), moving_pdb)
    struct_ref = parser.get_structure(os.path.basename(ref_pdb), ref_pdb)

    # extract common residues
    indices_moving, indices_ref = pairwise_SA(moving_fasta, ref_fasta)
    
    # get CA atoms from PDBs
    moving = [atom for atom in struct_moving[0].get_atoms() if (atom.name == "CA") and (atom.full_id[3][1] in indices_moving)]
    fixed = [atom for atom in struct_ref[0].get_atoms() if (atom.name == "CA") and (atom.full_id[3][1] in indices_ref)]

    # check if the two structures have the same number of length
    if len(moving) != len(fixed):
        print("Two structures have different numbers of residues!")
    
    # get the fixed coordinates
    coords_fixed = []
    for i in range(len(fixed)):
        coords_fixed.append(fixed[i].get_coord())
    # get the moving coordinates
    sup = Superimposer()
    sup.set_atoms(fixed, moving)
    sup.apply(moving)
    coords_moving = []
    for i in range(len(moving)):
        coords_moving.append(moving[i].get_coord())

    # calculate rmsd per residue (CA)
    rmsd_perResidue = {}
    for i in range(len(coords_fixed)):
        residue_id = fixed[i].full_id[3][1]
        rmsd_perResidue[residue_id] = rmsd_point(coords_fixed[i], coords_moving[i])

    return rmsd_perResidue


# to calculate rmsd overall using Bio.PDB
# return rmsd between two structures
# indices consistent with the previous function "rmsd_pdb_perResidue"
def rmsd_pdb_overall(moving_pdb, ref_pdb, moving_fasta, ref_fasta):
    # load PDB structures
    parser = PDBParser()
    struct_moving = parser.get_structure(os.path.basename(moving_pdb), moving_pdb)
    struct_ref = parser.get_structure(os.path.basename(ref_pdb), ref_pdb)

    # extract common residues
    indices_moving, indices_ref = pairwise_SA(moving_fasta, ref_fasta)
    
    # get CA atoms from PDBs
    moving = [atom for atom in struct_moving[0].get_atoms() if (atom.name == "CA") and (atom.full_id[3][1] in indices_moving)]
    fixed = [atom for atom in struct_ref[0].get_atoms() if (atom.name == "CA") and (atom.full_id[3][1] in indices_ref)]

    # check if the two structures have the same number of length
    if len(moving) != len(fixed):
        print("Two structures have different numbers of residues!")
    
    # get the fixed coordinates
    coords_fixed = []
    for i in range(len(fixed)):
        coords_fixed.append(fixed[i].get_coord())
    # get the moving coordinates
    sup = Superimposer()
    sup.set_atoms(fixed, moving)
    sup.apply(moving)
    coords_moving = []
    for i in range(len(moving)):
        coords_moving.append(moving[i].get_coord())

    rmsd = rmsd_list(coords_moving, coords_fixed)

    return rmsd



#### delta delta G ####
    
# extract delta G per residue out of sc file
def dG_perResidue(sc_path):
    dG_perResidue = {}
    with open(sc_path, 'r') as f:
        for count, line in enumerate(f.readlines()):
            if(count != 0):
                line = line.strip("\n").split()
                id = int(line[23].split("_")[1])
                score = float(line[22])
                dG_perResidue[id] = score
    return dG_perResidue

# to calculate ddG within pairs
def ddG_perResidue(moving_sc, ref_sc, moving_fasta, ref_fasta):
    #### input csv files !!! ####

    # extract common residues
    indices_moving, indices_ref = pairwise_SA(moving_fasta, ref_fasta)

    # read score file
    dG_moving = pd.read_csv(moving_sc)
    dG_ref = pd.read_csv(ref_sc)

    # initiate dG and ddG dictionary
    dG_iso_perResidue = {}
    dG_ref_perResidue = {}
    ddG_perResidue = {}
    
    # dG and ddG
    for i in range(len(indices_ref)):
        idx_moving = indices_moving[i]
        score_moving = dG_moving[dG_moving['residue_id']==idx_moving]['dG'].iloc[0]
        idx_ref = indices_ref[i]
        score_ref = dG_ref[dG_ref['residue_id']==idx_ref]['dG'].iloc[0]
        ddG = round(score_moving - score_ref, 3)

        # use index for reference as residue id
        dG_iso_perResidue[idx_ref] = score_moving
        dG_ref_perResidue[idx_ref] = score_ref
        ddG_perResidue[idx_ref] = ddG

    return ddG_perResidue, dG_iso_perResidue, dG_ref_perResidue
    

#### extract fa_sol term out of sc files ####
def fa_sol_perResidue(sc_path):
    fa_sol_perResidue = {}
    with open(sc_path, 'r') as f:
        for count, line in enumerate(f.readlines()):
            if (count != 0):
                line = line.strip("\n").split()
                id = int(line[23].split("_")[1])
                fa_sol = float(line[5])
                fa_sol_perResidue[id] = fa_sol
    return fa_sol_perResidue

    
# get fa_sol within pairs
def get_pair_fa_sol(moving_sc, ref_sc, moving_fasta, ref_fasta):
    #### input csv files !!! ####

    # extract common residues
    indices_moving, indices_ref = pairwise_SA(moving_fasta, ref_fasta)

    # read score file
    fa_sol_moving = pd.read_csv(moving_sc)
    fa_sol_ref = pd.read_csv(ref_sc)

    # initiate fa_sol dictionary
    fa_sol_iso_perResidue = {}
    fa_sol_ref_perResidue = {}

    # fa_sol
    for i in range(len(indices_ref)):
        idx_moving = indices_moving[i]
        score_moving = fa_sol_moving[fa_sol_moving['residue_id']==idx_moving]['fa_sol'].iloc[0]
        idx_ref = indices_ref[i]
        score_ref = fa_sol_ref[fa_sol_ref['residue_id']==idx_ref]['fa_sol'].iloc[0]

        # use index for reference as residue id
        fa_sol_iso_perResidue[idx_ref] = score_moving
        fa_sol_ref_perResidue[idx_ref] = score_ref

    return fa_sol_iso_perResidue, fa_sol_ref_perResidue

    
# get tags with the lowest relax score
# here it generates a tag file (basically a txt file)
# that contains 3 ids with lowest score by default

def get_lowestTag(sc_file, tag_file, num = 3):
    if sc_file==tag_file:
        print("score file and tag file should be different!")
        return 1
    # sc file is the one generated by Rosetta Relax
    # tag file is a path, not a folder, need to specify the file name
    scores_and_ids = pd.DataFrame(columns = ['score', 'id'])
    with open(sc_file, "r") as f:
        for count, line in enumerate(f.readlines()):
            if (count != 0) and (count != 1):
                line = line.strip("\n")
                line = line.split()
                scores_and_ids.loc[len(scores_and_ids)] = [float(line[1]), line[23]]
    scores_and_ids = scores_and_ids.sort_values(by = 'score', ascending = True)
    lowest_ids = scores_and_ids['id'].head(num)
    with open(tag_file, 'w') as f:
        for id in lowest_ids:
            f.write(f"{id}\n")
    return 0



#### plddt score (with pair) ####
    
# get plddt score from "ranking_debug.json"
def get_plddt(af_ranking_file):
    af_ranking = json.load(open(af_ranking_file))
    ave_plddt = format(sum(af_ranking['plddts'].values()) / len(af_ranking['plddts'].values()), '.3f')
    return float(ave_plddt)


# get plddt per residue
# based on the original residue id within the corresponding pdb file
def get_plddt_perResidue(result_model_pkl):
    plddt_dic = {} # create a dictionary to store the plddt score
    plddt = pd.read_pickle(result_model_pkl)
    plddt = plddt['plddt']
    for i in range(len(plddt)):
        plddt_dic[i+1] = plddt[i]
    return plddt_dic


# get model name for highest ranking
def get_ranked_0_model(af_ranking_file):
    af_ranking = json.load(open(af_ranking_file))
    model_name = af_ranking['order'][0]
    return model_name


# get plddt per residue, index based on pairwise sequence alignment
def plddt_pair_perResidue(moving_pkl, ref_pkl, moving_fasta, ref_fasta):
    ### input pkl files

    # extract common residues
    indices_moving, indices_ref = pairwise_SA(moving_fasta, ref_fasta)

    # read pkl files
    moving_plddt = get_plddt_perResidue(moving_pkl)
    ref_plddt = get_plddt_perResidue(ref_pkl)

    # initiate plddt dictionary
    moving_plddt_perResidue = {}
    ref_plddt_perResidue = {}
    
    # change the index based on sequence alignment
    for i in range(len(indices_ref)):
        idx_moving = indices_moving[i]
        idx_ref = indices_ref[i]
        plddt_score_moving = moving_plddt[idx_moving]
        plddt_score_ref = ref_plddt[idx_ref] # get plddt score from dictionary
        moving_plddt_perResidue[idx_ref] = plddt_score_moving # use reference index for consistency
        ref_plddt_perResidue[idx_ref] = plddt_score_ref

    return moving_plddt_perResidue, ref_plddt_perResidue



#### RSA from DSSP ####


# get relative accessible surface
def get_dssp(pdb_file):
    # need to import #
    # from Bio.PDB.DSSP import DSSP
    # from Bio.PDB import PDBParser
    p = PDBParser()
    structure = p.get_structure(str(pdb_file), pdb_file)
    model = structure[0]

    # count residues
    #n = 0
    #for res in model.get_residues():
        #n += 1
    
    dssp = DSSP(model, pdb_file, dssp = 'mkdssp')
    #if len(dssp.keys()) != n:
        #print(os.path.basename(pdb_file) + " dssp length different from pdb!")
        #return 1
    return dssp


# change from residue index to dssp index
def fix_index(pdb_file, chain_id, residue_id):
    # from Bio.PDB.DSSP import dssp_dict_from_pdb_file

    # define chain
    #chain_id = os.path.basename(pdb_file).split('.')[0].split('_')[1]

    # get DSSP index using residue index
    dssp_tuple = dssp_dict_from_pdb_file(pdb_file)
    dssp_dict = dssp_tuple[0]
    key = (chain_id, (' ', residue_id, ' '))
    if key in dssp_dict.keys():
        dssp_id = dssp_dict[chain_id, (' ', residue_id, ' ')][5]
        # reference: https://github.com/biopython/biopython/blob/master/Bio/PDB/DSSP.py
        return dssp_id
    return None
    
# get single RSA value given the residue ID
def get_single_rsa(pdb_file, chain_id, residue_id):
    dssp_data = get_dssp(pdb_file)
    dssp_id = fix_index(pdb_file, chain_id, residue_id)

    if dssp_id == None:
        return None

    for key in dssp_data.keys():
        if dssp_data[key][0] == dssp_id:
            # dssp_data[key][0] is DSSP index
            # according to https://biopython.org/docs/1.76/api/Bio.PDB.DSSP.html
            return dssp_data[key][3]
    return None

    
# get RSA per residue, index based on pairwise sequence alignment
def get_pair_rsa(moving_pdb, ref_pdb, moving_fasta, ref_fasta, moving_chain_id, ref_chain_id):

    # need to import #
    # from Bio import SeqIO, pairwise2
    # from Bio.PDB.DSSP import DSSP
    # from Bio.PDB import PDBParser

    # extract common residues
    indices_moving, indices_ref = pairwise_SA(moving_fasta, ref_fasta)

    # get dssp
    #moving_dssp = get_dssp(moving_pdb)
    #ref_dssp = get_dssp(ref_pdb)

    # initiate rsa dictionary
    moving_rsa_perResidue = {}
    ref_rsa_perResidue = {}

    # index based on reference sequence
    for i in range(len(indices_ref)):
        idx_moving = indices_moving[i]
        idx_ref = indices_ref[i]
        
        # get RSA
        rsa_moving = get_single_rsa(moving_pdb, moving_chain_id, idx_moving)
        if rsa_moving != None:
            moving_rsa_perResidue[idx_ref] = rsa_moving # save in dictionary, use reference index for consistency
            
        rsa_ref = get_single_rsa(ref_pdb, ref_chain_id, idx_ref)
        if rsa_ref != None:
            ref_rsa_perResidue[idx_ref] = rsa_ref

    return moving_rsa_perResidue, ref_rsa_perResidue

# Final examples to use

In [8]:
# renumber the isoform and the canonical
# we define the one with higher plddt score as the canonical

examples = [
    {"pdb_id": "4i43", "chain": "A", "pair": 1, "isoform": 1, "insertion": 1, "indel_num": 3},
    {"pdb_id": "4ilg", "chain": "A", "pair": 1, "isoform": 0, "insertion": 0, "indel_num": 3},
    {"pdb_id": "7mpi", "chain": "L", "pair": 2, "isoform": 0, "insertion": 0, "indel_num": 1},
    {"pdb_id": "7b7d", "chain": "L", "pair": 2, "isoform": 1, "insertion": 1, "indel_num": 1},
    {"pdb_id": "6hoo", "chain": "A", "pair": 3, "isoform": 1, "insertion": 1, "indel_num": 1},
    {"pdb_id": "6hb8", "chain": "A", "pair": 3, "isoform": 0, "insertion": 0, "indel_num": 1},
    {"pdb_id": "6tac", "chain": "A", "pair": 4, "isoform": 0, "insertion": 0, "indel_num": 1},
    {"pdb_id": "6ta2", "chain": "A", "pair": 4, "isoform": 1, "insertion": 1, "indel_num": 1}, #only 2 AA indel
    {"pdb_id": "2z2b", "chain": "A", "pair": 5, "isoform": 0, "insertion": 0, "indel_num": 2},
    {"pdb_id": "2z2a", "chain": "A", "pair": 5, "isoform": 1, "insertion": 1, "indel_num": 2},
    {"pdb_id": "8sar", "chain": "B", "pair": 6, "isoform": 1, "insertion": 0, "indel_num": 1},
    {"pdb_id": "8saq", "chain": "B", "pair": 6, "isoform": 0, "insertion": 1, "indel_num": 1},
    {"pdb_id": "1c8s", "chain": "A", "pair": 7, "isoform": 1, "insertion": 0, "indel_num": 1},
    {"pdb_id": "1c8r", "chain": "A", "pair": 7, "isoform": 0, "insertion": 1, "indel_num": 1},
    {"pdb_id": "4awh", "chain": "A", "pair": 8, "isoform": 1, "insertion": 1, "indel_num": 1},
    {"pdb_id": "4e5e", "chain": "A", "pair": 8, "isoform": 0, "insertion": 0, "indel_num": 1},
    {"pdb_id": "6n4c", "chain": "F", "pair": 9, "isoform": 1, "insertion": 0, "indel_num": 4},
    {"pdb_id": "6k4y", "chain": "F", "pair": 9, "isoform": 0, "insertion": 1, "indel_num": 4},
    {"pdb_id": "8e31", "chain": "C", "pair": 10, "isoform": 0, "insertion": 0, "indel_num": 1},
    {"pdb_id": "8e3b", "chain": "C", "pair": 10, "isoform": 1, "insertion": 1, "indel_num": 1}, #only 3 AA indel
    {"pdb_id": "6bhl", "chain": "A", "pair": 11, "isoform": 1, "insertion": 0, "indel_num": 5},
    {"pdb_id": "6b2f", "chain": "A", "pair": 11, "isoform": 0, "insertion": 1, "indel_num": 5},
    {"pdb_id": "1yae", "chain": "A", "pair": 12, "isoform": 1, "insertion": 1, "indel_num": 3},
    {"pdb_id": "1vso", "chain": "A", "pair": 12, "isoform": 0, "insertion": 0, "indel_num": 3},
    {"pdb_id": "5fxj", "chain": "A", "pair": 13, "isoform": 0, "insertion": 1, "indel_num": 6},
    {"pdb_id": "4pe5", "chain": "A", "pair": 13, "isoform": 1, "insertion": 0, "indel_num": 6},
    {"pdb_id": "5l1h", "chain": "A", "pair": 14, "isoform": 1, "insertion": 0, "indel_num": 3},
    {"pdb_id": "5vot", "chain": "A", "pair": 14, "isoform": 0, "insertion": 1, "indel_num": 3}
]

df = pd.DataFrame(examples)

In [47]:
# test
df[(df['pair']==1)&(df['isoform']==1)]['chain'].iloc[0]

'A'

# Extract tag files

In [5]:
# extract tag files from relax.out

af_dir = "alphafold_res"
tag_filename = "lowest.tag"

for folder in os.listdir(af_dir):
    if not folder.startswith('.'):
        print(folder)
        relax_sc_path = os.path.join(af_dir, folder, "relax.sc")
        tag_path = os.path.join(af_dir, folder, tag_filename)
        get_lowestTag(relax_sc_path, tag_path, num = 1)

4pe5_A
7mpi_L
1vso_A
5fxj_A
6k4y_F
6tac_A
8e31_C
1yae_A
2z2a_A
1c8r_A
8e3b_C
6b2f_A
5vot_A
5l1h_A
6hoo_A
4awh_A
4ilg_A
6hb8_A
6ta2_A
2z2b_A
8sar_B
7b7d_L
4i43_A
8saq_B
1c8s_A
6n4c_F
6bhl_A
4e5e_A


# Extract dG from sc files

In [43]:
for index, row in df.iterrows():
    example = row['pdb_id'] + "_" + row['chain']
    sc_original_name = example + "_perRes.sc"
    sc_original_path = os.path.join("chain_files", sc_original_name)
    sc_af_path = os.path.join("alphafold_res", example, "ranked_0_perRes.sc")
    tag_path = os.path.join("alphafold_res", example, "lowest.tag")
    with open(tag_path, 'r') as f:
        relax_name = f.readline().strip()
        relax_sc_name = relax_name + "_perRes.sc"
    sc_relax_path = os.path.join("alphafold_res", example, relax_sc_name)

    # read sc files
    sc_original = dG_perResidue(sc_original_path)
    sc_original = pd.DataFrame.from_dict(sc_original, orient = 'index', columns = ['dG'])
    sc_original = sc_original.reset_index().rename(columns = {'index': 'residue_id'})
    
    sc_af = dG_perResidue(sc_af_path)
    sc_af = pd.DataFrame.from_dict(sc_af, orient = 'index', columns = ['dG'])
    sc_af = sc_af.reset_index().rename(columns = {'index': 'residue_id'})
    
    sc_relax = dG_perResidue(sc_relax_path)
    sc_relax = pd.DataFrame.from_dict(sc_relax, orient = 'index', columns = ['dG'])
    sc_relax = sc_relax.reset_index().rename(columns = {'index': 'residue_id'})

    
    # save the files
    csv_original_name = example + "_original.csv"
    csv_original_path = os.path.join("dG_perResidue", csv_original_name)
    sc_original.to_csv(csv_original_path, index = False)
    
    csv_af_name = example + "_af.csv"
    csv_af_path = os.path.join("dG_perResidue", csv_af_name)
    sc_af.to_csv(csv_af_path, index = False)
    
    csv_relax_name = example + "_relax.csv"
    csv_relax_path = os.path.join("dG_perResidue", csv_relax_name)
    sc_relax.to_csv(csv_relax_path, index = False)

In [9]:
# extract fa_sol score from sc files
for index, row in df.iterrows():
    example = row['pdb_id'] + "_" + row['chain']
    sc_original_name = example + "_perRes.sc"
    sc_original_path = os.path.join("chain_files", sc_original_name)
    sc_af_path = os.path.join("alphafold_res", example, "ranked_0_perRes.sc")
    tag_path = os.path.join("alphafold_res", example, "lowest.tag")
    with open(tag_path, 'r') as f:
        relax_name = f.readline().strip()
        relax_sc_name = relax_name + "_perRes.sc"
    sc_relax_path = os.path.join("alphafold_res", example, relax_sc_name)

    # read sc files
    fa_sol_original = fa_sol_perResidue(sc_original_path)
    fa_sol_original = pd.DataFrame.from_dict(fa_sol_original, orient = 'index', columns = ['fa_sol'])
    fa_sol_original = fa_sol_original.reset_index().rename(columns = {'index': 'residue_id'})

    fa_sol_af = fa_sol_perResidue(sc_af_path)
    fa_sol_af = pd.DataFrame.from_dict(fa_sol_af, orient = 'index', columns = ['fa_sol'])
    fa_sol_af = fa_sol_af.reset_index().rename(columns = {'index': 'residue_id'})

    fa_sol_relax = fa_sol_perResidue(sc_relax_path)
    fa_sol_relax = pd.DataFrame.from_dict(fa_sol_relax, orient = 'index', columns = ['fa_sol'])
    fa_sol_relax = fa_sol_relax.reset_index().rename(columns = {'index': 'residue_id'})

    # save the files
    csv_original_name = example + "_original.csv"
    csv_original_path = os.path.join("fa_sol_perResidue", csv_original_name)
    fa_sol_original.to_csv(csv_original_path, index = False)

    csv_af_name = example + "_af.csv"
    csv_af_path = os.path.join("fa_sol_perResidue", csv_af_name)
    fa_sol_af.to_csv(csv_af_path, index = False)

    csv_relax_name = example + "_relax.csv"
    csv_relax_path = os.path.join("fa_sol_perResidue", csv_relax_name)
    fa_sol_relax.to_csv(csv_relax_path, index = False)

# RMSD, ddG, and plddt

In [41]:
# loop through pairs

# function definition: rmsd_pdb_perResidue(moving_pdb, ref_pdb, moving_fasta, ref_fasta)
# function definition: ddG_perResidue(moving_sc, ref_sc, moving_fasta, ref_fasta)

# Residue ID are based on the reference (canonical ones)

tag_filename = "lowest.tag" # can be changed

for i in range(1, 15):
    # get isoform name
    iso_pdb_id = df[(df['pair']==i)&(df['isoform']==1)]['pdb_id'].iloc[0]
    iso_chain = df[(df['pair']==i)&(df['isoform']==1)]['chain'].iloc[0]
    iso_name = iso_pdb_id + "_" + iso_chain

    # get reference (canonical) name
    ref_pdb_id = df[(df['pair']==i)&(df['isoform']==0)]['pdb_id'].iloc[0]
    ref_chain = df[(df['pair']==i)&(df['isoform']==0)]['chain'].iloc[0]
    ref_name = ref_pdb_id + "_" + ref_chain

    # fasta files directory
    iso_fasta_name = iso_name + ".fasta"
    iso_fasta_path = os.path.join("chain_files", iso_fasta_name)
    ref_fasta_name = ref_name + ".fasta"
    ref_fasta_path = os.path.join("chain_files", ref_fasta_name)

    # original pdb files directory
    iso_pdb_name = iso_name + ".pdb"
    iso_pdb_path = os.path.join("chain_files", iso_pdb_name)
    ref_pdb_name = ref_name + ".pdb"
    ref_pdb_path = os.path.join("chain_files", ref_pdb_name)

    
    # AlphaFold pdb files directory, use rank 0 only
    iso_af_pdb_path = os.path.join("alphafold_res", iso_name, "ranked_0.pdb")
    ref_af_pdb_path = os.path.join("alphafold_res", ref_name, "ranked_0.pdb")


    # relax pdb files directory, use the lowest only
    iso_tag_path = os.path.join("alphafold_res", iso_name, tag_filename)
    with open(iso_tag_path, 'r') as f:
        iso_relax_name = f.readline().strip()
        iso_relax_pdb_name = iso_relax_name + ".pdb"
    iso_relax_pdb_path = os.path.join("alphafold_res", iso_name, iso_relax_pdb_name)
    
    ref_tag_path = os.path.join("alphafold_res", ref_name, tag_filename)
    with open(ref_tag_path, 'r') as f:
        ref_relax_name = f.readline().strip()
        ref_relax_pdb_name = ref_relax_name + ".pdb"
    ref_relax_pdb_path = os.path.join("alphafold_res", ref_name, ref_relax_pdb_name)


    # csv files that store delta G score information
    dG_dir = "dG_perResidue"
    
    iso_sc_name = iso_name + "_original.csv"
    iso_sc_path = os.path.join(dG_dir, iso_sc_name)
    ref_sc_name = ref_name + "_original.csv"
    ref_sc_path = os.path.join(dG_dir, ref_sc_name)

    iso_af_sc_name = iso_name + "_af.csv"
    iso_af_sc_path = os.path.join(dG_dir, iso_af_sc_name)
    ref_af_sc_name = ref_name + "_af.csv"
    ref_af_sc_path = os.path.join(dG_dir, ref_af_sc_name)

    iso_relax_sc_name = iso_name + "_relax.csv"
    iso_relax_sc_path = os.path.join(dG_dir, iso_relax_sc_name)
    ref_relax_sc_name = ref_name + "_relax.csv"
    ref_relax_sc_path = os.path.join(dG_dir, ref_relax_sc_name)

    
    #### indices for isoform ####
    # pairwise_SA(moving_fasta, ref_fasta)
    # return indices_moving, indices_ref
    indices_iso, indices_ref = pairwise_SA(iso_fasta_path, ref_fasta_path)
    indices_dict = dict(zip(indices_ref, indices_iso)) # use reference indices as keys
    # initialize the dataframe
    pair_df = pd.DataFrame({
        'Residue': indices_dict.keys(),
        'Residue_isoform': indices_dict.values()
    })


    #### add residue types ####
    residues = residue_type(iso_fasta_path, ref_fasta_path)
    # combine into dataframe
    pair_df['ResidueType'] = pair_df['Residue'].map(residues)
    
    
    #### RMSD ####
    # original pdb
    rmsd_original = rmsd_pdb_perResidue(iso_pdb_path, ref_pdb_path, iso_fasta_path, ref_fasta_path)
    # pdb after AlphaFold
    rmsd_af = rmsd_pdb_perResidue(iso_af_pdb_path, ref_af_pdb_path, iso_fasta_path, ref_fasta_path)
    # pdb after AlphaFold + Rosetta
    rmsd_relax = rmsd_pdb_perResidue(iso_relax_pdb_path, ref_relax_pdb_path, iso_fasta_path, ref_fasta_path)

    # combine these into the dataframe
    # theoretically they should have same keys
    pair_df['RMSD_original'] = pair_df['Residue'].map(rmsd_original)
    pair_df['RMSD_af'] = pair_df['Residue'].map(rmsd_af)
    pair_df['RMSD_relax'] = pair_df['Residue'].map(rmsd_relax)

    

    #### dG and ddG ####
    # ddG_perResidue(moving_sc, ref_sc, moving_fasta, ref_fasta)
    # return ddG_perResidue, dG_iso_perResidue, dG_ref_perResidue 
    
    # original pdb
    ddG_original, dG_iso_original, dG_ref_original = ddG_perResidue(iso_sc_path, ref_sc_path, iso_fasta_path, ref_fasta_path)
    # pdb after AlphaFold
    ddG_af, dG_iso_af, dG_ref_af = ddG_perResidue(iso_af_sc_path, ref_af_sc_path, iso_fasta_path, ref_fasta_path)
    # pdb after AlphaFold + Rosetta
    ddG_relax, dG_iso_relax, dG_ref_relax = ddG_perResidue(iso_relax_sc_path, ref_relax_sc_path, iso_fasta_path, ref_fasta_path)

    # combine these into dataframe
    # theoretically they should have same keys
    pair_df['ddG_original'] = pair_df['Residue'].map(ddG_original)
    pair_df['ddG_af'] = pair_df['Residue'].map(ddG_af)
    pair_df['ddG_relax'] = pair_df['Residue'].map(ddG_relax)

    pair_df['dG_iso_original'] = pair_df['Residue'].map(dG_iso_original)
    pair_df['dG_ref_original'] = pair_df['Residue'].map(dG_ref_original)
    pair_df['dG_iso_af'] = pair_df['Residue'].map(dG_iso_af)
    pair_df['dG_ref_af'] = pair_df['Residue'].map(dG_ref_af)
    pair_df['dG_iso_relax'] = pair_df['Residue'].map(dG_iso_relax)
    pair_df['dG_ref_relax'] = pair_df['Residue'].map(dG_ref_relax)


    #### plddt per residue ####
    # get plddt per residue for both isoform and reference (canonical)
    # plddt_pair_perResidue(moving_pkl, ref_pkl, moving_fasta, ref_fasta)
    iso_model_name = get_ranked_0_model(os.path.join("alphafold_res", iso_name, "ranking_debug.json"))
    iso_pkl_name = "result_" + iso_model_name + ".pkl"
    iso_pkl_path = os.path.join("alphafold_res", iso_name, iso_pkl_name)

    ref_model_name = get_ranked_0_model(os.path.join("alphafold_res", ref_name, "ranking_debug.json"))
    ref_pkl_name = "result_" + ref_model_name + ".pkl"
    ref_pkl_path = os.path.join("alphafold_res", ref_name, ref_pkl_name)

    iso_plddt, ref_plddt = plddt_pair_perResidue(iso_pkl_path, ref_pkl_path, iso_fasta_path, ref_fasta_path)

    # combine into the dataframe
    pair_df['plddt_iso'] = pair_df['Residue'].map(iso_plddt)
    pair_df['plddt_ref'] = pair_df['Residue'].map(ref_plddt)


    #### RSA per residue ####
    # get_pair_rsa(moving_pdb, ref_pdb, moving_fasta, ref_fasta, moving_chain_id, ref_chain_id)
    
    # original pdb
    rsa_iso_original, rsa_ref_original = get_pair_rsa(iso_pdb_path, ref_pdb_path, iso_fasta_path, ref_fasta_path, iso_chain, ref_chain)
    # pdb after AlphaFold
    rsa_iso_af, rsa_ref_af = get_pair_rsa(iso_af_pdb_path, ref_af_pdb_path, iso_fasta_path, ref_fasta_path, "A", "A")
    # pdb after AlphaFold + Rosetta
    rsa_iso_relax, rsa_ref_relax = get_pair_rsa(iso_relax_pdb_path, ref_relax_pdb_path, iso_fasta_path, ref_fasta_path, "A", "A")

    # after AlphaFold / Relax the chain id will automatically become "A"...

    # combine into the dataframe
    pair_df['rsa_iso_original'] = pair_df['Residue'].map(rsa_iso_original)
    pair_df['rsa_ref_original'] = pair_df['Residue'].map(rsa_ref_original)
    pair_df['rsa_iso_af'] = pair_df['Residue'].map(rsa_iso_af)
    pair_df['rsa_ref_af'] = pair_df['Residue'].map(rsa_ref_af)
    pair_df['rsa_iso_relax'] = pair_df['Residue'].map(rsa_iso_relax)
    pair_df['rsa_ref_relax'] = pair_df['Residue'].map(rsa_ref_relax)

    # print process
    print(i)

    # save to csv
    csv_name = "pair_" + str(i) + ".csv"
    csv_path = os.path.join("pairs_csv", csv_name)
    pair_df.to_csv(csv_path, index = False)

1
2
3
4
5
6
7
8
9
10
11
12
13
14


In [42]:
pair_all = []

for file in os.listdir("pairs_csv"):
    if file.startswith('pair_'):
        current_pair_df = pd.read_csv(os.path.join("pairs_csv", file))
        current_pair_df['Pair'] = file
        pair_all.append(current_pair_df)

pair_all_df = pd.concat(pair_all, ignore_index = True)
pair_all_df.to_csv("pairs_csv/pairs_all.csv", index = False)

In [11]:
# add fa_sol to the csv files

tag_filename = "lowest.tag" # can be changed

for i in range(1, 15):
    # get isoform name
    iso_pdb_id = df[(df['pair']==i)&(df['isoform']==1)]['pdb_id'].iloc[0]
    iso_chain = df[(df['pair']==i)&(df['isoform']==1)]['chain'].iloc[0]
    iso_name = iso_pdb_id + "_" + iso_chain

    # get reference (canonical) name
    ref_pdb_id = df[(df['pair']==i)&(df['isoform']==0)]['pdb_id'].iloc[0]
    ref_chain = df[(df['pair']==i)&(df['isoform']==0)]['chain'].iloc[0]
    ref_name = ref_pdb_id + "_" + ref_chain

    # fasta files directory
    iso_fasta_name = iso_name + ".fasta"
    iso_fasta_path = os.path.join("chain_files", iso_fasta_name)
    ref_fasta_name = ref_name + ".fasta"
    ref_fasta_path = os.path.join("chain_files", ref_fasta_name)

    # csv files that store fa_sol score information
    fa_sol_dir = "fa_sol_perResidue"
    
    iso_sc_name = iso_name + "_original.csv"
    iso_sc_path = os.path.join(fa_sol_dir, iso_sc_name)
    ref_sc_name = ref_name + "_original.csv"
    ref_sc_path = os.path.join(fa_sol_dir, ref_sc_name)

    iso_af_sc_name = iso_name + "_af.csv"
    iso_af_sc_path = os.path.join(fa_sol_dir, iso_af_sc_name)
    ref_af_sc_name = ref_name + "_af.csv"
    ref_af_sc_path = os.path.join(fa_sol_dir, ref_af_sc_name)

    iso_relax_sc_name = iso_name + "_relax.csv"
    iso_relax_sc_path = os.path.join(fa_sol_dir, iso_relax_sc_name)
    ref_relax_sc_name = ref_name + "_relax.csv"
    ref_relax_sc_path = os.path.join(fa_sol_dir, ref_relax_sc_name)

    csv_name = "pair_" + str(i) + ".csv"
    csv_path = os.path.join("pairs_csv", csv_name)
    pair_df = pd.read_csv(csv_path)

    #### get fa_sol scores ####
    # get_pair_fa_sol(moving_sc, ref_sc, moving_fasta, ref_fasta)
    # return fa_sol_iso_perResidue, fa_sol_ref_perResidue
    ## original
    fa_sol_iso_original, fa_sol_ref_original = get_pair_fa_sol(iso_sc_path, ref_sc_path, iso_fasta_path, ref_fasta_path)
    ## af
    fa_sol_iso_af, fa_sol_ref_af = get_pair_fa_sol(iso_af_sc_path, ref_af_sc_path, iso_fasta_path, ref_fasta_path)
    ## relax
    fa_sol_iso_relax, fa_sol_ref_relax = get_pair_fa_sol(iso_relax_sc_path, ref_relax_sc_path, iso_fasta_path, ref_fasta_path)

    pair_df['fa_sol_iso_original'] = pair_df['Residue'].map(fa_sol_iso_original)
    pair_df['fa_sol_ref_original'] = pair_df['Residue'].map(fa_sol_ref_original)
    pair_df['fa_sol_iso_af'] = pair_df['Residue'].map(fa_sol_iso_af)
    pair_df['fa_sol_ref_af'] = pair_df['Residue'].map(fa_sol_ref_af)
    pair_df['fa_sol_iso_relax'] = pair_df['Residue'].map(fa_sol_iso_relax)
    pair_df['fa_sol_ref_relax'] = pair_df['Residue'].map(fa_sol_ref_relax)

    # define saving path
    csv_path = os.path.join("pairs_csv_fa_sol", csv_name)
    pair_df.to_csv(csv_path, index = False)




pair_all = []

for file in os.listdir("pairs_csv_fa_sol"):
    if file.startswith('pair_'):
        current_pair_df = pd.read_csv(os.path.join("pairs_csv_fa_sol", file))
        current_pair_df['Pair'] = file
        pair_all.append(current_pair_df)

pair_all_df = pd.concat(pair_all, ignore_index = True)
pair_all_df.to_csv("pairs_csv_fa_sol/pairs_all.csv", index = False)

In [8]:
# calculate correlations, and other things
# rmsd
corr_rmsd_af_original = {}
corr_rmsd_relax_original = {}
rmsd = {}
plddt_iso = {}
plddt_ref = {}
indel_num = {}

for i in range(1, 15):
    csv_name = "pair_" + str(i) + ".csv"
    csv_path = os.path.join("pairs_csv", csv_name)
    df_pairs = pd.read_csv(csv_path)
    corr_rmsd_af_original[i] = round(df_pairs['RMSD_original'].corr(df_pairs['RMSD_af'], method = "kendall"), 3)
    corr_rmsd_relax_original[i] = round(df_pairs['RMSD_original'].corr(df_pairs['RMSD_relax'], method = "kendall"), 3)


    #### number of indels ####
    indel_num[i] = df[(df['pair']==i)&(df['isoform']==1)]['indel_num'].iloc[0]

    
    #### RMSD within pairs (original) ####
    # get isoform name
    iso_pdb_id = df[(df['pair']==i)&(df['isoform']==1)]['pdb_id'].iloc[0]
    iso_chain = df[(df['pair']==i)&(df['isoform']==1)]['chain'].iloc[0]
    iso_name = iso_pdb_id + "_" + iso_chain

    # get reference (canonical) name
    ref_pdb_id = df[(df['pair']==i)&(df['isoform']==0)]['pdb_id'].iloc[0]
    ref_chain = df[(df['pair']==i)&(df['isoform']==0)]['chain'].iloc[0]
    ref_name = ref_pdb_id + "_" + ref_chain

    # fasta files directory
    iso_fasta_name = iso_name + ".fasta"
    iso_fasta_path = os.path.join("chain_files", iso_fasta_name)
    ref_fasta_name = ref_name + ".fasta"
    ref_fasta_path = os.path.join("chain_files", ref_fasta_name)

    # original pdb files directory
    iso_pdb_name = iso_name + ".pdb"
    iso_pdb_path = os.path.join("chain_files", iso_pdb_name)
    ref_pdb_name = ref_name + ".pdb"
    ref_pdb_path = os.path.join("chain_files", ref_pdb_name)

    # RMSD
    rmsd[i] = round(rmsd_pdb_overall(iso_pdb_path, ref_pdb_path, iso_fasta_path, ref_fasta_path), 3)


    #### plDDT ####
    # af score file
    iso_af_sc_path = os.path.join("alphafold_res", iso_name, "ranking_debug.json")
    plddt_iso[i] = get_plddt(iso_af_sc_path)

    ref_af_sc_path = os.path.join("alphafold_res", ref_name, "ranking_debug.json")
    plddt_ref[i] = get_plddt(ref_af_sc_path)
    

print(corr_rmsd_af_original)
print(corr_rmsd_relax_original)
print(rmsd)
print(plddt_iso)
print(plddt_ref)

keys = list(corr_rmsd_af_original.keys())
indel = [indel_num[key] for key in keys]
corr1 = [corr_rmsd_af_original[key] for key in keys]
corr2 = [corr_rmsd_relax_original[key] for key in keys]
rmsd = [rmsd[key] for key in keys]
plddt_iso = [plddt_iso[key] for key in keys]
plddt_ref = [plddt_ref[key] for key in keys]
df_corr = pd.DataFrame({'pair': keys, 'indel_num': indel, 'corr_af': corr1, 'corr_relax': corr2, 'rmsd': rmsd, 'plddt_iso': plddt_iso, 'plddt_ref': plddt_ref})
df_corr.to_csv("correlations_csv/rmsd.csv", index = False)

{1: 0.361, 2: 0.136, 3: 0.119, 4: 0.018, 5: 0.488, 6: 0.064, 7: 0.254, 8: 0.158, 9: 0.517, 10: 0.17, 11: -0.132, 12: 0.227, 13: 0.273, 14: 0.056}
{1: 0.273, 2: 0.192, 3: 0.159, 4: 0.143, 5: 0.147, 6: 0.225, 7: 0.402, 8: 0.273, 9: 0.231, 10: -0.072, 11: -0.005, 12: 0.361, 13: 0.209, 14: 0.067}
{1: 2.046, 2: 0.764, 3: 1.501, 4: 0.624, 5: 0.689, 6: 2.085, 7: 0.535, 8: 2.201, 9: 14.263, 10: 5.157, 11: 1.254, 12: 2.295, 13: 3.592, 14: 3.847}
{1: 89.189, 2: 92.704, 3: 94.913, 4: 96.919, 5: 96.889, 6: 81.493, 7: 95.59, 8: 92.492, 9: 84.026, 10: 89.602, 11: 84.656, 12: 92.383, 13: 86.718, 14: 87.272}
{1: 91.27, 2: 94.837, 3: 97.145, 4: 97.042, 5: 97.696, 6: 85.564, 7: 96.079, 8: 93.347, 9: 87.489, 10: 90.374, 11: 95.343, 12: 95.425, 13: 87.078, 14: 90.334}


In [9]:
# calculate correlations, and other things
# ddG
corr_ddG_af_original = {}
corr_ddG_relax_original = {}
rmsd = {}
plddt_iso = {}
plddt_ref = {}
indel_num = {}

for i in range(1, 15):
    csv_name = "pair_" + str(i) + ".csv"
    csv_path = os.path.join("pairs_csv", csv_name)
    df_pairs = pd.read_csv(csv_path)
    corr_ddG_af_original[i] = round(df_pairs['ddG_original'].corr(df_pairs['ddG_af'], method = "kendall"), 3)
    corr_ddG_relax_original[i] = round(df_pairs['ddG_original'].corr(df_pairs['ddG_relax'], method = "kendall"), 3)


    #### number of indels ####
    indel_num[i] = df[(df['pair']==i)&(df['isoform']==1)]['indel_num'].iloc[0]

    
    #### RMSD within pairs (original) ####
    # get isoform name
    iso_pdb_id = df[(df['pair']==i)&(df['isoform']==1)]['pdb_id'].iloc[0]
    iso_chain = df[(df['pair']==i)&(df['isoform']==1)]['chain'].iloc[0]
    iso_name = iso_pdb_id + "_" + iso_chain

    # get reference (canonical) name
    ref_pdb_id = df[(df['pair']==i)&(df['isoform']==0)]['pdb_id'].iloc[0]
    ref_chain = df[(df['pair']==i)&(df['isoform']==0)]['chain'].iloc[0]
    ref_name = ref_pdb_id + "_" + ref_chain

    # fasta files directory
    iso_fasta_name = iso_name + ".fasta"
    iso_fasta_path = os.path.join("chain_files", iso_fasta_name)
    ref_fasta_name = ref_name + ".fasta"
    ref_fasta_path = os.path.join("chain_files", ref_fasta_name)

    # original pdb files directory
    iso_pdb_name = iso_name + ".pdb"
    iso_pdb_path = os.path.join("chain_files", iso_pdb_name)
    ref_pdb_name = ref_name + ".pdb"
    ref_pdb_path = os.path.join("chain_files", ref_pdb_name)

    # RMSD
    rmsd[i] = round(rmsd_pdb_overall(iso_pdb_path, ref_pdb_path, iso_fasta_path, ref_fasta_path), 3)


    #### plDDT ####
    # af score file
    iso_af_sc_path = os.path.join("alphafold_res", iso_name, "ranking_debug.json")
    plddt_iso[i] = get_plddt(iso_af_sc_path)

    ref_af_sc_path = os.path.join("alphafold_res", ref_name, "ranking_debug.json")
    plddt_ref[i] = get_plddt(ref_af_sc_path)
    

print(corr_ddG_af_original)
print(corr_ddG_relax_original)
print(rmsd)
print(plddt_iso)
print(plddt_ref)

keys = list(corr_ddG_af_original.keys())
indel = [indel_num[key] for key in keys]
corr1 = [corr_ddG_af_original[key] for key in keys]
corr2 = [corr_ddG_relax_original[key] for key in keys]
rmsd = [rmsd[key] for key in keys]
plddt_iso = [plddt_iso[key] for key in keys]
plddt_ref = [plddt_ref[key] for key in keys]
df_corr = pd.DataFrame({'pair': keys, 'indel_num': indel, 'corr_af': corr1, 'corr_relax': corr2, 'rmsd': rmsd, 'plddt_iso': plddt_iso, 'plddt_ref': plddt_ref})
df_corr.to_csv("correlations_csv/ddG.csv", index = False)

{1: 0.078, 2: -0.024, 3: 0.077, 4: 0.073, 5: 0.062, 6: 0.034, 7: 0.131, 8: 0.119, 9: 0.067, 10: -0.007, 11: 0.231, 12: 0.046, 13: 0.039, 14: 0.007}
{1: -0.02, 2: 0.019, 3: -0.035, 4: 0.025, 5: 0.001, 6: 0.019, 7: 0.189, 8: 0.126, 9: 0.023, 10: 0.06, 11: 0.159, 12: -0.003, 13: 0.029, 14: 0.059}
{1: 2.046, 2: 0.764, 3: 1.501, 4: 0.624, 5: 0.689, 6: 2.085, 7: 0.535, 8: 2.201, 9: 14.263, 10: 5.157, 11: 1.254, 12: 2.295, 13: 3.592, 14: 3.847}
{1: 89.189, 2: 92.704, 3: 94.913, 4: 96.919, 5: 96.889, 6: 81.493, 7: 95.59, 8: 92.492, 9: 84.026, 10: 89.602, 11: 84.656, 12: 92.383, 13: 86.718, 14: 87.272}
{1: 91.27, 2: 94.837, 3: 97.145, 4: 97.042, 5: 97.696, 6: 85.564, 7: 96.079, 8: 93.347, 9: 87.489, 10: 90.374, 11: 95.343, 12: 95.425, 13: 87.078, 14: 90.334}


# test

In [7]:
seq1 = load_fasta("test/test_pair/1c8s_A.fasta")
seq2 = load_fasta("test/test_pair/1c8r_A.fasta")
aln = pairwise2.align.globalxx(seq1, seq2)[0]
aligned_seq1, aligned_seq2 = aln[:2]
aligned_seq1, aligned_seq2

('TGRPEWIWLALGTALMGLGTLYFLVKGMGVSDPDAKKFYAITTLVPAIAFTMYLSMLLGYGLTMVPFGGEQNPIYWARYADWLFTTPLLLLNLALLVDADQGTILALVGADGIMIGTGLVGALTKVYSYRFVWWAISTAAMLYILYVLF-----------------NVTVVLWSAYPVVWLIGSEGAGIVPLNIETLLFMVLDVSAKVGFGLI---------',
 'TGRPEWIWLALGTALMGLGTLYFLVKGMGVSDPDAKKFYAITTLVPAIAFTMYLSMLLGYGLTMVPFGGEQNPIYWARYADWLFTTPLLLLNLALLVDADQGTILALVGADGIMIGTGLVGALTKVYSYRFVWWAISTAAMLYILYVLFFGFSMRPEVASTFKVLRNVTVVLWSAYPVVWLIGSEGAGIVPLNIETLLFMVLDVSAKVGFGLILLRSRAIFG')

In [6]:
seq1 = load_fasta("test/test_pair/toy1.fasta")
seq2 = load_fasta("test/test_pair/toy2.fasta")
aln = pairwise2.align.globalxx(seq1, seq2)[0]
aligned_seq1, aligned_seq2 = aln[:2]
aligned_seq1, aligned_seq2

('TG-PEWIWL--AL', 'TGRPE-IWLLLAL')

In [21]:
residue_type("test/test_pair/toy1.fasta", "test/test_pair/toy2.fasta")

{1: 'T', 2: 'G', 4: 'P', 5: 'E', 6: 'I', 7: 'W', 8: 'L', 11: 'A', 12: 'L'}

In [47]:
common1, common2 = common_residues(aligned_seq1, aligned_seq2)
common1

[1, 2, 3, 4, 6, 7, 8, 9, 10]

In [9]:
rmsd_test = rmsd_pdb_perResidue("test/test_pair/toy1.pdb", "test/test_pair/toy2.pdb", "test/test_pair/toy1.fasta", "test/test_pair/toy2.fasta")
rmsd_test

{1: 2.3832724928168103,
 2: 1.8278304086460582,
 4: 2.7925501284101837,
 5: 3.0679377546196474,
 6: 1.4460801179972724,
 7: 1.2449680631494484,
 8: 1.6106359400560664,
 11: 4.371540604549284,
 12: 4.554878544200179}

In [19]:
rmsd_list = {key: value**2 for key, value in rmsd_test.items()}
np.sqrt(sum(rmsd_list.values()) / 9)

2.8347850631957674

In [11]:
rmsd_test = rmsd_pdb_overall("test/test_pair/toy1.pdb", "test/test_pair/toy2.pdb", "test/test_pair/toy1.fasta", "test/test_pair/toy2.fasta")
rmsd_test # consistent of manual test above

2.8347850631957674

In [27]:
dG_test1 = dG_perResidue("test/test_pair/toy1.sc", common1)
dG_test2 = dG_perResidue("test/test_pair/toy2.sc", common2)
print(dG_test1)
print(dG_test2)

{1: 0.39, 2: 0.311, 3: -0.641, 4: -2.13, 5: -2.935, 6: -1.075, 7: -3.307, 8: -7.437, 9: -3.794}
{1: 0.289, 2: 0.334, 3: -2.16, 4: -5.501, 5: -2.9, 6: -1.175, 7: -2.99, 8: -3.124, 9: -2.838}


In [22]:
test1 = {1: 0.5, 2: 0.1}
test2 = {1: 0.3, 2: 0.05}
{key: test1[key]-test2[key] for key in test1}

{1: 0.2, 2: 0.05}

In [52]:
sc_test = dG_perResidue("test/test_pair/toy2.sc")
sc_test = pd.DataFrame.from_dict(sc_test, orient = 'index', columns = ['dG'])
sc_test = sc_test.reset_index().rename(columns = {'index': 'residue_id'})
sc_test.to_csv("test/test_pair/toy2.csv", index = False)

In [2]:
sc_test = fa_sol_perResidue("test/test_pair/toy2.sc")
sc_test

{1: 0.762,
 2: 0.572,
 3: 6.787,
 4: 2.18,
 5: 6.752,
 6: 3.376,
 7: 1.876,
 8: 3.012,
 9: 3.275,
 10: 2.543,
 11: 2.879,
 12: 4.0}

In [55]:
toy2_sc = pd.read_csv("test/test_pair/toy2.csv")
toy2_sc[toy2_sc['residue_id']==1]['dG'].iloc[0]

0.289

In [14]:
ddG_test = ddG_perResidue("test/test_pair/toy1.csv", "test/test_pair/toy2.csv", "test/test_pair/toy1.fasta", "test/test_pair/toy2.fasta")
ddG_test

({1: 0.101,
  2: -0.023,
  4: 1.519,
  5: 3.371,
  6: -0.035,
  7: 0.1,
  8: -0.317,
  11: -4.313,
  12: -0.956},
 {1: 0.39,
  2: 0.311,
  4: -0.641,
  5: -2.13,
  6: -2.935,
  7: -1.075,
  8: -3.307,
  11: -7.437,
  12: -3.794},
 {1: 0.289,
  2: 0.334,
  4: -2.16,
  5: -5.501,
  6: -2.9,
  7: -1.175,
  8: -2.99,
  11: -3.124,
  12: -2.838})

In [17]:
rmsd_test = rmsd_pdb_perResidue("test/test_pair/1c8s_A.pdb", "test/test_pair/1c8r_A.pdb", "test/test_pair/1c8s_A.fasta", "test/test_pair/1c8r_A.fasta")
len(rmsd_test) # consistent with what I got in NCBI BlastP

196

In [15]:
plddt_df_test = pd.read_pickle("alphafold_res/7b7d_L/result_model_2_pred_0.pkl")
plddt_df_test['plddt'][0:20]

array([82.75529757, 90.11974773, 94.06391428, 96.80744335, 96.35257968,
       96.3565961 , 96.20805604, 96.76853942, 96.84631438, 97.45464079,
       96.21296917, 96.43939875, 93.88745398, 97.52955247, 97.70038517,
       97.31614692, 97.77972968, 97.48317298, 97.07363158, 97.49932168])

In [33]:
get_plddt_perResidue("alphafold_res/1c8r_A/result_model_1_pred_0.pkl")[1]

59.723875642084735

In [27]:
get_ranked_0_model("alphafold_res/1c8r_A/ranking_debug.json")

'model_2_pred_0'

In [13]:
test_plddt1, test_plddt2 = plddt_pair_perResidue("alphafold_res/1c8s_A/result_model_2_pred_0.pkl", "alphafold_res/1c8r_A/result_model_2_pred_0.pkl", "test/test_pair/1c8s_A.fasta", "test/test_pair/1c8r_A.fasta")
print(len(test_plddt1), len(test_plddt2))

196 196


In [16]:
rsa_test = get_pair_rsa("test/test_pair/toy1.pdb", "test/test_pair/toy2.pdb", "test/test_pair/toy1.fasta", "test/test_pair/toy2.fasta")
rsa_test

({1: 0.8309859154929577,
  2: 0.9523809523809523,
  4: 0.592741935483871,
  5: 0.7794117647058824,
  6: 0.45374449339207046,
  7: 0.47337278106508873,
  8: 0.9118942731277533,
  11: 0.7378048780487805,
  12: 0.9528301886792453},
 {1: 0.8591549295774648,
  2: 0.9404761904761905,
  4: 0.7867647058823529,
  5: 0.4175257731958763,
  6: 0.4933920704845815,
  7: 0.40236686390532544,
  8: 0.7533039647577092,
  11: 0.774390243902439,
  12: 1.0})

In [4]:
dssp_test = get_dssp("test/rsa_visualization/1c8r_A.pdb")
#len(dssp_test.keys())
a_key = dssp_test.keys()[221]
dssp_test[a_key]

(223,
 'G',
 '-',
 0.9404761904761905,
 -60.4,
 360.0,
 -2,
 -0.3,
 -1,
 -0.1,
 0,
 0.0,
 -191,
 -0.0)

In [61]:
dssp_test = get_dssp("chain_files/7mpi_L.pdb")
get_single_rsa(dssp_test, 103)

0.7925531914893617

In [45]:
dssp_test = get_dssp("test/test_pair/6hoo_A.pdb")
get_single_rsa(dssp_test, 47)

0.036585365853658534

In [40]:
rsa_test = get_pair_rsa("test/test_pair/6hoo_A.pdb", "test/test_pair/6hb8_A.pdb", "test/test_pair/6hoo_A.fasta", "test/test_pair/6hb8_A.fasta", "A", "A")
print(len(rsa_test[0]), len(rsa_test[1]))

240 241


In [19]:
a = [1, 2, 3]
b = [4, 5, 6]
dict(zip(a, b))

{1: 4, 2: 5, 3: 6}

In [13]:
# test dssp length
for file in os.listdir("chain_files"):
    if file.endswith(".pdb"):
        get_dssp(os.path.join("chain_files", file))

6hoo_A.pdb dssp length different from pdb!
6n4c_F.pdb dssp length different from pdb!


In [38]:
dssp_tuple = dssp_dict_from_pdb_file("test/rsa_visualization/1c8r_A.pdb")
dssp_dict = dssp_tuple[0]
#dssp_dict.keys()
print(dssp_dict['A',(' ', 153, ' ')])

('S', '-', 186, 360.0, 81.5, 154, 2, -0.1, 2, -0.2, 3, -0.0, 0, 0.0)


In [27]:
fix_index("test/rsa_visualization/1c8r_A.pdb", 153)

154