In [6]:
from Bio.PDB import PDBParser, Superimposer
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import json

import warnings
warnings.filterwarnings("ignore")

# These are useful functions.

In [2]:
# to trim predicted wildtype pdb file
# only for deletion mutants paper
def trim_wildtype(wt_pdb, del_pos, del_folder):
    trimmed_wt_filename = "wt_del" + str(del_pos) + "_" + os.path.basename(wt_pdb)
    trimmed_wt_filepath = os.path.join(del_folder, trimmed_wt_filename)
    with open(wt_pdb, "r") as source, open(trimmed_wt_filepath, "w") as target:
        for count, line in enumerate(source.readlines()):
            if line.startswith("ATOM"):
                line_elements = line.strip("\n").split()
                current_res = int(line_elements[5])
                if current_res!=del_pos:
                    target.write(line)
    return 0


# to calculate rmsd using Bio.PDB
# predicted_pdb: moving pdb comparing to reference, need to have the same number of CA atoms as the ref_pdb
def rmsd_pdb(predicted_pdb, ref_pdb):
    parser = PDBParser()
    struct_ref = parser.get_structure(os.path.basename(ref_pdb), ref_pdb)
    struct_predicted = parser.get_structure(os.path.basename(predicted_pdb), predicted_pdb)
    fixed = [atom for atom in struct_ref[0].get_atoms() if atom.name == "CA"]
    moving = [atom for atom in struct_predicted[0].get_atoms() if atom.name == "CA"]
    sup = Superimposer()
    # sets the fixed and moving atom lists
    # finds the rotation and translation matrix that best superimposes the moving atoms onto fixed atoms
    sup.set_atoms(fixed, moving)
    # applies the calculated rotation and translation to all atoms in the second structure
    # superimposing it onto the first structure
    #sup.apply(struct_predicted[0].get_atoms())
    sup.apply(moving)

    return sup.rms


def rmsd_point(coordinate1, coordinate2):
    # 3D coordinates, example: array([ 11.925492,  10.070204, -12.518902], dtype=float32)
    # this is a function to calculate rmsd for a single point
    x1 = coordinate1[0]
    y1 = coordinate1[1]
    z1 = coordinate1[2]
    x2 = coordinate2[0]
    y2 = coordinate2[1]
    z2 = coordinate2[2]
    value = np.sqrt((x1-x2)**2 + (y1-y2)**2 + (z1-z2)**2)
    return value


def rmsd_list(coordinates1, coordinates2):
    # list of 3D coordinates, should have the same number of coordinates
    # this is a function to calculate rmsd for two list of 3D coordinates
    length = len(coordinates1)
    values = []
    for i in range(length):
        x1 = coordinates1[i][0]
        y1 = coordinates1[i][1]
        z1 = coordinates1[i][2]
        x2 = coordinates2[i][0]
        y2 = coordinates2[i][1]
        z2 = coordinates2[i][2]
        value = (x1-x2)**2 + (y1-y2)**2 + (z1-z2)**2
        values.append(value)
    rmsd = np.sqrt(sum(values)/length)
    return rmsd


# to calculate rmsd per residue using Bio.PDB
def rmsd_pdb_perResidue(moving_pdb, ref_pdb):
    # moving_pdb and ref_pdb should have the same number of alpha carbons (CA)
    
    parser = PDBParser()
    struct_moving = parser.get_structure(os.path.basename(moving_pdb), moving_pdb)
    struct_ref = parser.get_structure(os.path.basename(ref_pdb), ref_pdb)
    moving = [atom for atom in struct_moving[0].get_atoms() if atom.name == "CA"]
    fixed = [atom for atom in struct_ref[0].get_atoms() if atom.name == "CA"]
    
    # get the fixed coordinates
    coords_fixed = []
    for i in range(len(fixed)):
        coords_fixed.append(fixed[i].get_coord())
    # get the moving coordinates
    sup = Superimposer()
    sup.set_atoms(fixed, moving)
    sup.apply(moving)
    coords_moving = []
    for i in range(len(moving)):
        coords_moving.append(moving[i].get_coord())

    # calculate rmsd per residue (CA)
    rmsd_perResidue = {}
    for i in range(len(coords_fixed)):
        residue_id = fixed[i].full_id[3][1]
        rmsd_perResidue[residue_id] = rmsd_point(coords_fixed[i], coords_moving[i])

    return rmsd_perResidue


# get tags with the lowest relax score
# here it generates a tag file (basically a txt file)
# that contains 3 ids with lowest score by default

def get_lowestTag(sc_file, tag_file, num = 3):
    if sc_file==tag_file:
        print("score file and tag file should be different!")
        return 1
    # sc file is the one generated by Rosetta Relax
    # tag file is a path, not a folder, need to specify the file name
    scores_and_ids = pd.DataFrame(columns = ['score', 'id'])
    with open(sc_file, "r") as f:
        for count, line in enumerate(f.readlines()):
            if (count != 0) and (count != 1):
                line = line.strip("\n")
                line = line.split()
                scores_and_ids.loc[len(scores_and_ids)] = [float(line[1]), line[23]]
    scores_and_ids = scores_and_ids.sort_values(by = 'score', ascending = True)
    lowest_ids = scores_and_ids['id'].head(num)
    with open(tag_file, 'w') as f:
        for id in lowest_ids:
            f.write(f"{id}\n")
    return 0


# write a function to get a dictionary including the deleted point
# out of a score file per residue
def sc_perResidue(del_pos, sc_path):
    scores = {}
    scores[del_pos] = 0
    with open(sc_path, "r") as f:
        for count, line in enumerate(f.readlines()):
            if (count != 0):
                line = line.strip("\n").split()
                score = float(line[22])
                id = int(line[23].split("_")[1])
                if (id < del_pos):
                    scores[id] = score
                else:
                    scores[id+1] = score

    return scores

# These are tests.

In [14]:
# test trim_wildtype function
trim_wildtype("alphafold_res/BMRB51377/ranked_0.pdb", 2, "test")

0

In [43]:
# test rmsd_pdb function

#test1 = "/scicore/home/zavolan/zhu0006/3D_structure/test/insertion_examples/alphafold_res/2Z2A_1/ranked_0_trimmed.pdb"
#test2 = "/scicore/home/zavolan/zhu0006/3D_structure/test/insertion_examples/pdb_data/2z2a/2z2a_A.pdb"
test1 = "alphafold_res/del10/ranked_0.pdb"
test2 = "test/wt_del10.pdb"

rmsd_pdb(test1, test2)

2.8830959599645034

In [38]:
# test rmsd_point function
rmsd_per_point = []
for i in range(len(coords1_rotran)):
    current_rmsd = rmsd_point(coords1_rotran[i], coords2[i])
    rmsd_per_point.append(current_rmsd)

In [42]:
# test rmsd_list function
rmsd_list(coords1_rotran, coords2)

2.8830960945699724

In [31]:
# test Superimposer

#test1 = "/scicore/home/zavolan/zhu0006/3D_structure/test/insertion_examples/alphafold_res/2Z2A_1/ranked_0_trimmed.pdb"
#test2 = "/scicore/home/zavolan/zhu0006/3D_structure/test/insertion_examples/pdb_data/2z2a/2z2a_A.pdb"
test1 = "alphafold_res/del10/ranked_0.pdb"
test2 = "test/wt_del10.pdb"

parser = PDBParser()
struct1 = parser.get_structure("predicted", test1)
struct2 = parser.get_structure("ref", test2)
atoms1 = [atom for atom in struct1[0].get_atoms() if atom.name == "CA"]
print(len(atoms1))
coords1 = []
coords2 = []
for i in range(len(atoms1)):
    coords1.append(atoms1[i].get_coord())
atoms2 = [atom for atom in struct2[0].get_atoms() if atom.name == "CA"]
for i in range(len(atoms2)):
    coords2.append(atoms2[i].get_coord())
sup = Superimposer()
sup.set_atoms(atoms2, atoms1)
sup.apply(atoms1)
coords1_rotran = []
for i in range(len(atoms1)):
    coords1_rotran.append(atoms1[i].get_coord())

71


In [57]:
# get the residue id from atom object
type(atoms2[0].full_id[3][1])

int

In [18]:
# test rmsd_pdb_perResidue function

#test1 = "/scicore/home/zavolan/zhu0006/3D_structure/test/insertion_examples/alphafold_res/2Z2A_1/ranked_0_trimmed.pdb"
#test2 = "/scicore/home/zavolan/zhu0006/3D_structure/test/insertion_examples/pdb_data/2z2a/2z2a_A.pdb"
test1 = "alphafold_res/del2/ranked_0.pdb"
test2 = "alphafold_res/BMRB51377/trimmed/wt_del2_ranked_0.pdb"

rmsd_perResidue_test = rmsd_pdb_perResidue(test1, test2)
rmsd_perResidue_test

{1: 3.97251192977329,
 3: 1.7567367282868729,
 4: 0.6191483236482239,
 5: 0.951982701124678,
 6: 0.41334614948094317,
 7: 0.3318938430077309,
 8: 0.1534110660325296,
 9: 0.177780806187189,
 10: 0.15060371156259855,
 11: 0.10290612079089577,
 12: 0.09565603354704968,
 13: 0.1307741853387249,
 14: 0.11497936471779963,
 15: 0.10002455563535399,
 16: 0.11898385940727053,
 17: 0.09632220617134535,
 18: 0.11724291107606999,
 19: 0.14160813283555737,
 20: 0.09471287105069345,
 21: 0.09758657078245164,
 22: 0.1023370865720697,
 23: 0.06067063564931613,
 24: 0.04972084203421075,
 25: 0.06531495660228426,
 26: 0.04344971234148392,
 27: 0.04542309214291578,
 28: 0.08672203645695822,
 29: 0.09139718168919334,
 30: 0.11543246769614125,
 31: 0.13145539129952571,
 32: 0.11459670530062156,
 33: 0.1474590977054926,
 34: 0.12227391006919291,
 35: 0.07718420425162145,
 36: 0.10769414312683225,
 37: 0.1115258316567638,
 38: 0.09012119976508692,
 39: 0.0998329616711246,
 40: 0.09274242156795698,
 41: 0.177

To generate a dataframe from the csp files

In [21]:
# test for del2, for example
csp_del2 = "del_csp_data/del2_cssp_updated.csv"
df = pd.read_csv(csp_del2, sep = "\t", names = ['residue_id', 'CSP'])
df['af_rmsd'] = df['residue_id'].map(rmsd_perResidue_test)
df.fillna(0, inplace = True)
df

Unnamed: 0,residue_id,CSP,af_rmsd
0,1,0.000000,3.972512
1,2,0.000000,0.000000
2,3,0.000000,1.756737
3,4,0.000000,0.619148
4,5,0.000000,0.951983
...,...,...,...
67,68,0.013085,0.121153
68,69,0.037998,0.285306
69,70,0.043463,0.307882
70,71,0.011390,0.304856


# Analysis starts from here.

In [4]:
del_positions = [2, 3, 5, 50, 51, 52, 62, 64, 66, 67, 68, 69, 70, 71, 72]

## Get trimmed pdb files

In [22]:
# get trimmed pdb files
# here for AF only structures
wt_folder = "alphafold_res/BMRB51377"
trimmed_folder = os.path.join(wt_folder, "trimmed_af")
for i in range(0,5):
    current_pdb = "ranked_" + str(i) + ".pdb"
    current_pdb_path = os.path.join(wt_path, current_pdb)
    for j in del_positions:
        trim_wildtype(current_pdb_path, j, trimmed_folder)

In [4]:
# generate tag file with the lowest scoring mutant
directory = "alphafold_res"
tag_filename = "lowest.tag"
for i in del_positions:
    current_folder = "del" + str(i)
    sc_filename = "af_del" + str(i) + ".sc"
    sc_path = os.path.join(directory, current_folder, sc_filename)
    tag_path = os.path.join(directory, current_folder, tag_filename)
    get_lowestTag(sc_path, tag_path, num = 1)

In [None]:
# extract pdb files using tag file
# this should be done using the bash script
# "extract_pdbs.sh"

## Get delta G for wildtype (AF + Rosetta)

In [5]:
# get relax scores per residue
# test if the sum of per residue is the same as in the af_wt.sc

scores = []
with open("alphafold_res/BMRB51377/ranked_3_0006_perRes.sc", "r") as f:
    for count, line in enumerate(f.readlines()):
        if (count != 0):
            line = line.strip("\n").split()
            scores.append(float(line[22]))

print(scores)
print(len(scores))
print(sum(scores))

# they are basically the same

[1.157, -0.275, -2.022, -0.727, 2.955, -1.053, -5.941, -5.309, 0.315, -3.441, -5.464, -8.187, -0.95, -1.578, -2.254, -1.249, -3.982, 0.277, 0.386, -7.003, -2.019, -4.093, -2.27, -7.459, -2.116, -0.788, -1.567, -4.02, -5.25, -3.456, -2.574, -3.307, -0.713, -2.375, -4.967, -1.908, -0.563, -5.305, -4.814, -1.403, -2.432, -6.781, -7.167, -2.424, -1.158, -3.489, -2.676, -5.227, -0.221, -1.196, 1.59, -0.929, -3.524, -6.024, -2.631, -3.374, -7.674, -3.395, -2.073, -3.541, -5.446, -1.891, -2.178, -3.747, -3.651, -1.223, -2.907, -2.505, -0.267, -1.425, -1.217, -1.967]
72
-198.082


In [8]:
# get delta G score from wildtype
# using the average of the lowest 3 models

scores_1 = []
scores_2 = []
scores_3 = []

sc_lists = ["ranked_0_0001_perRes.sc", "ranked_0_0008_perRes.sc", "ranked_3_0006_perRes.sc"]

with open("alphafold_res/BMRB51377/ranked_0_0001_perRes.sc", "r") as f:
    for count, line in enumerate(f.readlines()):
        if (count != 0):
            line = line.strip("\n").split()
            scores_1.append(float(line[22]))

with open("alphafold_res/BMRB51377/ranked_0_0008_perRes.sc", "r") as f:
    for count, line in enumerate(f.readlines()):
        if (count != 0):
            line = line.strip("\n").split()
            scores_2.append(float(line[22]))

with open("alphafold_res/BMRB51377/ranked_3_0006_perRes.sc", "r") as f:
    for count, line in enumerate(f.readlines()):
        if (count != 0):
            line = line.strip("\n").split()
            scores_3.append(float(line[22]))

print(scores_1)
print(scores_2)
print(scores_3)

scores_3_dic = {i+1: value for i, value in enumerate(scores_3)}
print(scores_3_dic)

average_wt_scores = [(a + b + c) / 3 for a, b, c in zip(scores_1, scores_2, scores_3)]
average_wt_scores = {i+1: value for i, value in enumerate(average_wt_scores)}

[1.541, -0.785, 0.969, -0.575, 0.873, -2.255, -6.156, -5.338, -0.051, -4.594, -5.524, -8.195, -0.773, -1.555, -2.482, -1.395, -3.906, 0.24, 0.291, -7.233, -2.092, -4.147, -2.554, -7.359, -2.041, -0.748, -1.806, -4.131, -4.81, -3.232, -2.309, -4.267, -1.023, -2.11, -4.983, -2.194, -0.605, -5.445, -4.305, -1.875, -2.272, -5.713, -6.911, -2.596, -0.676, -3.357, -2.777, -5.651, -0.238, -1.391, 1.362, -0.828, -3.957, -6.303, -2.8, -3.283, -7.637, -3.527, -2.281, -3.309, -5.206, -1.931, -2.229, -3.976, -3.583, -1.533, -3.394, -1.765, -0.899, -1.093, -0.389, 1.095]
[1.813, -0.789, 0.715, -0.575, 0.887, -2.261, -6.15, -5.35, -0.046, -4.57, -5.555, -8.188, -0.78, -1.555, -2.507, -1.41, -3.896, 0.247, 0.301, -7.239, -2.091, -4.143, -2.53, -7.38, -2.065, -0.742, -1.769, -4.131, -4.903, -3.232, -2.302, -4.282, -1.038, -2.089, -4.975, -2.182, -0.607, -5.426, -4.31, -1.876, -2.259, -5.694, -6.904, -2.641, -0.636, -3.319, -2.784, -5.658, -0.235, -1.398, 1.352, -0.83, -3.958, -6.297, -2.799, -3.283, -

In [None]:
# calculate delta G per residue for each deletion mutants
# this should be done using the bash script
# "score_perResidue.sh"

## Get CSP, RMSD, dG and ddG

In [9]:
# get csp, rmsd, dG and ddG
### rmsd 1: AlphaFold, wt vs del
### rmsd 2: AlphaFold & Rosetta del vs AF-only wt
### rmsd 3: AlphaFold & Rosetta, wt vs del


# set directory
csp_dir = "del_csp_data"
af_dir = "alphafold_res"
trimmed_af_dir = "alphafold_res/BMRB51377/trimmed_af"
trimmed_relax_dir = "alphafold_res/BMRB51377/trimmed_relax"
results_csv_dir = "perResidue_info"

# loop in 15 deletion mutants
for i in del_positions:
    # set current deletion mutant name
    del_name = "del" + str(i)
    
    # get csp data and create dataframe
    csp_filename = del_name + "_cssp_updated.csv"
    csp_path = os.path.join(csp_dir, csp_filename)
    df = pd.read_csv(csp_path, sep = "\t", names = ['residue_id', 'CSP'])

    # get af-only trimmed wildtype structure, use ranked_0
    trimmed_af_filename = "wt_" + del_name + "_ranked_0.pdb"
    trimmed_af_path = os.path.join(trimmed_af_dir, trimmed_af_filename)

    # get af+rosetta trimmed wildtype structure, use the one with lowest score
    trimmed_relax_filename = "wt_" + del_name + "_ranked_3_0006.pdb"
    trimmed_relax_path = os.path.join(trimmed_relax_dir, trimmed_relax_filename)
    
    # rmsd 1: AlphaFold, wt vs del
    af_path = os.path.join(af_dir, del_name, "ranked_0.pdb")
    af_rmsd = rmsd_pdb_perResidue(af_path, trimmed_af_path)
    df['af_rmsd'] = df['residue_id'].map(af_rmsd)

    # rmsd 2: AlphaFold & Rosetta del vs AF-only wt
    # rmsd 3: AlphaFold & Rosetta, wt vs del
    tag_path = os.path.join(af_dir, del_name, "lowest.tag")
    with open(tag_path, 'r') as f:
        relax_filename = f.readline().strip()
        relax_filename = relax_filename + ".pdb"
    relax_path = os.path.join(af_dir, del_name, relax_filename)
    relax_rmsd_2 = rmsd_pdb_perResidue(relax_path, trimmed_af_path) # rmsd 2
    relax_rmsd_3 = rmsd_pdb_perResidue(relax_path, trimmed_relax_path) # rmsd 3
    df['relax_rmsd_wtAF'] = df['residue_id'].map(relax_rmsd_2)
    df['relax_rmsd_wtRelax'] = df['residue_id'].map(relax_rmsd_3)

    # get dG for mutants (AF + Rosetta)
    # ddG between mutants and wt (AF + Rosetta)
    with open(tag_path, "r") as f:
        sc_filename = f.readline().strip()
        sc_filename = sc_filename + "_perRes.sc"
    sc_path = os.path.join(af_dir, del_name, sc_filename)
    dG = sc_perResidue(i, sc_path)
    df['delta_G'] = df['residue_id'].map(dG)
    df['delta_G_wt'] = df['residue_id'].map(scores_3_dic) # this is from the previous chunk
    df['ddG'] = df['delta_G'] - df['delta_G_wt']
    df['absddG'] = abs(df['ddG'])

    # fill NA value
    df.fillna(0, inplace = True)

    # save csv files
    csv_filename = del_name + ".csv"
    csv_path = os.path.join(results_csv_dir, csv_filename)
    df.to_csv(csv_path, index = False)

## Correlation Coef, CSP

In [15]:
# calculate correlation coefficient, pearson
### CSP vs RMSD

### rmsd 1: AlphaFold, wt vs del
### rmsd 2: AlphaFold & Rosetta del vs AF-only wt
### rmsd 3: AlphaFold & Rosetta, wt vs del

# set the dictionary for per-residue information
results_csv_dir = "perResidue_info"

# initialize the dataframe
del_positions = [2, 3, 5, 50, 51, 52, 62, 64, 66, 67, 68, 69, 70, 71, 72]
del_names = {'del_names':['del2', 'del3', 'del5', 'del50', 'del51', 'del52', 'del62', 'del64', 'del66', 'del67', 'del68', 'del69', 'del70', 'del71', 'del72']}
df_corr = pd.DataFrame(del_names)

# set dictionary to save correlation coefficients
# all corr are with CSP values
rmsd_1_corr_dic = {}
rmsd_2_corr_dic = {}
rmsd_3_corr_dic = {}
ddG_corr_dic = {}

for del_pos in del_positions:
    # define deletion positioon
    del_name = "del" + str(del_pos)

    # read csv file as dataframe
    csv_name = del_name + '.csv'
    csv_path = os.path.join(results_csv_dir, csv_name)
    df = pd.read_csv(csv_path)
    
    # filter out 0 CSP values
    df = df[df['CSP'] != 0]

    # filter out the deletion point
    df = df[df['residue_id'] != del_pos]
    
    # rmsd 1: AlphaFold, wt vs del
    rmsd_1_corr = df['CSP'].corr(df['af_rmsd'])
    rmsd_1_corr_dic[del_name] = rmsd_1_corr
    
    # rmsd 2: AlphaFold & Rosetta del vs AF-only wt
    rmsd_2_corr = df['CSP'].corr(df['relax_rmsd_wtAF'])
    rmsd_2_corr_dic[del_name] = rmsd_2_corr

    # rmsd 3: AlphaFold & Rosetta, wt vs del
    rmsd_3_corr = df['CSP'].corr(df['relax_rmsd_wtRelax'])
    rmsd_3_corr_dic[del_name] = rmsd_3_corr
    
    # correlation between absddG(AlphaFold+Rosetta) and CSP values
    ddG_corr = df['CSP'].corr(df['absddG'])
    ddG_corr_dic[del_name] = ddG_corr

df_corr['af_corr_CSPvsRMSD'] = df_corr['del_names'].map(rmsd_1_corr_dic) # rmsd 1
df_corr['relax_corr_wtAF_CSPvsRMSD'] = df_corr['del_names'].map(rmsd_2_corr_dic) # rmsd 2
df_corr['relax_corr_wtRelax_CSPvsRMSD'] = df_corr['del_names'].map(rmsd_3_corr_dic) # rmsd 3
df_corr['relax_corr_CSPvsddG'] = df_corr['del_names'].map(ddG_corr_dic)

df_corr.to_csv("correlation_csv/pearson.csv", index = False)

In [16]:
# calculate correlation coefficient, kendall
### CSP vs RMSD

### rmsd 1: AlphaFold, wt vs del
### rmsd 2: AlphaFold & Rosetta del vs AF-only wt
### rmsd 3: AlphaFold & Rosetta, wt vs del

# set the dictionary for per-residue information
results_csv_dir = "perResidue_info"

# initialize the dataframe
del_positions = [2, 3, 5, 50, 51, 52, 62, 64, 66, 67, 68, 69, 70, 71, 72]
del_names = {'del_names':['del2', 'del3', 'del5', 'del50', 'del51', 'del52', 'del62', 'del64', 'del66', 'del67', 'del68', 'del69', 'del70', 'del71', 'del72']}
df_corr = pd.DataFrame(del_names)

# set dictionary to save correlation coefficients
# all corr are with CSP values
rmsd_1_corr_dic = {}
rmsd_2_corr_dic = {}
rmsd_3_corr_dic = {}
ddG_corr_dic = {}

for del_pos in del_positions:
    # define deletion positioon
    del_name = "del" + str(del_pos)

    # read csv file as dataframe
    csv_name = del_name + '.csv'
    csv_path = os.path.join(results_csv_dir, csv_name)
    df = pd.read_csv(csv_path)
    
    # filter out 0 CSP values
    df = df[df['CSP'] != 0]

    # filter out the deletion point
    df = df[df['residue_id'] != del_pos]
    
    # rmsd 1: AlphaFold, wt vs del
    rmsd_1_corr = df['CSP'].corr(df['af_rmsd'], method = 'kendall')
    rmsd_1_corr_dic[del_name] = rmsd_1_corr
    
    # rmsd 2: AlphaFold & Rosetta del vs AF-only wt
    rmsd_2_corr = df['CSP'].corr(df['relax_rmsd_wtAF'], method = 'kendall')
    rmsd_2_corr_dic[del_name] = rmsd_2_corr

    # rmsd 3: AlphaFold & Rosetta, wt vs del
    rmsd_3_corr = df['CSP'].corr(df['relax_rmsd_wtRelax'], method = 'kendall')
    rmsd_3_corr_dic[del_name] = rmsd_3_corr
    
    # correlation between absddG(AlphaFold+Rosetta) and CSP values
    ddG_corr = df['CSP'].corr(df['absddG'], method = 'kendall')
    ddG_corr_dic[del_name] = ddG_corr

df_corr['af_corr_CSPvsRMSD'] = df_corr['del_names'].map(rmsd_1_corr_dic) # rmsd 1
df_corr['relax_corr_wtAF_CSPvsRMSD'] = df_corr['del_names'].map(rmsd_2_corr_dic) # rmsd 2
df_corr['relax_corr_wtRelax_CSPvsRMSD'] = df_corr['del_names'].map(rmsd_3_corr_dic) # rmsd 3
df_corr['relax_corr_CSPvsddG'] = df_corr['del_names'].map(ddG_corr_dic)

df_corr.to_csv("correlation_csv/kendall.csv", index = False)

In [17]:
# calculate correlation coefficient, spearman
### CSP vs RMSD

### rmsd 1: AlphaFold, wt vs del
### rmsd 2: AlphaFold & Rosetta del vs AF-only wt
### rmsd 3: AlphaFold & Rosetta, wt vs del

# set the dictionary for per-residue information
results_csv_dir = "perResidue_info"

# initialize the dataframe
del_positions = [2, 3, 5, 50, 51, 52, 62, 64, 66, 67, 68, 69, 70, 71, 72]
del_names = {'del_names':['del2', 'del3', 'del5', 'del50', 'del51', 'del52', 'del62', 'del64', 'del66', 'del67', 'del68', 'del69', 'del70', 'del71', 'del72']}
df_corr = pd.DataFrame(del_names)

# set dictionary to save correlation coefficients
# all corr are with CSP values
rmsd_1_corr_dic = {}
rmsd_2_corr_dic = {}
rmsd_3_corr_dic = {}
ddG_corr_dic = {}

for del_pos in del_positions:
    # define deletion positioon
    del_name = "del" + str(del_pos)

    # read csv file as dataframe
    csv_name = del_name + '.csv'
    csv_path = os.path.join(results_csv_dir, csv_name)
    df = pd.read_csv(csv_path)
    
    # filter out 0 CSP values
    df = df[df['CSP'] != 0]

    # filter out the deletion point
    df = df[df['residue_id'] != del_pos]
    
    # rmsd 1: AlphaFold, wt vs del
    rmsd_1_corr = df['CSP'].corr(df['af_rmsd'], method = 'spearman')
    rmsd_1_corr_dic[del_name] = rmsd_1_corr
    
    # rmsd 2: AlphaFold & Rosetta del vs AF-only wt
    rmsd_2_corr = df['CSP'].corr(df['relax_rmsd_wtAF'], method = 'spearman')
    rmsd_2_corr_dic[del_name] = rmsd_2_corr

    # rmsd 3: AlphaFold & Rosetta, wt vs del
    rmsd_3_corr = df['CSP'].corr(df['relax_rmsd_wtRelax'], method = 'spearman')
    rmsd_3_corr_dic[del_name] = rmsd_3_corr
    
    # correlation between absddG(AlphaFold+Rosetta) and CSP values
    ddG_corr = df['CSP'].corr(df['absddG'], method = 'spearman')
    ddG_corr_dic[del_name] = ddG_corr

df_corr['af_corr_CSPvsRMSD'] = df_corr['del_names'].map(rmsd_1_corr_dic) # rmsd 1
df_corr['relax_corr_wtAF_CSPvsRMSD'] = df_corr['del_names'].map(rmsd_2_corr_dic) # rmsd 2
df_corr['relax_corr_wtRelax_CSPvsRMSD'] = df_corr['del_names'].map(rmsd_3_corr_dic) # rmsd 3
df_corr['relax_corr_CSPvsddG'] = df_corr['del_names'].map(ddG_corr_dic)

df_corr.to_csv("correlation_csv/spearman.csv", index = False)

## Correlation Coef, abs(ddG) vs RMSD

In [19]:
# calculate correlation coefficient, pearson
# abs(ddG) vs RMSD

# set the dictionary for per-residue information
results_csv_dir = "perResidue_info"

# initialize the dataframe
del_positions = [2, 3, 5, 50, 51, 52, 62, 64, 66, 67, 68, 69, 70, 71, 72]
del_names = {'del_names':['del2', 'del3', 'del5', 'del50', 'del51', 'del52', 'del62', 'del64', 'del66', 'del67', 'del68', 'del69', 'del70', 'del71', 'del72']}
df_corr = pd.DataFrame(del_names)

# set dictionary to save correlation coefficients
# all corr are with abs(ddG) values
rmsd_1_corr_dic = {}
rmsd_2_corr_dic = {}
rmsd_3_corr_dic = {}

for del_pos in del_positions:
    # define deletion positioon
    del_name = "del" + str(del_pos)

    # read csv file as dataframe
    csv_name = del_name + '.csv'
    csv_path = os.path.join(results_csv_dir, csv_name)
    df = pd.read_csv(csv_path)

    # filter out the deletion point
    df = df[df['residue_id'] != del_pos]

    # rmsd 1: AlphaFold, wt vs del
    rmsd_1_corr = df['absddG'].corr(df['af_rmsd'])
    rmsd_1_corr_dic[del_name] = rmsd_1_corr
    
    # rmsd 2: AlphaFold & Rosetta del vs AF-only wt
    rmsd_2_corr = df['absddG'].corr(df['relax_rmsd_wtAF'])
    rmsd_2_corr_dic[del_name] = rmsd_2_corr

    # rmsd 3: AlphaFold & Rosetta, wt vs del
    rmsd_3_corr = df['absddG'].corr(df['relax_rmsd_wtRelax'])
    rmsd_3_corr_dic[del_name] = rmsd_3_corr


df_corr['af_corr_absddGvsRMSD'] = df_corr['del_names'].map(rmsd_1_corr_dic) # rmsd 1
df_corr['relax_corr_wtAF_absddGvsRMSD'] = df_corr['del_names'].map(rmsd_2_corr_dic) # rmsd 2
df_corr['relax_corr_wtRelax_absddGvsRMSD'] = df_corr['del_names'].map(rmsd_3_corr_dic) # rmsd 3

df_corr.to_csv("correlation_csv/pearson_absddGvsRMSD.csv", index = False)

In [20]:
# calculate correlation coefficient, kendall
# abs(ddG) vs RMSD

# set the dictionary for per-residue information
results_csv_dir = "perResidue_info"

# initialize the dataframe
del_positions = [2, 3, 5, 50, 51, 52, 62, 64, 66, 67, 68, 69, 70, 71, 72]
del_names = {'del_names':['del2', 'del3', 'del5', 'del50', 'del51', 'del52', 'del62', 'del64', 'del66', 'del67', 'del68', 'del69', 'del70', 'del71', 'del72']}
df_corr = pd.DataFrame(del_names)

# set dictionary to save correlation coefficients
# all corr are with abs(ddG) values
rmsd_1_corr_dic = {}
rmsd_2_corr_dic = {}
rmsd_3_corr_dic = {}

for del_pos in del_positions:
    # define deletion positioon
    del_name = "del" + str(del_pos)

    # read csv file as dataframe
    csv_name = del_name + '.csv'
    csv_path = os.path.join(results_csv_dir, csv_name)
    df = pd.read_csv(csv_path)

    # filter out the deletion point
    df = df[df['residue_id'] != del_pos]

    # rmsd 1: AlphaFold, wt vs del
    rmsd_1_corr = df['absddG'].corr(df['af_rmsd'], method = 'kendall')
    rmsd_1_corr_dic[del_name] = rmsd_1_corr
    
    # rmsd 2: AlphaFold & Rosetta del vs AF-only wt
    rmsd_2_corr = df['absddG'].corr(df['relax_rmsd_wtAF'], method = 'kendall')
    rmsd_2_corr_dic[del_name] = rmsd_2_corr

    # rmsd 3: AlphaFold & Rosetta, wt vs del
    rmsd_3_corr = df['absddG'].corr(df['relax_rmsd_wtRelax'], method = 'kendall')
    rmsd_3_corr_dic[del_name] = rmsd_3_corr


df_corr['af_corr_absddGvsRMSD'] = df_corr['del_names'].map(rmsd_1_corr_dic) # rmsd 1
df_corr['relax_corr_wtAF_absddGvsRMSD'] = df_corr['del_names'].map(rmsd_2_corr_dic) # rmsd 2
df_corr['relax_corr_wtRelax_absddGvsRMSD'] = df_corr['del_names'].map(rmsd_3_corr_dic) # rmsd 3

df_corr.to_csv("correlation_csv/kendall_absddGvsRMSD.csv", index = False)

In [21]:
# calculate correlation coefficient, spearman
# abs(ddG) vs RMSD

# set the dictionary for per-residue information
results_csv_dir = "perResidue_info"

# initialize the dataframe
del_positions = [2, 3, 5, 50, 51, 52, 62, 64, 66, 67, 68, 69, 70, 71, 72]
del_names = {'del_names':['del2', 'del3', 'del5', 'del50', 'del51', 'del52', 'del62', 'del64', 'del66', 'del67', 'del68', 'del69', 'del70', 'del71', 'del72']}
df_corr = pd.DataFrame(del_names)

# set dictionary to save correlation coefficients
# all corr are with abs(ddG) values
rmsd_1_corr_dic = {}
rmsd_2_corr_dic = {}
rmsd_3_corr_dic = {}

for del_pos in del_positions:
    # define deletion positioon
    del_name = "del" + str(del_pos)

    # read csv file as dataframe
    csv_name = del_name + '.csv'
    csv_path = os.path.join(results_csv_dir, csv_name)
    df = pd.read_csv(csv_path)

    # filter out the deletion point
    df = df[df['residue_id'] != del_pos]

    # rmsd 1: AlphaFold, wt vs del
    rmsd_1_corr = df['absddG'].corr(df['af_rmsd'], method = 'spearman')
    rmsd_1_corr_dic[del_name] = rmsd_1_corr
    
    # rmsd 2: AlphaFold & Rosetta del vs AF-only wt
    rmsd_2_corr = df['absddG'].corr(df['relax_rmsd_wtAF'], method = 'spearman')
    rmsd_2_corr_dic[del_name] = rmsd_2_corr

    # rmsd 3: AlphaFold & Rosetta, wt vs del
    rmsd_3_corr = df['absddG'].corr(df['relax_rmsd_wtRelax'], method = 'spearman')
    rmsd_3_corr_dic[del_name] = rmsd_3_corr


df_corr['af_corr_absddGvsRMSD'] = df_corr['del_names'].map(rmsd_1_corr_dic) # rmsd 1
df_corr['relax_corr_wtAF_absddGvsRMSD'] = df_corr['del_names'].map(rmsd_2_corr_dic) # rmsd 2
df_corr['relax_corr_wtRelax_absddGvsRMSD'] = df_corr['del_names'].map(rmsd_3_corr_dic) # rmsd 3

df_corr.to_csv("correlation_csv/spearman_absddGvsRMSD.csv", index = False)

# extract plDDT and deltaG

In [13]:
# get plddt score, define function

def get_plddt(af_ranking_file):
    af_ranking = json.load(open(af_ranking_file))
    plddt = format(max(af_ranking['plddts'].values()), '.3f')
    return float(plddt)

In [14]:
# get plddt score and delta G

# initialize the dataframe
del_positions = [2, 3, 5, 50, 51, 52, 62, 64, 66, 67, 68, 69, 70, 71, 72]
del_names = {'del_names':['del2', 'del3', 'del5', 'del50', 'del51', 'del52', 'del62', 'del64', 'del66', 'del67', 'del68', 'del69', 'del70', 'del71', 'del72']}
df_scores = pd.DataFrame(del_names)

# set dictionary
plddt_dic = {}
deltaG_dic = {}

# define directory
af_dir = "alphafold_res"

# extract and save plddt score and delta G
for del_pos in del_positions:
    del_name = "del" + str(del_pos)
    
    plddt_path = os.path.join(af_dir, del_name, "ranking_debug.json")
    plddt = get_plddt(plddt_path)
    plddt_dic[del_name] = plddt

    tag_path = os.path.join(af_dir, del_name, "lowest.tag")
    with open(tag_path, 'r') as f:
        relax_filename = f.readline().strip()
    sc_filename = "af_del" + str(del_pos) + ".sc"
    sc_path = os.path.join(af_dir, del_name, sc_filename)
    deltaG = 0
    with open(sc_path, "r") as f:
        for count, line in enumerate(f.readlines()):
            if (count != 0) and (count != 1):
                line = line.strip("\n")
                line = line.split()
                if line[23]==relax_filename:
                    deltaG = line[1]
                    break
    deltaG_dic[del_name] = deltaG

df_scores['plddt'] = df_scores['del_names'].map(plddt_dic)
df_scores['deltaG'] = df_scores['del_names'].map(deltaG_dic)
df_scores

Unnamed: 0,del_names,plddt,deltaG
0,del2,88.676,-202.959
1,del3,88.69,-203.504
2,del5,90.004,-208.114
3,del50,88.136,-211.722
4,del51,87.616,-203.912
5,del52,87.908,-200.888
6,del62,91.865,-214.146
7,del64,89.206,-209.198
8,del66,88.225,-203.411
9,del67,89.073,-207.568
