In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os

def filter_results(input_csv_path, sc_value_threshold=0.6, pred_tm_score_threshold=0.7,
                       average_pae_threshold=15, average_plddt_threshold=0.8,
                       dG_separated_threshold=-65, rmsd_threshold=2.5, save_results=True):
    df = pd.read_csv(input_csv_path, index_col=0)
    df = df.sort_values(by=['rmsd'])

    fig, axs = plt.subplots(3, 2, figsize=(12, 15))

    axs[0, 0].hist(df['sc_value'], bins=12, alpha=0.5, color='blue')
    axs[0, 0].set_title('SC Value Histogram')
    axs[0, 0].set_xlabel('SC Value')
    axs[0, 0].set_ylabel('Frequency')

    axs[0, 1].hist(df['pred_tm_score'], bins=12, alpha=0.5, color='green')
    axs[0, 1].set_title('Predicted TM Score Histogram')
    axs[0, 1].set_xlabel('Predicted TM Score')
    axs[0, 1].set_ylabel('Frequency')

    axs[1, 0].hist(df['average_plddt'], bins=12, alpha=0.5, color='orange')
    axs[1, 0].set_title('Average pLDDT Histogram')
    axs[1, 0].set_xlabel('Average pLDDT')
    axs[1, 0].set_ylabel('Frequency')

    axs[1, 1].hist(df['average_pae'], bins=12, alpha=0.5, color='red')
    axs[1, 1].set_title('Average PAE Histogram')
    axs[1, 1].set_xlabel('Average PAE')
    axs[1, 1].set_ylabel('Frequency')

    axs[2, 0].hist(df['rmsd'], bins=12, alpha=0.5, color='purple')
    axs[2, 0].set_title('RMSD Histogram')
    axs[2, 0].set_xlabel('RMSD')
    axs[2, 0].set_ylabel('Frequency')

    axs[2, 1].hist(df['dG_separated'], bins=12, alpha=0.5, color='cyan')
    axs[2, 1].set_title('Rosetta Energy Histogram')
    axs[2, 1].set_xlabel('Rosetta Energy')
    axs[2, 1].set_ylabel('Frequency')

    plt.tight_layout()
    plt.show()

    filtered_df = df[
        (df['sc_value'] > sc_value_threshold) &
        (df['pred_tm_score'] > pred_tm_score_threshold) &
        (df['average_pae'] < average_pae_threshold) &
        (df['average_plddt'] > average_plddt_threshold) &
        (df['dG_separated'] < dG_separated_threshold) &
        (df['rmsd'] < rmsd_threshold)
    ]

    print(f"The total number of geometries that passed the filters are: {len(filtered_df.ID.unique())}")
    if save_results:
        output_filtered_path = os.path.join(os.path.dirname(input_csv_path), '../filtered_AF_results.csv')
        print(f"Saving the filtered results to {output_filtered_path}")
        filtered_df.to_csv(output_filtered_path, index=False)
    return filtered_df

In [3]:
nsym = 11
radius = 140

mmgbsa_input_folder='MMGBSA_template_scripts'
save_results = True

# # For round 1
# af_results_path = f'../4_Alphafold_Predictions/{nsym}mer/{nsym}_r{radius}/Finished_AF_predictions/AF_top_results.csv'
# pdb_input_folder = f"../3_MPNN_Design/{nsym}mer/{nsym}_r{radius}/SelectedDesigns"
# final_selection_folder = f'../8_Final_Selection/{nsym}mer/{nsym}_r{radius}/Round1'

# For round 2
af_results_path = f'../7_Alphafold_Predictions/{nsym}mer/{nsym}_r{radius}/Finished_AF_predictions/AF_top_results.csv'
pdb_input_folder = f"../6_GenerateDesigns/{nsym}mer/{nsym}_r{radius}/SelectedDesigns"
final_selection_folder = f'../8_Final_Selection/{nsym}mer/{nsym}_r{radius}/Round2'

# Parameters                         
sc_value_threshold = 0.60
pred_tm_score_threshold = 0.7
average_pae_threshold = 16
average_plddt_threshold = 75
dG_separated_threshold = -40
rmsd_threshold = 2.5

save_results = False
filtered_df = filter_results(af_results_path, sc_value_threshold, pred_tm_score_threshold, average_pae_threshold, average_plddt_threshold, dG_separated_threshold, rmsd_threshold, save_results)

if save_results:
    os.makedirs(final_selection_folder, exist_ok=True)
    for pdb_id in filtered_df['ID']:
        print(f'Copying {pdb_id}.pdb to {final_selection_folder}')
        pdb_file = f'{pdb_input_folder}/{pdb_id}.pdb'
        destination = f'{final_selection_folder}/'
        os.system(f'cp {pdb_file} {destination}')

    output_path = f'../8_Final_Selection/{nsym}mer/FINAL_AF_RESULTS.csv'
    if os.path.exists(output_path):
        existing_df = pd.read_csv(output_path)
        filtered_df = pd.concat([existing_df, filtered_df]).drop_duplicates().reset_index(drop=True)
    filtered_df.to_csv(output_path, index=False)
            
# if save_results:
#     os.makedirs(final_selection_folder, exist_ok=True)

#     for pdb_id in filtered_df['ID']:
#         print(f'Copying {pdb_id}.pdb to {final_selection_folder}')
#         pdb_file = f'{pdb_input_folder}/{pdb_id}.pdb'
#         destination = f'{final_selection_folder}/'
#         os.system(f'cp {pdb_file} {destination}')
#         os.makedirs(f'../MMGBSA/{nsym}mer/{nsym}_r{radius}/{pdb_id}/run1/', exist_ok=True)
#         os.makedirs(f'../MMGBSA/{nsym}mer/{nsym}_r{radius}/{pdb_id}/run2/', exist_ok=True)
#         os.makedirs(f'../MMGBSA/{nsym}mer/{nsym}_r{radius}/{pdb_id}/run3/', exist_ok=True)
#         os.system(f'cp {pdb_input_folder}/{pdb_id}.pdb ../MMGBSA/{nsym}mer/{nsym}_r{radius}/{pdb_id}/run1/')
#         os.system(f'cp {pdb_input_folder}/{pdb_id}.pdb ../MMGBSA/{nsym}mer/{nsym}_r{radius}/{pdb_id}/run2/')
#         os.system(f'cp {pdb_input_folder}/{pdb_id}.pdb ../MMGBSA/{nsym}mer/{nsym}_r{radius}/{pdb_id}/run3/')
#         # copy the MMGBSA input scripts
#         os.system(f'cp {mmgbsa_input_folder}/* ../MMGBSA/{nsym}mer/{nsym}_r{radius}/{pdb_id}/run1/')
#         os.system(f'cp {mmgbsa_input_folder}/* ../MMGBSA/{nsym}mer/{nsym}_r{radius}/{pdb_id}/run2/')
#         os.system(f'cp {mmgbsa_input_folder}/* ../MMGBSA/{nsym}mer/{nsym}_r{radius}/{pdb_id}/run3/')


FileNotFoundError: [Errno 2] No such file or directory: '../7_Alphafold_Predictions/11mer/11_r140/Finished_AF_predictions/AF_top_results.csv'

In [7]:
pd.set_option('display.max_colwidth', None) 


In [8]:
filtered_df

Unnamed: 0,ID,AF_rank,average_plddt,pred_tm_score,average_pae,rmsd,geometry,dG_separated,sc_value,dSASA_int,hydropathy_score,hydrophobic_percentage,cluster_label,geometry_score,n_contacts_geometry,total_geometry_score
1669,N271_xtal_11_r140_rot207.27_2.35_181.87_score0.544_rank391_0001,1.0,84.594174,0.74,12.554917,1.937469,N271_xtal_11_r140_rot207.27_2.35_181.87_score0.544,,,,,,,,,
1677,N271_xtal_11_r140_rot207.27_2.35_181.87_score0.544_rank391_0001,3.0,86.092341,0.74,13.202417,1.962423,N271_xtal_11_r140_rot207.27_2.35_181.87_score0.544,,,,,,,,,
138,N271_xtal_11_r140_rot181.38_163.59_1.61_score0.482_rank70_0001,1.0,85.732871,0.71,15.805982,2.069047,N271_xtal_11_r140_rot181.38_163.59_1.61_score0.482,,,,,,,,,
1724,N271_xtal_11_r140_rot207.99_332.54_1.19_score0.472_rank341_0001,,85.532754,0.77,15.330835,2.079173,N271_xtal_11_r140_rot207.99_332.54_1.19_score0.472,,,,,,,,,
1676,N271_xtal_11_r140_rot207.27_2.35_181.87_score0.544_rank391_0001,4.0,84.697669,0.72,12.928449,2.084201,N271_xtal_11_r140_rot207.27_2.35_181.87_score0.544,,,,,,,,,
1728,N271_xtal_11_r140_rot207.99_332.54_1.19_score0.472_rank341_0001,,88.581801,0.8,14.161425,2.324693,N271_xtal_11_r140_rot207.99_332.54_1.19_score0.472,,,,,,,,,
1673,N271_xtal_11_r140_rot207.27_2.35_181.87_score0.544_rank391_0001,2.0,84.131494,0.74,12.853384,2.333346,N271_xtal_11_r140_rot207.27_2.35_181.87_score0.544,,,,,,,,,


1669    N271_xtal_11_r140_rot207.27_2.35_181.87_score0...
1677    N271_xtal_11_r140_rot207.27_2.35_181.87_score0...
138     N271_xtal_11_r140_rot181.38_163.59_1.61_score0...
1724    N271_xtal_11_r140_rot207.99_332.54_1.19_score0...
1676    N271_xtal_11_r140_rot207.27_2.35_181.87_score0...
1728    N271_xtal_11_r140_rot207.99_332.54_1.19_score0...
1673    N271_xtal_11_r140_rot207.27_2.35_181.87_score0...
Name: ID, dtype: object

Other things

In [182]:
import os
import shutil

src = '../8_Final_Selection'
dst = '../9_Final_Analysis'

for folder in os.listdir(src):
    if os.path.isdir(os.path.join(src, folder)):
        for subfolder in os.listdir(os.path.join(src, folder)):
            if os.path.isdir(os.path.join(src, folder, subfolder)):
                for subfolder2 in os.listdir(os.path.join(src, folder, subfolder)):
                    if os.path.isdir(os.path.join(src, folder, subfolder, subfolder2)):
                        for file in os.listdir(os.path.join(src, folder, subfolder, subfolder2)):
                            if file.endswith('.pdb'):
                                dst_path = os.path.join(dst, folder, subfolder)
                                os.makedirs(dst_path, exist_ok=True)
                                shutil.copy(os.path.join(src, folder, subfolder, subfolder2, file), os.path.join(dst_path, file))
                                # print('Copied', os.path.join(dst_path, file))

In [1]:
import os
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.PDB import PDBParser, PPBuilder
from Bio.Align import substitution_matrices
import pandas as pd
import numpy as np

def get_fasta(pdb_file_path):
    """
    Efficiently parse a PDB file and extract polypeptides.

    Args:
        pdb_file_path (str): Path to the PDB file.

    Returns:
        list: List of polypeptides.
    """
    if not os.path.exists(pdb_file_path):
        raise FileNotFoundError(f"PDB file not found at {pdb_file_path}")
    
    parser = PDBParser(QUIET=True)  # QUIET=True suppresses warnings
    structure = parser.get_structure('protein', pdb_file_path)
    ppb = PPBuilder()
    
    # Extract all polypeptides, skipping non-essential parts of the structure
    polypeptides = ppb.build_peptides(structure)
    return polypeptides


def hamming_distance(seq1, seq2):
    """
    Calculate the Hamming distance between two sequences.

    Args:
        seq1 (str): First sequence.
        seq2 (str): Second sequence.

    Returns:
        int: Hamming distance between the sequences.
    """
    if len(seq1) != len(seq2):
        raise ValueError("Sequences must be of equal length for Hamming distance calculation.")
    
    return sum(c1 != c2 for c1, c2 in zip(seq1, seq2))

def weighted_distance(seq1, seq2, distance_matrix):
    """
    Calculate the weighted distance between two sequences using a distance matrix.

    Args:
        seq1 (str): First sequence.
        seq2 (str): Second sequence.
        distance_matrix (dict): Dictionary containing pairwise distances between amino acids.

    Returns:
        float: Weighted distance between the sequences.
    """
    if len(seq1) != len(seq2):
        raise ValueError("Sequences must be of equal length for weighted distance calculation.")
    total_distance = 0.0
    for c1, c2 in zip(seq1, seq2):
        c1 = c1.upper()
        c2 = c2.upper()
        if c1 != c2:
            total_distance += distance_matrix.get((c1, c2), distance_matrix.get((c2, c1), 0))
    return total_distance

def compare_seqs(original_seq, new_seq_list):
    """
    Compares the original sequence to each of the sequences in the new sequence list.
    If a residue is different between the original and new sequence, the output sequence will have the residue of the new sequence in lower case.
    If the residues are different between the original and more than one new sequence, an error will be raised.
    Args:
    original_seq (str): The original sequence.
    new_seq_list (list): A list of new sequences to compare to the original sequence.
    Returns:
    unified_seq (str): The unified sequence including the new positions in lower case.
    """
    for new_seq in new_seq_list:
        if len(original_seq) != len(new_seq):
            raise ValueError('The original and new sequences are not the same length.')
    unified_seq = ''
    for i in range(len(original_seq)):
        if original_seq[i] != new_seq_list[0][i]:
            if original_seq[i] != new_seq_list[1][i]:
                if new_seq_list[0][i] != new_seq_list[1][i]:
                    print(f'Position {i} is different between the original and new sequences: {original_seq[i]} vs {new_seq_list[0][i]} and {new_seq_list[1][i]}')
            else:
                unified_seq += new_seq_list[0][i].lower()
        elif original_seq[i] != new_seq_list[1][i]:
            unified_seq += new_seq_list[1][i].lower()
        else:
            unified_seq += original_seq[i].upper()
    return unified_seq

def get_designed_sequence(designed_pdb_path):
    parser = PDBParser()
    structure = parser.get_structure("protein", designed_pdb_path)
    ppb = PPBuilder()  # Use Polypeptide builder to extract sequences
    designed_seq = {}
    for model in structure:
        for chain in model:
            chain_id = chain.id
            pp = ppb.build_peptides(chain)
            sequence = "".join([str(p.get_sequence()) for p in pp])
            designed_seq[chain_id] = sequence
    return designed_seq

def get_new_unified_sequences(original_seq, designed_seq):
    print('Checking chain A')
    original_seq_A = original_seq['N271_design_A']
    new_seq_list = [designed_seq['A'], designed_seq['C']]
    unified_seq_A = compare_seqs(original_seq_A, new_seq_list)

    print('Checking chain B')
    original_seq_B = original_seq['N271_design_B']
    new_seq_list = [designed_seq['B'], designed_seq['D']]
    unified_seq_B = compare_seqs(original_seq_B, new_seq_list)

    unified_sequence = {'A': unified_seq_A, 'B': unified_seq_B}
    return unified_sequence

def compare_sequences_from_list(comparison_sequences_paths, reference_path):
    """
    Compare sequences from the list to the reference sequence.
    Args:
        comparison_sequences_paths (array): Array with the paths of the FASTA files to compare the reference to.
        reference_path (str): Path to the reference FASTA file.
        
    Returns:
        pd.DataFrame: DataFrame with comparison results.
    """
    reference_seq = list(SeqIO.parse(reference_path, "fasta"))
    results = []

    for path in comparison_sequences_paths:
        seq = list(SeqIO.parse(path, "fasta"))
        for ref_record in reference_seq:
            ref_id = ref_record.id
            ref_seq = ref_record.seq
            for record in seq:
                seq_id = record.id
                comp_seq = record.seq
                if ref_id==seq_id:
                    hamming_dist = hamming_distance(ref_seq, comp_seq)
                    weighted_dist = weighted_distance(ref_seq, comp_seq, distance_matrix_polar_apolar)
                    blosum62_dist = weighted_distance(ref_seq, comp_seq, blosum62_matrix)
                    weighted_blosum62_dist = weighted_distance(ref_seq, comp_seq, distance_matrix_polar_apolar_blosum62)

                    results.append({
                        "PDB File": os.path.basename(path),
                        "Chain ID": ref_id,
                        "Hamming Distance": hamming_dist,
                        "Polar-Apolar Distance": weighted_dist,
                        "BLOSUM62 Score": blosum62_dist,
                        "Weighted BLOSUM62 Score": weighted_blosum62_dist
                        })
                    break

    comparison_results_df = pd.DataFrame(results)
    return comparison_results_df

# Polar and nonpolar amino acid custom distance matrix
polar_aa = {'D', 'E', 'K', 'R', 'Q', 'N', 'H', 'S', 'T', 'Y'}
apolar_aa = {'A', 'V', 'L', 'I', 'M', 'F', 'W', 'G', 'P', 'C'}

distance_matrix_polar_apolar = {}
for aa1 in polar_aa.union(apolar_aa):
    for aa2 in polar_aa.union(apolar_aa):
        if (aa1 in polar_aa and aa2 in polar_aa) or (aa1 in apolar_aa and aa2 in apolar_aa):
            distance_matrix_polar_apolar[(aa1, aa2)] = 0.0
        else:
            distance_matrix_polar_apolar[(aa1, aa2)] = 1.0


blosum62 = substitution_matrices.load('BLOSUM62') 
blosum62_matrix = {}
for (aa1, aa2), score in blosum62.items():
    blosum62_matrix[(aa1, aa2)] = score
    blosum62_matrix[(aa2, aa1)] = score

distance_matrix_polar_apolar_blosum62 = {}
for (aa1, aa2), polar_apolar_value in distance_matrix_polar_apolar.items():
    if (aa1, aa2) in blosum62_matrix:
        blosum_value = blosum62_matrix[(aa1, aa2)]
    elif (aa2, aa1) in blosum62_matrix:
        blosum_value = blosum62_matrix[(aa2, aa1)]
    distance_matrix_polar_apolar_blosum62[(aa1, aa2)] = polar_apolar_value * blosum_value

In [186]:
selected_designed_sequences_folder = '/nfs/turbo/umms-ajitj/General_Geometry_Scan/Overview/9_Final_Analysis/13mer/13_r175'
os.makedirs(os.path.join(selected_designed_sequences_folder, 'Sequences'), exist_ok=True)
original_seq_path = 'original_N271_sequence.fa'
records = list(SeqIO.parse(original_seq_path, "fasta"))
original_seq = {records[0].id: records[0].seq, records[1].id: records[1].seq}

for filename in os.listdir(selected_designed_sequences_folder):
    if filename.endswith('.pdb'):
        print(f'Analyzing {filename}')
        designed_pdb_path = os.path.join(selected_designed_sequences_folder, filename)
        designed_seq = get_designed_sequence(designed_pdb_path)
        unified_sequence = get_new_unified_sequences(original_seq, designed_seq)
        with open(os.path.join(selected_designed_sequences_folder, 'Sequences', filename.replace('.pdb', '.fa')), 'w') as f:
            for chain in unified_sequence:
                f.write('>{}\n'.format(chain))
                f.write(unified_sequence[chain] + '\n')

Analyzing N271_xtal_13_r175_rot30.39_46.12_3.83_score0.41_rank94.pdb




Checking chain A
Checking chain B
Analyzing N271_xtal_13_r175_rot172.39_304.23_202.82_score0.473_rank52.pdb




Checking chain A
Checking chain B
Analyzing N271_xtal_13_r175_rot30.39_46.12_3.83_score0.41_rank3.pdb
Checking chain A
Checking chain B




Analyzing N271_xtal_13_r175_rot172.39_304.23_202.82_score0.473_rank258.pdb
Checking chain A
Checking chain B




Analyzing N271_xtal_13_r175_rot172.39_304.23_202.82_score0.473_rank229.pdb
Checking chain A
Checking chain B




Analyzing N271_xtal_13_r175_rot205.68_180.58_353.66_score0.513_rank59_rank55.pdb




Checking chain A
Checking chain B
Analyzing N271_xtal_13_r175_rot172.39_304.23_202.82_score0.473_rank118.pdb




Checking chain A
Checking chain B
Analyzing N271_xtal_13_r175_rot30.39_46.12_3.83_score0.41_rank149_rank40.pdb




Checking chain A
Checking chain B
Analyzing N271_xtal_13_r175_rot205.68_180.58_353.66_score0.513_rank115_rank94.pdb




Checking chain A
Checking chain B
Analyzing N271_xtal_13_r175_rot30.39_46.12_3.83_score0.41_rank41.pdb




Checking chain A
Checking chain B
Analyzing N271_xtal_13_r175_rot357.94_219.84_196.18_score0.556_rank312_rank251.pdb
Checking chain A
Checking chain B
Analyzing N271_xtal_13_r175_rot172.39_304.23_202.82_score0.473_rank258_rank24.pdb
Checking chain A
Checking chain B




Analyzing N271_xtal_13_r175_rot30.39_46.12_3.83_score0.41_rank149_rank83.pdb




Checking chain A
Checking chain B
Analyzing N271_xtal_13_r175_rot30.39_46.12_3.83_score0.41_rank240_rank216.pdb




Checking chain A
Checking chain B
Analyzing N271_xtal_13_r175_rot205.68_180.58_353.66_score0.513_rank115_rank317.pdb




Checking chain A
Checking chain B
Analyzing N271_xtal_13_r175_rot205.68_180.58_353.66_score0.513_rank59_rank21.pdb




Checking chain A
Checking chain B
Analyzing N271_xtal_13_r175_rot172.39_304.23_202.82_score0.473_rank258_rank292.pdb
Checking chain A
Checking chain B
Analyzing N271_xtal_13_r175_rot357.94_219.84_196.18_score0.556_rank312_rank228.pdb




Checking chain A
Checking chain B
Analyzing N271_xtal_13_r175_rot30.39_46.12_3.83_score0.41_rank149_rank6.pdb




Checking chain A
Checking chain B
Analyzing N271_xtal_13_r175_rot30.39_46.12_3.83_score0.41_rank149_rank61.pdb
Checking chain A
Checking chain B




In [196]:
import os
import re
import pandas as pd
from Bio import SeqIO

def process_group(group):
    group = group.sort_values(by='rmsd', ascending=True)
    print(f"First row rmsd: {group.iloc[0]['rmsd']}")
    group = group.drop_duplicates(subset='ID')

    if len(group) > 1:
        reference_row = group.loc[group['dG_separated'].idxmax()]
        ref_id = reference_row['ID']

        # Extract nsym and radius from the ID using regex
        nsym_match = re.search(r'_(\d{1,2})_', ref_id)
        radius_match = re.search(r'_r(\d+)', ref_id)
        nsym = nsym_match.group(1)
        radius = radius_match.group(1)

        ref_pdb_path = f'../9_Final_Analysis/{nsym}mer/{nsym}_r{radius}/Sequences/{ref_id}.fa'

        comparison_pdbs = [
            f'../9_Final_Analysis/{nsym}mer/{nsym}_r{radius}/Sequences/{row["ID"]}.fa'
            for _, row in group[group['ID'] != ref_id].iterrows()
        ]
        comparison_results_df = compare_sequences_from_list(comparison_pdbs, ref_pdb_path)
        group['PDB File'] = group['ID'] + '.fa'
        total_df = group.merge(comparison_results_df, on='PDB File', how='left')
        # comparison_results_df = comparison_results_df.merge(group, on='PDB File', how='left')
        return total_df

    elif len(group)==1:
        return group

scores_df_path = '../9_Final_Analysis/13mer/FINAL_AF_RESULTS.csv'
scores_df = pd.read_csv(scores_df_path)
scores_df = scores_df.sort_values(by='dG_separated', ascending=False)
scores_df_grouped = scores_df.groupby('geometry')

comparison_results_dict = {}
for geometry, group in scores_df_grouped:
    print(f"Processing Geometry: {geometry}")
    print(len(group))
    comparison_results = process_group(group)
    comparison_results_dict[geometry] = comparison_results
    if comparison_results is not None:
        output_dir = './ComparisonScores'
        os.makedirs(output_dir, exist_ok=True)
        output_path = os.path.join(output_dir, f'{geometry}.csv')
        comparison_results.to_csv(output_path, index=False)
        print(f"Saved results for Geometry: {geometry} to {output_path}")


Processing Geometry: N271_xtal_13_r135_rot111.82_74.3_87.78_score0.451
4
First row rmsd: 1.2797678312617755
Saved results for Geometry: N271_xtal_13_r135_rot111.82_74.3_87.78_score0.451 to ./ComparisonScores/N271_xtal_13_r135_rot111.82_74.3_87.78_score0.451.csv
Processing Geometry: N271_xtal_13_r135_rot210.71_158.22_13.76_score0.457
21
First row rmsd: 1.1017908802201046
Saved results for Geometry: N271_xtal_13_r135_rot210.71_158.22_13.76_score0.457 to ./ComparisonScores/N271_xtal_13_r135_rot210.71_158.22_13.76_score0.457.csv
Processing Geometry: N271_xtal_13_r135_rot213.06_161.95_178.24_score0.43
1
First row rmsd: 2.0229487839564286
Saved results for Geometry: N271_xtal_13_r135_rot213.06_161.95_178.24_score0.43 to ./ComparisonScores/N271_xtal_13_r135_rot213.06_161.95_178.24_score0.43.csv
Processing Geometry: N271_xtal_13_r135_rot215.76_13.44_190.17_score0.548
4
First row rmsd: 2.372493098331497
Saved results for Geometry: N271_xtal_13_r135_rot215.76_13.44_190.17_score0.548 to ./Compari

In [188]:
scores_df_grouped.first()

Unnamed: 0_level_0,ID,AF_rank,average_plddt,pred_tm_score,average_pae,rmsd,dG_separated,sc_value,dSASA_int,hydropathy_score,hydrophobic_percentage,cluster_label,geometry_score,n_contacts_geometry,total_geometry_score
geometry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
N271_xtal_13_r135_rot111.82_74.3_87.78_score0.451,N271_xtal_13_r135_rot111.82_74.3_87.78_score0.451_rank223,2.0,83.681695,0.78,15.90002,1.455096,-53.59,0.672,2638.544,-0.92973,0.50967,4.0,0.451197,64.0,28.876618
N271_xtal_13_r135_rot210.71_158.22_13.76_score0.457,N271_xtal_13_r135_rot210.71_158.22_13.76_score0.457_rank198,1.0,83.109089,0.73,14.753313,1.433125,-58.935,0.626,2781.099,-0.268919,0.598216,8.0,0.457159,61.0,27.886694
N271_xtal_13_r135_rot213.06_161.95_178.24_score0.43,N271_xtal_13_r135_rot213.06_161.95_178.24_score0.43_rank231,2.0,82.651674,0.73,14.930838,2.022949,-57.784,0.629,2739.5,-0.794286,0.574574,4.0,0.429597,61.0,26.205403
N271_xtal_13_r135_rot215.76_13.44_190.17_score0.548,N271_xtal_13_r135_rot215.76_13.44_190.17_score0.548_rank222_rank147,4.0,85.877945,0.8,9.999456,2.446804,-90.061,0.644,3232.156,0.139474,0.694323,11.0,0.548143,60.0,32.888579
N271_xtal_13_r135_rot216.34_21.4_188.31_score0.501,N271_xtal_13_r135_rot216.34_21.4_188.31_score0.501_rank127,8.0,84.796081,0.72,11.687254,2.437338,-81.981,0.648,3098.408,-0.144186,0.701768,7.0,0.501261,84.0,42.105936
N271_xtal_13_r135_rot218.03_202.82_349.86_score0.527,N271_xtal_13_r135_rot218.03_202.82_349.86_score0.527_rank217_rank167,2.0,88.643104,0.82,14.331595,2.442734,-95.82,0.655,3431.444,-0.728395,0.58973,2.0,0.527089,65.0,34.260798
N271_xtal_13_r135_rot30.42_45.63_218.03_score0.502,N271_xtal_13_r135_rot30.42_45.63_218.03_score0.502_rank136,1.0,81.881144,0.73,14.819751,1.220681,-62.282,0.612,2741.44,-0.35,0.629521,5.0,0.501609,55.0,27.588469
N271_xtal_13_r135_rot33.35_22.9_355.82_score0.506,N271_xtal_13_r135_rot33.35_22.9_355.82_score0.506_rank169,1.0,84.514142,0.71,15.733737,2.116536,-54.377,0.606,2684.638,-1.28125,0.513026,4.0,0.505557,58.0,29.322316
N271_xtal_13_r135_rot33.69_171.76_1.63_score0.504,N271_xtal_13_r135_rot33.69_171.76_1.63_score0.504_rank189,1.0,87.067299,0.79,9.873461,2.334628,-67.091,0.62,3337.162,-1.274359,0.531772,11.0,0.50406,69.0,34.780131
N271_xtal_13_r135_rot35.49_344.79_172.39_score0.528,N271_xtal_13_r135_rot35.49_344.79_172.39_score0.528_rank65,1.0,85.503962,0.8,15.684667,1.832051,-77.081,0.632,3451.277,-0.28375,0.661203,2.0,0.528397,79.0,41.743375


In [195]:
def save_table_as_image(df, output_path):
    fig, ax = plt.subplots(figsize=(len(df.columns) * 1.2, len(df) * 0.4))
    ax.axis('off')
    table = ax.table(cellText=df.values, colLabels=df.columns, cellLoc='center', loc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(10)
    table.auto_set_column_width(col=list(range(len(df.columns))))
    plt.savefig(output_path, bbox_inches='tight', dpi=300)
    plt.close()

scores_df_path = '../9_Final_Analysis/13mer/FINAL_AF_RESULTS.csv'
scores_df = pd.read_csv(scores_df_path)
scores_df = scores_df.sort_values(by='dG_separated', ascending=False)
scores_df_grouped = scores_df.groupby('geometry')

comparison_results_dict = {}
for geometry, group in scores_df_grouped:
    print(f"Processing Geometry: {geometry}")
    print(len(group))
    comparison_results = process_group(group)
    comparison_results_dict[geometry] = comparison_results

    # Save them as images
    if comparison_results is not None:
        output_dir = '../9_Final_Analysis/13mer/'
        os.makedirs(output_dir, exist_ok=True)
        output_path = os.path.join(output_dir, f'{geometry}.png')
        save_table_as_image(comparison_results, output_path)
        print(f"Saved table image for Geometry: {geometry} to {output_path}")

Processing Geometry: N271_xtal_13_r135_rot111.82_74.3_87.78_score0.451
4
First row rmsd: 1.2797678312617755
Saved table image for Geometry: N271_xtal_13_r135_rot111.82_74.3_87.78_score0.451 to ../9_Final_Analysis/13mer/N271_xtal_13_r135_rot111.82_74.3_87.78_score0.451.png
Processing Geometry: N271_xtal_13_r135_rot210.71_158.22_13.76_score0.457
21
First row rmsd: 1.1017908802201046
Saved table image for Geometry: N271_xtal_13_r135_rot210.71_158.22_13.76_score0.457 to ../9_Final_Analysis/13mer/N271_xtal_13_r135_rot210.71_158.22_13.76_score0.457.png
Processing Geometry: N271_xtal_13_r135_rot213.06_161.95_178.24_score0.43
1
First row rmsd: 2.0229487839564286
Saved table image for Geometry: N271_xtal_13_r135_rot213.06_161.95_178.24_score0.43 to ../9_Final_Analysis/13mer/N271_xtal_13_r135_rot213.06_161.95_178.24_score0.43.png
Processing Geometry: N271_xtal_13_r135_rot215.76_13.44_190.17_score0.548
4
First row rmsd: 2.372493098331497
Saved table image for Geometry: N271_xtal_13_r135_rot215.76

In [151]:
for key in comparison_results_dict.keys():
    try:
        print(len(comparison_results_dict[key]))
        x=comparison_results_dict[key]
        break
    except:
        print(1)

3


In [160]:
pd.set_option('display.max_colwidth', None) 

In [60]:
comparison_results_dict = {}
for geometry, group in scores_df_grouped:
    print(f"Processing Geometry: {geometry}")
    comparison_results = process_group(group)
    if comparison_results is not None:
        comparison_results_dict[geometry] = comparison_results
        print(comparison_results)

Processing Geometry: N271_xtal_13_r135_rot210.71_158.22_13.76_score0.457
                                            PDB File Chain ID  \
0  N271_xtal_13_r135_rot210.71_158.22_13.76_score...        A   
1  N271_xtal_13_r135_rot210.71_158.22_13.76_score...        B   

   Hamming Distance  Polar-Apolar Distance  BLOSUM62 Score  \
0                26                   13.0           -14.0   
1                21                    7.0            11.0   

   Weighted BLOSUM62 Score  
0                    -23.0  
1                     -6.0  
Processing Geometry: N271_xtal_13_r135_rot216.34_21.4_188.31_score0.501
Processing Geometry: N271_xtal_13_r135_rot33.69_171.76_1.63_score0.504
Processing Geometry: N271_xtal_13_r135_rot35.49_344.79_172.39_score0.528
Processing Geometry: N271_xtal_13_r135_rot35.96_165.73_9.79_score0.527
Processing Geometry: N271_xtal_13_r135_rot60.85_121.69_40.56_score0.512
Processing Geometry: N271_xtal_13_r175_rot172.39_304.23_202.82_score0.473
                        

In [41]:
# Read df
scores_df_path = '../9_Final_Analysis/13mer/FINAL_AF_RESULTS.csv'
scores_df = pd.read_csv(scores_df_path)
scores_df = scores_df.sort_values(by='dG_separated', ascending=False)
scores_df_grouped = scores_df.groupby('geometry')

# now for each geometry fo the following. do this like a function where I pass the group like in 
for geometry, group in scores_df_grouped:
    print(f"Geometry: {geometry}")
    print(group)
# First drop duplicates based on ID. Then, if there is more than one row left, Get the ID of the row with the highest dG_separated. that will be the reference pdb
# then get the nsym as the the first numbers in the ID followed by a _ and the radius is the second numbers in the ID between a r and a _.
# then get the path of the reference pdb as ref_pdb_path = ../9_Final_Analysis_{nsym}mer/{nsym}mer_r{radius}/Sequences/ID.fa
# then get the paths of the other rows in a similar manner, this will be the comparison_pdbs

# Then use compare_sequences_from_list to get comparison_results_df where 

# Get unified sequences for the given PDB files
total_unified_sequences = {}
for pdb_path in comparison_pdbs:
    print(f'Analyzing {pdb_path}')
    designed_seq = get_designed_sequence(pdb_path)
    total_unified_sequences[pdb_path] = designed_seq

# Compare sequences
reference_pdb_path = comparison_pdbs[0]
comparison_results = compare_sequences_from_list(total_unified_sequences, reference_pdb_path)
comparison_results.to_csv('comparison_results_specific.csv', index=False)
print(comparison_results)

In [54]:
group

Unnamed: 0,ID,AF_rank,average_plddt,pred_tm_score,average_pae,rmsd,geometry,dG_separated,sc_value,dSASA_int,hydropathy_score,hydrophobic_percentage,cluster_label
0,N271_xtal_13_r135_rot210.71_158.22_13.76_score...,3.0,84.069534,0.81,14.850354,1.43178,N271_xtal_13_r135_rot210.71_158.22_13.76_score...,-82.52,0.652,3055.366,-0.35,0.635118,8.0
1,N271_xtal_13_r135_rot210.71_158.22_13.76_score...,3.0,86.156801,0.8,14.957955,1.688193,N271_xtal_13_r135_rot210.71_158.22_13.76_score...,-72.394,0.668,2874.989,-0.385714,0.592373,8.0
2,N271_xtal_13_r135_rot210.71_158.22_13.76_score...,1.0,86.973358,0.81,14.627886,1.748605,N271_xtal_13_r135_rot210.71_158.22_13.76_score...,-72.394,0.668,2874.989,-0.385714,0.592373,8.0
4,N271_xtal_13_r135_rot210.71_158.22_13.76_score...,2.0,86.164142,0.8,14.91011,1.78828,N271_xtal_13_r135_rot210.71_158.22_13.76_score...,-72.394,0.668,2874.989,-0.385714,0.592373,8.0
3,N271_xtal_13_r135_rot210.71_158.22_13.76_score...,5.0,85.879555,0.78,14.652394,1.763339,N271_xtal_13_r135_rot210.71_158.22_13.76_score...,-72.394,0.668,2874.989,-0.385714,0.592373,8.0
5,N271_xtal_13_r135_rot210.71_158.22_13.76_score...,6.0,85.378136,0.78,14.852485,1.845395,N271_xtal_13_r135_rot210.71_158.22_13.76_score...,-72.394,0.668,2874.989,-0.385714,0.592373,8.0


In [47]:
for geometry, group in scores_df_grouped:
    print(f"Geometry: {geometry}")
    print(group)
    break

Geometry: N271_xtal_13_r135_rot210.71_158.22_13.76_score0.457
                                                  ID  AF_rank  average_plddt  \
0  N271_xtal_13_r135_rot210.71_158.22_13.76_score...      3.0      84.069534   
1  N271_xtal_13_r135_rot210.71_158.22_13.76_score...      3.0      86.156801   
2  N271_xtal_13_r135_rot210.71_158.22_13.76_score...      1.0      86.973358   
4  N271_xtal_13_r135_rot210.71_158.22_13.76_score...      2.0      86.164142   
3  N271_xtal_13_r135_rot210.71_158.22_13.76_score...      5.0      85.879555   
5  N271_xtal_13_r135_rot210.71_158.22_13.76_score...      6.0      85.378136   

   pred_tm_score  average_pae      rmsd  \
0           0.81    14.850354  1.431780   
1           0.80    14.957955  1.688193   
2           0.81    14.627886  1.748605   
4           0.80    14.910110  1.788280   
3           0.78    14.652394  1.763339   
5           0.78    14.852485  1.845395   

                                            geometry  dG_separated  sc_valu

In [43]:
scores_df_grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x1495d2b4d2e0>

In [None]:
# Step 2: Compare specific sequences
comparison_pdbs = [
    './13mer/13_r175/N271_xtal_13_r175_rot172.39_304.23_202.82_score0.473_rank118.pdb',
    './13mer/13_r175/N271_xtal_13_r175_rot172.39_304.23_202.82_score0.473_rank258.pdb',
    './13mer/13_r175/N271_xtal_13_r175_rot172.39_304.23_202.82_score0.473_rank229.pdb'
]

# Get unified sequences for the given PDB files
total_unified_sequences = {}
for pdb_path in comparison_pdbs:
    print(f'Analyzing {pdb_path}')
    designed_seq = get_designed_sequence(pdb_path)
    total_unified_sequences[pdb_path] = designed_seq

# Compare sequences
reference_pdb_path = comparison_pdbs[0]
comparison_results = compare_sequences_from_list(total_unified_sequences, reference_pdb_path)
comparison_results.to_csv('comparison_results_specific.csv', index=False)
print(comparison_results)

In [9]:
import os
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt

base_paths = ["../7_Alphafold_Predictions", "../4_Alphafold_Predictions"]
all_data = []

for base_path in base_paths:
    for root, dirs, files in os.walk(base_path):
        for file in files:
            if file == "filtered_AF_results.csv":
                file_path = os.path.join(root, file)
                df = pd.read_csv(file_path, index_col=0)
                all_data.append(df)

merged_df = pd.concat(all_data, ignore_index=True)
merged_df.to_csv("../8_Final_Selection/FINAL_AF_RESULTS.csv", index=False)