# Imports #
Import all packages and project code needed to complete the analyses.

In [1]:
import os
import re
import sys
import requests
import pandas as pd
from time import time, sleep
# %matplotlib inline
import seaborn as sns
import matplotlib as mpl
from matplotlib import lines
import matplotlib.pyplot as plt
from scipy.stats import wilcoxon
import xml.etree.ElementTree as XMLET
from Bio import Entrez
from Bio.Phylo.TreeConstruction import DistanceMatrix
# Load global variables from project file
from dotenv import find_dotenv, load_dotenv
try:
    dotenv_path = find_dotenv(raise_error_if_not_found=True)
except IOError:
    dotenv_path = find_dotenv(raise_error_if_not_found=True, usecwd=True)
load_dotenv(dotenv_path)
# Add the project path to the python path so the required clases can be imported
sys.path.append(os.path.join(os.environ.get('PROJECT_PATH'), 'src'))
sys.path.append(os.path.join(os.environ.get('PROJECT_PATH'), 'src', 'SupportingClasses'))
# Import lab internal packages
from SupportingClasses.SeqAlignment import SeqAlignment
from SupportingClasses.PhylogeneticTree import PhylogeneticTree
from SupportingClasses.DCAWrapper import DCAWrapper
from SupportingClasses.EVCouplingsWrapper import EVCouplingsWrapper
from Evaluation.ContactScorer import ContactScorer
from Evaluation.SinglePositionScorer import SinglePositionScorer
from EvolutionaryTrace import EvolutionaryTrace
# from SupportingClasses.DataSetGenerator import parse_uniref_lineage
# Set the e-mail variable for Entrez for parsing later on
Entrez.email = os.environ.get('EMAIL')
output_dir = '/media/daniel/ExtraDrive1/Results/CovET_Final_Results'

# Visualizing Trees #
For the three proteins analyzed in depth the phylogenetic tree computed for the CovET predictions is loaded and colored either by species or by molecular classification.

In [None]:
# Define function to annotate the molecular classifications of GPCRs
def get_uniprot_prints_annotations(acc_id):
    acc = acc_id.split('_')[-1]
    BASE = 'https://www.uniprot.org'
    KB_ENDPOINT = '/uniprot/'
    payload = {'query': acc, 'format': 'xml'}
    result = requests.get(BASE + KB_ENDPOINT, params=payload) 
    if result.ok:
        xml_root = XMLET.fromstring(result.text)
        prints_elements = xml_root.findall("{http://uniprot.org/uniprot}entry/{http://uniprot.org/uniprot}dbReference[@type='PRINTS']")
        print_annotations = []
        for p_e in prints_elements:
            print_id = p_e.get('id')
            entry_names = p_e.findall("{http://uniprot.org/uniprot}property[@type='entry name']")
            for entry_name in entry_names:
                print_entry_name = entry_name.get('value')
                print_annotations.append((print_id, print_entry_name))
    return print_annotations

def get_uniparc_prints_annotations(acc_id):
    acc = acc_id.split('_')[-1]
    BASE = 'https://www.uniprot.org'
    KB_ENDPOINT = '/uniparc/'
    payload = {'query': acc, 'format': 'xml'}
    result = requests.get(BASE + KB_ENDPOINT, params=payload) 
    if result.ok:
        xml_root = XMLET.fromstring(result.text)
        prints_elements = xml_root.findall("{http://uniprot.org/uniparc}entry/{http://uniprot.org/uniparc}signatureSequenceMatch[@database='PRINTS']")
        print_annotations = []
        for p_e in prints_elements:
            print_id = p_e.get('id')
            entry_names = p_e.findall("{http://uniprot.org/uniparc}ipr")
            for entry_name in entry_names:
                print_entry_name = entry_name.get('name')
                print_annotations.append((print_id, print_entry_name))
    return print_annotations

# D2R
d2r_tree_dir = os.path.join(output_dir, 'Trees', 'D2R')
os.makedirs(d2r_tree_dir, exist_ok=True)

query = '2ddr'
d2r_aln_file_path = '/media/daniel/ExtraDrive1/Results/Allosteric_Data_Set/Domain_Specific_Min_ID_40/Output/ID40/U90/2ddr/ET-MEMER/Non-Gapped_Alignment.fa'
d2r_aln = SeqAlignment(file_name=d2r_aln_file_path, query_id=query)
d2r_aln.import_alignment()

# Attempt to annotate each of the sequences in the alignment
d2r_annotations_path = os.path.join(d2r_tree_dir, 'Saved_Classification_Annotations.tsv')

if os.path.isfile(d2r_annotations_path):
    d2r_annotations_df = pd.read_csv(d2r_annotations_path, header=0, index_col=None, sep='\t')
    d2r_annotations = d2r_annotations_df.to_dict('series')
    d2r_annotations = {k: list(v) for k, v in d2r_annotations.items()}
    print(f"Loaded: {len(set(d2r_annotations['Sequence ID']))} ID annotations")
else:
    d2r_annotations = {'Sequence ID': [], 'PRINTS ID': [], 'PRINTS Name': []}
counter = 0
start = time()
to_annotate = set(d2r_aln.seq_order) - set(d2r_annotations['Sequence ID'])
print(to_annotate)
for s_id in to_annotate:
    print('SEQ ID: ', s_id)
    if counter % 10 == 0:
        print(f'Seq ID {counter}: {time() - start}sec')
        pd.DataFrame(d2r_annotations).to_csv(d2r_annotations_path, header=True, index=False, sep='\t')
    counter += 1
    if s_id == query:
        s_id = 'P14416'
    requesting = True
    while requesting:
        try:
            curr_annotations = get_uniprot_prints_annotations(s_id)
#             print('CURR ANNOTATIONS: ', curr_annotations)
            requesting = False
        except ConnectionResetError:
            sleep(1)
        except XMLET.ParseError:
            try:
                curr_annotations = get_uniparc_prints_annotations(s_id)
                requesting = False
            except XMLET.ParseError:
                curr_annotations = [('UNKNOWN', None)]
                requesting = False
            except ConnectionResetError:
                sleep(1)
        for annot in curr_annotations:
            if s_id == 'P14416':
                s_id = query
            d2r_annotations['Sequence ID'].append(s_id)
            d2r_annotations['PRINTS ID'].append(annot[0])
            d2r_annotations['PRINTS Name'].append(annot[1])
        if len(curr_annotations) == 0:
            d2r_annotations['Sequence ID'].append(s_id)
            d2r_annotations['PRINTS ID'].append('UNKNOWN')
            d2r_annotations['PRINTS Name'].append(None)
d2r_annotations_df = pd.DataFrame(d2r_annotations)
if len(to_annotate) > 0:
    d2r_annotations_df.to_csv(d2r_annotations_path, header=True, index=False, sep='\t')
    
unannotated_seq_ids = d2r_annotations_df.loc[d2r_annotations_df['PRINTS ID'] == 'UNKNOWN', 'Sequence ID'].unique()
unannotated_dict = {x: 'UNKNOWN' for x in unannotated_seq_ids}

annotated_seq_ids = d2r_annotations_df.loc[d2r_annotations_df['PRINTS ID'] != 'UNKNOWN', :]
annotated_seq_ids['ID Integer'] = annotated_seq_ids['PRINTS ID'].apply(lambda x: int(x[2:]))

groups = annotated_seq_ids.sort_values(by='ID Integer').groupby('Sequence ID')
annotation_levels = {x: {} for x in range(3)}
annotation_counts = {x: {} for x in range(3)}
for group in groups.groups.keys():
    curr_group = groups.get_group(group)
    for i in range(3):
        try:
            label = curr_group['PRINTS ID'].iloc[i]
            annotation_levels[i][group] = label
            if label not in annotation_counts[i]:
                annotation_counts[i][label] = 1
            else:
                annotation_counts[i][label] += 1
        except IndexError:
            pass

# Load the tree used in the D2R analyses
d2r_tree_file_path = '/media/daniel/ExtraDrive1/Results/Allosteric_Data_Set/Domain_Specific_Min_ID_40/Output/ID40/U90/2ddr/ET-MEMER/2ddr_ET_blosum62_dist_et_tree.nhx'
d2r_tree = PhylogeneticTree(tree_building_method='custom', tree_building_args={'tree_path': d2r_tree_file_path})
d2r_tree.construct_tree(dm=DistanceMatrix(names=d2r_aln.seq_order))

# Render the tree with the leaves colored according to the least specific protein classifications
completed_least_specific_annotations, completed_least_specific_annotations_df = d2r_tree.annotate_incomplete_tree(annotation_levels[0])
completed_least_specific_annotations_df.to_csv(os.path.join(d2r_tree_dir, 'Least_Specific_Annotation_Completion.tsv'), header=True, index=False, sep='\t')
curr_annotation_counts = {}
for seq_id in completed_least_specific_annotations:
    label = completed_least_specific_annotations[seq_id]
    if label not in curr_annotation_counts:
        curr_annotation_counts[label] = 1
    else:
        curr_annotation_counts[label] += 1
_, color_dict1 = d2r_tree.visualize_tree(query='2ddr', out_dir=d2r_tree_dir, id_categories=completed_least_specific_annotations, filename='Least_Specific_Classification.png')
color_lists1 = [list(x) for x in list(zip(*list(color_dict1.items())))]
pd.DataFrame({'Category': color_lists1[0], 'Colors': color_lists1[1], 'Counts': [curr_annotation_counts[x] for x in color_lists1[0]]}).to_csv(os.path.join(d2r_tree_dir, 'Least_Specific_Coloring.tsv'),
                                                                                                                                              header=True, index=False, sep='\t')
# Render the tree with the leaves colored according to the intermediate specificity protein classifications
completed_middle_specific_annotations, completed_middle_specific_annotations_df = d2r_tree.annotate_incomplete_tree(annotation_levels[1])
completed_middle_specific_annotations_df.to_csv(os.path.join(d2r_tree_dir, 'Middle_Specific_Annotation_Completion.tsv'), header=True, index=False, sep='\t')
curr_annotation_counts = {}
for seq_id in completed_middle_specific_annotations:
    label = completed_middle_specific_annotations[seq_id]
    if label not in curr_annotation_counts:
        curr_annotation_counts[label] = 1
    else:
        curr_annotation_counts[label] += 1
_, color_dict2 = d2r_tree.visualize_tree(query='2ddr', out_dir=d2r_tree_dir, id_categories=completed_middle_specific_annotations, filename='Middle_Specific_Classification.png')
color_lists2 = [list(x) for x in list(zip(*list(color_dict2.items())))]
pd.DataFrame({'Category': color_lists2[0], 'Colors': color_lists2[1], 'Counts': [curr_annotation_counts[x] for x in color_lists2[0]]}).to_csv(os.path.join(d2r_tree_dir, 'Middle_Specific_Coloring.tsv'),
                                                                                                                                              header=True, index=False, sep='\t')
# Render the tree with the leaves colored according to the most specific protein classifications
completed_most_specific_annotations, completed_most_specific_annotations_df = d2r_tree.annotate_incomplete_tree(annotation_levels[2])
completed_most_specific_annotations_df.to_csv(os.path.join(d2r_tree_dir, 'Most_Specific_Annotation_Completion.tsv'), header=True, index=False, sep='\t')
curr_annotation_counts = {}
for seq_id in completed_most_specific_annotations:
    label = completed_most_specific_annotations[seq_id]
    if label not in curr_annotation_counts:
        curr_annotation_counts[label] = 1
    else:
        curr_annotation_counts[label] += 1
_, color_dict3 = d2r_tree.visualize_tree(query='2ddr', out_dir=d2r_tree_dir, id_categories=completed_most_specific_annotations, filename='Most_Specific_Classification.png')
color_lists3 = [list(x) for x in list(zip(*list(color_dict3.items())))]
pd.DataFrame({'Category': color_lists3[0], 'Colors': color_lists3[1], 'Counts': [curr_annotation_counts[x] for x in color_lists3[0]]}).to_csv(os.path.join(d2r_tree_dir, 'Most_Specific_Coloring.tsv'),
                                                                                                                                              header=True, index=False, sep='\t')
# Render a legend to go with this tree
f = lambda m,c: plt.plot([],[],marker=m, color=c, ls="none")[0]
handles = [f("s", color_lists3[1][i]) for i in range(len(color_lists3[0]))]
labels = [f'{color_lists3[0][i]}({curr_annotation_counts[color_lists3[0][i]]})' for i in range(len(color_lists3[0]))]
legend = plt.legend(handles, loc=3, framealpha=1, frameon=True, ncol=4, edgecolor='black', facecolor='none')
for i, l_label in enumerate(legend.get_texts()):
    l_label.set_text(labels[i])
fig  = legend.figure
plt.axis('off')
fig.canvas.draw()
fig.savefig(os.path.join(d2r_tree_dir, 'Most_Specific_Legend.png'), dpi=500, bbox_inches='tight', tranparent=True)

In [None]:
# Define function to retrieve, lookup, and annotate the lineage of each sequence ID
def retrieve_and_parse_lineage_ncbi(lineage_id):
    handle = Entrez.efetch(db='taxonomy', rettype='gp', retmode='xml', id=lineage_id)
    taxa_set = XMLET.parse(handle).getroot()
    extended_lineage = taxa_set.findall("Taxon/LineageEx/Taxon")
    taxa_desc = {}
    taxa_list = []
    for taxon in extended_lineage:
        sci_name = taxon.find('ScientificName').text
        rank = taxon.find('Rank').text
        if rank in taxa_desc:
            if isinstance(taxa_desc[rank], tuple):
                taxa_desc[rank] = [taxa_desc[rank], (taxa_set.find('Taxon/TaxId'), sci_name)]
            elif isinstance(taxa_desc[rank], list):
                taxa_desc[rank].append((taxa_set.find('Taxon/TaxId'), sci_name))
            else:
                raise ValueError('Strange value encountered.')
        else:
            taxa_desc[rank] = (taxon.find('TaxId').text, sci_name)
        taxa_list.append(sci_name)
    final_sci_name = taxa_set.find('Taxon/ScientificName')
    final_rank = taxa_set.find('Taxon/Rank')
    if final_rank in taxa_desc:
        if isinstance(taxa_desc[final_rank], tuple):
            taxa_desc[final_rank]: [taxa_desc[final_rank], (taxa_set.find('Taxon/TaxId'), final_sci_name)]
        elif isinstance(taxa_desc[rank], list):
            taxa_desc[rank].append((taxa_set.find('Taxon/TaxId'), sci_name))
        else:
            raise ValueError('Strange value encountered.')
    else:
        taxa_desc[final_rank]: (taxa_set.find('Taxon/TaxId'), final_sci_name)
    taxa_list.append(final_sci_name)
    return taxa_list, taxa_desc

def get_uniprot_lineage(acc_id):
    acc = acc_id.split('_')[-1]
    BASE = 'https://www.uniprot.org'
    KB_ENDPOINT = '/uniprot/'
    payload = {'query': acc, 'format': 'xml'}
    result = requests.get(BASE + KB_ENDPOINT, params=payload) 
    if result.ok:
        xml_root = XMLET.fromstring(result.text)
        taxa_elements = xml_root.findall("{http://uniprot.org/uniprot}entry/{http://uniprot.org/uniprot}organism/{http://uniprot.org/uniprot}dbReference[@type='NCBI Taxonomy']")
        assert len(taxa_elements) == 1
        ncbi_id = taxa_elements[0].get('id')
        return retrieve_and_parse_lineage_ncbi(ncbi_id)
    else:
        raise ConnectionError('Did not get a successful response.')
        
def get_uniparc_lineage(acc_id):
    acc = acc_id.split('_')[-1]
    BASE = 'https://www.uniprot.org'
    KB_ENDPOINT = '/uniparc/'
    payload = {'query': acc, 'format': 'xml'}
    result = requests.get(BASE + KB_ENDPOINT, params=payload) 
    if result.ok:
        xml_root = XMLET.fromstring(result.text)
        taxa_element = xml_root.find("{http://uniprot.org/uniparc}entry/{http://uniprot.org/uniparc}dbReference/{http://uniprot.org/uniparc}property[@type='NCBI_taxonomy_id']")
        ncbi_id = taxa_element.get('value')
        return retrieve_and_parse_lineage_ncbi(ncbi_id)
    else:
        raise ConnectionError('Did not get a successful response.')


for protein, query, query_uniprot in [('WW', '1yap', 'UniRef90_P46937'), ('RRM', '2rrm', 'UniRef90_P04147'), ('D2R', '2ddr', 'UniRef90_P14416')]:
    print('QUERY: ', query)
    tree_dir = os.path.join(output_dir, 'Trees', protein)
    os.makedirs(tree_dir, exist_ok=True)

    aln_file_path = f'/media/daniel/ExtraDrive1/Results/Allosteric_Data_Set/Domain_Specific_Min_ID_40/Output/ID40/U90/{query}/ET-MEMER/Non-Gapped_Alignment.fa'
    aln = SeqAlignment(file_name=aln_file_path, query_id=query)
    aln.import_alignment()

    # Attempt to annotate each of the sequences in the alignment
    annotations_path = os.path.join(tree_dir, 'Saved_Annotations.tsv')


    if os.path.isfile(annotations_path):
        annotations_df = pd.read_csv(annotations_path, header=0, index_col=None, sep='\t')
        annotations = annotations_df.to_dict('series')
        annotations = {k: list(v) for k, v in annotations.items()}
        print(f"Loaded: {len(set(annotations['Sequence ID']))} ID annotations")
    else:
        annotations = {'Sequence ID': []}

    counter = 0
    start = time()
    to_annotate = set(aln.seq_order) - set(annotations['Sequence ID'])
    max_desc = set(annotations.keys()) - set(['Sequence ID'])
    for s_id in to_annotate:
        if counter % 10 == 0:
            print(f'Seq ID {counter}: {time() - start}sec')
            pd.DataFrame(annotations).to_csv(annotations_path, header=True, index=False, sep='\t')
        counter += 1
        if s_id == query:
            s_id = query_uniprot
        try:
            _, lin_desc = get_uniprot_lineage(s_id)
        except XMLET.ParseError:
            try:
                _, lin_desc = get_uniparc_lineage(s_id)
            except XMLET.ParseError:
                lin_desc = {}
        curr_desc_levels = set(lin_desc.keys())
        missing_levels = curr_desc_levels - max_desc
        if len(missing_levels) > 1:
            for desc_level in missing_levels:
                annotations[desc_level] = [None] * len(annotations['Sequence ID'])
            max_desc |= set(curr_desc_levels)
        for desc_level in max_desc:
            try:
                if isinstance(lin_desc[desc_level], tuple):
                    annotations[desc_level].append(lin_desc[desc_level][1])
                elif isinstance(lin_desc[desc_level], list):
                    annotations[desc_level].append(','.join([x[1] for x in lin_desc[desc_level]]))
            except KeyError:
                annotations[desc_level].append(None)
        annotations['Sequence ID'].append(s_id)
        if s_id == query_uniprot:
            annotations['Sequence ID'].append(query)
            for desc_level in max_desc:
                annotations[desc_level].append(annotations[desc_level][-1])
    print(f'The max number of descriptive levels is: {len(max_desc)}')
    annotations_df = pd.DataFrame(annotations)
    if len(to_annotate) > 0:
        annotations_df.to_csv(annotations_path, header=True, index=False, sep='\t')

    tree_file_path = f'/media/daniel/ExtraDrive1/Results/Allosteric_Data_Set/Domain_Specific_Min_ID_40/Output/ID40/U90/{query}/ET-MEMER/{query}_ET_blosum62_dist_et_tree.nhx'
    tree = PhylogeneticTree(tree_building_method='custom', tree_building_args={'tree_path': tree_file_path})
    tree.construct_tree(dm=DistanceMatrix(names=aln.seq_order))

    percent_complete = 1 - (annotations_df[set(annotations_df.columns) - set(['Sequence ID', 'no rank'])].isna().sum()/len(annotations_df))
    columns_to_render = percent_complete.index[percent_complete > 0.75]
    final_annotations_df = annotations_df.fillna('UNKNOWN')

    annotation_dict = {c: {} for c in columns_to_render}
    counts = {c: {} for c in columns_to_render}
    for ind in final_annotations_df.index:
        row = final_annotations_df.loc[ind, :]
    #     print(row)
        for col in columns_to_render:
            annotation_dict[col][row['Sequence ID']] = 'AMBIGUOUS' if ',' in row[col] else row[col]
            if annotation_dict[col][row['Sequence ID']] not in counts[col]:
                counts[col][annotation_dict[col][row['Sequence ID']]] = 1
            else:
                counts[col][annotation_dict[col][row['Sequence ID']]] += 1
                
#     print(counts)

    for taxa_level in columns_to_render:    
        _, color_dict = tree.visualize_tree(query=query, out_dir=tree_dir, id_categories=annotation_dict[taxa_level], filename=f'Taxa_Level_{taxa_level}.png')
        color_lists = [list(x) for x in list(zip(*list(color_dict.items())))]
        pd.DataFrame({'Category': color_lists[0], 'Colors': color_lists[1], 'Counts': [counts[taxa_level][x] for x in color_lists[0]]}).to_csv(os.path.join(tree_dir, f'Taxa_Level_{taxa_level}.tsv'),
                                                                                                                                               header=True, index=False, sep='\t')
        if protein in ['RRM', 'WW'] and taxa_level == 'phylum':
            # Render a legend to go with this tree
            f = lambda m,c: plt.plot([],[],marker=m, color=c, ls="none")[0]
            handles = [f("s", color_lists[1][i]) for i in range(len(color_lists[0]))]
            labels = [f'{color_lists[0][i]}({counts[taxa_level][color_lists[0][i]]})' for i in range(len(color_lists[0]))]
            legend = plt.legend(handles, loc=3, framealpha=1, frameon=True, ncol=4, edgecolor='black', facecolor='none')
            for i, l_label in enumerate(legend.get_texts()):
                l_label.set_text(labels[i])
            fig  = legend.figure
            plt.axis('off')
            fig.canvas.draw()
            fig.savefig(os.path.join(tree_dir, f'Taxa_Level_{taxa_level}_Legend.png'), dpi=500, bbox_inches='tight', tranparent=True)

# Setting variables:
## For all proteins analyzed in this study
Tracking not just the query and PDB but also the location for the input data.

In [2]:
small_data_set_dir = '/media/daniel/ExtraDrive1/Results/Newest_Covariation/Input'
# 2ysd is a duplicate of 1yap but with an alignment based on the structure instead of teh domain reference sequence, it has been removed from my analyses.
# 2zxe has not been completed
small_data_set = {'1c0k': {'query': '1c0k', 'pdb': '1c0k', 'chain': 'A', 'pdb_path': os.path.join(small_data_set_dir, 'PDB', 'c0', 'pdb1c0k.ent'), 'aln_path': os.path.join(small_data_set_dir, 'Final_Alignments', '1c0k.fasta')},
                  '7hvp': {'query': '7hvp', 'pdb': '7hvp', 'chain': 'A', 'pdb_path': os.path.join(small_data_set_dir, 'PDB', 'hv', 'pdb7hvp.ent'), 'aln_path': os.path.join(small_data_set_dir, 'Final_Alignments', '7hvp.fasta')},
                  '2b59': {'query': '2b59', 'pdb': '2b59', 'chain': 'B', 'pdb_path': os.path.join(small_data_set_dir, 'PDB', 'b5', 'pdb2b59.ent'), 'aln_path': os.path.join(small_data_set_dir, 'Final_Alignments', '2b59.fasta')},
                  '206l': {'query': '206l', 'pdb': '206l', 'chain': 'A', 'pdb_path': os.path.join(small_data_set_dir, 'PDB', '06', 'pdb206l.ent'), 'aln_path': os.path.join(small_data_set_dir, 'Final_Alignments', '206l.fasta')},
                  '1bol': {'query': '1bol', 'pdb': '1bol', 'chain': 'A', 'pdb_path': os.path.join(small_data_set_dir, 'PDB', 'bo', 'pdb1bol.ent'), 'aln_path': os.path.join(small_data_set_dir, 'Final_Alignments', '1bol.fasta')},
                  '1jwl': {'query': '1jwl', 'pdb': '1jwl', 'chain': 'A', 'pdb_path': os.path.join(small_data_set_dir, 'PDB', 'jw', 'pdb1jwl.ent'), 'aln_path': os.path.join(small_data_set_dir, 'Final_Alignments', '1jwl.fasta')},
                  '3q05': {'query': '3q05', 'pdb': '3q05', 'chain': 'A', 'pdb_path': os.path.join(small_data_set_dir, 'PDB', 'q0', 'pdb3q05.ent'), 'aln_path': os.path.join(small_data_set_dir, 'Final_Alignments', '3q05.fasta')},
                  '2z0e': {'query': '2z0e', 'pdb': '2z0e', 'chain': 'A', 'pdb_path': os.path.join(small_data_set_dir, 'PDB', 'z0', 'pdb2z0e.ent'), 'aln_path': os.path.join(small_data_set_dir, 'Final_Alignments', '2z0e.fasta')},
                  '2rh1': {'query': '2rh1', 'pdb': '2rh1', 'chain': 'A', 'pdb_path': os.path.join(small_data_set_dir, 'PDB', 'rh', 'pdb2rh1.ent'), 'aln_path': os.path.join(small_data_set_dir, 'Final_Alignments', '2rh1.fasta')},
                  '4lli': {'query': '4lli', 'pdb': '4lli', 'chain': 'A', 'pdb_path': os.path.join(small_data_set_dir, 'PDB', 'll', 'pdb4lli.ent'), 'aln_path': os.path.join(small_data_set_dir, 'Final_Alignments', '4lli.fasta')},
#                   '2ysd': {'query': '2ysd', 'pdb': '2ysd', 'chain': 'A', 'pdb_path': os.path.join(small_data_set_dir, 'PDB', 'ys', 'pdb2ysd.ent'), 'aln_path': os.path.join(small_data_set_dir, 'Final_Alignments', '2ysd.fasta')},
                  '1h1v': {'query': '1h1v', 'pdb': '1h1v', 'chain': 'G', 'pdb_path': os.path.join(small_data_set_dir, 'PDB', 'h1', 'pdb1h1v.ent'), 'aln_path': os.path.join(small_data_set_dir, 'Final_Alignments', '1h1v.fasta')},
#                   '2zxe': {'query': '2zxe', 'pdb': '2zxe', 'chain': 'A', 'pdb_path': os.path.join(small_data_set_dir, 'PDB', 'zx', 'pdb2zxe.ent'), 'aln_path': os.path.join(small_data_set_dir, 'Final_Alignments', '2zxe.fasta')},
                  '1c17': {'query': '1c17', 'pdb': '1c17', 'chain': 'A', 'pdb_path': os.path.join(small_data_set_dir, 'PDB', 'c1', 'pdb1c17.ent'), 'aln_path': os.path.join(small_data_set_dir, 'Final_Alignments', '1c17.fasta')},
                  '1a26': {'query': '1a26', 'pdb': '1a26', 'chain': 'A', 'pdb_path': os.path.join(small_data_set_dir, 'PDB', 'a2', 'pdb1a26.ent'), 'aln_path': os.path.join(small_data_set_dir, 'Final_Alignments', '1a26.fasta')},
                  '135l': {'query': '135l', 'pdb': '135l', 'chain': 'A', 'pdb_path': os.path.join(small_data_set_dir, 'PDB', '35', 'pdb135l.ent'), 'aln_path': os.path.join(small_data_set_dir, 'Final_Alignments', '135l.fasta')},
                  '3b6v': {'query': '3b6v', 'pdb': '3b6v', 'chain': 'A', 'pdb_path': os.path.join(small_data_set_dir, 'PDB', 'b6', 'pdb3b6v.ent'), 'aln_path': os.path.join(small_data_set_dir, 'Final_Alignments', '3b6v.fasta')},
                  '1hck': {'query': '1hck', 'pdb': '1hck', 'chain': 'A', 'pdb_path': os.path.join(small_data_set_dir, 'PDB', 'hc', 'pdb1hck.ent'), 'aln_path': os.path.join(small_data_set_dir, 'Final_Alignments', '1hck.fasta')},
                  '3tnu': {'query': '3tnu', 'pdb': '3tnu', 'chain': 'A', 'pdb_path': os.path.join(small_data_set_dir, 'PDB', 'tn', 'pdb3tnu.ent'), 'aln_path': os.path.join(small_data_set_dir, 'Final_Alignments', '3tnu.fasta')},
                  '2wer': {'query': '2wer', 'pdb': '2wer', 'chain': 'A', 'pdb_path': os.path.join(small_data_set_dir, 'PDB', 'we', 'pdb2wer.ent'), 'aln_path': os.path.join(small_data_set_dir, 'Final_Alignments', '2wer.fasta')},
                  '1axb': {'query': '1axb', 'pdb': '1axb', 'chain': 'A', 'pdb_path': os.path.join(small_data_set_dir, 'PDB', 'ax', 'pdb1axb.ent'), 'aln_path': os.path.join(small_data_set_dir, 'Final_Alignments', '1axb.fasta')},
                  '4ycu': {'query': '4ycu', 'pdb': '4ycu', 'chain': 'A', 'pdb_path': os.path.join(small_data_set_dir, 'PDB', 'yc', 'pdb4ycu.ent'), 'aln_path': os.path.join(small_data_set_dir, 'Final_Alignments', '4ycu.fasta')}}
allosteric_pdb_dir = '/media/daniel/ExtraDrive1/Results/Allosteric_Data_Set/Domain_Specific_Min_ID_40/Data_Sets/Shared/PDBs/'
allosteric_data_set = {'2ddr': {'query': '2ddr', 'pdb': '6cm4', 'chain': 'A', 'pdb_path': os.path.join(allosteric_pdb_dir, '6cm4.pdb'),
                                'aln_path': '/media/daniel/ExtraDrive1/Results/Allosteric_Data_Set/Output/U90/2ddr/ET-MEMER/Non-Gapped_Alignment.fa'},
                       '2rrm': {'query': '2rrm', 'pdb': '4f02', 'chain': 'A', 'pdb_path': os.path.join(allosteric_pdb_dir, '4f02.pdb'),
                                'aln_path': '/media/daniel/ExtraDrive1/Results/Allosteric_Data_Set/Output/U90/2rrm/ET-MEMER/Non-Gapped_Alignment.fa'},
                       '1yap': {'query': '1yap', 'pdb': '4rex', 'chain': 'A', 'pdb_path': os.path.join(allosteric_pdb_dir, '4rex.pdb'),
                                'aln_path': '/media/daniel/ExtraDrive1/Results/Allosteric_Data_Set/Output/U90/1yap/ET-MEMER/Non-Gapped_Alignment.fa'}}
all_data = small_data_set.copy()
all_data.update(allosteric_data_set)

## For all methods applied in this study
Tracking the classes need and the parameters needed to execute each one.

In [3]:
general_et_settings = {'polymer_type': 'Protein', 'et_distance': True, 'distance_model': 'blosum62', 'tree_building_method': 'et',
                 'tree_building_options': {}, 'position_type': 'pair', 'gap_correction': None,
                 'output_files': {'original_aln', 'non_gap_aln', 'tree', 'scores'}, 'processors': 10, 'low_memory': True}
method_dict = {'DCA': {'Predictor': DCAWrapper, 'Settings': {}, 'RankType':'max', 'Scoring Params': {}},
               'EVC Standard': {'Predictor': EVCouplingsWrapper, 'Settings': {'protocol': 'standard'}, 'RankType':'max', 'Scoring Params': {'cores': 10}},
#                'EVC Mean Field': {'Predictor': EVCouplingsWrapper, 'Settings': {'protocol': 'mean_field'}, 'Scoring Params': {'cores': 10}},
               'ET-MIp': {'Predictor': EvolutionaryTrace, 'RankType':'max', 'Scoring Params': {},
                          'Settings': {'ranks': None, 'scoring_metric': 'filtered_average_product_corrected_mutual_information'}},               
               'ET-MCM-': {'Predictor': EvolutionaryTrace, 'RankType':'max', 'Scoring Params': {},
                           'Settings': {'ranks': None, 'scoring_metric': 'match_count'}},
               'ET-M-MC': {'Predictor': EvolutionaryTrace, 'RankType':'max', 'Scoring Params': {},
                           'Settings': {'ranks': None, 'scoring_metric': 'mismatch_count'}},
               'ET-MCMCR': {'Predictor': EvolutionaryTrace, 'RankType':'max', 'Scoring Params': {},
                            'Settings': {'ranks': None, 'scoring_metric': 'match_mismatch_count_ratio'}},
               'ET-MCMCA': {'Predictor': EvolutionaryTrace, 'RankType':'max', 'Scoring Params': {},
                            'Settings': {'ranks': None, 'scoring_metric': 'match_mismatch_count_angle'}},
               
               'ET-MEM-': {'Predictor': EvolutionaryTrace, 'RankType':'max', 'Scoring Params': {},
                           'Settings': {'ranks': None, 'scoring_metric': 'match_entropy'}},
               'ET-M-ME': {'Predictor': EvolutionaryTrace, 'RankType':'max', 'Scoring Params': {},
                           'Settings': {'ranks': None, 'scoring_metric': 'mismatch_entropy'}},
               'ET-MEMER': {'Predictor': EvolutionaryTrace, 'RankType':'max', 'Scoring Params': {},
                           'Settings': {'ranks': None, 'scoring_metric': 'match_mismatch_entropy_ratio'}},
               'ET-MEMEA': {'Predictor': EvolutionaryTrace, 'RankType':'max', 'Scoring Params': {},
                           'Settings': {'ranks': None, 'scoring_metric': 'match_mismatch_entropy_angle'}},
               
               'ET-MDM-': {'Predictor': EvolutionaryTrace, 'RankType':'max', 'Scoring Params': {},
                           'Settings': {'ranks': None, 'scoring_metric': 'match_diversity'}},
               'ET-M-MD': {'Predictor': EvolutionaryTrace, 'RankType':'max', 'Scoring Params': {},
                           'Settings': {'ranks': None, 'scoring_metric': 'mismatch_diversity'}},
               'ET-MDMDR': {'Predictor': EvolutionaryTrace, 'RankType':'max', 'Scoring Params': {},
                           'Settings': {'ranks': None, 'scoring_metric': 'match_mismatch_diversity_ratio'}},
               'ET-MDMDA': {'Predictor': EvolutionaryTrace, 'RankType':'max', 'Scoring Params': {},
                           'Settings': {'ranks': None, 'scoring_metric': 'match_mismatch_diversity_angle'}},
               
               'ET-MDMER': {'Predictor': EvolutionaryTrace, 'RankType':'max', 'Scoring Params': {},
                           'Settings': {'ranks': None, 'scoring_metric': 'match_diversity_mismatch_entropy_ratio'}},
               'ET-MDMEA': {'Predictor': EvolutionaryTrace, 'RankType':'max', 'Scoring Params': {},
                           'Settings': {'ranks': None, 'scoring_metric': 'match_diversity_mismatch_entropy_angle'}},
               'rvET': {'Predictor': EvolutionaryTrace, 'RankType':'max', 'Scoring Params': {},
                        'Settings': {'ranks': None, 'scoring_metric': 'plain_entropy'}}}
method_dict['ET-MIp']['Settings'].update(general_et_settings)

method_dict['ET-MCM-']['Settings'].update(general_et_settings)
method_dict['ET-M-MC']['Settings'].update(general_et_settings)
method_dict['ET-MCMCR']['Settings'].update(general_et_settings)
method_dict['ET-MCMCA']['Settings'].update(general_et_settings)

method_dict['ET-MEM-']['Settings'].update(general_et_settings)
method_dict['ET-M-ME']['Settings'].update(general_et_settings)
method_dict['ET-MEMER']['Settings'].update(general_et_settings)
method_dict['ET-MEMEA']['Settings'].update(general_et_settings)

method_dict['ET-MDM-']['Settings'].update(general_et_settings)
method_dict['ET-M-MD']['Settings'].update(general_et_settings)
method_dict['ET-MDMDR']['Settings'].update(general_et_settings)
method_dict['ET-MDMDA']['Settings'].update(general_et_settings)

method_dict['ET-MDMER']['Settings'].update(general_et_settings)
method_dict['ET-MDMEA']['Settings'].update(general_et_settings)

method_dict['rvET']['Settings'].update(general_et_settings)
method_dict['rvET']['Settings']['position_type'] = 'single'
method_dict['rvET']['Settings']['gap_correction'] = 0.6

## For plotting
All variables to correctly order variables for plotting and to control plotting parameters

In [4]:
# Define the order of proteins, method, and separations for experiments and plotting
full_protein_order = ['1c0k', '7hvp', '2b59', '206l', '1bol', '1jwl', '3q05', '2z0e', '2rh1', '4lli', '1h1v',
                      '1c17', '1a26', '135l', '3b6v', '1yap', '1hck', '3tnu', '2wer', '1axb', '4ycu', '2ddr', '2rrm']
method_order = ['DCA', 'EVC Standard', 'ET-MIp', 'ET-M-MD']
method_labels = ['DCA', 'EVC', 'ET-MIp', 'CovET']
et_method_order = ['rvET'] + method_order
sequence_separation_order = ['Long', 'Medium', 'Short']
# Results directories:
small_data_out_dir = '/media/daniel/ExtraDrive1/Results/Newest_Covariation/Output/SmallTestSet'
allosteric_data_out_dir = '/media/daniel/ExtraDrive1/Results/Allosteric_Data_Set/Domain_Specific_Min_ID_40/Output/ID40/U90'
# Plotting variables
mpl.rcParams['axes.linewidth'] = 3
mpl.rcParams['xtick.major.width'] = 2
mpl.rcParams['ytick.major.width'] = 2
box_kwargs = {
#     'facecolor': 'none',
#     'edgecolor': 'black',
    'boxprops':{'facecolor':'none', 'edgecolor':'black'},
    'medianprops':{'color':'black'},
    'whiskerprops':{'color':'black'},
    'capprops':{'color':'black'},
    'linewidth': 3,
#     'whis': 100 # whis is set arbitrarily high to avoid defining outliers by Seaborn method
    'whis': 1.5 # whis is set to the standard 1.5 x the IQR
}

# 'boxprops':{'facecolor':'none', 'edgecolor':'black'},
#     'medianprops':{'color':'black'},
#     'whiskerprops':{'color':'black'},
#     'capprops':{'color':'black'}

def determine_significance_level(p_val):
    sig_lablel = None
    if p_val < 0.001:
        sig_label = '***'
    elif p_val < 0.01:
        sig_label = '**'
    elif p_val < 0.05:
        sig_label = '*'
    elif p_val >= 0.05:
        sig_label = 'ns'
    else:
        sig_lablel = 'error'
    return sig_label

def compute_wilcoxon_stats(data_type, df, data_dir):
    stat_data_path = os.path.join(data_dir, f'{data_type}_Wilcoxon_Test_Comparison.tsv')
    if os.path.isfile(stat_data_path):
        stat_data_df = pd.read_csv(stat_data_path, header=0, index_col=None, sep='\t')
    else:
        if 'Sequence Separation' in df.columns:
            stats_data_dict = {'Method 1': [], 'Method 2': [], 'Sequence Separation': [],'Statistic': [], 'P-Value': [], 'Significance Label': []}
            separations = df['Sequence Separation'].unique()
            methods = df['Method'].unique()
            for sep in separations:
                for i in range(len(methods)):
                    for j in range(i + 1, len(methods)):
                        method1 = methods[i]
                        method2 = methods[j]
                        stat, p_value = wilcoxon(df.loc[(df['Method'] == method1) & (df['Sequence Separation'] == sep), data_type],
                                                 df.loc[(df['Method'] == method2) & (df['Sequence Separation'] == sep), data_type])
                        stats_data_dict['Sequence Separation'].append(sep)
                        stats_data_dict['Method 1'].append(method1)
                        stats_data_dict['Method 2'].append(method2)
                        stats_data_dict['Statistic'].append(stat)
                        stats_data_dict['P-Value'].append(p_value)
                        stats_data_dict['Significance Label'].append(determine_significance_level(p_value))
        else:
            stats_data_dict = {'Method 1': [], 'Method 2': [],'Statistic': [], 'P-Value': [], 'Significance Label': []}
            methods = df['Method'].unique()
            for i in range(len(methods)):
                for j in range(i + 1, len(methods)):
                    method1 = methods[i]
                    method2 = methods[j]
                    stat, p_value = wilcoxon(df.loc[df['Method'] == method1, data_type],
                                             df.loc[df['Method'] == method2, data_type])
                    stats_data_dict['Method 1'].append(method1)
                    stats_data_dict['Method 2'].append(method2)
                    stats_data_dict['Statistic'].append(stat)
                    stats_data_dict['P-Value'].append(p_value)
                    stats_data_dict['Significance Label'].append(determine_significance_level(p_value))
        stat_data_df = pd.DataFrame(stats_data_dict)
        stat_data_df.to_csv(stat_data_path, header=True, index=False, sep='\t')
    return stat_data_df

def plot_main_methods_box_and_whisker_plots_for_each_separation(metric_of_success, df, stat_df, out_dir, sep, y_min=0.0, y_max=1.0, y_ticks=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
                                                                significance_offset=0.025, significance_interval=0.075, significance_cap=0.02, font_size=32, stat_font_size=24):
    if sep is None:
        indices = df['Method'].notna()
        stat_indices = stat_df['Method 1'].notna()
    else:
        indices = df["Sequence Separation"] == sep
        stat_indices = stat_df["Sequence Separation"] == sep
    
    fig, ax = plt.subplots(figsize=(1 + len(method_order), 1 + len(y_ticks)))

    main_methods_plot = sns.boxplot(x='Method', y=metric_of_success, order=method_order, data=df.loc[indices,:], ax=ax, **box_kwargs)
    sns.swarmplot(x='Method', y=metric_of_success, order=method_order, data=df.loc[indices,:], ax=main_methods_plot, dodge=True,
                  hue='DataSet', hue_order=['SmallDataSet', 'AllostericDataSet'], palette={'SmallDataSet': 'dimgrey', 'AllostericDataSet': 'red'})

    main_methods_plot.set_xticklabels(method_labels, rotation=90, fontsize=font_size)
    main_methods_plot.set_xlabel('Method', fontsize=font_size)

    main_methods_plot.set(ylim=(y_min, y_max))
    main_methods_plot.set_yticks(y_ticks)
    main_methods_plot.set_yticklabels(y_ticks, fontsize=font_size)
    main_methods_plot.set_ylabel(main_methods_plot.get_ylabel(), fontsize=font_size)
    # Add statistics to plot
    max_value = df.loc[indices, metric_of_success].max()
    # DCA vs ET-M-MD
    dca_et_m_md_label = stat_df.loc[stat_indices & (stat_df['Method 1'] == 'DCA') & (stat_df['Method 2'] == 'ET-M-MD'), 'Significance Label'].values[0]
    x1, x2 = 0, 3   # columns 'DCA' and 'ET-M-MD' (first column: 0, see plt.xticks())
    y, h, col = max_value + significance_offset, significance_cap, 'k'
    line = lines.Line2D([x1, x1, x2, x2], [y, y+h, y+h, y], lw=3, c=col, clip_on=False)
    main_methods_plot.add_line(line)

    plt.text(x1, y+h, dca_et_m_md_label, ha='center', va='bottom', color=col, fontsize=stat_font_size)
    # EVCouplings vs ET-M-MD
    evc_et_m_md_label = stat_df.loc[stat_indices & (stat_df['Method 1'] == 'EVC Standard') & (stat_df['Method 2'] == 'ET-M-MD'), 'Significance Label'].values[0]
    x1, x2 = 1, 3   # columns 'EVC Standard' and 'ET-M-MD' (first column: 0, see plt.xticks())
    y, h, col = max_value + (significance_offset + significance_interval), significance_cap, 'k'
    line = lines.Line2D([x1, x1, x2, x2], [y, y+h, y+h, y], lw=3, c=col, clip_on=False)
    main_methods_plot.add_line(line)

    plt.text(x1, y+h, evc_et_m_md_label, ha='center', va='bottom', color=col, fontsize=stat_font_size)
    # ET-MIp vs ET-M-MD
    et_mip_et_m_md_label = stat_df.loc[stat_indices & (stat_df['Method 1'] == 'ET-MIp') & (stat_df['Method 2'] == 'ET-M-MD'), 'Significance Label'].values[0]
    x1, x2 = 2, 3   # columns 'ET-MIp' and 'ET-M-MD' (first column: 0, see plt.xticks())
    y, h, col = max_value + (significance_offset + (2 * significance_interval)), significance_cap, 'k'
    line = lines.Line2D([x1, x1, x2, x2], [y, y+h, y+h, y], lw=3, c=col, clip_on=False)
    main_methods_plot.add_line(line)

    plt.text(x1, y+h, et_mip_et_m_md_label, ha='center', va='bottom', color=col, fontsize=stat_font_size)
    # Remove right and top boarders so there is no conflict with significance markers
    main_methods_plot.spines['right'].set_visible(False)
    main_methods_plot.spines['top'].set_visible(False)
    # Remove legend
    main_methods_plot.legend_.remove()

    main_methods_plot.get_figure().savefig(os.path.join(out_dir, f'{metric_of_success.replace(" ", "_").replace("%", "")}_Main_Methods_{"_" + sep if sep else ""}.png'),
                                           bbox_inches='tight', transparent=True, dpi=500)

    plt.close()

# Evaluate each method according to the different metrics of success:

## AUROC
This measures the True Positive Rate vs the False Positive Rate of prediction, it can be considered a measure of the accuracy of the measure. This can be strongly influenced by the class imbalance which is present when predicting structural contacts since there are many fewer contacts than non-contacts. The True Positive case is if the C-beta of two amino acids is within 8.0 Angstroms of one another (as is done in the CASP competitions).

In [None]:
# Create directory for AUROC results
auroc_dir = os.path.join(output_dir, 'AUROC_Analyses')
os.makedirs(auroc_dir, exist_ok=True)
auroc_data_path = os.path.join(auroc_dir, 'Full_AUROC_Data.tsv')
if os.path.isfile(auroc_data_path):
    # Load AUROC data
    auroc_data_df = pd.read_csv(auroc_data_path, header=0, index_col=None, sep='\t')
    auroc_data_dict = auroc_data_df.to_dict('series')
    auroc_data_dict = {k: list(v) for k, v in auroc_data_dict.items()}
    print(f"Loaded: {len(set(auroc_data_dict['Protein']))} AUROCs")
else:
    # Create dictionary to hold AUROC data
    auroc_data_dict = {'Protein': [], 'DataSet': [], 'Method': [], 'Sequence Separation': [], 'AUROC': []}

to_score = [x for x in full_protein_order if x in set(full_protein_order) - set(auroc_data_dict['Protein'])]
print(f'{len(to_score)} proteins left to score')
for protein in to_score:
    for method in method_order:

        if protein in small_data_set:
            protein_dir = os.path.join(small_data_out_dir, protein)
        elif protein in allosteric_data_set:
            protein_dir = os.path.join(allosteric_data_out_dir, protein)
        else:
            raise ValueError('Bad protein: ', protein)

        protein_settings = {'query': all_data[protein]['query'], 'aln_file': all_data[protein]['aln_path']}
        # Then perform experiments for each method
        # Set the result directory for this method
        if method in ['ET-MCM-', 'ET-M-MC', 'ET-MCMCR', 'ET-MCMCA', 'ET-MEM-', 'ET-M-ME', 'ET-MEMER', 'ET-MEMEA',
                      'ET-MDM-', 'ET-M-MD', 'ET-MDMDR', 'ET-MDMDA', 'ET-MDMER', 'ET-MDMEA']:
            method_dir = os.path.join(protein_dir, 'ET-MEMER')
        elif method in ['ET-MIp', 'cET-MIp']:
            method_dir = os.path.join(protein_dir, 'ET-MIp')
        else:
            method_dir = os.path.join(protein_dir, method)
        assert os.path.isdir(method_dir)
        print(f'Attempting to load {method} covariance for: {protein}')
        # Otherwise perform the experiment, beginning with compiling the compelte settings for this predictor
        curr_settings = {'out_dir': method_dir}
        curr_settings.update(protein_settings)
        curr_settings.update(method_dict[method]['Settings'])
        predictor = method_dict[method]['Predictor'](**curr_settings)
        total_time = predictor.calculate_scores(**method_dict[method]['Scoring Params'])
        print(f'Successfully loaded {method} covariance for: {protein}')

        cb_contact_scorer = ContactScorer(query=all_data[protein]['query'], seq_alignment=os.path.join(method_dir, 'Non-Gapped_Alignment.fa'), pdb_reference=all_data[protein]['pdb_path'], cutoff=8.0, chain=all_data[protein]['chain'])
        cb_contact_scorer.fit()
        cb_contact_scorer.measure_distance(method='CB')
        cb_contact_scorer.map_predictions_to_pdb(ranks=predictor.rankings, predictions=predictor.scores, coverages=predictor.coverages, threshold=0.5)

        for sep in sequence_separation_order:
            _, _, curr_auroc = cb_contact_scorer.score_auc(category=sep)
            auroc_data_dict['Protein'].append(protein)
            auroc_data_dict['DataSet'].append('SmallDataSet' if protein in small_data_set else 'AllostericDataSet')
            auroc_data_dict['Method'].append(method)
            auroc_data_dict['Sequence Separation'].append(sep)
            auroc_data_dict['AUROC'].append(curr_auroc)
    # After each protein is scored write results to file so they can be loaded if scoring is interrupted
    auroc_data_df = pd.DataFrame(auroc_data_dict)
    auroc_data_df.to_csv(auroc_data_path, header=True, index=False, sep='\t')
        
        
auroc_stat_df = compute_wilcoxon_stats('AUROC', auroc_data_df, auroc_dir)
for sep in sequence_separation_order:
    # Plot box and whisker plots comparing the AUROC for the main methods (DCA, EVCouplings, ET-MIp, and ET-M-MD) to each other at different sequence separations.
    # for sep in sequence_separation_order:
    plot_main_methods_box_and_whisker_plots_for_each_separation('AUROC', auroc_data_df, auroc_stat_df, auroc_dir, sep)

## AUPRC
This measures the Precision vs the Recall of the predictions, it can be considered a measure of the accuracy of the measure. This is less strongly influenced by the class imbalance which is present when predicting structural contacts since there are many fewer contacts than non-contacts. The True Positive case is if the C-beta of two amino acids is within 8.0 Angstroms of one another (as is done in the CASP competitions).

In [None]:
# Create directory for AUPRC results
auprc_dir = os.path.join(output_dir, 'AUPRC_Analyses')
os.makedirs(auprc_dir, exist_ok=True)
auprc_data_path = os.path.join(auprc_dir, 'Full_AUPRC_Data.tsv')
if os.path.isfile(auprc_data_path):
    # Load AUPRC data
    auprc_data_df = pd.read_csv(auprc_data_path, header=0, index_col=None, sep='\t')
    auprc_data_dict = auprc_data_df.to_dict('series')
    auprc_data_dict = {k: list(v) for k, v in auprc_data_dict.items()}
    print(f"Loaded: {len(set(auprc_data_dict['Protein']))} AUPRCs")
else:
    # Create dictionary to hold AUPRC data
    auprc_data_dict = {'Protein': [], 'DataSet': [], 'Method': [], 'Sequence Separation': [], 'AUPRC': []}

to_score = [x for x in full_protein_order if x in set(full_protein_order) - set(auprc_data_dict['Protein'])]
print(f'{len(to_score)} proteins left to score')
for protein in to_score:
    for method in method_order:

        if protein in small_data_set:
            protein_dir = os.path.join(small_data_out_dir, protein)
        elif protein in allosteric_data_set:
            protein_dir = os.path.join(allosteric_data_out_dir, protein)
        else:
            raise ValueError('Bad protein')

        protein_settings = {'query': all_data[protein]['query'], 'aln_file': all_data[protein]['aln_path']}
        # Then perform experiments for each method
        # Set the result directory for this method
        if method in ['ET-MCM-', 'ET-M-MC', 'ET-MCMCR', 'ET-MCMCA', 'ET-MEM-', 'ET-M-ME', 'ET-MEMER', 'ET-MEMEA',
                      'ET-MDM-', 'ET-M-MD', 'ET-MDMDR', 'ET-MDMDA', 'ET-MDMER', 'ET-MDMEA']:
            method_dir = os.path.join(protein_dir, 'ET-MEMER')
        elif method in ['ET-MIp', 'cET-MIp']:
            method_dir = os.path.join(protein_dir, 'ET-MIp')
        else:
            method_dir = os.path.join(protein_dir, method)
        assert os.path.isdir(method_dir)
        print(f'Attempting to load {method} covariance for: {protein}')
        # Otherwise perform the experiment, beginning with compiling the compelte settings for this predictor
        curr_settings = {'out_dir': method_dir}
        curr_settings.update(protein_settings)
        curr_settings.update(method_dict[method]['Settings'])
        predictor = method_dict[method]['Predictor'](**curr_settings)
        total_time = predictor.calculate_scores(**method_dict[method]['Scoring Params'])
        print(f'Successfully loaded {method} covariance for: {protein}')

        cb_contact_scorer = ContactScorer(query=all_data[protein]['query'], seq_alignment=os.path.join(method_dir, 'Non-Gapped_Alignment.fa'), pdb_reference=all_data[protein]['pdb_path'], cutoff=8.0, chain=all_data[protein]['chain'])
        cb_contact_scorer.fit()
        cb_contact_scorer.measure_distance(method='CB')
        cb_contact_scorer.map_predictions_to_pdb(ranks=predictor.rankings, predictions=predictor.scores, coverages=predictor.coverages, threshold=0.5)

        for sep in sequence_separation_order:
            _, _, curr_auprc = cb_contact_scorer.score_precision_recall(category=sep)
            auprc_data_dict['Protein'].append(protein)
            auprc_data_dict['DataSet'].append('SmallDataSet' if protein in small_data_set else 'AllostericDataSet')
            auprc_data_dict['Method'].append(method)
            auprc_data_dict['Sequence Separation'].append(sep)
            auprc_data_dict['AUPRC'].append(curr_auprc)
    # After each protein is scored write results to file so they can be loaded if scoring is interrupted
    auprc_data_df = pd.DataFrame(auprc_data_dict)
    auprc_data_df.to_csv(auprc_data_path, header=True, index=False, sep='\t')
        
        
auprc_stat_df = compute_wilcoxon_stats('AUPRC', auprc_data_df, auprc_dir)
for sep in sequence_separation_order:
    # Plot box and whisker plots comparing the AUPRC for the main methods (DCA, EVCouplings, ET-MIp, and ET-M-MD) to each other at different sequence separations.
    # for sep in sequence_separation_order:
    plot_main_methods_box_and_whisker_plots_for_each_separation('AUPRC', auprc_data_df, auprc_stat_df, auprc_dir, sep)

## Selection Clustering Weighting Z-Score
This measures whether highly ranked residues are randomly distributed over the protein structure or whether they cluster in a meaningful way. In previous work, high clustering z-scores have been shown to correspond with key structural and functional sites. Two types of clustering are explored here; unbiased clustering, which looks just at the position of highly ranked residues on the protein structure, and biased clustering, which weights the spatial clustering by the sequence separation of residues.

In [None]:
# Create directory for SCW Z-Score results
scw_dir = os.path.join(output_dir, 'SCW_Z-Score_Analyses')
os.makedirs(scw_dir, exist_ok=True)
scw_data_path = os.path.join(scw_dir, 'Full_SCW_Z-Score_Data.tsv')
if os.path.isfile(scw_data_path):
    # Load SCW data
    scw_data_df = pd.read_csv(scw_data_path, header=0, index_col=None, sep='\t')
    scw_data_dict = scw_data_df.to_dict('series')
    scw_data_dict = {k: list(v) for k, v in scw_data_dict.items()}
    print(f"Loaded: {len(set(scw_data_dict['Protein']))} SCW Z-Scores")
else:
    # Create dictionary to hold SCW data
    scw_data_dict = {'Protein': [], 'DataSet': [], 'Method': [],
                     'Biased SCW Z-Score (2%)': [], 'Biased SCW Z-Score (2.5%)': [], 'Biased SCW Z-Score (4%)': [], 'Biased SCW Z-Score (5%)': [], 'Biased SCW Z-Score (6%)': [], 'Biased SCW Z-Score (7.5%)': [],
                     'Biased SCW Z-Score (8%)': [], 'Biased SCW Z-Score (10%)': [], 'Biased SCW Z-Score (15%)': [], 'Biased SCW Z-Score (20%)': [], 'Biased SCW Z-Score (30%)': [],
                     'Unbiased SCW Z-Score (2%)': [], 'Unbiased SCW Z-Score (2.5%)': [], 'Unbiased SCW Z-Score (4%)': [], 'Unbiased SCW Z-Score (5%)': [], 'Unbiased SCW Z-Score (6%)': [], 'Unbiased SCW Z-Score (7.5%)': [],
                     'Unbiased SCW Z-Score (8%)': [], 'Unbiased SCW Z-Score (10%)': [], 'Unbiased SCW Z-Score (15%)': [], 'Unbiased SCW Z-Score (20%)': [], 'Unbiased SCW Z-Score (30%)': [], #}
                     'Average Sequence Separation (2%)': [], 'Average Sequence Separation (2.5%)': [], 'Average Sequence Separation (4%)': [], 'Average Sequence Separation (5%)': [],
                     'Average Sequence Separation (6%)': [], 'Average Sequence Separation (7.5%)': [], 'Average Sequence Separation (8%)': [], 'Average Sequence Separation (10%)': [],
                     'Average Sequence Separation (15%)': [], 'Average Sequence Separation (20%)': [], 'Average Sequence Separation (30%)': []}

to_score = [x for x in full_protein_order if x in set(full_protein_order) - set(scw_data_dict['Protein'])]
print(f'{len(to_score)} proteins left to score')
for protein in to_score:
    unbiased_scw_scorer = None
    biased_scw_scorer = None
    for method in method_order:

        if protein in small_data_set:
            protein_dir = os.path.join(small_data_out_dir, protein)
        elif protein in allosteric_data_set:
            protein_dir = os.path.join(allosteric_data_out_dir, protein)
        else:
            raise ValueError('Bad protein')

        protein_settings = {'query': all_data[protein]['query'], 'aln_file': all_data[protein]['aln_path']}
        # Then perform experiments for each method
        # Set the result directory for this method
        if method in ['ET-MCM-', 'ET-M-MC', 'ET-MCMCR', 'ET-MCMCA', 'ET-MEM-', 'ET-M-ME', 'ET-MEMER', 'ET-MEMEA',
                      'ET-MDM-', 'ET-M-MD', 'ET-MDMDR', 'ET-MDMDA', 'ET-MDMER', 'ET-MDMEA']:
            method_dir = os.path.join(protein_dir, 'ET-MEMER')
        elif method in ['ET-MIp', 'cET-MIp']:
            method_dir = os.path.join(protein_dir, 'ET-MIp')
        else:
            method_dir = os.path.join(protein_dir, method)
        assert os.path.isdir(method_dir)
        print(f'Attempting to load {method} covariance for: {protein}')
        # Otherwise perform the experiment, beginning with compiling the compelte settings for this predictor
        curr_settings = {'out_dir': method_dir}
        curr_settings.update(protein_settings)
        curr_settings.update(method_dict[method]['Settings'])
        predictor = method_dict[method]['Predictor'](**curr_settings)
        total_time = predictor.calculate_scores(**method_dict[method]['Scoring Params'])
        print(f'Successfully loaded {method} covariance for: {protein}')

        print('Creating Scorer')
        any_contact_scorer = ContactScorer(query=all_data[protein]['query'], seq_alignment=os.path.join(method_dir, 'Non-Gapped_Alignment.fa'), pdb_reference=all_data[protein]['pdb_path'], cutoff=8.0, chain=all_data[protein]['chain'])
        print('Fitting Scorer')
        any_contact_scorer.fit()
        print('Measuring Distance')
        any_contact_scorer.measure_distance(method='Any')
        print('Mapping Predictions to Scorer')
        any_contact_scorer.map_predictions_to_pdb(ranks=predictor.rankings, predictions=predictor.scores, coverages=predictor.coverages, threshold=0.5)

        print('Scoring SCW Z-Scores')
        z_score_biased, biased_scw_scorer, _ = any_contact_scorer.score_clustering_of_contact_predictions(
        biased=True, file_path=os.path.join(scw_dir, f'{protein}_{method}_Biased_SCW_Z-Score.tsv'), scw_scorer=biased_scw_scorer, processes=10)
        z_score_unbiased, unbiased_scw_scorer, _ = any_contact_scorer.score_clustering_of_contact_predictions(
        biased=False, file_path=os.path.join(scw_dir, f'{protein}_{method}_Unbiased_SCW_Z-Score.tsv'), scw_scorer=unbiased_scw_scorer, processes=10)
        for percentile in [2, 2.5, 4, 5, 6, 7.5, 8, 10, 15, 20, 30]:
            percentage = percentile / 100.0
            try:
                top_biased_preds = z_score_biased.loc[z_score_biased['Residue Coverage'] <= percentage, ['Res_i', 'Res_j', 'Z-Score']]
                scw_data_dict[f'Biased SCW Z-Score ({percentile}%)'].append(top_biased_preds['Z-Score'].iloc[-1])
                top_biased_preds['SeqSep'] = top_biased_preds.apply(lambda x: x['Res_j'] - x['Res_i'], axis=1)
                scw_data_dict[f'Average Sequence Separation ({percentile}%)'].append(top_biased_preds['SeqSep'].mean())
            except IndexError:
                scw_data_dict[f'Biased SCW Z-Score ({percentile}%)'].append(None)
                scw_data_dict[f'Average Sequence Separation ({percentile}%)'].append(None)
            try:
                scw_data_dict[f'Unbiased SCW Z-Score ({percentile}%)'].append(z_score_unbiased.loc[z_score_unbiased['Residue Coverage'] <= percentage, 'Z-Score'].iloc[-1])
            except IndexError:
                scw_data_dict[f'Unbiased SCW Z-Score ({percentile}%)'].append(None)

        scw_data_dict['Protein'].append(protein)
        scw_data_dict['DataSet'].append('SmallDataSet' if protein in small_data_set else 'AllostericDataSet')
        scw_data_dict['Method'].append(method)
    # After each protein is scored write results to file so they can be loaded if scoring is interrupted
    scw_data_df = pd.DataFrame(scw_data_dict)
    scw_data_df.to_csv(scw_data_path, header=True, index=False, sep='\t')
        
        
print(scw_data_df[['Biased SCW Z-Score (30%)', 'Unbiased SCW Z-Score (30%)', 'Average Sequence Separation (30%)']].min())
print(scw_data_df[['Biased SCW Z-Score (30%)', 'Unbiased SCW Z-Score (30%)', 'Average Sequence Separation (30%)']].max())
        
biased_scw_stat_df = compute_wilcoxon_stats('Biased SCW Z-Score (30%)', scw_data_df, scw_dir)
unbiased_scw_stat_df = compute_wilcoxon_stats('Unbiased SCW Z-Score (30%)', scw_data_df, scw_dir)
seq_sep_stat_df = compute_wilcoxon_stats('Average Sequence Separation (30%)', scw_data_df, scw_dir)
# Plot box and whisker plots comparing the SCW for the main methods (DCA, EVCouplings, ET-MIp, and ET-M-MD) to each other at different sequence separations.
plot_main_methods_box_and_whisker_plots_for_each_separation('Biased SCW Z-Score (30%)', scw_data_df, biased_scw_stat_df, scw_dir, None, y_min=-4, y_max=22, y_ticks=[-4, -2, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22],
                                                            significance_offset=0.25, significance_interval=0.75, significance_cap=0.2)
plot_main_methods_box_and_whisker_plots_for_each_separation('Unbiased SCW Z-Score (30%)', scw_data_df, unbiased_scw_stat_df, scw_dir, None, y_min=-4, y_max=22, y_ticks=[-4, -2, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22],
                                                            significance_offset=0.25, significance_interval=0.75, significance_cap=0.2)
plot_main_methods_box_and_whisker_plots_for_each_separation('Average Sequence Separation (30%)', scw_data_df, seq_sep_stat_df, scw_dir, None, y_min=0, y_max=140,
                                                            y_ticks=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140],
#                                                             y_ticks=[0, 15, 30, 45, 60, 75, 90, 105, 120, 135, 150],
                                                            significance_offset=2.5, significance_interval=7.5, significance_cap=2)

# In depth analysis of rvET overlap, key residue overlap, covariation networks, and allosteric interactions

## Visualizating residues from rvET and covariation predictions and measuring their overlap.

In [None]:
# Create directory for rvET and covariation residue visualization and overlap measurement
residue_viz_dir = os.path.join(output_dir, 'Structural_Residue_Visualization')
os.makedirs(residue_viz_dir, exist_ok=True)

for protein in ['1axb']: # allosteric_data_set:
    top_rvET_residues = {}
    rvET_overlap_data = {'rvET Residues': [], 'Method': [], 'Covariation Residues': [], 'Coverage Cutoff': [],
                         'X (Sample Successes)': [], 'M (Population Size)': [], 'n (Population Successes)': [], 'N (Sample_size)': [], 'Hypergeometric P-Value': []}
    for method in et_method_order:

        if protein in small_data_set:
            protein_dir = os.path.join(small_data_out_dir, protein)
        elif protein in allosteric_data_set:
            protein_dir = os.path.join(allosteric_data_out_dir, protein)
        else:
            raise ValueError('Bad protein')

        protein_settings = {'query': all_data[protein]['query'], 'aln_file': all_data[protein]['aln_path']}
        # Then perform experiments for each method
        # Set the result directory for this method
        if method in ['ET-MCM-', 'ET-M-MC', 'ET-MCMCR', 'ET-MCMCA', 'ET-MEM-', 'ET-M-ME', 'ET-MEMER', 'ET-MEMEA',
                      'ET-MDM-', 'ET-M-MD', 'ET-MDMDR', 'ET-MDMDA', 'ET-MDMER', 'ET-MDMEA']:
            method_dir = os.path.join(protein_dir, 'ET-MEMER')
        elif method in ['ET-MIp', 'cET-MIp']:
            method_dir = os.path.join(protein_dir, 'ET-MIp')
        else:
            method_dir = os.path.join(protein_dir, method)
        print(method_dir)
        assert os.path.isdir(method_dir)
        print(f'Attempting to load {method} covariance for: {protein}')
        # Otherwise perform the experiment, beginning with compiling the compelte settings for this predictor
        curr_settings = {'out_dir': method_dir}
        curr_settings.update(protein_settings)
        curr_settings.update(method_dict[method]['Settings'])
        predictor = method_dict[method]['Predictor'](**curr_settings)
        total_time = predictor.calculate_scores(**method_dict[method]['Scoring Params'])
        print(f'Successfully loaded {method} covariance for: {protein}')

        if method == 'rvET':
            prediction_scorer = SinglePositionScorer(query=all_data[protein]['query'], seq_alignment=os.path.join(method_dir, 'Non-Gapped_Alignment.fa'), pdb_reference=all_data[protein]['pdb_path'], chain=all_data[protein]['chain'])
        else:
            prediction_scorer = ContactScorer(query=all_data[protein]['query'], seq_alignment=os.path.join(method_dir, 'Non-Gapped_Alignment.fa'), pdb_reference=all_data[protein]['pdb_path'], cutoff=8.0,
                                              chain=all_data[protein]['chain'])
        prediction_scorer.fit()
        prediction_scorer.measure_distance(method='Any')
        prediction_scorer.map_predictions_to_pdb(ranks=predictor.rankings, predictions=predictor.scores, coverages=predictor.coverages, threshold=0.5)

        coverage_cutoffs = [0.3, 0.2, 0.15, 0.1]
        if protein == '2ddr':
            coverage_cutoffs = [0.3, 0.28, 0.26, 0.24, 0.22, 0.2, 0.18, 0.16, 0.15, 0.14, 0.12, 0.1, 0.08, 0.075, 0.06, 0.05, 0.04, 0.025, 0.02]
        for cov_cutoff in coverage_cutoffs:
            # category='Any' set by default for ContactScorer, not needed for SinglePositionScorer
            _, _, top_res = prediction_scorer.select_and_color_residues(out_dir=residue_viz_dir, n=None, k=None, residue_coverage=cov_cutoff,
                                                                        fn=f'{protein}_{method}_residues_Chain_{prediction_scorer.query_pdb_mapper.best_chain}_threshold_{cov_cutoff}')
            if method == 'rvET':
                top_rvET_residues[cov_cutoff] = top_res
            else:
                rvET_hyper_geom_test = prediction_scorer.score_pdb_residue_identification(pdb_residues=top_rvET_residues[cov_cutoff], coverage_cutoff=cov_cutoff)
                rvET_overlap_data['rvET Residues'].append('select resi ' + '+'.join([str(x) for x in top_rvET_residues[cov_cutoff]]))
                rvET_overlap_data['Method'].append(method)
                rvET_overlap_data['Covariation Residues'].append('select resi ' + '+'.join([str(x) for x in top_res]))
                rvET_overlap_data['Coverage Cutoff'].append(cov_cutoff)
                rvET_overlap_data['X (Sample Successes)'].append(rvET_hyper_geom_test[0])
                rvET_overlap_data['M (Population Size)'].append(rvET_hyper_geom_test[1])
                rvET_overlap_data['n (Population Successes)'].append(rvET_hyper_geom_test[2])
                rvET_overlap_data['N (Sample_size)'].append(rvET_hyper_geom_test[3])
                rvET_overlap_data['Hypergeometric P-Value'].append(rvET_hyper_geom_test[4])
    rvET_overlap_df = pd.DataFrame(rvET_overlap_data)
    rvET_overlap_df.to_csv(os.path.join(residue_viz_dir, f'{protein}_rvET_vs_Covariation_Coverage_Overlap_Significance.tsv'), header=True, index=False, sep='\t')

## Visualizing networks from covariation predictions

In [None]:
# Create directory for covariation network visualization
struct_viz_dir = os.path.join(output_dir, 'Structural_Network_Visualization')
os.makedirs(struct_viz_dir, exist_ok=True)

for protein in ['1axb']: # allosteric_data_set:
    for method in method_order:

        if protein in small_data_set:
            protein_dir = os.path.join(small_data_out_dir, protein)
        elif protein in allosteric_data_set:
            protein_dir = os.path.join(allosteric_data_out_dir, protein)
        else:
            raise ValueError('Bad protein')

        protein_settings = {'query': all_data[protein]['query'], 'aln_file': all_data[protein]['aln_path']}
        # Then perform experiments for each method
        # Set the result directory for this method
        if method in ['ET-MCM-', 'ET-M-MC', 'ET-MCMCR', 'ET-MCMCA', 'ET-MEM-', 'ET-M-ME', 'ET-MEMER', 'ET-MEMEA',
                      'ET-MDM-', 'ET-M-MD', 'ET-MDMDR', 'ET-MDMDA', 'ET-MDMER', 'ET-MDMEA']:
            method_dir = os.path.join(protein_dir, 'ET-MEMER')
        elif method in ['ET-MIp', 'cET-MIp']:
            method_dir = os.path.join(protein_dir, 'ET-MIp')
        else:
            method_dir = os.path.join(protein_dir, method)
        assert os.path.isdir(method_dir)
        print(f'Attempting to load {method} covariance for: {protein}')
        # Otherwise perform the experiment, beginning with compiling the compelte settings for this predictor
        curr_settings = {'out_dir': method_dir}
        curr_settings.update(protein_settings)
        curr_settings.update(method_dict[method]['Settings'])
        predictor = method_dict[method]['Predictor'](**curr_settings)
        total_time = predictor.calculate_scores(**method_dict[method]['Scoring Params'])
        print(f'Successfully loaded {method} covariance for: {protein}')

        any_contact_scorer = ContactScorer(query=all_data[protein]['query'], seq_alignment=os.path.join(method_dir, 'Non-Gapped_Alignment.fa'), pdb_reference=all_data[protein]['pdb_path'], cutoff=8.0,
                                           chain=all_data[protein]['chain'])
        any_contact_scorer.fit()
        any_contact_scorer.measure_distance(method='Any')
        any_contact_scorer.map_predictions_to_pdb(ranks=predictor.rankings, predictions=predictor.scores, coverages=predictor.coverages, threshold=0.5)

        coverage_cutoffs = [0.3, 0.2, 0.15, 0.1]
        if protein == '2ddr':
            coverage_cutoffs = [0.3, 0.28, 0.26, 0.24, 0.22, 0.2, 0.18, 0.16, 0.15, 0.14, 0.12, 0.1, 0.08, 0.075, 0.06, 0.05, 0.04, 0.025, 0.02]
        for cov_cutoff in coverage_cutoffs:
            print(f'Coverage Cutoff: {cov_cutoff}')
            any_contact_scorer.select_and_display_pairs(out_dir=struct_viz_dir, category='Any', n=None, k=None, residue_coverage=cov_cutoff,
                                                        fn=f'{protein}_{method}_pairs_Chain_{any_contact_scorer.query_pdb_mapper.best_chain}_threshold_{cov_cutoff}')

## Measure the overlap of predictions with known key sites

In [5]:
# Key motifs, functional, and structural residues
key_residues = {'2ddr': {'water_channel': [52, 76, 80, 418, 422, 423],
                         'tm2-7_link': [80, 422],
                         'ligand_binding': [41, 83, 117, 118, 193, 199, 205, 379, 414, 419],
                         'stabilize_inactive': [122, 201, 382, 418],
                         'beta_ionone_binding': [122, 201, 386],
                         'dry': [131, 132, 133],
                         'hhm': [125, 382, 378, 379],
                         'ionic_lock': [132, 368],
                         'tm3-5_link': [132, 209],
                         'tm5-6_link': [198, 386],
                         'broken_ionic_lock': [209, 426],
                         'ms_tm6': [382],
                         'cwxp': [385, 386, 388],
                         'npxxyx_f': [422, 423, 426, 433],
                         'tm7_cterm_link': [426, 433],
                         'all_key_res': [41, 52, 76, 80, 83, 117, 118, 122, 125, 131, 132, 133, 193,198, 199, 201, 205, 209, 368, 378, 379, 382, 385, 386, 388, 414, 418, 419, 422, 423, 426, 433]},
                '2rrm': {'rnp1': [138, 139, 140, 141, 142, 143, 144, 145],
                         'rnp2': [101, 102, 103, 104, 105, 106],
                         'all_key_res': [101, 102, 103, 104, 105, 106, 138, 139, 140, 141, 142, 143, 144, 145]},
                '1yap': {'ww': [177, 199],
                         'binding': [188, 190, 192, 196, 197, 199],
                         'hydrophobic_cluster_1': [174, 177, 189, 191, 198,202],
                         'hydrogen_bonds': [177, 193, 194, 196],
                         'hydropobic_patch': [173, 174, 189, 202],
                         'hydrophobic_cluster_2': [178, 180, 188, 190, 192, 197, 199],
                         'hydrophobic_pocket': [190, 192, 195],
                         'all_key_res': [173, 174, 177, 178, 180, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 202],
                         'insensitive_n_term': [164, 165, 166, 167, 168, 169, 170, 171, 172],
                         'insensitive_turns': [175, 183, 184, 193, 200],
                         'insensitive_c_term': [204, 205, 206, 207],
                         'all_insensitive_res': [164, 165, 166, 167, 168, 169, 170, 171, 172, 175, 183, 184, 193, 200, 204, 205, 206, 207]},
                '1axb': {'binding_cat': [70, 130, 132, 166, 170, 234, 235, 236, 237],
                         '5A_binding_cat': [68, 69, 71, 72, 73, 74, 103, 104, 105, 106, 125, 126, 127, 128, 129, 131, 133, 134, 135, 136, 164, 165, 167, 168, 169, 171, 172, 211, 214, 216, 217,
                                            220, 232, 233, 238, 240, 243, 244, 245, 246, 247, 272, 276],
                         'co-varying core': [72, 76, 80, 138, 142, 148],
                         'disulfide bridge': [77, 123],
                         'disulfide bridge in Sme-1': [69, 238],
                         'folding determinate': [229, 259, 290],
                         'all_key_res': [68, 69, 70, 71, 72, 73, 74, 76, 77, 80, 103, 104, 105, 106, 123, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 138, 142, 148, 164, 165, 166, 167, 168, 169,
                                         170, 171, 172, 211, 214, 216, 217, 220, 229, 232, 233, 234, 235, 236, 237, 238, 240, 243, 244, 245, 246, 247, 259, 272, 276, 290]}}

In [None]:
# Create directory for covariation network visualization
overlap_dir = os.path.join(output_dir, 'Key_Site_Overlap')
os.makedirs(overlap_dir, exist_ok=True)

for protein in key_residues: # ['1axb']: #  allosteric_data_set:
    for method in et_method_order:

        if protein in small_data_set:
            protein_dir = os.path.join(small_data_out_dir, protein)
        elif protein in allosteric_data_set:
            protein_dir = os.path.join(allosteric_data_out_dir, protein)
        else:
            raise ValueError('Bad protein')

        protein_settings = {'query': all_data[protein]['query'], 'aln_file': all_data[protein]['aln_path']}
        # Then perform experiments for each method
        # Set the result directory for this method
        if method in ['ET-MCM-', 'ET-M-MC', 'ET-MCMCR', 'ET-MCMCA', 'ET-MEM-', 'ET-M-ME', 'ET-MEMER', 'ET-MEMEA',
                      'ET-MDM-', 'ET-M-MD', 'ET-MDMDR', 'ET-MDMDA', 'ET-MDMER', 'ET-MDMEA']:
            method_dir = os.path.join(protein_dir, 'ET-MEMER')
        elif method in ['ET-MIp', 'cET-MIp']:
            method_dir = os.path.join(protein_dir, 'ET-MIp')
        else:
            method_dir = os.path.join(protein_dir, method)
        assert os.path.isdir(method_dir)
        print(f'Attempting to load {method} covariance for: {protein}')
        # Otherwise perform the experiment, beginning with compiling the compelte settings for this predictor
        curr_settings = {'out_dir': method_dir}
        curr_settings.update(protein_settings)
        curr_settings.update(method_dict[method]['Settings'])
        predictor = method_dict[method]['Predictor'](**curr_settings)
        total_time = predictor.calculate_scores(**method_dict[method]['Scoring Params'])
        print(f'Successfully loaded {method} covariance for: {protein}')

        if method == 'rvET':
            pred_scorer = SinglePositionScorer(query=all_data[protein]['query'], seq_alignment=os.path.join(method_dir, 'Non-Gapped_Alignment.fa'), pdb_reference=all_data[protein]['pdb_path'],
                                               chain=all_data[protein]['chain'])
        else:
            pred_scorer = ContactScorer(query=all_data[protein]['query'], seq_alignment=os.path.join(method_dir, 'Non-Gapped_Alignment.fa'), pdb_reference=all_data[protein]['pdb_path'], cutoff=8.0,
                                               chain=all_data[protein]['chain'])
        pred_scorer.fit()
        pred_scorer.measure_distance(method='Any')
        pred_scorer.map_predictions_to_pdb(ranks=predictor.rankings, predictions=predictor.scores, coverages=predictor.coverages, threshold=0.5)

        
        
        coverage_cutoffs = [0.3, 0.2, 0.15, 0.1]
        if protein == '2ddr':
            coverage_cutoffs = [0.3, 0.28, 0.26, 0.24, 0.22, 0.2, 0.18, 0.16, 0.15, 0.14, 0.12, 0.1, 0.08, 0.075, 0.06, 0.05, 0.04, 0.025, 0.02]
        
        for cov_cutoff in coverage_cutoffs:
            print(f'Coverage cutoff: {cov_cutoff}')
            overlap_data = {'Selection': [], 'X (Sample Successes)': [], 'M (Population Size)': [], 'n (Population Successes)': [], 'N (Sample_size)': [], 'Hypergeometric P-Value': []}
            for sel in key_residues[protein]:
                print(f'\tSelection: {sel}')
                hyper_geom_test = pred_scorer.score_pdb_residue_identification(pdb_residues=key_residues[protein][sel], coverage_cutoff=cov_cutoff)
                overlap_data['Selection'].append(sel)
                overlap_data['X (Sample Successes)'].append(hyper_geom_test[0])
                overlap_data['M (Population Size)'].append(hyper_geom_test[1])
                overlap_data['n (Population Successes)'].append(hyper_geom_test[2])
                overlap_data['N (Sample_size)'].append(hyper_geom_test[3])
                overlap_data['Hypergeometric P-Value'].append(hyper_geom_test[4])
            overlap_df = pd.DataFrame(overlap_data)
            overlap_df.to_csv(os.path.join(overlap_dir, f'{method}_overlap_key_{protein}_residues_hypergemetric_test_{cov_cutoff}_coverage.tsv'), header=True, index=False, sep='\t')

In [10]:
# Create directory for covariation auroc/auprc analyses
auc_dir = os.path.join(output_dir, 'Key_Site_AUCs')
os.makedirs(auc_dir, exist_ok=True)

recovery_data = {'Protein': [], 'Method': [], 'Selection': [], 'AUROC': [], 'AUPRC': []}

for protein in  key_residues: # ['1axb']: #  allosteric_data_set:
    for method in et_method_order:

        if protein in small_data_set:
            protein_dir = os.path.join(small_data_out_dir, protein)
        elif protein in allosteric_data_set:
            protein_dir = os.path.join(allosteric_data_out_dir, protein)
        else:
            raise ValueError('Bad protein')

        protein_settings = {'query': all_data[protein]['query'], 'aln_file': all_data[protein]['aln_path']}
        # Then perform experiments for each method
        # Set the result directory for this method
        if method in ['ET-MCM-', 'ET-M-MC', 'ET-MCMCR', 'ET-MCMCA', 'ET-MEM-', 'ET-M-ME', 'ET-MEMER', 'ET-MEMEA',
                      'ET-MDM-', 'ET-M-MD', 'ET-MDMDR', 'ET-MDMDA', 'ET-MDMER', 'ET-MDMEA']:
            method_dir = os.path.join(protein_dir, 'ET-MEMER')
        elif method in ['ET-MIp', 'cET-MIp']:
            method_dir = os.path.join(protein_dir, 'ET-MIp')
        else:
            method_dir = os.path.join(protein_dir, method)
        assert os.path.isdir(method_dir)
        print(f'Attempting to load {method} covariance for: {protein}')
        # Otherwise perform the experiment, beginning with compiling the compelte settings for this predictor
        curr_settings = {'out_dir': method_dir}
        curr_settings.update(protein_settings)
        curr_settings.update(method_dict[method]['Settings'])
        predictor = method_dict[method]['Predictor'](**curr_settings)
        total_time = predictor.calculate_scores(**method_dict[method]['Scoring Params'])
        print(f'Successfully loaded {method} covariance for: {protein}')

        if method == 'rvET':
            pred_scorer = SinglePositionScorer(query=all_data[protein]['query'], seq_alignment=os.path.join(method_dir, 'Non-Gapped_Alignment.fa'), pdb_reference=all_data[protein]['pdb_path'],
                                               chain=all_data[protein]['chain'])
        else:
            pred_scorer = ContactScorer(query=all_data[protein]['query'], seq_alignment=os.path.join(method_dir, 'Non-Gapped_Alignment.fa'), pdb_reference=all_data[protein]['pdb_path'], cutoff=8.0,
                                               chain=all_data[protein]['chain'])
        pred_scorer.fit()
        pred_scorer.measure_distance(method='Any')
        pred_scorer.map_predictions_to_pdb(ranks=predictor.rankings, predictions=predictor.scores, coverages=predictor.coverages, threshold=0.5)

        for sel in key_residues[protein]:
            print(f'\tSelection: {sel}')
            tpr, fpr, auroc, precision, recall, auprc = pred_scorer.recovery_of_pdb_residues(key_residues[protein][sel])
            pred_scorer.plot_auc(auc_data=(tpr, fpr, auroc), title=f'{protein}_{method}_{sel}_AUROC', file_name=f'{protein}_{method}_{sel}_AUROC.png', output_dir=auc_dir)
            pred_scorer.plot_auprc(auprc_data=(precision, recall, auprc), title=f'{protein}_{method}_{sel}_AUPRC', file_name=f'{protein}_{method}_{sel}_AUPRC.png', output_dir=auc_dir)
            recovery_data['Protein'].append(protein)
            recovery_data['Method'].append(method)
            recovery_data['Selection'].append(sel)
            recovery_data['AUROC'].append(auroc)
            recovery_data['AUPRC'].append(auprc)
recovery_df = pd.DataFrame(recovery_data)
recovery_df.to_csv(os.path.join(auc_dir, f'Overlap_Key_Residues_auroc_auprc.tsv'), header=True, index=False, sep='\t')

Attempting to load rvET covariance for: 2ddr
Removing gaps took 0.0006339351336161295 min
Evolutionary Trace analysis with the same parameters already saved to this location.
0.046736955642700195
Successfully loaded rvET covariance for: 2ddr
Importing the PDB file took 0.0007770140965779622 min
MDPLNLSWYDDDLERQNWSRPFNGSDGKADRPHYNYYATLLTLLIAVIVFGNVLVCMAVSREKALQTTTNYLIVSLAVADLLVATLVMPWVVYLEVVGEWKFSRIHCDIFVTLDVMMCTASILNLCAISIDRYTAVAMPMLYNTRYSSKRRVTVMISIVWVLSFTISCPLLFGLNNADQNECIIANPAFVVYSSIVSFYVPFIVTLLVYIKIYIVLRRRRKRVNTKRSSRAFRAHLRAPLKGNCTHPEDMKLCTVIMKSNGSFPVNRRRVEAARRAQELEMEMLSSTSPPERTRYSPIPPSHHQLTLPDPSHHGLHSTPDSPAKPEKNGHAKDHPKIAKIFEIQTMPNGKTRTSLKTMSRRKLSQQKEKKATQMLAIVLGVFIICWLPFFITHILNIHCDCNIPPVLYSAFTWLGYVNSAVNPIIYTTFNIEFRKAFLKILHC
                                  |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||.|||||||||||||||||    |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||                                      



Computing the distance matrix based on the PDB file took 0.006886466344197591 min
	Selection: rnp1
	Selection: rnp2
	Selection: all_key_res
Attempting to load DCA covariance for: 2rrm
Removing gaps took 0.0002004861831665039 min
7.448775291442871
Successfully loaded DCA covariance for: 2rrm
Importing the PDB file took 0.0013613382975260417 min
-----------------------------------------------------------------------------------------GNIFIKNLHPDIDNKALYDTFSVFGDILSSKIATDENGKSKGFGFVHFEEEGAAKEAIDALNGMLLNGQEIYVA---------PHLS
                                                                                         ||||||||...|||||||||||.||.|||.|...|||| |||.||||||...||..||...||||||.....|.         ..|.
MASLYVGDLHPDVTEAMLYEKFSPAGPILSIRVCRDMITRRSLGYAYVNFQQPADAERALDTMNFDVIKGKPVRIMWSQRDPSLRKSGVGNIFIKNLDKSIDNKALYDTFSAFGNILSCKVVCDENG-SKGYGFVHFETQEAAERAIEKMNGMLLNDRKVFVGRFKSRKEREAELG
  Score=189

Mapping query sequence and pdb took 0.0019469022750854491 min
Mapping query sequence and pdb took 0.0019472440



Computing the distance matrix based on the PDB file took 0.00689005454381307 min
	Selection: rnp1
	Selection: rnp2
	Selection: all_key_res
Attempting to load EVC Standard covariance for: 2rrm
Removing gaps took 0.0002058704694112142 min
22.07924485206604
Successfully loaded EVC Standard covariance for: 2rrm
Importing the PDB file took 0.001338215668996175 min
-----------------------------------------------------------------------------------------GNIFIKNLHPDIDNKALYDTFSVFGDILSSKIATDENGKSKGFGFVHFEEEGAAKEAIDALNGMLLNGQEIYVA---------PHLS
                                                                                         ||||||||...|||||||||||.||.|||.|...|||| |||.||||||...||..||...||||||.....|.         ..|.
MASLYVGDLHPDVTEAMLYEKFSPAGPILSIRVCRDMITRRSLGYAYVNFQQPADAERALDTMNFDVIKGKPVRIMWSQRDPSLRKSGVGNIFIKNLDKSIDNKALYDTFSAFGNILSCKVVCDENG-SKGYGFVHFETQEAAERAIEKMNGMLLNDRKVFVGRFKSRKEREAELG
  Score=189

Mapping query sequence and pdb took 0.0018725951512654623 min
Mapping query sequence and pdb t



Computing the distance matrix based on the PDB file took 0.006988743940989177 min
	Selection: rnp1
	Selection: rnp2
	Selection: all_key_res
Attempting to load ET-MIp covariance for: 2rrm
Removing gaps took 0.00020953814188639323 min
Evolutionary Trace analysis with the same parameters already saved to this location.
26.911238431930542
Successfully loaded ET-MIp covariance for: 2rrm
Importing the PDB file took 0.001346715291341146 min
-----------------------------------------------------------------------------------------GNIFIKNLHPDIDNKALYDTFSVFGDILSSKIATDENGKSKGFGFVHFEEEGAAKEAIDALNGMLLNGQEIYVA---------PHLS
                                                                                         ||||||||...|||||||||||.||.|||.|...|||| |||.||||||...||..||...||||||.....|.         ..|.
MASLYVGDLHPDVTEAMLYEKFSPAGPILSIRVCRDMITRRSLGYAYVNFQQPADAERALDTMNFDVIKGKPVRIMWSQRDPSLRKSGVGNIFIKNLDKSIDNKALYDTFSAFGNILSCKVVCDENG-SKGYGFVHFETQEAAERAIEKMNGMLLNDRKVFVGRFKSRKEREAELG
  Score=189

Mapping query sequ



Computing the distance matrix based on the PDB file took 0.006738130251566569 min
	Selection: rnp1
	Selection: rnp2
	Selection: all_key_res
Attempting to load ET-M-MD covariance for: 2rrm
Removing gaps took 0.00020546515782674153 min
Evolutionary Trace analysis with the same parameters already saved to this location.
31.297022581100464
Successfully loaded ET-M-MD covariance for: 2rrm
Importing the PDB file took 0.0013657768567403158 min
-----------------------------------------------------------------------------------------GNIFIKNLHPDIDNKALYDTFSVFGDILSSKIATDENGKSKGFGFVHFEEEGAAKEAIDALNGMLLNGQEIYVA---------PHLS
                                                                                         ||||||||...|||||||||||.||.|||.|...|||| |||.||||||...||..||...||||||.....|.         ..|.
MASLYVGDLHPDVTEAMLYEKFSPAGPILSIRVCRDMITRRSLGYAYVNFQQPADAERALDTMNFDVIKGKPVRIMWSQRDPSLRKSGVGNIFIKNLDKSIDNKALYDTFSAFGNILSCKVVCDENG-SKGYGFVHFETQEAAERAIEKMNGMLLNDRKVFVGRFKSRKEREAELG
  Score=189

Mapping query s



Computing the distance matrix based on the PDB file took 0.006735551357269287 min
	Selection: rnp1
	Selection: rnp2
	Selection: all_key_res
Attempting to load rvET covariance for: 1yap
Removing gaps took 5.121231079101563e-05 min
Evolutionary Trace analysis with the same parameters already saved to this location.
2.2046566009521484
Successfully loaded rvET covariance for: 1yap
Importing the PDB file took 0.00026706059773763023 min
---------DVPLPAGWEMAKTSSGQRYFLNHIDQTTTWQDPRK----
         |||||||||||||||||||||||||||||||||||    
GAMGFEIPDDVPLPAGWEMAKTSSGQRYFLNHIDQTTTWQDPRKAMLS
  Score=171.5

Mapping query sequence and pdb took 0.0004572312037150065 min
Mapping query sequence and pdb took 0.00045773983001708987 min
Constructing internal representation took 1.3963381449381511e-05 min
Computing the distance matrix based on the PDB file took 0.0008368333180745443 min
	Selection: ww
	Selection: binding
	Selection: hydrophobic_cluster_1
	Selection: hydrogen_bonds
	Selection: hydropobic_patch
	

  res_values = method(rvalues)


	Selection: hydrophobic_pocket
	Selection: all_key_res
	Selection: insensitive_n_term
	Selection: insensitive_turns
	Selection: insensitive_c_term
	Selection: all_insensitive_res
Attempting to load DCA covariance for: 1yap
Removing gaps took 5.261898040771484e-05 min
6.000110864639282
Successfully loaded DCA covariance for: 1yap
Importing the PDB file took 0.000278162956237793 min
---------DVPLPAGWEMAKTSSGQRYFLNHIDQTTTWQDPRK----
         |||||||||||||||||||||||||||||||||||    
GAMGFEIPDDVPLPAGWEMAKTSSGQRYFLNHIDQTTTWQDPRKAMLS
  Score=171.5

Mapping query sequence and pdb took 0.00047824382781982424 min
Mapping query sequence and pdb took 0.00048265854517618817 min
Constructing internal representation took 6.502072016398111e-05 min
Computing the distance matrix based on the PDB file took 0.0008151769638061523 min
	Selection: ww


  res_values = method(rvalues)


	Selection: binding
	Selection: hydrophobic_cluster_1
	Selection: hydrogen_bonds
	Selection: hydropobic_patch
	Selection: hydrophobic_cluster_2
	Selection: hydrophobic_pocket
	Selection: all_key_res
	Selection: insensitive_n_term
	Selection: insensitive_turns
	Selection: insensitive_c_term
	Selection: all_insensitive_res
Attempting to load EVC Standard covariance for: 1yap
Removing gaps took 5.352099736531575e-05 min
3.787893533706665
Successfully loaded EVC Standard covariance for: 1yap
Importing the PDB file took 0.00026946067810058595 min
---------DVPLPAGWEMAKTSSGQRYFLNHIDQTTTWQDPRK----
         |||||||||||||||||||||||||||||||||||    
GAMGFEIPDDVPLPAGWEMAKTSSGQRYFLNHIDQTTTWQDPRKAMLS
  Score=171.5

Mapping query sequence and pdb took 0.0004707654317220052 min
Mapping query sequence and pdb took 0.00047108332316080727 min
Constructing internal representation took 6.738503774007161e-05 min
Computing the distance matrix based on the PDB file took 0.0008122364679972331 min
	Selection: ww

  res_values = method(rvalues)


	Selection: binding
	Selection: hydrophobic_cluster_1
	Selection: hydrogen_bonds
	Selection: hydropobic_patch
	Selection: hydrophobic_cluster_2
	Selection: hydrophobic_pocket
	Selection: all_key_res
	Selection: insensitive_n_term
	Selection: insensitive_turns
	Selection: insensitive_c_term
	Selection: all_insensitive_res
Attempting to load ET-MIp covariance for: 1yap
Removing gaps took 5.296866099039713e-05 min
Evolutionary Trace analysis with the same parameters already saved to this location.
3.661250591278076
Successfully loaded ET-MIp covariance for: 1yap
Importing the PDB file took 0.00026466051737467446 min
---------DVPLPAGWEMAKTSSGQRYFLNHIDQTTTWQDPRK----
         |||||||||||||||||||||||||||||||||||    
GAMGFEIPDDVPLPAGWEMAKTSSGQRYFLNHIDQTTTWQDPRKAMLS
  Score=171.5

Mapping query sequence and pdb took 0.00046503543853759766 min
Mapping query sequence and pdb took 0.00046536922454833987 min
Constructing internal representation took 6.277163823445639e-05 min
Computing the distance 

  res_values = method(rvalues)


	Selection: binding
	Selection: hydrophobic_cluster_1
	Selection: hydrogen_bonds
	Selection: hydropobic_patch
	Selection: hydrophobic_cluster_2
	Selection: hydrophobic_pocket
	Selection: all_key_res
	Selection: insensitive_n_term
	Selection: insensitive_turns
	Selection: insensitive_c_term
	Selection: all_insensitive_res
Attempting to load ET-M-MD covariance for: 1yap
Removing gaps took 5.441506703694661e-05 min
Evolutionary Trace analysis with the same parameters already saved to this location.
5.340250492095947
Successfully loaded ET-M-MD covariance for: 1yap
Importing the PDB file took 0.0002672155698140462 min
---------DVPLPAGWEMAKTSSGQRYFLNHIDQTTTWQDPRK----
         |||||||||||||||||||||||||||||||||||    
GAMGFEIPDDVPLPAGWEMAKTSSGQRYFLNHIDQTTTWQDPRKAMLS
  Score=171.5

Mapping query sequence and pdb took 0.00045729875564575194 min
Mapping query sequence and pdb took 0.0004578669865926107 min
Constructing internal representation took 6.303787231445312e-05 min
Computing the distance 

  res_values = method(rvalues)


	Selection: binding
	Selection: hydrophobic_cluster_1
	Selection: hydrogen_bonds
	Selection: hydropobic_patch
	Selection: hydrophobic_cluster_2
	Selection: hydrophobic_pocket
	Selection: all_key_res
	Selection: insensitive_n_term
	Selection: insensitive_turns
	Selection: insensitive_c_term
	Selection: all_insensitive_res
Attempting to load rvET covariance for: 1axb
Removing gaps took 0.034744449456532794 min
Evolutionary Trace analysis with the same parameters already saved to this location.
8.709298610687256
Successfully loaded rvET covariance for: 1axb
Importing the PDB file took 0.0004657745361328125 min
HPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRIDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPVAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||

  res_values = method(rvalues)


	Selection: co-varying core
	Selection: disulfide bridge
	Selection: disulfide bridge in Sme-1
	Selection: folding determinate
	Selection: all_key_res
Attempting to load DCA covariance for: 1axb
Removing gaps took 0.04411388635635376 min
12.136721134185791
Successfully loaded DCA covariance for: 1axb
Importing the PDB file took 0.00048357248306274414 min
HPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRIDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPVAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
HPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRIDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLT

  res_values = method(rvalues)


	Selection: binding_cat
	Selection: 5A_binding_cat
	Selection: co-varying core
	Selection: disulfide bridge
	Selection: disulfide bridge in Sme-1
	Selection: folding determinate
	Selection: all_key_res
Attempting to load EVC Standard covariance for: 1axb
Removing gaps took 0.021958700815836587 min
165.5119504928589
Successfully loaded EVC Standard covariance for: 1axb
Importing the PDB file took 0.00047312180201212567 min
HPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRIDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPVAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
HPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMST

  res_values = method(rvalues)


	Selection: binding_cat
	Selection: 5A_binding_cat
	Selection: co-varying core
	Selection: disulfide bridge
	Selection: disulfide bridge in Sme-1
	Selection: folding determinate
	Selection: all_key_res
Attempting to load ET-MIp covariance for: 1axb
Removing gaps took 0.03677264849344889 min
Evolutionary Trace analysis with the same parameters already saved to this location.
9.642625331878662
Successfully loaded ET-MIp covariance for: 1axb
Importing the PDB file took 0.00047542651494344074 min
HPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRIDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPVAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||

  res_values = method(rvalues)


	Selection: binding_cat
	Selection: 5A_binding_cat
	Selection: co-varying core
	Selection: disulfide bridge
	Selection: disulfide bridge in Sme-1
	Selection: folding determinate
	Selection: all_key_res
Attempting to load ET-M-MD covariance for: 1axb
Removing gaps took 0.044185928503672284 min
Evolutionary Trace analysis with the same parameters already saved to this location.
8.163577795028687
Successfully loaded ET-M-MD covariance for: 1axb
Importing the PDB file took 0.0004847923914591471 min
HPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRIDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWEPELNEAIPNDERDTTMPVAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||

  res_values = method(rvalues)


	Selection: binding_cat
	Selection: 5A_binding_cat
	Selection: co-varying core
	Selection: disulfide bridge
	Selection: disulfide bridge in Sme-1
	Selection: folding determinate
	Selection: all_key_res
