# Small Test Set #
## Primary Goal ##
Check if I can use the data generated by Benu and I before my code overhaul to reproduce the figures and trends we had shown before.
## Secondary Goal ##
The goal of this test set is to perform proof of concept testing on a small number of proteins with a wide range of sizes and available homologs, orthologs, and paralogs. By doing so it should be possible to test the best parameterization for this tool as well as identifying the strengths and weaknesses of the tool using various measurments as end points.
## Warning ##
Before attempting to use this notebook make sure that your .env file has been properly setup to reflect the correct locations of command line tools and the location of files and directories needed for execution.
### Initial Import###
This first cell performs the necessary imports required to begin this notebook.

In [1]:
from dotenv import find_dotenv, load_dotenv
try:
    dotenv_path = find_dotenv(raise_error_if_not_found=True)
except IOError:
    dotenv_path = find_dotenv(raise_error_if_not_found=True, usecwd=True)
load_dotenv(dotenv_path)
import os
import sys
sys.path.append(os.path.join(os.environ.get('PROJECT_PATH'), 'src'))
sys.path.append(os.path.join(os.environ.get('PROJECT_PATH'), 'src', 'SupportingClasses'))
input_dir = os.environ.get('INPUT_PATH')

## Data Set Construction ##
The first task required to test the data set is to download the required data and construct any necessary input files for all down stream analyses.
In this case that means:
* Downloading PDB files for the proteins in our small test set.
* Extracting a query sequence from each PDB file.
* Searching for paralogs, homologs, and orthologs in a custom BLAST database built by filtering the Uniref90 database.
* Filtering the hits from the BLAST search to meet minimum and maximum length requirements, as well as minimum and maximum identity requirements.
* Building alignments using CLUSTALW in both the fasta and msf formats since some of the tools which will be used for comparison need different formats.
* Filtering the alignment for maximum identity similarity between seqeunces.
* Re-aligning the filtered sequences using CLUSTALW.
This is all handeled by the DataSetGenerator class found in the src/SupportingClasses folder

In [2]:
import pandas as pd
from time import time
from Bio.Align.Applications import ClustalwCommandline
from DataSetGenerator import DataSetGenerator
from SeqAlignment import SeqAlignment
clustalw_path = os.environ.get('CLUSTALW_PATH')
protein_list_dir = os.path.join(input_dir, 'ProteinLists')
if not os.path.isdir(protein_list_dir):
    os.makedirs(protein_list_dir)
small_list_fn = os.path.join(protein_list_dir, 'SmallDataSetOld.txt')
proteins_of_interest = ['2ysdA', '1c17A', '3tnuA', '7hvpA', '135lA', '206lA', '2werA', '1bolA', '3q05A', '1axbA',
                        '2rh1A', '1hckA', '3b6vA', '2z0eA', '1jwlA', '1a26A', '1c0kA', '4lliA', '4ycuA', '2iopA',
                        '2zxeA', '2b59A', '1h1vA']
if not os.path.isfile(small_list_fn):
    with open(small_list_fn, 'w') as small_list_handle:
        for p_id in proteins_of_interest:
            small_list_handle.write('{}\n'.format(p_id))
generator = DataSetGenerator(input_dir)
start = time()
generator.pdb_path = os.path.join(input_dir, 'Old_Data')
generator.sequence_path = None
generator.blast_path = None
generator.filtered_blast_path = None
generator.alignment_path = None
generator.filtered_alignment_path = None
generator.final_alingment_path = generator.pdb_path
generator.protein_data = {}
summary = {'Protein_ID': [], 'Accession': [], 'BLAST_Hits': [], 'Filtered_BLAST': [],
           'Length': [], 'Filtered_Alignment': [], 'Total_Size': []}
for p_id in proteins_of_interest:
    generator.protein_data[p_id] = {}
    if p_id == '2rh1A':
        generator.protein_data[p_id]['PDB'] = os.path.join(input_dir, 'Old_Data', '{}_NEW.pdb'.format(p_id))
        generator.protein_data[p_id]['Final_FA_Aln'] = os.path.join(input_dir, 'Old_Data', '{}_NEW.fa'.format(p_id))
    else:
        generator.protein_data[p_id]['PDB'] = os.path.join(input_dir, 'Old_Data', 'query_{}.pdb'.format(p_id))
        generator.protein_data[p_id]['Final_FA_Aln'] = os.path.join(input_dir, 'Old_Data', '{}.fa'.format(p_id))
    generator.protein_data[p_id]['Chain'] = 'A'
    seq_aln = SeqAlignment(file_name=generator.protein_data[p_id]['Final_FA_Aln'], query_id='query_' + p_id)
    seq_aln.import_alignment()
    seq_aln_non_gap = seq_aln.remove_gaps()
    generator.protein_data[p_id]['Sequence'] = seq_aln_non_gap.query_sequence
    generator.protein_data[p_id]['Length'] = seq_aln_non_gap.seq_length
    generator.protein_data[p_id]['Seq_Fasta'] = None
    generator.protein_data[p_id]['Accession'] = None
    generator.protein_data[p_id]['Filter_Count'] = None
    generator.protein_data[p_id]['Filtered_BLAST'] = None
    generator.protein_data[p_id]['MSF_Aln'] = None
    generator.protein_data[p_id]['FA_Aln'] = None
    generator.protein_data[p_id]['Final_Count'] = seq_aln_non_gap.size
    generator.protein_data[p_id]['Filtered_Alignment'] = None
    generator.protein_data[p_id]['Final_MSF_Aln'] = os.path.splitext(generator.protein_data[p_id]['Final_FA_Aln'])[0] + '.msf'
    if not os.path.isfile(generator.protein_data[p_id]['Final_MSF_Aln']):
        msf_cline = ClustalwCommandline(clustalw_path, infile=generator.protein_data[p_id]['Final_FA_Aln'], convert=True,
                                        outfile=generator.protein_data[p_id]['Final_MSF_Aln'], output='GCG')
        print(msf_cline)
        stdout, stderr = msf_cline()
        print(stdout)
        print(stderr)
    summary['Protein_ID'].append(p_id)
    summary['Accession'].append(None)
    summary['BLAST_Hits'].append(None)
    summary['Filtered_BLAST'].append(None)
    summary['Length'].append(seq_aln_non_gap.seq_length)
    summary['Filtered_Alignment'].append(seq_aln_non_gap.size)
    summary['Total_Size'].append(seq_aln_non_gap.seq_length * seq_aln_non_gap.size)
summary = pd.DataFrame(summary)
summary.sort_values(by=['Filtered_Alignment', 'Length'], axis=0, inplace=True)
summary_columns = ['Protein_ID', 'Accession', 'BLAST_Hits', 'Filtered_BLAST', 'Filtered_Alignment', 'Length',
                   'Total_Size']
print(summary[summary_columns])
end = time()
print('It took {} min to generate the data set.'.format((end - start) / 60.0))
summary.to_csv(os.path.join(input_dir, 'small_data_set_old_summary.tsv'), sep='\t', index=False, header=True,
               columns=summary_columns)

Removing gaps took 0.003695507844289144 min
Removing gaps took 1.5934308369954426e-05 min
Removing gaps took 0.006346674760182699 min
Removing gaps took 0.00010115305582682291 min
Removing gaps took 0.008964864412943523 min
Removing gaps took 0.00042479832967122396 min
Removing gaps took 0.025551732381184897 min
Removing gaps took 0.003337192535400391 min
Removing gaps took 7.469256718953451e-05 min
Removing gaps took 0.009037490685780842 min
Removing gaps took 0.009725987911224365 min
Removing gaps took 0.038866110642751056 min
Removing gaps took 0.03223922650019328 min
Removing gaps took 0.011290466785430909 min
Removing gaps took 0.030785882472991945 min
Removing gaps took 0.03383784691492717 min
Removing gaps took 0.04176588853200277 min
Removing gaps took 0.01701324780782064 min
Removing gaps took 0.056421323617299395 min
Removing gaps took 0.0675422708193461 min
Removing gaps took 0.07285692691802978 min
Removing gaps took 0.00010458628336588542 min
Removing gaps took 0.060426489

Create a location to store the output of this method comparison.

In [3]:
output_dir = os.environ.get('OUTPUT_PATH')
small_set_out_dir = os.path.join(output_dir, 'ReproducibilityCheck')
if not os.path.isdir(small_set_out_dir):
    os.makedirs(small_set_out_dir)

## Setting Up Scoring For Each Method
To reduce memory load during prediction and evaluation, the scoring objects needed to compute the metrics used to compare methods will be created ahead of time so they are available to each method when it computes its predictions for a given protein. This will ensure that results do not need to be kept in memory while waiting for all other results to be computed, only the metrics measured for each method will be recorded.

In [5]:
from SeqAlignment import SeqAlignment
from PDBReference import PDBReference
from ContactScorer import ContactScorer, plot_z_scores
protein_order = list(summary['Protein_ID'])
method_order = ['MI', 'DI', 'DCA', 'ET-MIp', 'cET-MIp']
sequence_separation_order = ['Any', 'Neighbors', 'Short', 'Medium', 'Long']
protein_scorers = {}
small_comparison_df = None
small_comparison_fn = os.path.join(small_set_out_dir, 'Reproducibility_Check_Data.csv')
if os.path.isfile(small_comparison_fn):
    small_comparison_df = pd.read_csv(small_comparison_fn, sep='\t', header=0, index_col=False)
else:
    for p_id in summary['Protein_ID']:
        protein_scorers[p_id] = {}
        # Import alignment and remove gaps
        full_aln = SeqAlignment(file_name=generator.protein_data[p_id]['Final_FA_Aln'], query_id='query_' + p_id)
        full_aln.import_alignment()
        non_gap_aln = full_aln.remove_gaps()
        # Import structure
        pdb_structure = PDBReference(pdb_file=generator.protein_data[p_id]['PDB'])
        pdb_structure.import_pdb(structure_id=p_id)
        protein_scorers[p_id]['Structure'] = pdb_structure
        # Initialize Beta Carbon distance scorer
        contact_scorer_cb = ContactScorer(query=p_id, seq_alignment=non_gap_aln,
                                          pdb_reference=pdb_structure, cutoff=8.0)
        contact_scorer_cb.best_chain = generator.protein_data[p_id]['Chain']
        contact_scorer_cb.fit()
        contact_scorer_cb.measure_distance(method='CB')
        protein_scorers[p_id]['Scorer_CB'] = contact_scorer_cb
        # Initialize distance scorer minimizing distance between any atoms
        contact_scorer_any = ContactScorer(query=p_id, seq_alignment=non_gap_aln,
                                           pdb_reference=pdb_structure, cutoff=8.0)
        contact_scorer_any.best_chain = generator.protein_data[p_id]['Chain']
        contact_scorer_any.fit()
        contact_scorer_any.measure_distance(method='Any')
        protein_scorers[p_id]['Scorer_Any'] = contact_scorer_any
        # Initialize z-scoring subproblems
        protein_scorers[p_id]['biased_w2_ave'] = None
        protein_scorers[p_id]['unbiased_w2_ave'] = None
output_columns = ['Protein', 'Protein Length', 'Alignment Size', 'Method', 'Distance', 'Sequence_Separation',
                  'AUROC', 'AUPRC',
                  'Top K Predictions', 'Precision', 'Recall', 'F1 Score',
                  'Biased Z-Score at 10%', 'Biased Z-Score at 30%', 'Max Biased Z-Score', 'AUC Biased Z-Score',
                  'Unbiased Z-Score at 10%', 'Biased Z-Score at 30%', 'Max Unbiased Z-Score', 'AUC Unbiased Z-Score']

Removing gaps took 0.00015128850936889647 min
Importing the PDB file took 0.0015863418579101563 min
Removing gaps took 0.00014876921971638998 min
Importing the PDB file took 0.0007414420445760092 min
Mapping query sequence and pdb took 0.0010100364685058593 min
Computing the distance matrix based on the PDB file took 0.007689571380615235 min
Removing gaps took 7.832050323486328e-05 min
Importing the PDB file took 0.0004199981689453125 min
Mapping query sequence and pdb took 0.0005836168924967448 min
Computing the distance matrix based on the PDB file took 0.008163972695668539 min
Removing gaps took 0.00011109113693237304 min
Importing the PDB file took 0.0012097835540771484 min
Removing gaps took 0.00010420878728230794 min
Importing the PDB file took 0.0002633174260457357 min
Mapping query sequence and pdb took 0.0004086931546529134 min
Computing the distance matrix based on the PDB file took 0.0029820760091145834 min
Removing gaps took 0.00010976394017537435 min
Importing the PDB file

# Generating Values For Comparision#
To determine the effectiveness of the new method and implementation the covariation of the same proteins will be computed using the previous Evolutionary Trace covariation method (ET-MIp) and other methods in the field.

## ET-MIp##
Scoring the the covariation of the proteins using the previous Evolutionary Trace covariation method (ET-MIp).

In [15]:
from ETMIPWrapper import ETMIPWrapper
import numpy as np
if not os.path.isfile(small_comparison_fn):
    old_etmip_out_dir = os.path.join(small_set_out_dir, 'ET-MIp')
    if not os.path.isdir(old_etmip_out_dir):
        os.makedirs(old_etmip_out_dir)
    old_etmip_method_fn = os.path.join(old_etmip_out_dir, 'wetc_ET-MIp_Method_Data.csv')
    if os.path.isfile(old_etmip_method_fn):
        old_etmip_method_df = pd.read_csv(old_etmip_method_fn, sep='\t', header=0, index_col=False)
    else:    
        old_etmip_method_df = None
        counts = {'success':0, 'value': 0, 'attribute':0}
        for p_id in summary['Protein_ID']:
            if p_id == '2rh1A':
                continue
            print('Attempting to calculate ET-MIp covariance for: {}'.format(p_id))
            protein_dir = os.path.join(old_etmip_out_dir, p_id)
            if not os.path.isdir(protein_dir):
                os.makedirs(protein_dir)
            protein_fn = os.path.join(protein_dir, '{}_Protein_Data.csv'.format(p_id))
            if os.path.isfile(protein_fn):
                protein_df = pd.read_csv(protein_fn, sep='\t', header=0, index_col=False)
                counts['success'] += 1
            else:
#                 curr_aln = SeqAlignment(file_name=generator.protein_data[p_id]['Final_FA_Aln'], query_id='query_' + p_id,
#                                         polymer_type='Protein')
#                 curr_aln.import_alignment()
                curr_etmip = ETMIPWrapper(query='query_' + p_id, aln_file=generator.protein_data[p_id]['Final_FA_Aln'],
                                          out_dir=protein_dir)
                curr_etmip.import_scores(method='ET-MIp', prefix=p_id)
                np.savez(os.path.join(protein_dir, 'ET-MIp.npz'), time=curr_etmip.time,
                         scores=curr_etmip.scores, coverage=curr_etmip.coverage)
                import pickle
                with open(os.path.join(protein_dir, 'ET-MIp.pkl'), 'wb') as handle:
                    pickle.dump((curr_etmip.distance_matrix, curr_etmip.tree, curr_etmip.rank_group_assignments),
                                handle, pickle.HIGHEST_PROTOCOL)
                print('Successfully computed ET-MIp covariance for: {} in {} sec'.format(p_id, end-start))
                # Compute statistics for the final scores of the ET-MIp model
                protein_df, _, _ = protein_scorers[p_id]['Scorer_CB'].evaluate_predictor(
                    predictor=curr_etmip, verbosity=2, out_dir=protein_dir, dist='CB', biased_w2_ave=None,
                    unbiased_w2_ave=None, processes=10, threshold=0.5, pos_size=2, rank_type='max',
                    file_prefix='ET-MIp_Scores_', plots=True)
                protein_df2, _, _ = protein_scorers[p_id]['Scorer_Any'].evaluate_predictor(
                    predictor=curr_etmip, verbosity=2, out_dir=protein_dir, dist='Any', biased_w2_ave=None,
                    unbiased_w2_ave=None, processes=10, threshold=0.5, pos_size=2, rank_type='max',
                    file_prefix='ET-MIp_Scores_Dist_Any', plots=True)
                protein_df = protein_df.append(protein_df2)
                # Score Prediction Clustering
                z_score_fn = os.path.join(protein_dir, 'ET-MIp_Scores_Dist-Any_{}_ZScores.tsv')
                z_score_plot_fn = os.path.join(protein_dir, 'ET-MIp_Scores_Dist-Any_{}_ZScores.png')
                z_score_biased, biased_w2_ave, biased_scw_z_auc = protein_scorers[p_id]['Scorer_Any'].score_clustering_of_contact_predictions(
                    1.0 - curr_etmip.coverage, bias=True, file_path=z_score_fn.format('Biased'),
                    w2_ave_sub=protein_scorers[p_id]['biased_w2_ave'], processes=10)
                if protein_scorers[p_id]['biased_w2_ave'] is None:
                    protein_scorers[p_id]['biased_w2_ave'] = biased_w2_ave
                biased_z_score_array = np.array(pd.to_numeric(z_score_biased['Z-Score'], errors='coerce'))
                protein_df['Max Biased Z-Score'] = np.nanmax(biased_z_score_array)
                protein_df['Biased Z-Score at 10%'] = biased_z_score_array[z_score_biased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.10][0]
                protein_df['Biased Z-Score at 30%'] = biased_z_score_array[z_score_biased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.30][0]
                protein_df['AUC Biased Z-Score'] = biased_scw_z_auc
                plot_z_scores(z_score_biased, z_score_plot_fn.format('Biased'))
                z_score_unbiased, unbiased_w2_ave, unbiased_scw_z_auc = protein_scorers[p_id]['Scorer_Any'].score_clustering_of_contact_predictions(
                    1.0 - curr_etmip.coverage, bias=False, file_path=z_score_fn.format('Unbiased'),
                    w2_ave_sub=protein_scorers[p_id]['unbiased_w2_ave'], processes=10)
                if protein_scorers[p_id]['unbiased_w2_ave'] is None:
                    protein_scorers[p_id]['unbiased_w2_ave'] = unbiased_w2_ave
                unbiased_z_score_array = np.array(pd.to_numeric(z_score_unbiased['Z-Score'], errors='coerce'))
                protein_df['Max Unbiased Z-Score'] = np.nanmax(unbiased_z_score_array)
                protein_df['Unbiased Z-Score at 10%'] = unbiased_z_score_array[z_score_unbiased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.10][0]
                protein_df['Unbiased Z-Score at 30%'] = unbiased_z_score_array[z_score_unbiased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.30][0]
                protein_df['AUC Unbiased Z-Score'] = unbiased_scw_z_auc
                plot_z_scores(z_score_unbiased, z_score_plot_fn.format('Unbiased'))
                # Record static data for this protein
                protein_df['Protein'] = p_id
                protein_df['Method'] = 'ET-MIp'
                protein_df['Alignment Size'] = summary['Filtered_Alignment'].values[summary['Protein_ID'] == p_id][0]
                protein_df.to_csv(protein_fn, sep='\t', header=True, index=False, columns=output_columns)
                print('Metrics meastured for ET-MIp covariance for: {}'.format(p_id))
                counts['success'] += 1
            if old_etmip_method_df is None:
                old_etmip_method_df = protein_df
            else:
                old_etmip_method_df = old_etmip_method_df.append(protein_df)
        print('{}\tSuccesses\n{}\tValue Errors\n{}\tAttribute Errors'.format(counts['success'], counts['value'],
                                                                             counts['attribute']))
        old_etmip_method_df.to_csv(old_etmip_method_fn, sep='\t', header=True, index=False, columns=output_columns)
    if small_comparison_df is None:
        small_comparison_df = old_etmip_method_df
    else:
        small_comparison_df = small_comparison_df.append(old_etmip_method_df)

## DCA##
Scoring the the covariation of the proteins using a DCA julia implementation.

In [19]:
from DCAWrapper import DCAWrapper
if not os.path.isfile(small_comparison_fn):
    dca_out_dir = os.path.join(small_set_out_dir, 'DCA')
    if not os.path.isdir(dca_out_dir):
        os.makedirs(dca_out_dir)
    dca_method_fn = os.path.join(dca_out_dir, 'DCA_Method_Data.csv')
    if os.path.isfile(dca_method_fn):
        dca_method_df = pd.read_csv(dca_method_fn, sep='\t', header=0, index_col=False)
    else:
        dca_method_df = None
        counts = {'success':0, 'value': 0, 'attribute':0}
        for p_id in summary['Protein_ID']:
            print('Attempting to calculate DCA covariance for: {}'.format(p_id))
            protein_dir = os.path.join(dca_out_dir, p_id)
            if not os.path.isdir(protein_dir):
                os.makedirs(protein_dir)
            protein_fn = os.path.join(protein_dir, '{}_Protein_Data.csv'.format(p_id))
            if os.path.isfile(protein_fn):
                protein_df = pd.read_csv(protein_fn, sep='\t', header=0, index_col=False)
                counts['success'] += 1
            else:
#                 curr_aln = SeqAlignment(file_name=generator.protein_data[p_id]['Final_FA_Aln'], query_id='query_' + p_id,
#                                         polymer_type='Protein')
#                 curr_aln.import_alignment()
                # Since the DCA implementation used here does not provide a way to specify the query sequence we remove the gaps
                # from the query sequences so positions will be referenced correctly for that sequence (and unnecessary
                # computations can be avoided).
#                 curr_aln = curr_aln.remove_gaps()
#                 new_aln_fn = os.path.join(protein_dir, '{}_no_gap.fasta'.format(p_id))
#                 curr_aln.write_out_alignment(new_aln_fn)
#                 curr_aln.file_name = new_aln_fn
#                 curr_dca = DCAWrapper(alignment=curr_aln)
                curr_dca = DCAWrapper(query=p_id, aln_file=generator.protein_data[p_id]['Final_FA_Aln'], out_dir=protein_dir)
#                 curr_dca.import_covariance_scores(out_path=os.path.join(protein_dir, '{}.DCA.txt'.format(p_id)))
                curr_dca.calculate_scores(delete_file=False)
                # Compute statistics for the final scores of the ET-MIp model
                protein_df, _, _ = protein_scorers[p_id]['Scorer_CB'].evaluate_predictor(
                    predictor=curr_dca, verbosity=2, out_dir=protein_dir, dist='CB', biased_w2_ave=None,
                    unbiased_w2_ave=None, processes=10, threshold=0.5, pos_size=2, rank_type='max', file_prefix='DCA_Scores_', plots=True)
                protein_df2, _, _ = protein_scorers[p_id]['Scorer_Any'].evaluate_predictor(
                    predictor=curr_dca, verbosity=2, out_dir=protein_dir, dist='Any', biased_w2_ave=None,
                    unbiased_w2_ave=None, processes=10, threshold=0.5, pos_size=2, rank_type='max', file_prefix='DCA_Scores_Dist_Any', plots=True)
                protein_df = protein_df.append(protein_df2)
                # Score Prediction Clustering
                _, dca_coverage  = compute_rank_and_coverage(seq_length=curr_dca.alignment.seq_length, scores=curr_dca.scores, pos_size=2,
                    rank_type='max')
                z_score_fn = os.path.join(protein_dir, 'DCA_Scores_Dist-Any_{}_ZScores.tsv')
                z_score_plot_fn = os.path.join(protein_dir, 'DCA_Scores_Dist-Any_{}_ZScores.png')
                z_score_biased, biased_w2_ave, biased_scw_z_auc = protein_scorers[p_id]['Scorer_Any'].score_clustering_of_contact_predictions(
                    1.0 - dca_coverage, bias=True, file_path=z_score_fn.format('Biased'),
                    w2_ave_sub=protein_scorers[p_id]['biased_w2_ave'], processes=10)
                if protein_scorers[p_id]['biased_w2_ave'] is None:
                    protein_scorers[p_id]['biased_w2_ave'] = biased_w2_ave
                biased_z_score_array = np.array(pd.to_numeric(z_score_biased['Z-Score'], errors='coerce'))
                protein_df['Max Biased Z-Score'] = np.nanmax(biased_z_score_array)
                protein_df['Biased Z-Score at 10%'] = biased_z_score_array[z_score_biased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.10][0]
                protein_df['Biased Z-Score at 30%'] = biased_z_score_array[z_score_biased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.30][0]
                protein_df['AUC Biased Z-Score'] = biased_scw_z_auc
                plot_z_scores(z_score_biased, z_score_plot_fn.format('Biased'))
                z_score_unbiased, unbiased_w2_ave, unbiased_scw_z_auc = protein_scorers[p_id]['Scorer_Any'].score_clustering_of_contact_predictions(
                    1.0 - dca_coverage, bias=False, file_path=z_score_fn.format('Unbiased'),
                    w2_ave_sub=protein_scorers[p_id]['unbiased_w2_ave'], processes=10)
                if protein_scorers[p_id]['unbiased_w2_ave'] is None:
                    protein_scorers[p_id]['unbiased_w2_ave'] = unbiased_w2_ave
                unbiased_z_score_array = np.array(pd.to_numeric(z_score_unbiased['Z-Score'], errors='coerce'))
                protein_df['Max Unbiased Z-Score'] = np.nanmax(unbiased_z_score_array)
                protein_df['Unbiased Z-Score at 10%'] = unbiased_z_score_array[z_score_unbiased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.10][0]
                protein_df['Unbiased Z-Score at 30%'] = unbiased_z_score_array[z_score_unbiased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.30][0]
                protein_df['AUC Unbiased Z-Score'] = unbiased_scw_z_auc
                plot_z_scores(z_score_unbiased, z_score_plot_fn.format('Unbiased'))
                # Record execution times
                protein_df['Init Time'] = None
                protein_df['Import Time'] = None
                protein_df['Dist Tree Time'] = None
                protein_df['Trace Time'] = None
                protein_df['Total Time'] = None
                # Record static data for this protein
                protein_df['Protein'] = p_id
                protein_df['Method'] = 'DCA'
                protein_df['Alignment Size'] = summary['Filtered_Alignment'].values[summary['Protein_ID'] == p_id][0]
                protein_df.to_csv(protein_fn, sep='\t', header=True, index=False, columns=output_columns)
                print('Successfully computed DCA covariance for: {}'.format(p_id))
                counts['success'] += 1
            if dca_method_df is None:
                dca_method_df = protein_df
            else:
                dca_method_df = dca_method_df.append(protein_df)
        print('{}\tSuccesses\n{}\tValue Errors\n{}\tAttribute Errors'.format(counts['success'], counts['value'],
                                                                             counts['attribute']))
        dca_method_df.to_csv(dca_method_fn, sep='\t', header=True, index=False, columns=output_columns)
    if small_comparison_df is None:
        small_comparison_df = dca_method_df
    else:
        small_comparison_df = small_comparison_df.append(dca_method_df)

Attempting to calculate DCA covariance for: 3q05A
Attempting to calculate DCA covariance for: 2b59A
Attempting to calculate DCA covariance for: 7hvpA
Attempting to calculate DCA covariance for: 1c17A
Attempting to calculate DCA covariance for: 206lA
Attempting to calculate DCA covariance for: 1bolA
Attempting to calculate DCA covariance for: 2z0eA
Attempting to calculate DCA covariance for: 1axbA
Removing gaps took 0.012909332911173502 min
Compute SCW Z-Score took 0.021410731474558513 min
Compute SCW Z-Score took 0.011387709776560466 min
Successfully computed DCA covariance for: 1axbA
Attempting to calculate DCA covariance for: 135lA
Attempting to calculate DCA covariance for: 2rh1A
Attempting to calculate DCA covariance for: 4lliA
Attempting to calculate DCA covariance for: 1a26A
Attempting to calculate DCA covariance for: 1c0kA
Attempting to calculate DCA covariance for: 2zxeA
Attempting to calculate DCA covariance for: 1jwlA
Attempting to calculate DCA covariance for: 1hckA
Attempti

## cET-MIp
This segment the ET-MIp method, when constrained to an arbitrary set of nodes (1, 2, 3, 5, 7, 10, 25) at the top of the phylogenetic tree.

In [24]:
from utils import compute_rank_and_coverage
def import_cETMIp_cov_scores(path, aln):
    data = pd.read_csv(path, sep='\t', header=0, index_col=None)
    scores = np.zeros((aln.seq_length, aln.seq_length))
    positions = sorted(set(data['Pos1']) | set(data['Pos2']))
    pos_map = {x: i for i, x in enumerate(positions)}
    data['Final_Pos1'] = data['Pos1'].apply(lambda x: pos_map[x])
    data['Final_Pos2'] = data['Pos2'].apply(lambda x: pos_map[x])
    scores[data['Final_Pos1'].values, data['Final_Pos2'].values] = data['ETMIp_Score'].values
    _, coverages = compute_rank_and_coverage(seq_length=aln.seq_length, scores=scores, pos_size=2, rank_type='max')
    return scores, coverages

if not os.path.isfile(small_comparison_fn):
    old_cetmip_out_dir = os.path.join(small_set_out_dir, 'cET-MIp')
    if not os.path.isdir(old_cetmip_out_dir):
        os.makedirs(old_cetmip_out_dir)
    old_cetmip_method_fn = os.path.join(old_cetmip_out_dir, 'cET-MIp_Method_Data.csv')
    if os.path.isfile(old_cetmip_method_fn):
        old_cetmip_method_df = pd.read_csv(old_cetmip_method_fn, sep='\t', header=0, index_col=False)
    else:    
        old_cetmip_method_df = None
        counts = {'success':0, 'value': 0, 'attribute':0}
        for p_id in summary['Protein_ID']:
            print('Attempting to calculate cET-MIp covariance for: {}'.format(p_id))
            protein_dir = os.path.join(old_cetmip_out_dir, p_id)
            if not os.path.isdir(protein_dir):
                os.makedirs(protein_dir)
            protein_fn = os.path.join(protein_dir, '{}_Protein_Data.csv'.format(p_id))
            if os.path.isfile(protein_fn):
                protein_df = pd.read_csv(protein_fn, sep='\t', header=0, index_col=False)
                counts['success'] += 1
            else:
#                 curr_aln = SeqAlignment(file_name=generator.protein_data[p_id]['Final_FA_Aln'], query_id='query_' + p_id,
#                                         polymer_type='Protein')
#                 curr_aln.import_alignment()
                curr_etmip = ETMIPWrapper(query='query_' + p_id, aln_file=generator.protein_data[p_id]['Final_FA_Aln'],
                                          out_dir=protein_dir)
                curr_etmip.scores, curr_etmip.coverage = import_cETMIp_cov_scores(
                    os.path.join(protein_dir, '{}.cetmip.txt'.format(p_id)), curr_aln)
                np.savez(os.path.join(protein_dir, 'cET-MIp.npz'), time=curr_etmip.time,
                         scores=curr_etmip.scores, coverage=curr_etmip.coverage)
                import pickle
                with open(os.path.join(protein_dir, 'cET-MIp.pkl'), 'wb') as handle:
                    pickle.dump((curr_etmip.distance_matrix, curr_etmip.tree, curr_etmip.rank_group_assignments),
                                handle, pickle.HIGHEST_PROTOCOL)
                print('Successfully computed cET-MIp covariance for: {} in {} sec'.format(p_id, end-start))
                # Compute statistics for the final scores of the ET-MIp model
                protein_df, _, _ = protein_scorers[p_id]['Scorer_CB'].evaluate_predictor(
                    predictor=curr_etmip, verbosity=2, out_dir=protein_dir, dist='CB', biased_w2_ave=None,
                    unbiased_w2_ave=None, processes=10, threshold=0.5, pos_size=2, rank_type='max',
                    file_prefix='cET-MIp_Scores_', plots=True)
                protein_df2, _, _ = protein_scorers[p_id]['Scorer_Any'].evaluate_predictor(
                    predictor=curr_etmip, verbosity=2, out_dir=protein_dir, dist='Any', biased_w2_ave=None,
                    unbiased_w2_ave=None, processes=10, threshold=0.5, pos_size=2, rank_type='max',
                    file_prefix='cET-MIp_Scores_Dist_Any', plots=True)
                protein_df = protein_df.append(protein_df2)
                # Score Prediction Clustering
                z_score_fn = os.path.join(protein_dir, 'cET-MIp_Scores_Dist-Any_{}_ZScores.tsv')
                z_score_plot_fn = os.path.join(protein_dir, 'cET-MIp_Scores_Dist-Any_{}_ZScores.png')
                z_score_biased, biased_w2_ave, biased_scw_z_auc = protein_scorers[p_id]['Scorer_Any'].score_clustering_of_contact_predictions(
                    1.0 - curr_etmip.coverage, bias=True, file_path=z_score_fn.format('Biased'),
                    w2_ave_sub=protein_scorers[p_id]['biased_w2_ave'], processes=10)
                if protein_scorers[p_id]['biased_w2_ave'] is None:
                    protein_scorers[p_id]['biased_w2_ave'] = biased_w2_ave
                biased_z_score_array = np.array(pd.to_numeric(z_score_biased['Z-Score'], errors='coerce'))
                protein_df['Max Biased Z-Score'] = np.nanmax(biased_z_score_array)
                protein_df['Biased Z-Score at 10%'] = biased_z_score_array[z_score_biased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.10][0]
                protein_df['Biased Z-Score at 30%'] = biased_z_score_array[z_score_biased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.30][0]
                protein_df['AUC Biased Z-Score'] = biased_scw_z_auc
                plot_z_scores(z_score_biased, z_score_plot_fn.format('Biased'))
                z_score_unbiased, unbiased_w2_ave, unbiased_scw_z_auc = protein_scorers[p_id]['Scorer_Any'].score_clustering_of_contact_predictions(
                    1.0 - curr_etmip.coverage, bias=False, file_path=z_score_fn.format('Unbiased'),
                    w2_ave_sub=protein_scorers[p_id]['unbiased_w2_ave'], processes=10)
                if protein_scorers[p_id]['unbiased_w2_ave'] is None:
                    protein_scorers[p_id]['unbiased_w2_ave'] = unbiased_w2_ave
                unbiased_z_score_array = np.array(pd.to_numeric(z_score_unbiased['Z-Score'], errors='coerce'))
                protein_df['Max Unbiased Z-Score'] = np.nanmax(unbiased_z_score_array)
                protein_df['Unbiased Z-Score at 10%'] = unbiased_z_score_array[z_score_unbiased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.10][0]
                protein_df['Unbiased Z-Score at 30%'] = unbiased_z_score_array[z_score_unbiased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.30][0]
                protein_df['AUC Unbiased Z-Score'] = unbiased_scw_z_auc
                plot_z_scores(z_score_unbiased, z_score_plot_fn.format('Unbiased'))
                # Record static data for this protein
                protein_df['Protein'] = p_id
                protein_df['Method'] = 'cET-MIp'
                protein_df['Alignment Size'] = summary['Filtered_Alignment'].values[summary['Protein_ID'] == p_id][0]
                protein_df.to_csv(protein_fn, sep='\t', header=True, index=False, columns=output_columns)
                print('Metrics meastured for cET-MIp covariance for: {}'.format(p_id))
                counts['success'] += 1
            if old_cetmip_method_df is None:
                old_cetmip_method_df = protein_df
            else:
                old_cetmip_method_df = old_cetmip_method_df.append(protein_df)
        print('{}\tSuccesses\n{}\tValue Errors\n{}\tAttribute Errors'.format(counts['success'], counts['value'],
                                                                             counts['attribute']))
        old_cetmip_method_df.to_csv(old_cetmip_method_fn, sep='\t', header=True, index=False, columns=output_columns)
    if small_comparison_df is None:
        small_comparison_df = old_cetmip_method_df
    else:
        small_comparison_df = small_comparison_df.append(old_cetmip_method_df)

Attempting to calculate cET-MIp covariance for: 3q05A
Successfully computed cET-MIp covariance for: 3q05A in 33.034722566604614 sec


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


Compute SCW Z-Score took 0.01861115296681722 min
Compute SCW Z-Score took 0.013350764910380045 min
Metrics meastured for cET-MIp covariance for: 3q05A
Attempting to calculate cET-MIp covariance for: 2b59A
Successfully computed cET-MIp covariance for: 2b59A in 33.034722566604614 sec
Compute SCW Z-Score took 0.009998353322347005 min
Compute SCW Z-Score took 0.010991899172465007 min
Metrics meastured for cET-MIp covariance for: 2b59A
Attempting to calculate cET-MIp covariance for: 7hvpA
Successfully computed cET-MIp covariance for: 7hvpA in 33.034722566604614 sec
Compute SCW Z-Score took 0.007678552468617757 min
Compute SCW Z-Score took 0.008338566621144612 min
Metrics meastured for cET-MIp covariance for: 7hvpA
Attempting to calculate cET-MIp covariance for: 1c17A
Successfully computed cET-MIp covariance for: 1c17A in 33.034722566604614 sec


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', avera

Compute SCW Z-Score took 0.00685197114944458 min
Compute SCW Z-Score took 0.007689126332600911 min
Metrics meastured for cET-MIp covariance for: 1c17A
Attempting to calculate cET-MIp covariance for: 206lA
Successfully computed cET-MIp covariance for: 206lA in 33.034722566604614 sec
Compute SCW Z-Score took 0.014568122227986653 min
Compute SCW Z-Score took 0.010161197185516358 min
Metrics meastured for cET-MIp covariance for: 206lA
Attempting to calculate cET-MIp covariance for: 1bolA
Successfully computed cET-MIp covariance for: 1bolA in 33.034722566604614 sec
Compute SCW Z-Score took 0.01884549856185913 min
Compute SCW Z-Score took 0.016194522380828857 min
Metrics meastured for cET-MIp covariance for: 1bolA
Attempting to calculate cET-MIp covariance for: 2z0eA
Successfully computed cET-MIp covariance for: 2z0eA in 33.034722566604614 sec
Compute SCW Z-Score took 0.09317988554636637 min
Compute SCW Z-Score took 0.11031901439030965 min
Metrics meastured for cET-MIp covariance for: 2z0eA


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', avera

Compute SCW Z-Score took 0.008471016089121501 min
Compute SCW Z-Score took 0.009182437260945638 min
Metrics meastured for cET-MIp covariance for: 2ysdA
Attempting to calculate cET-MIp covariance for: 2iopA
Successfully computed cET-MIp covariance for: 2iopA in 33.034722566604614 sec
Compute SCW Z-Score took 0.22693382501602172 min
Compute SCW Z-Score took 0.214168651898702 min
Metrics meastured for cET-MIp covariance for: 2iopA
Attempting to calculate cET-MIp covariance for: 3b6vA
Successfully computed cET-MIp covariance for: 3b6vA in 33.034722566604614 sec
Compute SCW Z-Score took 0.08326433499654134 min
Compute SCW Z-Score took 0.08648212353388468 min
Metrics meastured for cET-MIp covariance for: 3b6vA
Attempting to calculate cET-MIp covariance for: 4ycuA
Successfully computed cET-MIp covariance for: 4ycuA in 33.034722566604614 sec
Compute SCW Z-Score took 0.13809781869252521 min
Compute SCW Z-Score took 0.12764538923899332 min
Metrics meastured for cET-MIp covariance for: 4ycuA
Atte

  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


Compute SCW Z-Score took 0.022269082069396973 min
Compute SCW Z-Score took 0.025943811734517416 min
Metrics meastured for cET-MIp covariance for: 2werA
Attempting to calculate cET-MIp covariance for: 3tnuA
Successfully computed cET-MIp covariance for: 3tnuA in 33.034722566604614 sec


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', avera

Compute SCW Z-Score took 0.013906260331471762 min
Compute SCW Z-Score took 0.010667447249094646 min
Metrics meastured for cET-MIp covariance for: 3tnuA
23	Successes
0	Value Errors
0	Attribute Errors


## MI
This segment imports MI scores generated as intermediate values by EV Couplings.

In [25]:
from utils import compute_rank_and_coverage
def import_MI_cov_scores(path, aln):
    data = pd.read_csv(path, sep='\s+', header=None, index_col=None, names=['Pos1', 'AA1', 'Pos2', 'AA2', 'MI', 'DI'])
    scores = np.zeros((aln.seq_length, aln.seq_length))
    positions = sorted(set(data['Pos1']) | set(data['Pos2']))
    pos_map = {x: i for i, x in enumerate(positions)}
    data['Final_Pos1'] = data['Pos1'].apply(lambda x: pos_map[x])
    data['Final_Pos2'] = data['Pos2'].apply(lambda x: pos_map[x])
    scores[data['Final_Pos1'].values, data['Final_Pos2'].values] = data['MI'].values
    _, coverages = compute_rank_and_coverage(seq_length=aln.seq_length, scores=scores, pos_size=2, rank_type='max')
    return scores, coverages

if not os.path.isfile(small_comparison_fn):
    old_mi_out_dir = os.path.join(small_set_out_dir, 'EVCouplings_MI')
    if not os.path.isdir(old_mi_out_dir):
        os.makedirs(old_mi_out_dir)
    old_mi_method_fn = os.path.join(old_mi_out_dir, 'MI_Method_Data.csv')
    if os.path.isfile(old_mi_method_fn):
        old_mi_method_df = pd.read_csv(old_mi_method_fn, sep='\t', header=0, index_col=False)
    else:    
        old_mi_method_df = None
        counts = {'success':0, 'value': 0, 'attribute':0}
        for p_id in summary['Protein_ID']:
            print('Attempting to calculate MI covariance for: {}'.format(p_id))
            protein_dir = os.path.join(old_mi_out_dir, p_id)
            if not os.path.isdir(protein_dir):
                os.makedirs(protein_dir)
            protein_fn = os.path.join(protein_dir, '{}_Protein_Data.csv'.format(p_id))
            if os.path.isfile(protein_fn):
                protein_df = pd.read_csv(protein_fn, sep='\t', header=0, index_col=False)
                counts['success'] += 1
            else:
#                 curr_aln = SeqAlignment(file_name=generator.protein_data[p_id]['Final_FA_Aln'], query_id='query_' + p_id,
#                                         polymer_type='Protein')
#                 curr_aln.import_alignment()
                curr_etmip = ETMIPWrapper(query='query_' + p_id, aln_file=generator.protein_data[p_id]['Final_FA_Aln'],
                                          out_dir=protein_dir)
                curr_etmip.scores, curr_etmip.coverage = import_MI_cov_scores(
                    os.path.join(protein_dir, '{}_MI_DI.txt'.format(p_id)), curr_aln)
                np.savez(os.path.join(protein_dir, 'MI.npz'), time=curr_etmip.time,
                         scores=curr_etmip.scores, coverage=curr_etmip.coverage)
                import pickle
                with open(os.path.join(protein_dir, 'MI.pkl'), 'wb') as handle:
                    pickle.dump((curr_etmip.distance_matrix, curr_etmip.tree, curr_etmip.rank_group_assignments),
                                handle, pickle.HIGHEST_PROTOCOL)
                print('Successfully computed MI covariance for: {} in {} sec'.format(p_id, end-start))
                # Compute statistics for the final scores of the ET-MIp model
                protein_df, _, _ = protein_scorers[p_id]['Scorer_CB'].evaluate_predictor(
                    predictor=curr_etmip, verbosity=2, out_dir=protein_dir, dist='CB', biased_w2_ave=None,
                    unbiased_w2_ave=None, processes=10, threshold=0.5, pos_size=2, rank_type='max',
                    file_prefix='MI_Scores_', plots=True)
                protein_df2, _, _ = protein_scorers[p_id]['Scorer_Any'].evaluate_predictor(
                    predictor=curr_etmip, verbosity=2, out_dir=protein_dir, dist='Any', biased_w2_ave=None,
                    unbiased_w2_ave=None, processes=10, threshold=0.5, pos_size=2, rank_type='max',
                    file_prefix='MI_Scores_Dist_Any', plots=True)
                protein_df = protein_df.append(protein_df2)
                # Score Prediction Clustering
                z_score_fn = os.path.join(protein_dir, 'MI_Scores_Dist-Any_{}_ZScores.tsv')
                z_score_plot_fn = os.path.join(protein_dir, 'MI_Scores_Dist-Any_{}_ZScores.png')
                z_score_biased, biased_w2_ave, biased_scw_z_auc = protein_scorers[p_id]['Scorer_Any'].score_clustering_of_contact_predictions(
                    1.0 - curr_etmip.coverage, bias=True, file_path=z_score_fn.format('Biased'),
                    w2_ave_sub=protein_scorers[p_id]['biased_w2_ave'], processes=10)
                if protein_scorers[p_id]['biased_w2_ave'] is None:
                    protein_scorers[p_id]['biased_w2_ave'] = biased_w2_ave
                biased_z_score_array = np.array(pd.to_numeric(z_score_biased['Z-Score'], errors='coerce'))
                protein_df['Max Biased Z-Score'] = np.nanmax(biased_z_score_array)
                protein_df['Biased Z-Score at 10%'] = biased_z_score_array[z_score_biased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.10][0]
                protein_df['Biased Z-Score at 30%'] = biased_z_score_array[z_score_biased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.30][0]
                protein_df['AUC Biased Z-Score'] = biased_scw_z_auc
                plot_z_scores(z_score_biased, z_score_plot_fn.format('Biased'))
                z_score_unbiased, unbiased_w2_ave, unbiased_scw_z_auc = protein_scorers[p_id]['Scorer_Any'].score_clustering_of_contact_predictions(
                    1.0 - curr_etmip.coverage, bias=False, file_path=z_score_fn.format('Unbiased'),
                    w2_ave_sub=protein_scorers[p_id]['unbiased_w2_ave'], processes=10)
                if protein_scorers[p_id]['unbiased_w2_ave'] is None:
                    protein_scorers[p_id]['unbiased_w2_ave'] = unbiased_w2_ave
                unbiased_z_score_array = np.array(pd.to_numeric(z_score_unbiased['Z-Score'], errors='coerce'))
                protein_df['Max Unbiased Z-Score'] = np.nanmax(unbiased_z_score_array)
                protein_df['Unbiased Z-Score at 10%'] = unbiased_z_score_array[z_score_unbiased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.10][0]
                protein_df['Unbiased Z-Score at 30%'] = unbiased_z_score_array[z_score_unbiased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.30][0]
                protein_df['AUC Unbiased Z-Score'] = unbiased_scw_z_auc
                plot_z_scores(z_score_unbiased, z_score_plot_fn.format('Unbiased'))
                # Record static data for this protein
                protein_df['Protein'] = p_id
                protein_df['Method'] = 'MI'
                protein_df['Alignment Size'] = summary['Filtered_Alignment'].values[summary['Protein_ID'] == p_id][0]
                protein_df.to_csv(protein_fn, sep='\t', header=True, index=False, columns=output_columns)
                print('Metrics meastured for MI covariance for: {}'.format(p_id))
                counts['success'] += 1
            if old_mi_method_df is None:
                old_mi_method_df = protein_df
            else:
                old_mi_method_df = old_mi_method_df.append(protein_df)
        print('{}\tSuccesses\n{}\tValue Errors\n{}\tAttribute Errors'.format(counts['success'], counts['value'],
                                                                             counts['attribute']))
        old_mi_method_df.to_csv(old_mi_method_fn, sep='\t', header=True, index=False, columns=output_columns)
    if small_comparison_df is None:
        small_comparison_df = old_mi_method_df
    else:
        small_comparison_df = small_comparison_df.append(old_mi_method_df)

Attempting to calculate MI covariance for: 3q05A
Successfully computed MI covariance for: 3q05A in 33.034722566604614 sec


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


Compute SCW Z-Score took 0.019817384084065755 min
Compute SCW Z-Score took 0.0148736039797465 min
Metrics meastured for MI covariance for: 3q05A
Attempting to calculate MI covariance for: 2b59A
Successfully computed MI covariance for: 2b59A in 33.034722566604614 sec


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


Compute SCW Z-Score took 0.01036144495010376 min
Compute SCW Z-Score took 0.010043176015218098 min
Metrics meastured for MI covariance for: 2b59A
Attempting to calculate MI covariance for: 7hvpA
Successfully computed MI covariance for: 7hvpA in 33.034722566604614 sec


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', avera

Compute SCW Z-Score took 0.007858351866404215 min
Compute SCW Z-Score took 0.007786663373311361 min
Metrics meastured for MI covariance for: 7hvpA
Attempting to calculate MI covariance for: 1c17A
Successfully computed MI covariance for: 1c17A in 33.034722566604614 sec


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', avera

Compute SCW Z-Score took 0.007564135392506917 min
Compute SCW Z-Score took 0.0077718575795491535 min
Metrics meastured for MI covariance for: 1c17A
Attempting to calculate MI covariance for: 206lA
Successfully computed MI covariance for: 206lA in 33.034722566604614 sec


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


Compute SCW Z-Score took 0.014216025670369467 min
Compute SCW Z-Score took 0.010912875334421793 min
Metrics meastured for MI covariance for: 206lA
Attempting to calculate MI covariance for: 1bolA
Successfully computed MI covariance for: 1bolA in 33.034722566604614 sec
Compute SCW Z-Score took 0.014923528830210368 min
Compute SCW Z-Score took 0.02121281623840332 min
Metrics meastured for MI covariance for: 1bolA
Attempting to calculate MI covariance for: 2z0eA
Successfully computed MI covariance for: 2z0eA in 33.034722566604614 sec
Compute SCW Z-Score took 0.08788102865219116 min
Compute SCW Z-Score took 0.0910598874092102 min
Metrics meastured for MI covariance for: 2z0eA
Attempting to calculate MI covariance for: 1axbA
Successfully computed MI covariance for: 1axbA in 33.034722566604614 sec
Compute SCW Z-Score took 0.026435808340708414 min
Compute SCW Z-Score took 0.023765432834625243 min
Metrics meastured for MI covariance for: 1axbA
Attempting to calculate MI covariance for: 135lA
S

  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


Compute SCW Z-Score took 0.040933644771575926 min
Compute SCW Z-Score took 0.04050557613372803 min
Metrics meastured for MI covariance for: 2rh1A
Attempting to calculate MI covariance for: 4lliA
Successfully computed MI covariance for: 4lliA in 33.034722566604614 sec
Compute SCW Z-Score took 0.06761047840118409 min
Compute SCW Z-Score took 0.07163734833399454 min
Metrics meastured for MI covariance for: 4lliA
Attempting to calculate MI covariance for: 1a26A
Successfully computed MI covariance for: 1a26A in 33.034722566604614 sec


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


Compute SCW Z-Score took 0.13854771455128986 min
Compute SCW Z-Score took 0.1327943444252014 min
Metrics meastured for MI covariance for: 1a26A
Attempting to calculate MI covariance for: 1c0kA
Successfully computed MI covariance for: 1c0kA in 33.034722566604614 sec
Compute SCW Z-Score took 0.1327655553817749 min
Compute SCW Z-Score took 0.14542426665623984 min
Metrics meastured for MI covariance for: 1c0kA
Attempting to calculate MI covariance for: 2zxeA
Successfully computed MI covariance for: 2zxeA in 33.034722566604614 sec
Compute SCW Z-Score took 0.617483651638031 min
Compute SCW Z-Score took 0.5900325576464335 min
Metrics meastured for MI covariance for: 2zxeA
Attempting to calculate MI covariance for: 1jwlA
Successfully computed MI covariance for: 1jwlA in 33.034722566604614 sec
Compute SCW Z-Score took 0.0475423534711202 min
Compute SCW Z-Score took 0.04368751049041748 min
Metrics meastured for MI covariance for: 1jwlA
Attempting to calculate MI covariance for: 1hckA
Successfull

  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


Compute SCW Z-Score took 0.06903199354807536 min
Compute SCW Z-Score took 0.06816544930140177 min
Metrics meastured for MI covariance for: 1hckA
Attempting to calculate MI covariance for: 1h1vA
Successfully computed MI covariance for: 1h1vA in 33.034722566604614 sec


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


Compute SCW Z-Score took 0.13476604223251343 min
Compute SCW Z-Score took 0.1351438045501709 min
Metrics meastured for MI covariance for: 1h1vA
Attempting to calculate MI covariance for: 2ysdA
Successfully computed MI covariance for: 2ysdA in 33.034722566604614 sec


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', avera

Compute SCW Z-Score took 0.008566888173421223 min
Compute SCW Z-Score took 0.008618477980295818 min
Metrics meastured for MI covariance for: 2ysdA
Attempting to calculate MI covariance for: 2iopA
Successfully computed MI covariance for: 2iopA in 33.034722566604614 sec
Compute SCW Z-Score took 0.21591771841049195 min
Compute SCW Z-Score took 0.20942237774531047 min
Metrics meastured for MI covariance for: 2iopA
Attempting to calculate MI covariance for: 3b6vA
Successfully computed MI covariance for: 3b6vA in 33.034722566604614 sec


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


Compute SCW Z-Score took 0.0802385409673055 min
Compute SCW Z-Score took 0.07609207232793172 min
Metrics meastured for MI covariance for: 3b6vA
Attempting to calculate MI covariance for: 4ycuA
Successfully computed MI covariance for: 4ycuA in 33.034722566604614 sec
Compute SCW Z-Score took 0.13149775664011637 min
Compute SCW Z-Score took 0.14062191247940065 min
Metrics meastured for MI covariance for: 4ycuA
Attempting to calculate MI covariance for: 2werA
Successfully computed MI covariance for: 2werA in 33.034722566604614 sec
Compute SCW Z-Score took 0.024577407042185466 min
Compute SCW Z-Score took 0.022806549072265626 min
Metrics meastured for MI covariance for: 2werA
Attempting to calculate MI covariance for: 3tnuA
Successfully computed MI covariance for: 3tnuA in 33.034722566604614 sec


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', avera

Compute SCW Z-Score took 0.009751419226328531 min
Compute SCW Z-Score took 0.009075323740641275 min
Metrics meastured for MI covariance for: 3tnuA
23	Successes
0	Value Errors
0	Attribute Errors


## DI
This segment imports DI scores generated as intermediate values by EV Couplings.

In [29]:
from utils import compute_rank_and_coverage
def import_DI_cov_scores(path, aln):
    data = pd.read_csv(path, sep='\s+', header=None, index_col=None, names=['Pos1', 'AA1', 'Pos2', 'AA2', 'MI', 'DI'])
    scores = np.zeros((aln.seq_length, aln.seq_length))
    positions = sorted(set(data['Pos1']) | set(data['Pos2']))
    pos_map = {x: i for i, x in enumerate(positions)}
    data['Final_Pos1'] = data['Pos1'].apply(lambda x: pos_map[x])
    data['Final_Pos2'] = data['Pos2'].apply(lambda x: pos_map[x])
    scores[data['Final_Pos1'].values, data['Final_Pos2'].values] = data['DI'].values
    _, coverages = compute_rank_and_coverage(seq_length=aln.seq_length, scores=scores, pos_size=2, rank_type='max')
    return scores, coverages

if not os.path.isfile(small_comparison_fn):
    old_di_out_dir = os.path.join(small_set_out_dir, 'EVCouplings_DI')
    if not os.path.isdir(old_di_out_dir):
        os.makedirs(old_di_out_dir)
    old_di_method_fn = os.path.join(old_di_out_dir, 'DI_Method_Data.csv')
    if os.path.isfile(old_di_method_fn):
        old_di_method_df = pd.read_csv(old_di_method_fn, sep='\t', header=0, index_col=False)
    else:    
        old_di_method_df = None
        counts = {'success':0, 'value': 0, 'attribute':0}
        for p_id in summary['Protein_ID']:
            print('Attempting to calculate DI covariance for: {}'.format(p_id))
            protein_dir = os.path.join(old_di_out_dir, p_id)
            if not os.path.isdir(protein_dir):
                os.makedirs(protein_dir)
            protein_fn = os.path.join(protein_dir, '{}_Protein_Data.csv'.format(p_id))
            if os.path.isfile(protein_fn):
                protein_df = pd.read_csv(protein_fn, sep='\t', header=0, index_col=False)
                counts['success'] += 1
            else:
#                 curr_aln = SeqAlignment(file_name=generator.protein_data[p_id]['Final_FA_Aln'], query_id='query_' + p_id,
#                                         polymer_type='Protein')
#                 curr_aln.import_alignment()
                curr_etmip = ETMIPWrapper(query='query_' + p_id, aln_file=generator.protein_data[p_id]['Final_FA_Aln']protein_dir,
                                          out_dir=protein_dir)
                curr_etmip.scores, curr_etmip.coverage = import_DI_cov_scores(
                    os.path.join(protein_dir, '{}_MI_DI.txt'.format(p_id)), curr_aln)
                np.savez(os.path.join(protein_dir, 'dI.npz'), time=curr_etmip.time,
                         scores=curr_etmip.scores, coverage=curr_etmip.coverage)
                import pickle
                with open(os.path.join(protein_dir, 'dI.pkl'), 'wb') as handle:
                    pickle.dump((curr_etmip.distance_matrix, curr_etmip.tree, curr_etmip.rank_group_assignments),
                                handle, pickle.HIGHEST_PROTOCOL)
                print('Successfully computed dI covariance for: {} in {} sec'.format(p_id, end-start))
                # Compute statistics for the final scores of the ET-MIp model
                protein_df, _, _ = protein_scorers[p_id]['Scorer_CB'].evaluate_predictor(
                    predictor=curr_etmip, verbosity=2, out_dir=protein_dir, dist='CB', biased_w2_ave=None,
                    unbiased_w2_ave=None, processes=10, threshold=0.5, pos_size=2, rank_type='max',
                    file_prefix='DI_Scores_', plots=True)
                protein_df2, _, _ = protein_scorers[p_id]['Scorer_Any'].evaluate_predictor(
                    predictor=curr_etmip, verbosity=2, out_dir=protein_dir, dist='Any', biased_w2_ave=None,
                    unbiased_w2_ave=None, processes=10, threshold=0.5, pos_size=2, rank_type='max',
                    file_prefix='DI_Scores_Dist_Any', plots=True)
                protein_df = protein_df.append(protein_df2)
                # Score Prediction Clustering
                z_score_fn = os.path.join(protein_dir, 'DI_Scores_Dist-Any_{}_ZScores.tsv')
                z_score_plot_fn = os.path.join(protein_dir, 'DI_Scores_Dist-Any_{}_ZScores.png')
                z_score_biased, biased_w2_ave, biased_scw_z_auc = protein_scorers[p_id]['Scorer_Any'].score_clustering_of_contact_predictions(
                    1.0 - curr_etmip.coverage, bias=True, file_path=z_score_fn.format('Biased'),
                    w2_ave_sub=protein_scorers[p_id]['biased_w2_ave'], processes=10)
                if protein_scorers[p_id]['biased_w2_ave'] is None:
                    protein_scorers[p_id]['biased_w2_ave'] = biased_w2_ave
                biased_z_score_array = np.array(pd.to_numeric(z_score_biased['Z-Score'], errors='coerce'))
                protein_df['Max Biased Z-Score'] = np.nanmax(biased_z_score_array)
                protein_df['Biased Z-Score at 10%'] = biased_z_score_array[z_score_biased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.10][0]
                protein_df['Biased Z-Score at 30%'] = biased_z_score_array[z_score_biased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.30][0]
                protein_df['AUC Biased Z-Score'] = biased_scw_z_auc
                plot_z_scores(z_score_biased, z_score_plot_fn.format('Biased'))
                z_score_unbiased, unbiased_w2_ave, unbiased_scw_z_auc = protein_scorers[p_id]['Scorer_Any'].score_clustering_of_contact_predictions(
                    1.0 - curr_etmip.coverage, bias=False, file_path=z_score_fn.format('Unbiased'),
                    w2_ave_sub=protein_scorers[p_id]['unbiased_w2_ave'], processes=10)
                if protein_scorers[p_id]['unbiased_w2_ave'] is None:
                    protein_scorers[p_id]['unbiased_w2_ave'] = unbiased_w2_ave
                unbiased_z_score_array = np.array(pd.to_numeric(z_score_unbiased['Z-Score'], errors='coerce'))
                protein_df['Max Unbiased Z-Score'] = np.nanmax(unbiased_z_score_array)
                protein_df['Unbiased Z-Score at 10%'] = unbiased_z_score_array[z_score_unbiased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.10][0]
                protein_df['Unbiased Z-Score at 30%'] = unbiased_z_score_array[z_score_unbiased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.30][0]
                protein_df['AUC Unbiased Z-Score'] = unbiased_scw_z_auc
                plot_z_scores(z_score_unbiased, z_score_plot_fn.format('Unbiased'))
                # Record static data for this protein
                protein_df['Protein'] = p_id
                protein_df['Method'] = 'DI'
                protein_df['Alignment Size'] = summary['Filtered_Alignment'].values[summary['Protein_ID'] == p_id][0]
                protein_df.to_csv(protein_fn, sep='\t', header=True, index=False, columns=output_columns)
                print('Metrics meastured for DI covariance for: {}'.format(p_id))
                counts['success'] += 1
            if old_di_method_df is None:
                old_di_method_df = protein_df
            else:
                old_di_method_df = old_di_method_df.append(protein_df)
        print('{}\tSuccesses\n{}\tValue Errors\n{}\tAttribute Errors'.format(counts['success'], counts['value'],
                                                                             counts['attribute']))
        old_di_method_df.to_csv(old_di_method_fn, sep='\t', header=True, index=False, columns=output_columns)
    if small_comparison_df is None:
        small_comparison_df = old_di_method_df
    else:
        small_comparison_df = small_comparison_df.append(old_di_method_df)

Attempting to calculate DI covariance for: 3q05A
Successfully computed dI covariance for: 3q05A in 33.034722566604614 sec


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', avera

Compute SCW Z-Score took 0.014219431082407634 min
Compute SCW Z-Score took 0.01660402218500773 min
Metrics meastured for DI covariance for: 3q05A
Attempting to calculate DI covariance for: 2b59A
Successfully computed dI covariance for: 2b59A in 33.034722566604614 sec


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


Compute SCW Z-Score took 0.011115042368570964 min
Compute SCW Z-Score took 0.01041925350824992 min
Metrics meastured for DI covariance for: 2b59A
Attempting to calculate DI covariance for: 7hvpA
Successfully computed dI covariance for: 7hvpA in 33.034722566604614 sec
Compute SCW Z-Score took 0.007634969552357992 min
Compute SCW Z-Score took 0.007882972558339437 min
Metrics meastured for DI covariance for: 7hvpA
Attempting to calculate DI covariance for: 1c17A
Successfully computed dI covariance for: 1c17A in 33.034722566604614 sec


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', avera

Compute SCW Z-Score took 0.0075304587682088215 min
Compute SCW Z-Score took 0.011876654624938966 min
Metrics meastured for DI covariance for: 1c17A
Attempting to calculate DI covariance for: 206lA
Successfully computed dI covariance for: 206lA in 33.034722566604614 sec
Compute SCW Z-Score took 0.015273336569468181 min
Compute SCW Z-Score took 0.010190558433532716 min
Metrics meastured for DI covariance for: 206lA
Attempting to calculate DI covariance for: 1bolA
Successfully computed dI covariance for: 1bolA in 33.034722566604614 sec
Compute SCW Z-Score took 0.014323906103769938 min
Compute SCW Z-Score took 0.01763444741566976 min
Metrics meastured for DI covariance for: 1bolA
Attempting to calculate DI covariance for: 2z0eA
Successfully computed dI covariance for: 2z0eA in 33.034722566604614 sec
Compute SCW Z-Score took 0.08505027294158936 min
Compute SCW Z-Score took 0.09089635610580445 min
Metrics meastured for DI covariance for: 2z0eA
Attempting to calculate DI covariance for: 1axbA

  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', avera

Compute SCW Z-Score took 0.008866433302561443 min
Compute SCW Z-Score took 0.010055756568908692 min
Metrics meastured for DI covariance for: 3tnuA
23	Successes
0	Value Errors
0	Attribute Errors


In [40]:
# Write out final comparison data so it can be loaded later for generating figures.
if not os.path.isfile(small_comparison_fn):
    small_comparison_df['Protein Length'] = small_comparison_df['Protein'].apply(lambda x: generator.protein_data[x]['Length'])
    small_comparison_df.to_csv(small_comparison_fn, sep='\t', header=True, index=False, columns=output_columns)

## Method Comparison
We now begin comparing methods based on their ability to predict the structural contacts in the proteins in this test set. There is an important consideration in the case of sequence separation and different measures by which to compare the methods.

### Data Cleaning
These data need an additional cleaning step beyond what was performed for the timing comparison. Since there are multiple categories of sequence separation and some proteins may not have any True Positive contacts for a category the scoring for that protein is incomplete. We will remove all such proteins from the comparison, performing the clean up separate for each metric of success. Another contributing factor which necessitates this kind of cleaning is assessment of the top K predictions for a protein or best L/K predictions which for poor predictions may not include any predictions of True Positives.

### Sequence Separation
One important consideration for the difficulty of prediction and interest in predictions is the distance between the residues for which coupling was predicted. As has been documented in the literature, especially in the CASP competitions, there are several categories of prediction:
* Neighbors (1 - 5 residues apart) - This is the least interesting category of predictions. It is highly likely that residues this close together will show covariance signal. Predicting two residues are in contact that are this close together is trivial and uninformative.
* Short (6 - 12 residues apart) - This is also not a very interesting type of prediction. Residues this close in proximity can be more easily modeled by alogrithms which focus on 2D protein structure modeling (identifying beta sheets, alpha helices, etc.).
* Medium (13 - 24 residues apart) - This is a more interesting type of prediction. The resiudes in this range of separation are on the edge of the 2D protein structure prediction range.
* Long (24 and more residues apart) - The most interesting category of predictions. Resiudes this far apart are not easily modeled by 2D protein structure modeling systems. They are also very useful for 3D and 4D protein structure prediction becausae they provide constraints on potential protein (similar to NMR data) folds which makes protein folding a more tractable problem for modelers.
* Any/All - All categories can be considered at once, this provides a summary value, but is often skewed by one particularly good category of predictions.

### Metrics of Success
* AUROC - This measures the True Positive Rate vs the False Positive Rate of prediction, it can be considered a measure of the accuracy of the measure. This can be strongly influenced by the class imbalance which is present when predicting structural contacts since there are many fewer contacts than non-contacts. The True Positive case is if the C-beta of two amino acids is within 8.0 Angstroms of one another (as is done in the CASP competitions).

In [42]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import wilcoxon
protein_method_groups = small_comparison_df[['Protein', 'Method']].drop_duplicates().groupby('Protein').count()
method_max = protein_method_groups['Method'].max()
proteins_to_keep = protein_method_groups.index[protein_method_groups['Method'] == method_max]
comparable_method_proteins = small_comparison_df[small_comparison_df['Protein'].isin(proteins_to_keep)]
protein_length_order = comparable_method_proteins.sort_values('Protein Length')['Protein'].unique()
protein_alignment_order = comparable_method_proteins.sort_values('Alignment Size')['Protein'].unique()
auroc_columns = ['Protein', 'Method', 'Distance', 'Sequence_Separation', 'AUROC']
protein_auroc_groups = comparable_method_proteins[auroc_columns].drop_duplicates(
    subset=['Protein', 'Method', 'Sequence_Separation', 'Distance']).groupby('Protein')['AUROC'].apply(
    lambda x: not x.isnull().any())
complete_proteins = protein_auroc_groups.index[protein_auroc_groups.values]
comparable_auroc_proteins = small_comparison_df[small_comparison_df['Protein'].isin(complete_proteins)]
auroc_protein_length_order = [x for x in protein_length_order if x in complete_proteins]
auroc_protein_alignment_order = [x for x in protein_alignment_order if x in complete_proteins]
auroc_subset_df = comparable_auroc_proteins.loc[:, auroc_columns].drop_duplicates()
auroc_subset_df = auroc_subset_df.loc[auroc_subset_df['Distance'] == 'CB', :]
# Plot the methods vs AUROC per protein ordered by protein length
auroc_subset_df.to_csv(os.path.join(small_set_out_dir, 'Small_AUROC_Comaprison_Data.csv'), sep='\t', header=True, index=False,
                       columns=auroc_columns)
protein_order_auroc_plot = sns.catplot(x="Protein", y="AUROC", hue="Method", row="Sequence_Separation", data=auroc_subset_df, kind="bar",
                                       ci=None, order=auroc_protein_length_order, hue_order=method_order, legend=True, legend_out=True)
protein_order_auroc_plot.set_xticklabels(auroc_protein_length_order, rotation=90)
protein_order_auroc_plot.savefig(os.path.join(small_set_out_dir, 'Protein_Method_AUROC_Comparison_Protein_Length_Order.png'),
                                 bbox_inches='tight', transparent=True, dpi=300)
plt.close()
# Plot the methods vs AUROC per protein ordered by protein alignment size
alignment_order_auroc_plot = sns.catplot(x="Protein", y="AUROC", hue="Method", row="Sequence_Separation", data=auroc_subset_df, kind="bar",
                                       ci=None, order=auroc_protein_alignment_order, hue_order=method_order, legend=True, legend_out=True)
alignment_order_auroc_plot.set_xticklabels(auroc_protein_alignment_order, rotation=90)
alignment_order_auroc_plot.savefig(os.path.join(small_set_out_dir, 'Protein_Method_AUROC_Comparison_Alignment_Size_Order.png'),
                                   bbox_inches='tight', transparent=True, dpi=300)
plt.close()
# Plot the methods vs AUROC grouped together to see overall trends
overall_auroc_plot = sns.boxplot(x="Sequence_Separation", y="AUROC", hue="Method", data=auroc_subset_df,
                                 order=sequence_separation_order, hue_order=method_order)
overall_auroc_plot.set_xticklabels(sequence_separation_order, rotation=90)
overall_auroc_plot.legend(loc='center left', bbox_to_anchor=(1.25, 0.5), ncol=1)
plt.savefig(os.path.join(small_set_out_dir, 'Protein_Method_AUROC_Comparison.png'), bbox_inches='tight', transparent=True, dpi=300)
plt.close()
# Compute statistics comparing methods at each sequence separation
auroc_statistics = {'Sequence Separation': [], 'Method 1': [], 'Method 2': [], 'Statistic': [], 'P-Value': []}
for sep in sequence_separation_order:
    sep_auroc_subset_df = auroc_subset_df.loc[auroc_subset_df['Sequence_Separation'] == sep, :]
    for i in range(len(method_order)):
        m1_sep_auroc_subset_df = sep_auroc_subset_df.loc[sep_auroc_subset_df['Method'] == method_order[i], :]
        for j in range(i + 1, len(method_order)):
            m2_sep_auroc_subset_df = sep_auroc_subset_df.loc[sep_auroc_subset_df['Method'] == method_order[j], :]
            try:
                stat, p_val = wilcoxon(x=m1_sep_auroc_subset_df['AUROC'], y=m2_sep_auroc_subset_df['AUROC'], zero_method='wilcox')
            except ValueError:
                print(method_order[i])
                print(len(m1_sep_auroc_subset_df['AUROC']))
                print(m1_sep_auroc_subset_df)
                print(method_order[j])
                print(len(m2_sep_auroc_subset_df['AUROC']))
                print(m2_sep_auroc_subset_df)
            auroc_statistics['Sequence Separation'].append(sep)
            auroc_statistics['Method 1'].append(method_order[i])
            auroc_statistics['Method 2'].append(method_order[j])
            auroc_statistics['Statistic'].append(stat)
            auroc_statistics['P-Value'].append(p_val)
pd.DataFrame(auroc_statistics).to_csv(os.path.join(small_set_out_dir, 'Small_AUROC_Comaprison_Statistics.csv'), sep='\t', header=True,
                                      index=False, columns=['Sequence Separation', 'Method 1', 'Method 2', 'Statistic', 'P-Value'])

MI
21
   Protein Method Distance Sequence_Separation     AUROC
10   3q05A     MI       CB           Neighbors  0.520454
10   2b59A     MI       CB           Neighbors  0.513605
10   7hvpA     MI       CB           Neighbors  0.513136
10   1c17A     MI       CB           Neighbors  0.560120
10   206lA     MI       CB           Neighbors  0.536861
10   1bolA     MI       CB           Neighbors  0.505232
10   2z0eA     MI       CB           Neighbors  0.548819
10   1axbA     MI       CB           Neighbors  0.565785
10   135lA     MI       CB           Neighbors  0.493339
10   4lliA     MI       CB           Neighbors  0.478922
10   1a26A     MI       CB           Neighbors  0.565014
10   1c0kA     MI       CB           Neighbors  0.596392
10   2zxeA     MI       CB           Neighbors  0.564293
10   1jwlA     MI       CB           Neighbors  0.534891
10   1hckA     MI       CB           Neighbors  0.506842
10   1h1vA     MI       CB           Neighbors  0.537764
10   2ysdA     MI       C

In [43]:
auroc_columns = ['Protein', 'Method', 'Distance', 'Sequence_Separation', 'AUROC']
protein_auroc_groups = comparable_method_proteins[auroc_columns].drop_duplicates().groupby('Protein')['AUROC'].apply(
    lambda x: not x.isnull().any())
complete_proteins = protein_auroc_groups.index[protein_auroc_groups.values]
comparable_auroc_proteins = small_comparison_df[small_comparison_df['Protein'].isin(complete_proteins)]
auroc_protein_length_order = [x for x in protein_length_order if x in complete_proteins]
auroc_protein_alignment_order = [x for x in protein_alignment_order if x in complete_proteins]
auroc_subset_df = comparable_auroc_proteins.loc[:, auroc_columns].drop_duplicates(subset=['Protein', 'Method', 'Sequence_Separation', 'Distance'])
auroc_subset_df = auroc_subset_df.loc[auroc_subset_df['Distance'] == 'Any', :]
# Plot the methods vs AUROC per protein ordered by protein length
auroc_subset_df.to_csv(os.path.join(small_set_out_dir, 'Small_AUROC_Comaprison_Data_Dist_Any.csv'), sep='\t', header=True, index=False,
                       columns=auroc_columns)
protein_order_auroc_plot = sns.catplot(x="Protein", y="AUROC", hue="Method", row="Sequence_Separation", data=auroc_subset_df, kind="bar",
                                       ci=None, order=auroc_protein_length_order, hue_order=method_order, legend=True, legend_out=True)
protein_order_auroc_plot.set_xticklabels(auroc_protein_length_order, rotation=90)
protein_order_auroc_plot.savefig(os.path.join(small_set_out_dir, 'Protein_Method_AUROC_Comparison_Protein_Length_Order_Dist_Any.png'),
                                 bbox_inches='tight', transparent=True, dpi=300)
plt.close()
# Plot the methods vs AUROC per protein ordered by protein alignment size
alignment_order_auroc_plot = sns.catplot(x="Protein", y="AUROC", hue="Method", row="Sequence_Separation", data=auroc_subset_df, kind="bar",
                                       ci=None, order=auroc_protein_alignment_order, hue_order=method_order, legend=True, legend_out=True)
alignment_order_auroc_plot.set_xticklabels(auroc_protein_alignment_order, rotation=90)
alignment_order_auroc_plot.savefig(os.path.join(small_set_out_dir, 'Protein_Method_AUROC_Comparison_Alignment_Size_Order_Dist_Any.png'),
                                   bbox_inches='tight', transparent=True, dpi=300)
plt.close()
# Plot the methods vs AUROC grouped together to see overall trends
overall_auroc_plot = sns.boxplot(x="Sequence_Separation", y="AUROC", hue="Method", data=auroc_subset_df,
                                 order=sequence_separation_order, hue_order=method_order)
overall_auroc_plot.set_xticklabels(sequence_separation_order, rotation=90)
overall_auroc_plot.legend(loc='center left', bbox_to_anchor=(1.25, 0.5), ncol=1)
plt.savefig(os.path.join(small_set_out_dir, 'Protein_Method_AUROC_Comparison_Dist_Any.png'), bbox_inches='tight', transparent=True,
            dpi=300)
plt.close()
# Compute statistics comparing methods at each sequence separation
auroc_statistics = {'Sequence Separation': [], 'Method 1': [], 'Method 2': [], 'Statistic': [], 'P-Value': []}
for sep in sequence_separation_order:
    sep_auroc_subset_df = auroc_subset_df.loc[auroc_subset_df['Sequence_Separation'] == sep, :]
    for i in range(len(method_order)):
        m1_sep_auroc_subset_df = sep_auroc_subset_df.loc[sep_auroc_subset_df['Method'] == method_order[i], :]
        m1_sep_auroc_subset_df.drop_duplicates(inplace=True)
        for j in range(i + 1, len(method_order)):
            m2_sep_auroc_subset_df = sep_auroc_subset_df.loc[sep_auroc_subset_df['Method'] == method_order[j], :]
            m2_sep_auroc_subset_df.drop_duplicates(inplace=True)
            try:
                stat, p_val = wilcoxon(x=m1_sep_auroc_subset_df['AUROC'], y=m2_sep_auroc_subset_df['AUROC'], zero_method='wilcox')
            except ValueError:
                print(method_order[i])
                print(len(m1_sep_auroc_subset_df['AUROC']))
                print(m1_sep_auroc_subset_df)
                print(method_order[j])
                print(len(m2_sep_auroc_subset_df['AUROC']))
                print(m2_sep_auroc_subset_df)
            auroc_statistics['Sequence Separation'].append(sep)
            auroc_statistics['Method 1'].append(method_order[i])
            auroc_statistics['Method 2'].append(method_order[j])
            auroc_statistics['Statistic'].append(stat)
            auroc_statistics['P-Value'].append(p_val)
pd.DataFrame(auroc_statistics).to_csv(os.path.join(small_set_out_dir, 'Small_AUROC_Comaprison_Statistics_Dist_any.csv'), sep='\t',
                                      header=True, index=False,
                                      columns=['Sequence Separation', 'Method 1', 'Method 2', 'Statistic', 'P-Value'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### Metrics of Success (Continued)
* AUPRC - This measures the Precision vs the Recall of the predictions, it can be considered a measure of the accuracy of the measure. This is less strongly influenced by the class imbalance which is present when predicting structural contacts since there are many fewer contacts than non-contacts. The True Positive case is if the C-beta of two amino acids is within 8.0 Angstroms of one another (as is done in the CASP competitions).

In [44]:
auprc_columns = ['Protein', 'Method', 'Distance', 'Sequence_Separation', 'AUPRC']
protein_auprc_groups = comparable_method_proteins[auprc_columns].drop_duplicates().groupby('Protein')['AUPRC'].apply(
    lambda x: not x.isnull().any())
complete_proteins = protein_auprc_groups.index[protein_auprc_groups.values]
comparable_auprc_proteins = small_comparison_df[small_comparison_df['Protein'].isin(complete_proteins)]
auprc_protein_length_order = [x for x in protein_length_order if x in complete_proteins]
auprc_protein_alignment_order = [x for x in protein_alignment_order if x in complete_proteins]
auprc_subset_df = comparable_auprc_proteins.loc[:, auprc_columns].drop_duplicates(subset=['Protein', 'Method', 'Sequence_Separation', 'Distance'])
auprc_subset_df = auprc_subset_df.loc[auprc_subset_df['Distance'] == 'CB', :]
# Plot the methods vs AUPRC per protein ordered by protein length
auprc_subset_df.to_csv(os.path.join(small_set_out_dir, 'Small_AUPRC_Comaprison_Data.csv'), sep='\t', header=True, index=False,
                       columns=auprc_columns)
protein_order_auprc_plot = sns.catplot(x="Protein", y="AUPRC", hue="Method", row="Sequence_Separation", data=auprc_subset_df, kind="bar",
                                       ci=None, order=auprc_protein_length_order, hue_order=method_order, legend=True, legend_out=True)
protein_order_auprc_plot.set_xticklabels(auprc_protein_length_order, rotation=90)
protein_order_auprc_plot.savefig(os.path.join(small_set_out_dir, 'Protein_Method_AUPRC_Comparison_Protein_Length_Order.png'),
                                 bbox_inches='tight', transparent=True, dpi=300)
plt.close()
# Plot the methods vs AUPRC per protein ordered by protein alignment size
alignment_order_auprc_plot = sns.catplot(x="Protein", y="AUPRC", hue="Method", row="Sequence_Separation", data=auprc_subset_df, kind="bar",
                                       ci=None, order=auprc_protein_alignment_order, hue_order=method_order, legend=True, legend_out=True)
alignment_order_auprc_plot.set_xticklabels(auprc_protein_alignment_order, rotation=90)
alignment_order_auprc_plot.savefig(os.path.join(small_set_out_dir, 'Protein_Method_AUPRC_Comparison_Alignment_Size_Order.png'),
                                   bbox_inches='tight', transparent=True, dpi=300)
plt.close()
# Plot the methods vs AUPRC grouped together to see overall trends
overall_auprc_plot = sns.boxplot(x="Sequence_Separation", y="AUPRC", hue="Method", data=auprc_subset_df,
                                 order=sequence_separation_order, hue_order=method_order)
overall_auprc_plot.set_xticklabels(sequence_separation_order, rotation=90)
overall_auprc_plot.legend(loc='center left', bbox_to_anchor=(1.25, 0.5), ncol=1)
plt.savefig(os.path.join(small_set_out_dir, 'Protein_Method_AUPRC_Comparison.png'), bbox_inches='tight', transparent=True, dpi=300)
plt.close()
# Compute statistics comparing methods at each sequence separation
auprc_statistics = {'Sequence Separation': [], 'Method 1': [], 'Method 2': [], 'Statistic': [], 'P-Value': []}
for sep in sequence_separation_order:
    sep_auprc_subset_df = auprc_subset_df.loc[auprc_subset_df['Sequence_Separation'] == sep, :]
    for i in range(len(method_order)):
        m1_sep_auprc_subset_df = sep_auprc_subset_df.loc[sep_auprc_subset_df['Method'] == method_order[i], :]
        for j in range(i + 1, len(method_order)):
            m2_sep_auprc_subset_df = sep_auprc_subset_df.loc[sep_auprc_subset_df['Method'] == method_order[j], :]
            try:
                stat, p_val = wilcoxon(x=m1_sep_auprc_subset_df['AUPRC'], y=m2_sep_auprc_subset_df['AUPRC'], zero_method='wilcox')
            except ValueError:
                print(method_order[i])
                print(len(m1_sep_auroc_subset_df['AUROC']))
                print(m1_sep_auroc_subset_df)
                print(method_order[j])
                print(len(m2_sep_auroc_subset_df['AUROC']))
                print(m2_sep_auroc_subset_df)
            auprc_statistics['Sequence Separation'].append(sep)
            auprc_statistics['Method 1'].append(method_order[i])
            auprc_statistics['Method 2'].append(method_order[j])
            auprc_statistics['Statistic'].append(stat)
            auprc_statistics['P-Value'].append(p_val)
pd.DataFrame(auprc_statistics).to_csv(os.path.join(small_set_out_dir, 'Small_AUPRC_Comaprison_Statistics.csv'), sep='\t', header=True,
                                      index=False, columns=['Sequence Separation', 'Method 1', 'Method 2', 'Statistic', 'P-Value'])

In [45]:
auprc_columns = ['Protein', 'Method', 'Distance', 'Sequence_Separation', 'AUPRC']
protein_auprc_groups = comparable_method_proteins[auprc_columns].drop_duplicates().groupby('Protein')['AUPRC'].apply(
    lambda x: not x.isnull().any())
complete_proteins = protein_auprc_groups.index[protein_auprc_groups.values]
comparable_auprc_proteins = small_comparison_df[small_comparison_df['Protein'].isin(complete_proteins)]
auprc_protein_length_order = [x for x in protein_length_order if x in complete_proteins]
auprc_protein_alignment_order = [x for x in protein_alignment_order if x in complete_proteins]
auprc_subset_df = comparable_auprc_proteins.loc[:, auprc_columns].drop_duplicates(subset=['Protein', 'Method', 'Sequence_Separation', 'Distance'])
auprc_subset_df = auprc_subset_df.loc[auprc_subset_df['Distance'] == 'Any', :]
# Plot the methods vs AUPRC per protein ordered by protein length
auprc_subset_df.to_csv(os.path.join(small_set_out_dir, 'Small_AUPRC_Comaprison_Data_Dist_Any.csv'), sep='\t', header=True, index=False,
                       columns=auprc_columns)
protein_order_auprc_plot = sns.catplot(x="Protein", y="AUPRC", hue="Method", row="Sequence_Separation", data=auprc_subset_df, kind="bar",
                                       ci=None, order=auprc_protein_length_order, hue_order=method_order, legend=True, legend_out=True)
protein_order_auprc_plot.set_xticklabels(auprc_protein_length_order, rotation=90)
protein_order_auprc_plot.savefig(os.path.join(small_set_out_dir, 'Protein_Method_AUPRC_Comparison_Protein_Length_Order_Dist_Any.png'),
                                 bbox_inches='tight', transparent=True, dpi=300)
plt.close()
# Plot the methods vs AUPRC per protein ordered by protein alignment size
alignment_order_auprc_plot = sns.catplot(x="Protein", y="AUPRC", hue="Method", row="Sequence_Separation", data=auprc_subset_df, kind="bar",
                                       ci=None, order=auprc_protein_alignment_order, hue_order=method_order, legend=True, legend_out=True)
alignment_order_auprc_plot.set_xticklabels(auprc_protein_alignment_order, rotation=90)
alignment_order_auprc_plot.savefig(os.path.join(small_set_out_dir, 'Protein_Method_AUPRC_Comparison_Alignment_Size_Order_Dist_Any.png'),
                                   bbox_inches='tight', transparent=True, dpi=300)
plt.close()
# Plot the methods vs AUPRC grouped together to see overall trends
overall_auprc_plot = sns.boxplot(x="Sequence_Separation", y="AUPRC", hue="Method", data=auprc_subset_df,
                                 order=sequence_separation_order, hue_order=method_order)
overall_auprc_plot.set_xticklabels(sequence_separation_order, rotation=90)
overall_auprc_plot.legend(loc='center left', bbox_to_anchor=(1.25, 0.5), ncol=1)
plt.savefig(os.path.join(small_set_out_dir, 'Protein_Method_AUPRC_Comparison_Dist_Any.png'), bbox_inches='tight', transparent=True, dpi=300)
plt.close()
# Compute statistics comparing methods at each sequence separation
auprc_statistics = {'Sequence Separation': [], 'Method 1': [], 'Method 2': [], 'Statistic': [], 'P-Value': []}
for sep in sequence_separation_order:
    sep_auprc_subset_df = auprc_subset_df.loc[auprc_subset_df['Sequence_Separation'] == sep, :]
    for i in range(len(method_order)):
        m1_sep_auprc_subset_df = sep_auprc_subset_df.loc[sep_auprc_subset_df['Method'] == method_order[i], :]
        for j in range(i + 1, len(method_order)):
            m2_sep_auprc_subset_df = sep_auprc_subset_df.loc[sep_auprc_subset_df['Method'] == method_order[j], :]
            try:
                stat, p_val = wilcoxon(x=m1_sep_auprc_subset_df['AUPRC'], y=m2_sep_auprc_subset_df['AUPRC'], zero_method='wilcox')
            except ValueError:
                print(method_order[i])
                print(len(m1_sep_auroc_subset_df['AUROC']))
                print(m1_sep_auroc_subset_df)
                print(method_order[j])
                print(len(m2_sep_auroc_subset_df['AUROC']))
                print(m2_sep_auroc_subset_df)
            auprc_statistics['Sequence Separation'].append(sep)
            auprc_statistics['Method 1'].append(method_order[i])
            auprc_statistics['Method 2'].append(method_order[j])
            auprc_statistics['Statistic'].append(stat)
            auprc_statistics['P-Value'].append(p_val)
pd.DataFrame(auprc_statistics).to_csv(os.path.join(small_set_out_dir, 'Small_AUPRC_Comaprison_Statistics_Dist_Any.csv'), sep='\t',
                                      header=True, index=False,
                                      columns=['Sequence Separation', 'Method 1', 'Method 2', 'Statistic', 'P-Value'])