# Large Test Set #
## Goal ##
The goal of this test set is to show that what was found in the smaller data set is generalizable to a larger number of proteins with a wide range of sizes and available homologs, orthologs, and paralogs. This set of proteins was used previously (Marcos et al., 2011) to evaluate covariance predictions. By using this dataset it should be possible to show the generalizability of the method as well as make a better comparison to previous methods.
## Warning ##
Before attempting to use this notebook make sure that your .env file has been properly setup to reflect the correct locations of command line tools and the location of files and directories needed for execution.
### Initial Import###
This first cell performs the necessary imports required to begin this notebook.

In [1]:
from dotenv import find_dotenv, load_dotenv
try:
    dotenv_path = find_dotenv(raise_error_if_not_found=True)
except IOError:
    dotenv_path = find_dotenv(raise_error_if_not_found=True, usecwd=True)
load_dotenv(dotenv_path)
import os
import sys
sys.path.append(os.path.join(os.environ.get('PROJECT_PATH'), 'src'))
sys.path.append(os.path.join(os.environ.get('PROJECT_PATH'), 'src', 'SupportingClasses'))
input_dir = os.path.join(os.environ.get('INPUT_PATH'), 'Large_Test_Set')

## Data Set Construction ##
The first task required to test the data set is to download the required data and construct any necessary input files for all down stream analyses.
In this case that means:
* Downloading PDB files for the proteins in our small test set.
* Extracting a query sequence from each PDB file.
* Searching for paralogs, homologs, and orthologs in a custom BLAST database built by filtering the Uniref90 database.
* Filtering the hits from the BLAST search to meet minimum and maximum length requirements, as well as minimum and maximum identity requirements.
* Building alignments using CULSTALW in both the fasta and msf formats since some of the tools which will be used for comparison need different formats.
* Filtering the alignment for maximum identity similarity between seqeunces.
* Re-aligning the filtered sequences using CLUSTALW.
This is all handeled by the DataSetGenerator class found in the src/SupportingClasses folder

In [2]:
from time import time
from DataSetGenerator import DataSetGenerator
protein_list_dir = os.path.join(input_dir, 'ProteinLists')
if not os.path.isdir(protein_list_dir):
    os.makedirs(protein_list_dir)
large_list_fn = os.path.join(protein_list_dir, 'LargeDataSet.txt')
if not os.path.isfile(large_list_fn):
    proteins_of_interest = ['153lA', '154lA', '1a04A', '1a0bA', '1a0pA', '1ae9A', '1al3A', '1atgA', '1b7eA', '1b9mA', '1b9nA',
                            '1biaA', '1bibA', '1bl0A', '1booA', '1bslA', '1byiA', '1byqA', '1c02A', '1c52A', '1c5kA', '1c75A',
                            '1cb7A', '1ccwA', '1cp2A', '1crxA', '1crzA', '1ctjA', '1d4aA', '1d5yA', '1dadA', '1daeA', '1dagA',
                            '1dahA', '1daiA', '1dakA', '1dd9A', '1ddeA', '1di6A', '1di7A', '1dljA', '1dtsA', '1durA', '1e2xA',
                            '1e3uA', '1e4dA', '1e4fT', '1e4gT', '1e8cA', '1eclA', '1efaA', '1efdN', '1eg2A', '1ek9A', '1eszA',
                            '1etkA', '1etoA', '1etvA', '1etwA', '1etxA', '1etyA', '1ezwA', '1f07A', '1f1uA', '1f44A', '1f48A',
                            '1f5vA', '1f9iA', '1fcaA', '1fdnA', '1fepA', '1fiaA', '1fipA', '1fp6A', '1fr3A', '1fseA', '1fxoA',
                            '1g1lA', '1g1mA', '1g20A', '1g28A', '1g5pA', '1g60A', '1g6oA', '1g72A', '1g8kA', '1gbsA', '1gdtA',
                            '1gg4A', '1gqyA', '1gu9A', '1gugA', '1gunA', '1gusA', '1gutA', '1h3lA', '1h4iA', '1h7lA', '1h7qA',
                            '1h8zA', '1h98A', '1h9gA', '1h9jA', '1h9kA', '1h9mA', '1h9sA', '1hfeS', '1hfeL', '1hm9A', '1hw1A',
                            '1hxdA',
                            '1i0rA', '1i1gA', '1i52A', '1i58A', '1i5nA', '1i74A', '1i8oA', '1i9cA', '1icrA', '1id0A', '1id1A',
                            '1ihcA', '1ihrA', '1ihuA', '1ii0A', '1ii9A', '1iniA', '1injA', '1ir6A', '1iujA', '1ixcA', '1ixgA',
                            '1ixhA', '1iz1A', '1j5yA', '1j6uA', '1jbgA', '1jbwA', '1je8A', '1jetA', '1jeuA', '1jevA', '1jftA',
                            '1jh9A', '1jiwP', '1jiwI', '1jljA', '1jnuA', '1jpuA', '1jq5A', '1jykA', '1k20A', '1k2vN', '1k38A',
                            '1k4fA',
                            '1k54A', '1k56A', '1kapP', '1kb0A', '1kbuA', '1kgsA', '1kmoA', '1kmpA', '1kq3A', '1ku3A', '1ku7A',
                            '1kv9A', '1kw3B', '1kw6B', '1l3lA', '1lj9A', '1lq9A', '1lqkA', '1lqpA', '1lr0A', '1ls9A', '1lspA',
                            '1lssA', '1lucA', '1lvwA', '1m65A', '1m68A', '1m6kA', '1m70A', '1m7jA', '1ma7A', '1mb3A', '1mdoA',
                            '1mkmA', '1mkzA', '1mm8A', '1mnzA', '1moqA', '1muhA', '1murA', '1musA', '1muwA', '1mv8A', '1mw8X',
                            '1mw9X', '1n2zA', '1n9lA', '1n9nA', '1nfpA', '1nkiA', '1nlyA', '1nnfA', '1noxA', '1nqeA', '1nw5A',
                            '1nw6A', '1nw7A', '1nw8A', '1nwzA', '1ny5A', '1ny6A', '1o1hA', '1o2dA', '1o61A', '1o69A', '1o7lA',
                            '1oadA', '1oapA', '1oddA', '1odvA', '1oj7A', '1oltA', '1opcA', '1opxA', '1or7A', '1ot6A', '1ot9A',
                            '1otaA', '1otbA', '1oxkA', '1p2fA', '1p31A', '1p3dA', '1p7dA', '1p9rA', '1p9wA', '1pb0A', '1pb7A',
                            '1pb8A', '1pjrA', '1pnzA', '1po0A', '1pt7A', '1pvpA', '1q05A', '1q06A', '1q07A', '1q08A', '1q09A',
                            '1q0aA', '1q35A', '1q7eA', '1qg8A', '1qgqA', '1qgsA', '1qhgA', '1qhhA', '1qksA', '1qpzA', '1qsaA',
                            '1qteA', '1qtwA', '1qu7A', '1qwyA', '1qxxA', '1r1mA', '1r1tA', '1r1uA', '1r23A', '1r62A', '1r8dA',
                            '1r8eA', '1r9xA', '1r9yA', '1r9zA', '1ra0A', '1ra5A', '1rakA', '1reqA', '1rhcA', '1rioA', '1rk6A',
                            '1rp3A', '1rrmA', '1rttA', '1rzuA', '1rzvA', '1s5mA', '1s5nA', '1s8nA', '1sfxA', '1sg0A', '1si0A',
                            '1sigA', '1slyA', '1sqeA', '1sqsA', '1sumB', '1suuA', '1t3tA', '1t5bA', '1t72A', '1ta9A', '1td5A',
                            '1tf1A', '1tqgA', '1tqqA', '1tv8A', '1tvlA', '1tzbA', '1tzcA', '1u07A', '1u2wA', '1u8bA', '1u8tA',
                            '1uaaA', '1uc8A', '1uc9A', '1us4A', '1us5A', '1uscA', '1usfA', '1uuxA', '1uuyA', '1uylA', '1v4yA',
                            '1v51A', '1v8pA', '1v96A', '1vctA', '1ve2A', '1vf7A', '1vgtA', '1vgwA', '1vhdA', '1vhvA', '1vimA',
                            '1vj7A', '1vkeA', '1vljA', '1vz0A', '1w55A', '1w6sA', '1w77A', '1w78A', '1w8iA', '1wetA', '1wmiA',
                            '1woqA', '1wp1A', '1wpmA', '1wpnA', '1wppA', '1ws6A', '1x74A', '1x9hA', '1x9iA', '1xa3A', '1xc3A',
                            '1xd7A', '1xi2A', '1xjaA', '1xk6A', '1xk7A', '1xkwA', '1xkzA', '1xmaA', '1xo0A', '1xocA', '1xw3A',
                            '1y0hA', '1y1zA', '1y20A', '1y7mA', '1y7yA', '1y80A', '1y82A', '1y9uA', '1yc9A', '1ydxA', '1ye5A',
                            '1yf2A', '1yg2A', '1yioA', '1yiqA', '1ylfA', '1yoyA', '1yspA', '1ysqA', '1yviA', '1z05A', '1z19A',
                            '1z7uA', '1zatA', '1zi0A', '1zljA', '1zvtA', '1zvuA', '1zzcA', '2a0bA', '2a3nA', '2a5hA', '2a5lA',
                            '2a61A', '2aa4A', '2aacA', '2ad6A', '2ad7A', '2ad8A', '2aefA', '2aejA', '2afhA', '2am1A', '2anuA',
                            '2ap1A', '2ar0A', '2araA', '2arcA', '2aznA', '2b02A', '2b0pA', '2b13A', '2b3zA', '2b44A', '2basA',
                            '2bfwA', '2bknA', '2bkoA', '2bkpA', '2bm4A', '2bm5A', '2bm6A', '2bm7A', '2bnmA', '2brcA', '2byiA',
                            '2c2aA', '2c81A', '2ce0A', '2cg4A', '2ch7A', '2cviA', '2cwqA', '2cyyA', '2d1hA', '2d1vA', '2d5mA',
                            '2d5nA', '2d5wA', '2dbbA', '2dekA', '2df8A', '2dg6A', '2di3A', '2dqlA', '2dvzA', '2dxwA', '2dxxA',
                            '2e15A', '2e1nA', '2e4nA', '2e5fA', '2e7wA', '2e7xA', '2e7zA', '2eb7A', '2ecuA', '2efnA', '2eh3A',
                            '2ehlA', '2ehzA', '2ek5A', '2eshA', '2esnA', '2esrA', '2ewnA', '2ewvA', '2eyuA', '2f00A', '2f2eA',
                            '2f5xA', '2f6gA', '2f6pA', '2f7aA', '2f7bA', '2f8lA', '2f9fA', '2fa1A', '2fa5A', '2fb2A', '2fbhA',
                            '2fcjA', '2fdnA', '2fe1A', '2fezA', '2ff4A', '2ffuA', '2fhpA', '2fn9A', '2fnuA', '2fpoA', '2fswA',
                            '2fvyA', '2fw0A', '2g2cA', '2g6vA', '2g7uA', '2gaiA', '2gajA', '2gciA', '2gd0A', '2gd2A', '2gd9A',
                            '2gj3A', '2gjgA', '2gkgA', '2glkA', '2gm5A', '2gmsA', '2gmyA', '2gqpA', '2gskA', '2gu1A', '2gufA',
                            '2guhA', '2gupA', '2gxgA', '2gzaA', '2h1cA', '2h98A', '2h99A', '2h9bA', '2hawA', '2hekA', '2heuA',
                            '2hklA', '2hmtA', '2hmuA', '2hmvA', '2hnhA', '2hoeA', '2hofA', '2hphA', '2hq0A', '2hqsA', '2hs5A',
                            '2hsgA', '2hsiA', '2hwvA', '2hxvA', '2i0mA', '2i5rA', '2ia2A', '2ia4A', '2ibdA', '2ictA', '2iftA',
                            '2ikkA', '2iplA', '2ipmA', '2ipnA', '2is1A', '2is2A', '2is4A', '2is6A', '2is8A', '2iu5A', '2iuyA',
                            '2iv7A', '2iw1A', '2iw4A', '2iwxA', '2jbaA', '2jcgA', '2jfgA', '2nipA', '2npnA', '2nq2A', '2nq9A',
                            '2nqhA', '2nt3A', '2nt4A', '2o08A', '2o0yA', '2o3jA', '2o4dA', '2o7iA', '2o7pA', '2o8xA', '2o99A',
                            '2o9aA', '2obcA', '2ofyA', '2ogiA', '2ojhA', '2okcA', '2olbA', '2oocA', '2oqgA', '2oqrA', '2oxoA',
                            '2oyoA', '2p19A', '2p4gA', '2p5vA', '2p7oA', '2paqA', '2pbqA', '2pfxA', '2ph1A', '2pjrA', '2pkhA',
                            '2pmhA', '2pn6A', '2pq7A', '2pt7A', '2pucA', '2pudA', '2px7A', '2q0oA', '2q0tA', '2q1zA', '2q4fA',
                            '2q8pA', '2qb6A', '2qb7A', '2qb8A', '2qczA', '2qdfA', '2qdlA', '2qeuA', '2qgqA', '2qgzA', '2qi9A',
                            '2qj7A', '2qm1A', '2qmoA', '2qpqA', '2qsxA', '2qwxA', '2qx4A', '2qx6A', '2qx8A', '2r01A', '2r0xA',
                            '2r1jA', '2r25A', '2r4tA', '2r6gA', '2r6oA', '2r6vA', '2ra5A', '2rb9A', '2rc7A', '2rc8A', '2rcaA',
                            '2rdeA', '2riiA', '2rilA', '2rslA', '2uagA', '2v25A', '2v2kA', '2v9yA', '2vhaA', '2vjqA', '2vk2A',
                            '2vkeA', '2vkrA', '2vlgA', '2vmaA', '2vmbA', '2vpzA', '2vshA', '2w27A', '2w8bA', '2w8iA', '2yveA',
                            '2yx0A', '2yxbA', '2yxoA', '2yxzA', '2yyeA', '2yz5A', '2z1eA', '2z1fA', '2z1uA', '2z2lA', '2z2mA',
                            '2z4gA', '2z4pA', '2z6rA', '2z8xA', '2z98A', '2z9bA', '2zauA', '2zbcA', '2zc3A', '2zc4A', '2zcmA',
                            '2zdpA', '2zf8A', '2zieA', '2zifA', '2zigA', '2zkiA', '2zkzA', '2zodA', '2zovA', '2zxjA', '3b4yA',
                            '3b6iA', '3b8xA', '3b9oA', '3bcvA', '3be6A', '3bemA', '3bg2A', '3bhqA', '3bkhA', '3bkvA', '3bm7A',
                            '3bpkA', '3bpqA', '3bpvA', '3bqxA', '3breA', '3bs3A', '3bvpA', '3bwgA', '3c1qA', '3c29A', '3c3wA',
                            '3c48A', '3c57A', '3c7jA', '3c85A', '3c8fA', '3c8nA', '3c9uA', '3canA', '3ccgA', '3cijA', '3cixA',
                            '3ckjA', '3cknA', '3ckvA', '3cloA', '3cnrA', '3cnvA', '3cp5A', '3ctpA', '3cuoA', '3cwrA', '3cx4A',
                            '3cyiA', '3cypB', '3cypC', '3cypD', '3cyqA', '3d5kA', '3d6zA', '3d7iA', '3dboA', '3df7A', '3df8A', '3dmaA', '3dr4A',
                            '3drfA', '3drjA', '3dsgA', '3du1X', '3e10A', '3e38A', '3e4rA', '3e4vA', '3e7lA', '3e8oA', '3eagA',
                            '3ec2A', '3eccA', '3echA', '3ecpA', '3edpA', '3eetA', '3efmA', '3eiwA', '3eixA', '3ekoA', '3elkA',
                            '3eusA', '3ex8A', '3eywA', '3ezuA', '3f1cA', '3f1nA', '3f1oA', '3f1pA', '3f2bA', '3f44A', '3f52A',
                            '3f6cA', '3f6oA', '3f6vA', '3f8bA', '3f8cA', '3f8fA', '3fd3A', '3fgvA', '3fisA', '3fmsA', '3fwyA',
                            '3fwzA', '3fxaA', '3fzvA', '3g13A', '3g5oA', '3g7rA', '3gdiA', '3gfaA', '3gfvA', '3gfxA', '3gfyA',
                            '3gfzA', '3gg0A', '3gg1A', '3gg2A', '3ghjA', '3gp4A', '3gpvA', '3gr3A', '3guvA', '3h4oA', '3h5tA',
                            '3h87A', '3hfiA', '3hh0A', '3hhhA', '3hl0A', '3hmzA', '3hn7A', '3hoiA', '3htvA', '3hvwA', '3pypA',
                            '3uagA', '4aahA', '4crxA', '4reqA', '4uagA', '5reqA', '6reqA', '7reqA', '8abpA']
    with open(large_list_fn, 'w') as large_list_handle:
        for p_id in proteins_of_interest:
            large_list_handle.write('{}\n'.format(p_id))
generator = DataSetGenerator(input_dir)
start = time()
summary = generator.build_pdb_alignment_dataset(protein_list_fn=os.path.basename(large_list_fn), processes=10,
                                                database='uniref90_05122020/custom_uniref90_05122020.fasta',
                                                max_target_seqs=20000, e_value_threshold=0.05, remote=False,
                                                min_fraction=0.7, min_identity=0.40, max_identity=0.98,
                                                msf=True, fasta=True, sources=['UNP', 'GB', 'PDB'], verbose=False)
summary['Chain'] = summary['Protein_ID'].apply(lambda x: generator.protein_data[x]['Chain'])
summary['Accession'] = summary['Protein_ID'].apply(lambda x: generator.protein_data[x]['Accession'])
summary['Length'] = summary['Protein_ID'].apply(lambda x: generator.protein_data[x]['Length'])
summary['Total_Size'] = summary.apply(lambda x: float(x['Length']) * float(x['Filtered_Alignment']), axis=1)
summary.sort_values(by=['Filtered_Alignment', 'Length'], axis=0, inplace=True)
summary_columns = ['Protein_ID', 'Chain', 'Accession', 'BLAST_Hits', 'Filtered_BLAST',
                   'Filtered_Alignment', 'Length',
                   'Total_Size']
print(summary[summary_columns])
end = time()
print('It took {} min to generate the data set.'.format((end - start) / 60.0))
summary.to_csv(os.path.join(input_dir, 'large_data_set_summary.tsv'), sep='\t', index=False, header=True,
               columns=summary_columns)

STARTING identify_protein_sequences
IMPORTED PROTEIN LIST
PROTEIN ID: 153lA
PROTEIN ID: 154lA
PROTEIN ID: 1a04A
PROTEIN ID: 1a0bA
PROTEIN ID: 1a0pA
PROTEIN ID: 1ae9A
PROTEIN ID: 1al3A
PROTEIN ID: 1atgA
PROTEIN ID: 1b7eA
PROTEIN ID: 1b9mA
PROTEIN ID: 1b9nA
PROTEIN ID: 1biaA
PROTEIN ID: 1bibA
PROTEIN ID: 1bl0A
PROTEIN ID: 1booA
PROTEIN ID: 1bslA
PROTEIN ID: 1byiA
PROTEIN ID: 1byqA
PROTEIN ID: 1c02A
PROTEIN ID: 1c52A
PROTEIN ID: 1c5kA
PROTEIN ID: 1c75A
PROTEIN ID: 1cb7A
PROTEIN ID: 1ccwA
PROTEIN ID: 1cp2A
PROTEIN ID: 1crxA
PROTEIN ID: 1crzA
PROTEIN ID: 1ctjA
PROTEIN ID: 1d4aA
PROTEIN ID: 1d5yA
PROTEIN ID: 1dadA
PROTEIN ID: 1daeA
PROTEIN ID: 1dagA
PROTEIN ID: 1dahA
PROTEIN ID: 1daiA
PROTEIN ID: 1dakA
PROTEIN ID: 1dd9A
PROTEIN ID: 1ddeA
PROTEIN ID: 1di6A
PROTEIN ID: 1di7A
PROTEIN ID: 1dljA
PROTEIN ID: 1dtsA
PROTEIN ID: 1durA
PROTEIN ID: 1e2xA
PROTEIN ID: 1e3uA
PROTEIN ID: 1e4dA
PROTEIN ID: 1e4fT
PROTEIN ID: 1e4gT
PROTEIN ID: 1e8cA
PROTEIN ID: 1eclA
PROTEIN ID: 1efaA
PROTEIN ID: 1efdN
PROTEI



PROTEIN ID: 1kmoA
PROTEIN ID: 1kmpA
PROTEIN ID: 1kq3A
PROTEIN ID: 1ku3A
PROTEIN ID: 1ku7A
PROTEIN ID: 1kv9A
PROTEIN ID: 1kw3B
PROTEIN ID: 1kw6B
PROTEIN ID: 1l3lA
PROTEIN ID: 1lj9A
PROTEIN ID: 1lq9A
PROTEIN ID: 1lqkA
PROTEIN ID: 1lqpA
PROTEIN ID: 1lr0A
PROTEIN ID: 1ls9A
PROTEIN ID: 1lspA
PROTEIN ID: 1lssA
PROTEIN ID: 1lucA
PROTEIN ID: 1lvwA
PROTEIN ID: 1m65A
PROTEIN ID: 1m68A
PROTEIN ID: 1m6kA
PROTEIN ID: 1m70A
PROTEIN ID: 1m7jA
PROTEIN ID: 1ma7A
PROTEIN ID: 1mb3A
PROTEIN ID: 1mdoA
PROTEIN ID: 1mkmA
PROTEIN ID: 1mkzA
PROTEIN ID: 1mm8A
PROTEIN ID: 1mnzA
PROTEIN ID: 1moqA
PROTEIN ID: 1muhA
Desired structure doesn't exists
PROTEIN ID: 1murA
PROTEIN ID: 1musA
PROTEIN ID: 1muwA
PROTEIN ID: 1mv8A
PROTEIN ID: 1mw8X
PROTEIN ID: 1mw9X
PROTEIN ID: 1n2zA
PROTEIN ID: 1n9lA
PROTEIN ID: 1n9nA
PROTEIN ID: 1nfpA
PROTEIN ID: 1nkiA
PROTEIN ID: 1nlyA
PROTEIN ID: 1nnfA
PROTEIN ID: 1noxA
PROTEIN ID: 1nqeA
PROTEIN ID: 1nw5A
PROTEIN ID: 1nw6A
PROTEIN ID: 1nw7A
PROTEIN ID: 1nw8A
PROTEIN ID: 1nwzA
PROTEIN ID: 1



PROTEIN ID: 1y9uA
PROTEIN ID: 1yc9A
PROTEIN ID: 1ydxA
PROTEIN ID: 1ye5A
PROTEIN ID: 1yf2A
PROTEIN ID: 1yg2A
PROTEIN ID: 1yioA
PROTEIN ID: 1yiqA
PROTEIN ID: 1ylfA
PROTEIN ID: 1yoyA
PROTEIN ID: 1yspA
PROTEIN ID: 1ysqA
PROTEIN ID: 1yviA
PROTEIN ID: 1z05A
PROTEIN ID: 1z19A
PROTEIN ID: 1z7uA
PROTEIN ID: 1zatA
PROTEIN ID: 1zi0A
PROTEIN ID: 1zljA
PROTEIN ID: 1zvtA
PROTEIN ID: 1zvuA
PROTEIN ID: 1zzcA
PROTEIN ID: 2a0bA
PROTEIN ID: 2a3nA
PROTEIN ID: 2a5hA
PROTEIN ID: 2a5lA
PROTEIN ID: 2a61A
PROTEIN ID: 2aa4A
PROTEIN ID: 2aacA
PROTEIN ID: 2ad6A
PROTEIN ID: 2ad7A
PROTEIN ID: 2ad8A
PROTEIN ID: 2aefA
PROTEIN ID: 2aejA
PROTEIN ID: 2afhA
PROTEIN ID: 2am1A
PROTEIN ID: 2anuA
PROTEIN ID: 2ap1A
PROTEIN ID: 2ar0A
PROTEIN ID: 2araA
PROTEIN ID: 2arcA
PROTEIN ID: 2aznA
PROTEIN ID: 2b02A
PROTEIN ID: 2b0pA
PROTEIN ID: 2b13A
PROTEIN ID: 2b3zA
PROTEIN ID: 2b44A
PROTEIN ID: 2basA
PROTEIN ID: 2bfwA
PROTEIN ID: 2bknA
PROTEIN ID: 2bkoA
PROTEIN ID: 2bkpA
PROTEIN ID: 2bm4A
PROTEIN ID: 2bm5A
PROTEIN ID: 2bm6A
PROTEIN ID



Importing the PDB file took 0.0008391976356506347 min
{'UNP': {'L': [('P69202', 'RPC2_BPP22', 1, 68)], 'R': [('P69202', 'RPC2_BPP22', 1, 68)]}}
PROTEIN ID: 2r25A
PROTEIN ID: 2r4tA
PROTEIN ID: 2r6gA
PROTEIN ID: 2r6oA
PROTEIN ID: 2r6vA
PROTEIN ID: 2ra5A
PROTEIN ID: 2rb9A
PROTEIN ID: 2rc7A
PROTEIN ID: 2rc8A
PROTEIN ID: 2rcaA
PROTEIN ID: 2rdeA
PROTEIN ID: 2riiA
PROTEIN ID: 2rilA
PROTEIN ID: 2rslA
PROTEIN ID: 2uagA
PROTEIN ID: 2v25A
PROTEIN ID: 2v2kA
PROTEIN ID: 2v9yA
PROTEIN ID: 2vhaA
PROTEIN ID: 2vjqA
PROTEIN ID: 2vk2A
PROTEIN ID: 2vkeA
PROTEIN ID: 2vkrA
PROTEIN ID: 2vlgA
PROTEIN ID: 2vmaA
PROTEIN ID: 2vmbA
PROTEIN ID: 2vpzA
PROTEIN ID: 2vshA
PROTEIN ID: 2w27A
PROTEIN ID: 2w8bA
PROTEIN ID: 2w8iA
PROTEIN ID: 2yveA
PROTEIN ID: 2yx0A
PROTEIN ID: 2yxbA
PROTEIN ID: 2yxoA
PROTEIN ID: 2yxzA
PROTEIN ID: 2yyeA
PROTEIN ID: 2yz5A
PROTEIN ID: 2z1eA
PROTEIN ID: 2z1fA
PROTEIN ID: 2z1uA
PROTEIN ID: 2z2lA
PROTEIN ID: 2z2mA
PROTEIN ID: 2z4gA
PROTEIN ID: 2z4pA
PROTEIN ID: 2z6rA
PROTEIN ID: 2z8xA
PROTEIN ID



Desired structure doesn't exists
PROTEIN ID: 3dmaA
PROTEIN ID: 3dr4A
PROTEIN ID: 3drfA
PROTEIN ID: 3drjA
PROTEIN ID: 3dsgA
PROTEIN ID: 3du1X
Importing the PDB file took 0.0007538636525472005 min
{'UNP': {'X': [('Q8YQS9', 'Q8YQS9_ANASP', 1, 237)]}}




PROTEIN ID: 3e10A
PROTEIN ID: 3e38A
PROTEIN ID: 3e4rA
PROTEIN ID: 3e4vA
PROTEIN ID: 3e7lA
PROTEIN ID: 3e8oA
PROTEIN ID: 3eagA
PROTEIN ID: 3ec2A
PROTEIN ID: 3eccA
PROTEIN ID: 3echA
PROTEIN ID: 3ecpA
PROTEIN ID: 3edpA
PROTEIN ID: 3eetA
PROTEIN ID: 3efmA
PROTEIN ID: 3eiwA
PROTEIN ID: 3eixA
PROTEIN ID: 3ekoA
PROTEIN ID: 3elkA
PROTEIN ID: 3eusA
PROTEIN ID: 3ex8A
PROTEIN ID: 3eywA
Importing the PDB file took 0.002374744415283203 min
{'UNP': {'A': [('P03819', 'KEFC_ECOLI', 401, 620), ('P0A754', 'KEFF_ECOLI', 1, 176)], 'B': [('P03819', 'KEFC_ECOLI', 401, 620), ('P0A754', 'KEFF_ECOLI', 1, 176)]}}
PROTEIN ID: 3ezuA
PROTEIN ID: 3f1cA
PROTEIN ID: 3f1nA
PROTEIN ID: 3f1oA
PROTEIN ID: 3f1pA
PROTEIN ID: 3f2bA
PROTEIN ID: 3f44A
PROTEIN ID: 3f52A
PROTEIN ID: 3f6cA
PROTEIN ID: 3f6oA
PROTEIN ID: 3f6vA
PROTEIN ID: 3f8bA
PROTEIN ID: 3f8cA
PROTEIN ID: 3f8fA
PROTEIN ID: 3fd3A
PROTEIN ID: 3fgvA
PROTEIN ID: 3fisA
PROTEIN ID: 3fmsA
PROTEIN ID: 3fwyA
PROTEIN ID: 3fwzA
PROTEIN ID: 3fxaA
PROTEIN ID: 3fzvA
PROTEIN I

ApplicationError: Non-zero return code 137 from '/home/daniel/lib/ncbi-blast-2.9.0+-src/c++/ReleaseMT/bin/blastp -out /media/daniel/ExtraDrive1/Results/Newest_Covariation/Input/Large_Test_Set/BLAST/LargeDataSet_All_Seqs.xml -outfmt 5 -query /media/daniel/ExtraDrive1/Results/Newest_Covariation/Input/Large_Test_Set/Sequences/LargeDataSet.fasta -db /media/daniel/ExtraDrive1/blast_databases/uniref90_05122020/custom_uniref90_05122020.fasta -evalue 0.05 -max_target_seqs 20000 -num_threads 10', message 'Killed'

Create a location to store the output of this validation.

In [3]:
output_dir = os.environ.get('OUTPUT_PATH')
large_set_out_dir = os.path.join(output_dir, 'LargeTestSet')
if not os.path.isdir(large_set_out_dir):
    os.makedirs(large_set_out_dir)

## Setting Up Scoring For Each Method
To reduce memory load during prediction and evaluation, the scoring objects needed to compute the metrics used to compare methods will be created ahead of time so they are available to each method when it computes its predictions for a given protein. This will ensure that results do not need to be kept in memory while waiting for all other results to be computed, only the metrics measured for each method will be recorded.

In [None]:
from SeqAlignment import SeqAlignment
from PDBReference import PDBReference
from ContactScorer import ContactScorer, plot_z_scores
protein_order = list(summary['Protein_ID'])
method_order = ['DCA', 'EV Couplings', 'EV Couplings MF', 'ET-MIp', 'cET-MIp']
sequence_separation_order = ['Any', 'Neighbors', 'Short', 'Medium', 'Long']
protein_scorers = {}
counts = {'success': 0, 'attribute': 0, 'value': 0}
for p_id in summary['Protein_ID']:
    try:
        protein_scorers[p_id] = {}
        # Import alignment and remove gaps
        full_aln = SeqAlignment(file_name=generator.protein_data[p_id]['Final_FA_Aln'], query_id=p_id)
        full_aln.import_alignment()
        non_gap_aln = full_aln.remove_gaps()
        # Import structure
        pdb_structure = PDBReference(pdb_file=generator.protein_data[p_id]['PDB'])
        pdb_structure.import_pdb(structure_id=p_id)
        protein_scorers[p_id]['Structure'] = pdb_structure
        # Initialize Beta Carbon distance scorer
        contact_scorer_cb = ContactScorer(query=p_id, seq_alignment=non_gap_aln,
                                          pdb_reference=pdb_structure, cutoff=8.0)
        contact_scorer_cb.best_chain = generator.protein_data[p_id]['Chain']
        contact_scorer_cb.fit()
        contact_scorer_cb.measure_distance(method='CB')
        protein_scorers[p_id]['Scorer_CB'] = contact_scorer_cb
        # Initialize distance scorer minimizing distance between any atoms
        contact_scorer_any = ContactScorer(query=p_id, seq_alignment=non_gap_aln,
                                           pdb_reference=pdb_structure, cutoff=8.0)
        contact_scorer_any.best_chain = generator.protein_data[p_id]['Chain']
        contact_scorer_any.fit()
        contact_scorer_any.measure_distance(method='Any')
        protein_scorers[p_id]['Scorer_Any'] = contact_scorer_any
        # Initialize z-scoring subproblems
        protein_scorers[p_id]['biased_w2_ave'] = None
        protein_scorers[p_id]['unbiased_w2_ave'] = None
        print('Successfully initialized scoring for: {}'.format(p_id))
        counts['success'] += 1
    except AttributeError:
        print('Could not initialize scoring for: {}\tATTRIBUTE'.format(p_id))
        counts['attribute'] += 1
    except ValueError:
        print('Could not initialize scoring for: {}\tVALUE'.format(p_id))
        counts['value'] += 1
print('Completed initializing scorers:\nSuccesses:\t{}\nAttribute:\t{}\nValue:\t{}'.format(counts['success'], counts['attribute'],
                                                                                           counts['value']))
output_columns = ['Protein', 'Protein Length', 'Alignment Size', 'Method', 'Distance', 'Init Time', 'Import Time', 'Dist Tree Time', 'Trace Time', 'Total Time', 
                  'Sequence_Separation', 'AUROC', 'AUPRC',
                  'Top K Predictions', 'Precision', 'Recall', 'F1 Score',
                  'Biased Z-Score at 10%', 'Biased Z-Score at 30%', 'Max Biased Z-Score', 'AUC Biased Z-Score',
                  'Unbiased Z-Score at 10%', 'Unbiased Z-Score at 30%', 'Max Unbiased Z-Score', 'AUC Unbiased Z-Score']
large_comparison_df = None
large_comparison_fn = os.path.join(large_set_out_dir, 'Large_Comparision_Data.csv')
if os.path.isfile(large_comparison_fn):
    large_comparison_df = pd.read_csv(large_comparison_fn, sep='\t', header=0, index_col=False)

# Generating Values For Comparision#
To determine the effectiveness of the new method and implementation the covariation of the same proteins will be computed using the previous Evolutionary Trace covariation method (ET-MIp) and other methods in the field.

## ET-MIp##
Scoring the the covariation of the proteins using the previous Evolutionary Trace covariation method (ET-MIp).

In [None]:
# from ETMIPWrapper import ETMIPWrapper
# etmip_out_dir = os.path.join(large_set_out_dir, 'ET-MIp')
# if not os.path.isdir(etmip_out_dir):
#     os.makedirs(etmip_out_dir)
# etmip_scores = {}
# counts = {'success':0, 'value': 0, 'attribute':0}
# for p_id in generator.protein_data:
#     print('Attempting to calculate ET-MIp covariance for: {}'.format(p_id))
#     try:
#         protein_out_dir = os.path.join(etmip_out_dir, p_id)
#         if not os.path.isdir(protein_out_dir):
#             os.makedirs(protein_out_dir)
##         curr_aln = SeqAlignment(file_name=generator.protein_data[p_id]['Final_FA_Aln'], query_id=p_id, polymer_type='Protein')
##         curr_aln.import_alignment()
#         curr_etmip = ETMIPWrapper(query=p_id, aln_file=generator.protein_data[p_id]['Final_FA_Aln'],
#                                   out_dir=protein_out_dir)
#         curr_etmip.calculate_scores(delete_files=False)
#         etmip_scores[p_id] = curr_etmip
#         print('Successfully computed ET-MIp covariance for: {}'.format(p_id))
#         counts['success'] += 1
#     except ValueError:
#         print('Could not compute ET-MIp covariance for: {} with seq_length: {} and size: {}'.format(
#             p_id, curr_aln.seq_length, curr_aln.size))
#         counts['value'] += 1
#     except AttributeError:
#         print('Could not compute ET-MIp covariance for: {} no alignment'.format(p_id))
#         counts['attribute'] += 1
# print('{}\tSuccesses\n{}\tValue Errors\n{}\tAttribute Errors'.format(counts['success'], counts['value'],
#                                                                      counts['attribute']))

## ET-MIp (Continued)
The previous implementation is not able to run for alignments of the size used here. Instead we use the new implementation with the same parameterization used by the previous implementation (Distance Model - blosum62 similarity, Tree - ET UPGMA variant, Scoring Metric - filtered average product corrected mutual information, Ranks - all).

In [None]:
from EvolutionaryTrace import EvolutionaryTrace
import numpy as np
import pandas as pd
if not os.path.isfile(large_comparison_fn):
    etmip_out_dir = os.path.join(large_set_out_dir, 'ET-MIp')
    if not os.path.isdir(etmip_out_dir):
        os.makedirs(etmip_out_dir)
    etmip_method_fn = os.path.join(etmip_out_dir, 'ET-MIp_Method_Data.csv')
    if os.path.isfile(etmip_method_fn):
        etmip_method_df = pd.read_csv(etmip_method_fn, sep='\t', header=0, index_col=False)
    else:    
        etmip_method_df = None
        counts = {'success':0, 'value': 0, 'attribute':0, 'type': 0}
        for p_id in summary['Protein_ID']: # generator.protein_data:
            print('Attempting to calculate ET-MIp covariance for: {}'.format(p_id))
            protein_dir = os.path.join(etmip_out_dir, p_id)
            if not os.path.isdir(protein_dir):
                os.makedirs(protein_dir)
            protein_fn = os.path.join(protein_dir, '{}_Protein_Data.csv'.format(p_id))
            if os.path.isfile(protein_fn):
                protein_df = pd.read_csv(protein_fn, sep='\t', header=0, index_col=False)
            else:
                try:

#                     start_time = time()
                    curr_etmip = EvolutionaryTrace(query=p_id, polymer_type='Protein',
                                                   aln_file=generator.protein_data[p_id]['Final_FA_Aln'], et_distance=True,
                                                   distance_model='blosum62', tree_building_method='et', tree_building_options={},
                                                   ranks=None, position_type='pair',
                                                   scoring_metric='filtered_average_product_corrected_mutual_information',
                                                   gap_correction=None, out_dir=protein_dir,
                                                   output_files={'original_aln', 'non_gap_aln', 'tree', 'scores'},
                                                   processors=10, low_memory=True)
#                     init_time = time()
#                     curr_etmip.import_and_process_aln()
#                     import_time = time()
#                     curr_etmip.compute_distance_matrix_tree_and_assignments()
#                     dist_tree_time = time()
#                     curr_etmip.perform_trace()
#                     end_time = time()
                    curr_etmip.calculate_scores()
                    print('Successfully computed ET-MIp covariance for: {}'.format(p_id))
                    # Compute statistics for the final scores of the ET-MIp model
                    protein_df, _, _ = protein_scorers[p_id]['Scorer_CB'].evaluate_predictor(
                        predictor=curr_etmip, verbosity=2, out_dir=protein_dir, dist='CB', biased_w2_ave=None,
                        unbiased_w2_ave=None, processes=10, threshold=0.5, pos_size=curr_etmip.scorer.position_size,
                        rank_type=curr_etmip.scorer.rank_type, file_prefix='ET-MIp_Scores_', plots=True)
                    # Score Prediction Clustering
                    z_score_fn = os.path.join(protein_dir, 'ET-MIp_Scores_Dist-Any_{}_ZScores.tsv')
                    z_score_plot_fn = os.path.join(protein_dir, 'ET-MIp_Scores_Dist-Any_{}_ZScores.png')
                    z_score_biased, biased_w2_ave, biased_scw_z_auc = protein_scorers[p_id]['Scorer_Any'].score_clustering_of_contact_predictions(
                        1.0 - curr_etmip.coverage, bias=True, file_path=z_score_fn.format('Biased'),
                        w2_ave_sub=protein_scorers[p_id]['biased_w2_ave'], processes=10)
                    if protein_scorers[p_id]['biased_w2_ave'] is None:
                        protein_scorers[p_id]['biased_w2_ave'] = biased_w2_ave
                    biased_z_score_array = np.array(pd.to_numeric(z_score_biased['Z-Score'], errors='coerce'))
                    protein_df['Max Biased Z-Score'] = np.nanmax(biased_z_score_array)
                    protein_df['Biased Z-Score at 10%'] = biased_z_score_array[z_score_biased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.10][0]
                    protein_df['Biased Z-Score at 30%'] = biased_z_score_array[z_score_biased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.30][0]
                    protein_df['AUC Biased Z-Score'] = biased_scw_z_auc
                    plot_z_scores(z_score_biased, z_score_plot_fn.format('Biased'))
                    z_score_unbiased, unbiased_w2_ave, unbiased_scw_z_auc = protein_scorers[p_id]['Scorer_Any'].score_clustering_of_contact_predictions(
                        1.0 - curr_etmip.coverage, bias=False, file_path=z_score_fn.format('Unbiased'),
                        w2_ave_sub=protein_scorers[p_id]['unbiased_w2_ave'], processes=10)
                    if protein_scorers[p_id]['unbiased_w2_ave'] is None:
                        protein_scorers[p_id]['unbiased_w2_ave'] = unbiased_w2_ave
                    unbiased_z_score_array = np.array(pd.to_numeric(z_score_unbiased['Z-Score'], errors='coerce'))
                    protein_df['Max Unbiased Z-Score'] = np.nanmax(unbiased_z_score_array)
                    protein_df['Unbiased Z-Score at 10%'] = unbiased_z_score_array[z_score_unbiased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.10][0]
                    protein_df['Unbiased Z-Score at 30%'] = unbiased_z_score_array[z_score_unbiased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.30][0]
                    protein_df['AUC Unbiased Z-Score'] = unbiased_scw_z_auc
                    plot_z_scores(z_score_unbiased, z_score_plot_fn.format('Unbiased'))
                    # Record execution times
                    protein_df['Init Time'] = None # init_time - start_time
                    protein_df['Import Time'] = None # import_time - init_time
                    protein_df['Dist Tree Time'] = None # dist_tree_time - import_time
                    protein_df['Trace Time'] = None # end_time - dist_tree_time
                    protein_df['Total Time'] = curr_etmip.time # end_time - start_time
                    # Record static data for this protein
                    protein_df['Protein'] = p_id
                    protein_df['Method'] = 'ET-MIp'
                    protein_df['Protein Length'] = generator.protein_data[p_id]['Length']
                    protein_df['Alignment Size'] = summary['Filtered_Alignment'].values[summary['Protein_ID'] == p_id][0]
                    protein_df.to_csv(protein_fn, sep='\t', header=True, index=False, columns=output_columns)
                    temp_data = os.path.join(protein_dir, 'unique_node_data')
                    for temp_fn in os.listdir(temp_data):
                        if not temp_fn.endswith("_pair_rank_filtered_average_product_corrected_mutual_information_score.npz"):
                            os.remove(os.path.join(temp_data, temp_fn))
                    print('Metrics meastured for ET-MIp covariance for: {}'.format(p_id))
                    counts['success'] += 1
                except ValueError:
                    print('Could not compute ET-MIp covariance for: {} with seq_length: {} and size: {}'.format(
                        p_id, curr_etmip.original_aln.seq_length, curr_etmip.original_aln.size))
                    counts['value'] += 1
                    continue
                except AttributeError:
                    print('Could not compute ET-MIp covariance for: {} no alignment'.format(p_id))
                    counts['attribute'] += 1
                    continue
                except TypeError:
                    print('Could not compute ET-MIp coooooovariance for: {} type error'.format(p_id))
                    counts['type'] += 1
                    continue
            if etmip_method_df is None:
                etmip_method_df = protein_df
            else:
                etmip_method_df = etmip_method_df.append(protein_df)
        print('{}\tSuccesses\n{}\tValue Errors\n{}\tAttribute Errors'.format(counts['success'], counts['value'],
                                                                             counts['attribute']))
        etmip_method_df.to_csv(etmip_method_fn, sep='\t', header=True, index=False, columns=output_columns)
    if large_comparison_df is None:
        large_comparison_df = etmip_method_df
    else:
        large_comparison_df = large_comparison_df.append(etmip_method_df)

## cET-MIp
This segment the ET-MIp method, when constrained to an arbitrary set of nodes (1, 2, 3, 5, 7, 10, 25) at the top of the phylogenetic tree.

In [None]:
if not os.path.isfile(large_comparison_fn):
    cetmip_out_dir = os.path.join(large_set_out_dir, 'cET-MIp')
    if not os.path.isdir(cetmip_out_dir):
        os.makedirs(cetmip_out_dir)
    cetmip_method_fn = os.path.join(cetmip_out_dir, 'cET-MIp_Method_Data.csv')
    if os.path.isfile(cetmip_method_fn):
        cetmip_method_df = pd.read_csv(cetmip_method_fn, sep='\t', header=0, index_col=False)
    else:
        cetmip_method_df = None
        counts = {'success':0, 'value': 0, 'attribute':0, 'key': 0}
        for p_id in generator.protein_data:
            print('Attempting to calculate cET-MIp covariance for: {}'.format(p_id))
            protein_dir = os.path.join(cetmip_out_dir, p_id)
            if not os.path.isdir(protein_dir):
                os.makedirs(protein_dir)
            protein_fn = os.path.join(protein_dir, '{}_Protein_Data.csv'.format(p_id))
            if os.path.isfile(protein_fn):
                protein_df = pd.read_csv(protein_fn, sep='\t', header=0, index_col=False)
            else:
                try:
#                     start_time = time()
                    curr_cetmip = EvolutionaryTrace(query=p_id, polymer_type='Protein',
                                                   aln_file=generator.protein_data[p_id]['Final_FA_Aln'], et_distance=True,
                                                   distance_model='blosum62', tree_building_method='et', tree_building_options={},
                                                   ranks=[1, 2, 3, 5, 7, 10, 25], position_type='pair',
                                                   scoring_metric='filtered_average_product_corrected_mutual_information',
                                                   gap_correction=None, out_dir=protein_dir,
                                                   output_files={'original_aln', 'non_gap_aln', 'tree', 'scores'},
                                                   processors=10, low_memory=True)
#                     init_time = time()
#                     curr_cetmip.import_and_process_aln()
#                     import_time = time()
#                     curr_cetmip.compute_distance_matrix_tree_and_assignments()
#                     dist_tree_time = time()
#                     curr_cetmip.perform_trace()
#                     end_time = time()
                    curr_cetmip.calculate_scores()
                    # Compute statistics for the final scores of the ET-MIp model
                    protein_df, _, _ = protein_scorers[p_id]['Scorer_CB'].evaluate_predictor(
                        predictor=curr_cetmip, verbosity=2, out_dir=protein_dir, dist='CB', biased_w2_ave=None,
                        unbiased_w2_ave=None, processes=10, threshold=0.5, pos_size=curr_cetmip.scorer.position_size,
                        rank_type=curr_cetmip.scorer.rank_type, file_prefix='cET-MIp_Scores_', plots=True)
                    # Score Prediction Clustering
                    z_score_fn = os.path.join(protein_dir, 'cET-MIp_Scores_Dist-Any_{}_ZScores.tsv')
                    z_score_plot_fn = os.path.join(protein_dir, 'cET-MIp_Scores_Dist-Any_{}_ZScores.png')
                    z_score_biased, biased_w2_ave, biased_scw_z_auc = protein_scorers[p_id]['Scorer_Any'].score_clustering_of_contact_predictions(
                        1.0 - curr_cetmip.coverage, bias=True, file_path=z_score_fn.format('Biased'),
                        w2_ave_sub=protein_scorers[p_id]['biased_w2_ave'], processes=10)
                    if protein_scorers[p_id]['biased_w2_ave'] is None:
                        protein_scorers[p_id]['biased_w2_ave'] = biased_w2_ave
                    biased_z_score_array = np.array(pd.to_numeric(z_score_biased['Z-Score'], errors='coerce'))
                    protein_df['Max Biased Z-Score'] = np.nanmax(biased_z_score_array)
                    protein_df['Biased Z-Score at 10%'] = biased_z_score_array[z_score_biased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.10][0]
                    protein_df['Biased Z-Score at 30%'] = biased_z_score_array[z_score_biased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.30][0]
                    protein_df['AUC Biased Z-Score'] = biased_scw_z_auc
                    plot_z_scores(z_score_biased, z_score_plot_fn.format('Biased'))
                    z_score_unbiased, unbiased_w2_ave, unbiased_scw_z_auc = protein_scorers[p_id]['Scorer_Any'].score_clustering_of_contact_predictions(
                        1.0 - curr_cetmip.coverage, bias=False, file_path=z_score_fn.format('Unbiased'),
                        w2_ave_sub=protein_scorers[p_id]['unbiased_w2_ave'], processes=10)
                    if protein_scorers[p_id]['unbiased_w2_ave'] is None:
                        protein_scorers[p_id]['unbiased_w2_ave'] = unbiased_w2_ave
                    unbiased_z_score_array = np.array(pd.to_numeric(z_score_unbiased['Z-Score'], errors='coerce'))
                    protein_df['Max Unbiased Z-Score'] = np.nanmax(unbiased_z_score_array)
                    protein_df['Unbiased Z-Score at 10%'] = unbiased_z_score_array[z_score_unbiased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.10][0]
                    protein_df['Unbiased Z-Score at 30%'] = unbiased_z_score_array[z_score_unbiased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.30][0]
                    protein_df['AUC Unbiased Z-Score'] = unbiased_scw_z_auc
                    plot_z_scores(z_score_unbiased, z_score_plot_fn.format('Unbiased'))
                    # Record execution times
                    protein_df['Init Time'] = None # init_time - start_time
                    protein_df['Import Time'] = None # import_time - init_time
                    protein_df['Dist Tree Time'] = None # dist_tree_time - import_time
                    protein_df['Trace Time'] = None # end_time - dist_tree_time
                    protein_df['Total Time'] = curr_cetmip.time # end_time - start_time
                    # Record static data for this protein
                    protein_df['Protein'] = p_id
                    protein_df['Method'] = 'cET-MIp'
                    protein_df['Protein Length'] = generator.protein_data[p_id]['Length']
                    protein_df['Alignment Size'] = summary['Filtered_Alignment'].values[summary['Protein_ID'] == p_id][0]
                    protein_df.to_csv(protein_fn, sep='\t', header=True, index=False, columns=output_columns)
                    temp_data = os.path.join(protein_dir, 'unique_node_data')
                    for temp_fn in os.listdir(temp_data):
                        if not temp_fn.endswith("_pair_rank_filtered_average_product_corrected_mutual_information_score.npz"):
                            os.remove(os.path.join(temp_data, temp_fn))
                    print('Successfully computed cET-MIp covariance for: {}'.format(p_id))
                    counts['success'] += 1
                except ValueError:
                    print('Could not compute cET-MIp covariance for: {} with seq_length: {} and size: {}'.format(
                        p_id, curr_cetmip.original_aln.seq_length, curr_etmip.original_aln.size))
                    counts['value'] += 1
                    continue
                except AttributeError:
                    print('Could not compute cET-MIp covariance for: {} no alignment'.format(p_id))
                    counts['attribute'] += 1
                    continue
                except KeyError:
                    print('Could not compute cET-MIp covariance for: {} not enough sequences'.format('p_ied'))
                    counts['key'] += 1
                    continue
            if cetmip_method_df is None:
                cetmip_method_df = protein_df
            else:
                cetmip_method_df = cetmip_method_df.append(protein_df)
        print('{}\tSuccesses\n{}\tValue Errors\n{}\tAttribute Errors'.format(counts['success'], counts['value'],
                                                                             counts['attribute']))
        cetmip_method_df.to_csv(cetmip_method_fn, sep='\t', header=True, index=False, columns=output_columns)
    if large_comparison_df is None:
        large_comparison_df = cetmip_method_df
    else:
        large_comparison_df = large_comparison_df.append(cetmip_method_df)

## DCA##
Scoring the the covariation of the proteins using a DCA julia implementation.

In [None]:
from DCAWrapper import DCAWrapper
from utils import compute_rank_and_coverage
if not os.path.isfile(large_comparison_fn):
    dca_out_dir = os.path.join(large_set_out_dir, 'DCA')
    if not os.path.isdir(dca_out_dir):
        os.makedirs(dca_out_dir)
    dca_method_fn = os.path.join(dca_out_dir, 'DCA_Method_Data.csv')
    if olarges.path.isfile(dca_method_fn):
        dca_method_df = pd.read_csv(dca_method_fn, sep='\t', header=0, index_col=False)
    else:
        dca_method_df = None
        counts = {'success':0, 'value': 0, 'attribute':0}
        for p_id in generator.protein_data:
            print('Attempting to calculate DCA covariance for: {}'.format(p_id))
            protein_dir = os.path.join(dca_out_dir, p_id)
            if not os.path.isdir(protein_dir):
                os.makedirs(protein_dir)
            protein_fn = os.path.join(protein_dir, '{}_Protein_Data.csv'.format(p_id))
            if os.path.isfile(protein_fn):
                protein_df = pd.read_csv(protein_fn, sep='\t', header=0, index_col=False)
            else:
                try:
#                     curr_aln = SeqAlignment(file_name=generator.protein_data[p_id]['Final_FA_Aln'], query_id=p_id,
#                                             polymer_type='Protein')
#                     curr_aln.import_alignment()
                    # Since the DCA implementation used here does not provide a way to specify the query sequence we remove the gaps
                    # from the query sequences so positions will be referenced correctly for that sequence (and unnecessary
                    # computations can be avoided).
#                     curr_aln = curr_aln.remove_gaps()
#                     new_aln_fn = os.path.join(protein_dir, '{}_no_gap.fasta'.format(p_id))
#                     curr_aln.write_out_alignment(new_aln_fn)
#                     curr_aln.file_name = new_aln_fn
#                     curr_dca = DCAWrapper(alignment=curr_aln)
                    curr_dca = DCAWrapper(query=p_id, aln_file=generator.protein_data[p_id]['Final_FA_Aln'], out_dir=protein_dir)
#                     curr_dca.calculate_scores(out_dir=protein_dir, delete_file=False)
                    curr_dca.calculate_scores(delete_file=False)
                    # Compute statistics for the final scores of the ET-MIp model
                    protein_df, _, _ = protein_scorers[p_id]['Scorer_CB'].evaluate_predictor(
                        predictor=curr_dca, verbosity=2, out_dir=protein_dir, dist='CB', biased_w2_ave=None,
                        unbiased_w2_ave=None, processes=10, threshold=0.5, pos_size=2, rank_type='max', file_prefix='DCA_Scores_', plots=True)
                    # Score Prediction Clustering
                    _, dca_coverage  = compute_rank_and_coverage(seq_length=curr_dca.alignment.seq_length, scores=curr_dca.scores, pos_size=2,
                        rank_type='max')
                    z_score_fn = os.path.join(protein_dir, 'DCA_Scores_Dist-Any_{}_ZScores.tsv')
                    z_score_plot_fn = os.path.join(protein_dir, 'DCA_Scores_Dist-Any_{}_ZScores.png')
                    z_score_biased, biased_w2_ave, biased_scw_z_auc = protein_scorers[p_id]['Scorer_Any'].score_clustering_of_contact_predictions(
                        1.0 - dca_coverage, bias=True, file_path=z_score_fn.format('Biased'),
                        w2_ave_sub=protein_scorers[p_id]['biased_w2_ave'], processes=10)
                    if protein_scorers[p_id]['biased_w2_ave'] is None:
                        protein_scorers[p_id]['biased_w2_ave'] = biased_w2_ave
                    biased_z_score_array = np.array(pd.to_numeric(z_score_biased['Z-Score'], errors='coerce'))
                    protein_df['Max Biased Z-Score'] = np.nanmax(biased_z_score_array)
                    protein_df['Biased Z-Score at 10%'] = biased_z_score_array[z_score_biased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.10][0]
                    protein_df['Biased Z-Score at 30%'] = biased_z_score_array[z_score_biased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.30][0]
                    protein_df['AUC Biased Z-Score'] = biased_scw_z_auc
                    plot_z_scores(z_score_biased, z_score_plot_fn.format('Biased'))
                    z_score_unbiased, unbiased_w2_ave, unbiased_scw_z_auc = protein_scorers[p_id]['Scorer_Any'].score_clustering_of_contact_predictions(
                        1.0 - dca_coverage, bias=False, file_path=z_score_fn.format('Unbiased'),
                        w2_ave_sub=protein_scorers[p_id]['unbiased_w2_ave'], processes=10)
                    if protein_scorers[p_id]['unbiased_w2_ave'] is None:
                        protein_scorers[p_id]['unbiased_w2_ave'] = unbiased_w2_ave
                    unbiased_z_score_array = np.array(pd.to_numeric(z_score_unbiased['Z-Score'], errors='coerce'))
                    protein_df['Max Unbiased Z-Score'] = np.nanmax(unbiased_z_score_array)
                    protein_df['Unbiased Z-Score at 10%'] = unbiased_z_score_array[z_score_unbiased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.10][0]
                    protein_df['Unbiased Z-Score at 30%'] = unbiased_z_score_array[z_score_unbiased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.30][0]
                    protein_df['AUC Unbiased Z-Score'] = unbiased_scw_z_auc
                    plot_z_scores(z_score_unbiased, z_score_plot_fn.format('Unbiased'))
                    # Record execution times
                    protein_df['Init Time'] = None
                    protein_df['Import Time'] = None
                    protein_df['Dist Tree Time'] = None
                    protein_df['Trace Time'] = None
                    protein_df['Total Time'] = None
                    # Record static data for this protein
                    protein_df['Protein'] = p_id
                    protein_df['Method'] = 'DCA'
                    protein_df['Protein Length'] = generator.protein_data[p_id]['Length']
                    protein_df['Alignment Size'] = summary['Filtered_Alignment'].values[summary['Protein_ID'] == p_id][0]
                    protein_df.to_csv(protein_fn, sep='\t', header=True, index=False, columns=output_columns)
                    print('Successfully computed DCA covariance for: {}'.format(p_id))
                    counts['success'] += 1
                except ValueError:
                    print('Could not compute DCA covariance for: {} with seq_length: {} and size: {}'.format(
                        p_id, curr_aln.seq_length, curr_aln.size))
                    counts['value'] += 1
                    continue
                except AttributeError:
                    print('Could not compute DCA covariance for: {} no alignment'.format(p_id))
                    counts['attribute'] += 1
                    continue
            if dca_method_df is None:
                dca_method_df = protein_df
            else:
                dca_method_df = dca_method_df.append(protein_df)
        print('{}\tSuccesses\n{}\tValue Errors\n{}\tAttribute Errors'.format(counts['success'], counts['value'],
                                                                             counts['attribute']))
        dca_method_df.to_csv(dca_method_fn, sep='\t', header=True, index=False, columns=output_columns)
    if large_comparison_df is None:
        large_comparison_df = dca_method_df
    else:
        large_comparison_df = large_comparison_df.append(dca_method_df)

## EVCouplings##
Scoring the the covariation of the proteins using the EVCouplings method standard protocol.

In [None]:
from EVCouplingsWrapper import EVCouplingsWrapper
if not os.path.isfile(large_comparison_fn):
    evc_standard_out_dir = os.path.join(large_set_out_dir, 'EVCouplings_Standard')
    if not os.path.isdir(evc_standard_out_dir):
        os.makedirs(evc_standard_out_dir)
    evc_standard_method_fn = os.path.join(evc_standard_out_dir, 'EVCouplings_Standard_Method_Data.csv')
    if os.path.isfile(evc_standard_method_fn):
        evc_standard_method_df = pd.read_csv(evc_standard_method_fn, sep='\t', header=0, index_col=False)
    else:
        evc_standard_method_df = None
        counts = {'success':0, 'value': 0, 'attribute':0}
        for p_id in generator.protein_data:
            print('Attempting to calculate EV couplings standard protocol covariance for: {}'.format(p_id))
            protein_dir = os.path.join(evc_standard_out_dir, p_id)
            if not os.path.isdir(protein_dir):
                os.makedirs(protein_dir)
            protein_fn = os.path.join(protein_dir, '{}_Protein_Data.csv'.format(p_id))
            if os.path.isfile(protein_fn):
                protein_df = pd.read_csv(protein_fn, sep='\t', header=0, index_col=False)
            else:
                try:
#                     curr_aln = SeqAlignment(file_name=generator.protein_data[p_id]['Final_FA_Aln'], query_id=p_id,
#                                             polymer_type='Protein')
#                     curr_aln.import_alignment()
#                     curr_evc = EVCouplingsWrapper(alignment=curr_aln, protocol='standard')
#                     curr_evc.calculate_scores(out_dir=protein_dir, cores=10, delete_files=True)
                    curr_evc = EVCouplingsWrapper(query=p_id, aln_file=generator.protein_data[p_id]['Final_FA_Aln'], protocol='standard', out_dir=protein_dir)
                    curr_evc.calculate_scores(cores=10, delete_files=True)
                    # Compute statistics for the final scores of the ET-MIp model
                    protein_df, _, _ = protein_scorers[p_id]['Scorer_CB'].evaluate_predictor(
                        predictor=curr_evc, verbosity=2, out_dir=protein_dir, dist='CB', biased_w2_ave=None,
                        unbiased_w2_ave=None, processes=10, threshold=0.5, pos_size=2,
                        rank_type='max', file_prefix='EVC_Standard_Scores_', plots=True)
                    # Score Prediction Clustering
                    _, evc_standard_coverage  = compute_rank_and_coverage(seq_length=curr_evc.alignment.seq_length, scores=curr_evc.scores, pos_size=2,
                        rank_type='max')
                    z_score_fn = os.path.join(protein_dir, 'EVC_Standard_Scores_Dist-Any_{}_ZScores.tsv')
                    z_score_plot_fn = os.path.join(protein_dir, 'EVC_Standard_Scores_Dist-Any_{}_ZScores.png')
                    z_score_biased, biased_w2_ave, biased_scw_z_auc = protein_scorers[p_id]['Scorer_Any'].score_clustering_of_contact_predictions(
                        1.0 - evc_standard_coverage, bias=True, file_path=z_score_fn.format('Biased'),
                        w2_ave_sub=protein_scorers[p_id]['biased_w2_ave'], processes=10)
                    if protein_scorers[p_id]['biased_w2_ave'] is None:
                        protein_scorers[p_id]['biased_w2_ave'] = biased_w2_ave
                    biased_z_score_array = np.array(pd.to_numeric(z_score_biased['Z-Score'], errors='coerce'))
                    protein_df['Max Biased Z-Score'] = np.nanmax(biased_z_score_array)
                    protein_df['Biased Z-Score at 10%'] = biased_z_score_array[z_score_biased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.10][0]
                    protein_df['Biased Z-Score at 30%'] = biased_z_score_array[z_score_biased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.30][0]
                    protein_df['AUC Biased Z-Score'] = biased_scw_z_auc
                    plot_z_scores(z_score_biased, z_score_plot_fn.format('Biased'))
                    z_score_unbiased, unbiased_w2_ave, unbiased_scw_z_auc = protein_scorers[p_id]['Scorer_Any'].score_clustering_of_contact_predictions(
                        1.0 - evc_standard_coverage, bias=False, file_path=z_score_fn.format('Unbiased'),
                        w2_ave_sub=protein_scorers[p_id]['unbiased_w2_ave'], processes=10)
                    if protein_scorers[p_id]['unbiased_w2_ave'] is None:
                        protein_scorers[p_id]['unbiased_w2_ave'] = unbiased_w2_ave
                    unbiased_z_score_array = np.array(pd.to_numeric(z_score_unbiased['Z-Score'], errors='coerce'))
                    protein_df['Max Unbiased Z-Score'] = np.nanmax(unbiased_z_score_array)
                    protein_df['Unbiased Z-Score at 10%'] = unbiased_z_score_array[z_score_unbiased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.10][0]
                    protein_df['Unbiased Z-Score at 30%'] = unbiased_z_score_array[z_score_unbiased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.30][0]
                    protein_df['AUC Unbiased Z-Score'] = unbiased_scw_z_auc
                    plot_z_scores(z_score_unbiased, z_score_plot_fn.format('Unbiased'))
                    # Record execution times
                    protein_df['Init Time'] = None
                    protein_df['Import Time'] = None
                    protein_df['Dist Tree Time'] = None
                    protein_df['Trace Time'] = None
                    protein_df['Total Time'] = None
                    # Record static data for this protein
                    protein_df['Protein'] = p_id
                    protein_df['Method'] = 'EVC Standard'
                    protein_df['Protein Length'] = generator.protein_data[p_id]['Length']
                    protein_df['Alignment Size'] = summary['Filtered_Alignment'].values[summary['Protein_ID'] == p_id][0]
                    protein_df.to_csv(protein_fn, sep='\t', header=True, index=False, columns=output_columns)
                    print('Successfully computed EV couplings standard protocol covariance for: {}'.format(p_id))
                    counts['success'] += 1
                except ValueError:
                    print('Could not compute EV couplings standard protocol covariance for: {} with seq_length: {} and size: {}'.format(
                        p_id, curr_aln.seq_length, curr_aln.size))
                    counts['value'] += 1
                    continue
                except AttributeError:
                    print('Could not compute EV couplings standard protocol covariance for: {} no alignment'.format(p_id))
                    counts['attribute'] += 1
                    continue
            if evc_standard_method_df is None:
                evc_standard_method_df = protein_df
            else:
                evc_standard_method_df = evc_standard_method_df.append(protein_df)
        print('{}\tSuccesses\n{}\tValue Errors\n{}\tAttribute Errors'.format(counts['success'], counts['value'],
                                                                             counts['attribute']))
        evc_standard_method_df.to_csv(evc_standard_method_fn, sep='\t', header=True, index=False, columns=output_columns)
    if large_comparison_df is None:
        large_comparison_df = evc_standard_method_df
    else:
        large_comparison_df = large_comparison_df.append(evc_standard_method_df)

Scoring the covariation of the proteins using the EVCouplings method mean field protocol.

In [None]:
if not os.path.isfile(large_comparison_fn):
    evc_mf_out_dir = os.path.join(large_set_out_dir, 'EVCouplings_Mean_Field')
    if not os.path.isdir(evc_mf_out_dir):
        os.makedirs(evc_mf_out_dir)
    evc_mf_method_fn = os.path.join(evc_mf_out_dir, 'EVCouplings_Mean_Field_Method_Data.csv')
    if os.path.isfile(evc_mf_method_fn):
        evc_mf_method_df = pd.read_csv(evc_mf_method_fn, sep='\t', header=0, index_col=False)
    else:
        evc_mf_method_df = None
        counts = {'success':0, 'value': 0, 'attribute':0}
        for p_id in generator.protein_data:
            print('Attempting to calculate EV couplings covariance for: {}'.format(p_id))
            try:
                protein_dir = os.path.join(evc_mf_out_dir, p_id)
                if not os.path.isdir(protein_dir):
                    os.makedirs(protein_dir)
#                 curr_aln = SeqAlignment(file_name=generator.protein_data[p_id]['Final_FA_Aln'], query_id=p_id,
#                                         polymer_type='Protein')
#                 curr_aln.import_alignment()
#                 curr_evc = EVCouplingsWrapper(alignment=curr_aln, protocol='mean_field')
#                 curr_evc.calculate_scores(out_dir=protein_dir, cores=10, delete_files=True)
                curr_evc = EVCouplingsWrapper(query=p_id, aln_file=generator.protein_data[p_id]['Final_FA_Aln'], protocol='mean_field', out_dir=protein_dir)
                curr_evc.calculate_scores(cores=10, delete_files=True)
                # Compute statistics for the final scores of the ET-MIp model
                protein_df, _, _ = protein_scorers[p_id]['Scorer_CB'].evaluate_predictor(
                    predictor=curr_evc, verbosity=2, out_dir=protein_dir, dist='CB', biased_w2_ave=None,
                    unbiased_w2_ave=None, processes=10, threshold=0.5, pos_size=2, rank_type='max',
                    file_prefix='EVC_Standard_Scores_', plots=True)
                # Score Prediction Clustering
                _, evc_mf_coverage  = compute_rank_and_coverage(seq_length=curr_evc.alignment.seq_length, scores=curr_evc.scores, pos_size=2,
                    rank_type='max')
                z_score_fn = os.path.join(protein_dir, 'EVC_Mean_Field_Scores_Dist-Any_{}_ZScores.tsv')
                z_score_plot_fn = os.path.join(protein_dir, 'EVC_Mean_Field_Scores_Dist-Any_{}_ZScores.png')
                z_score_biased, biased_w2_ave, biased_scw_z_auc = protein_scorers[p_id]['Scorer_Any'].score_clustering_of_contact_predictions(
                    1.0 - evc_mf_coverage, bias=True, file_path=z_score_fn.format('Biased'),
                    w2_ave_sub=protein_scorers[p_id]['biased_w2_ave'], processes=10)
                if protein_scorers[p_id]['biased_w2_ave'] is None:
                        protein_scorers[p_id]['biased_w2_ave'] = biased_w2_ave
                biased_z_score_array = np.array(pd.to_numeric(z_score_biased['Z-Score'], errors='coerce'))
                protein_df['Max Biased Z-Score'] = np.nanmax(biased_z_score_array)
                protein_df['Biased Z-Score at 10%'] = biased_z_score_array[z_score_biased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.10][0]
                protein_df['Biased Z-Score at 30%'] = biased_z_score_array[z_score_biased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.30][0]
                protein_df['AUC Biased Z-Score'] = biased_scw_z_auc
                plot_z_scores(z_score_biased, z_score_plot_fn.format('Biased'))
                z_score_unbiased, unbiased_w2_ave, unbiased_scw_z_auc = protein_scorers[p_id]['Scorer_Any'].score_clustering_of_contact_predictions(
                    1.0 - evc_mf_coverage, bias=False, file_path=z_score_fn.format('Unbiased'),
                    w2_ave_sub=protein_scorers[p_id]['unbiased_w2_ave'], processes=10)
                if protein_scorers[p_id]['unbiased_w2_ave'] is None:
                        protein_scorers[p_id]['unbiased_w2_ave'] = unbiased_w2_ave
                unbiased_z_score_array = np.array(pd.to_numeric(z_score_unbiased['Z-Score'], errors='coerce'))
                protein_df['Max Unbiased Z-Score'] = np.nanmax(unbiased_z_score_array)
                protein_df['Unbiased Z-Score at 10%'] = unbiased_z_score_array[z_score_unbiased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.10][0]
                protein_df['Unbiased Z-Score at 30%'] = unbiased_z_score_array[z_score_unbiased['Num_Residues'] >= float(len(protein_scorers[p_id]['Scorer_Any'].query_structure.seq[protein_scorers[p_id]['Scorer_Any'].best_chain])) * 0.30][0]
                protein_df['AUC Unbiased Z-Score'] = unbiased_scw_z_auc
                plot_z_scores(z_score_unbiased, z_score_plot_fn.format('Unbiased'))
                # Record execution times
                protein_df['Init Time'] = None
                protein_df['Import Time'] = None
                protein_df['Dist Tree Time'] = None
                protein_df['Trace Time'] = None
                protein_df['Total Time'] = None
                # Record static data for this protein
                protein_df['Protein'] = p_id
                protein_df['Method'] = 'EVC Mean Field'
                protein_df['Protein Length'] = generator.protein_data[p_id]['Length']
                protein_df['Alignment Size'] = summary['Filtered_Alignment'].values[summary['Protein_ID'] == p_id][0]
                protein_df.to_csv(protein_fn, sep='\t', header=True, index=False, columns=output_columns)
                print('Successfully computed EV couplings covariance for: {}'.format(p_id))
                counts['success'] += 1
            except ValueError:
                print('Could not compute EV couplings covariance for: {} with seq_length: {} and size: {}'.format(
                    p_id, curr_aln.seq_length, curr_aln.size))
                counts['value'] += 1
                continue
            except AttributeError:
                print('Could not compute EV couplings covariance for: {} no alignment'.format(p_id))
                counts['attribute'] += 1
                continue
            if evc_mf_method_df is None:
                evc_mf_method_df = protein_df
            else:
                evc_mf_method_df = evc_mf_method_df.append(protein_df)
        print('{}\tSuccesses\n{}\tValue Errors\n{}\tAttribute Errors'.format(counts['success'], counts['value'],
                                                                             counts['attribute']))
        evc_mf_method_df.to_csv(evc_mf_method_fn, sep='\t', header=True, index=False, columns=output_columns)
    if large_comparison_df is None:
        large_comparison_df = evc_mf_method_df
    else:
        large_comparison_df = large_comparison_df.append(evc_mf_method_df)

In [None]:
# Write out final comparison data so it can be loaded later for generating figures.
large_comparison_df.to_csv(large_comparison_fn, sep='\t', header=True, index=False, columns=output_columns)