In [171]:
# Load libraries
from collections import defaultdict
import time
from Bio import SeqIO
import pandas as pd
import torch.nn  as nn
import torch
import networkx as nx
from io import StringIO
import numpy as np
from sklearn.decomposition import PCA
cos = nn.CosineSimilarity(dim=0, eps=1e-6)
top_n = 3
min_diagonal_length = 10
max_mismatches = 3


i = 0
def get_data_matrix(seq_1, seq_2):
    x_tensor = seq_1["representations"][36]
    y_tensor = seq_2["representations"][36]

    # Normalize the vectors (this is needed for cosine similarity)
    x_norm = x_tensor / x_tensor.norm(dim=1)[:, None]
    y_norm = y_tensor / y_tensor.norm(dim=1)[:, None]

    # Compute the cosine similarity matrix
    cosine_similarity_matrix = torch.mm(x_norm, y_norm.transpose(0,1))

    # If you need the output as a DataFrame
    data = pd.DataFrame(cosine_similarity_matrix.numpy())
    return data


def approximate_similarity_matrix(seq1_embeddings, seq2_embeddings, n_components=5):
    """
    Calculates an approximate similarity matrix by reducing the dimensionality of embeddings.
    """
    # Extract the embeddings
    seq1_embeddings_tensor = seq1_embeddings["representations"][36]
    seq2_embeddings_tensor = seq2_embeddings["representations"][36]
    
    # Concatenate the embeddings 
    combined_embeddings = torch.cat((seq1_embeddings_tensor, seq2_embeddings_tensor), dim=0)
    
    # Numpy for PCA
    combined_embeddings_np = combined_embeddings.detach().numpy()  
    
    # Dimension reduction with PCA
    pca = PCA(n_components=n_components)
    reduced_embeddings = pca.fit_transform(combined_embeddings_np)
    
    # Separate reduced embeddings
    reduced_seq1_embeddings = reduced_embeddings[:len(seq1_embeddings_tensor)]
    reduced_seq2_embeddings = reduced_embeddings[len(seq1_embeddings_tensor):]
    
    # Standardize reduced embeddings
    # Cosine similarity on reduced embeddings
    norm_seq1 = np.linalg.norm(reduced_seq1_embeddings, axis=1, keepdims=True)
    norm_seq2 = np.linalg.norm(reduced_seq2_embeddings, axis=1, keepdims=True)
    similarity_matrix = np.dot(reduced_seq1_embeddings / norm_seq1, (reduced_seq2_embeddings / norm_seq2).T)
    similarity_matrix = pd.DataFrame(similarity_matrix)
    return similarity_matrix


def find_mutual_matches_optimized(data, top_n=3):

    # Find the top_n indices for each line
    top_n_indices_rows = np.argsort(-data.values, axis=1)[:, :top_n]
    
    # Find the top_n indices for each column
    top_n_indices_cols = np.argsort(-data.values, axis=0)[:top_n, :]
    
    matches = set()
    for i in range(data.shape[0]):  # For each line
        for j in top_n_indices_rows[i]:  # For each top_n index in the line
            if i in top_n_indices_cols[:, j]:  # If row index is in column top_n
                matches.add((i, j))
                
    return matches


def add_matching_neighbors_optimized(seq_1_str, seq_2_str, matches):
    temp_set = set()

    for match in matches:
        i, j = match
        # Checking the neighbors of each match
        if i > 0 and j > 0 and seq_1_str[i - 1] == seq_2_str[j - 1]:
            temp_set.add((i - 1, j - 1))
        if i < len(seq_1_str) - 1 and j < len(seq_2_str) - 1 and seq_1_str[i + 1] == seq_2_str[j + 1]:
            temp_set.add((i + 1, j + 1))

    return matches.union(temp_set)


def find_exclusive_intervals_optimized(intervals):
    # Sort intervals by starting point, then descending end point
    intervals.sort(key=lambda x: (x[0], -x[1]))
    
    exclusive_intervals = []
    max_end_so_far = -1
    
    for interval in intervals:
        # If the end point of the current interval is greater than max_end_so_far,
        # this means that the interval is not included in any of the preceding intervals
        if interval[1] > max_end_so_far:
            exclusive_intervals.append(interval)
            max_end_so_far = interval[1]
    
    return exclusive_intervals


def find_matches_optimized(s, t, offset_val, matches, k, nb_errors=2):
    found_matches = []

    # Optimization: Run through the sequence once, keeping track of errors and matches    
    start = 0
    while start <= len(s) - k:  # Make sure there are enough characters left for a valid match
        error_count = 0
        match_length = 0
        for i in range(start, len(s)):
            # Check whether current positions match or whether a pre-existing match is recognized
            if s[i] == t[i] or (i, i + offset_val) in matches:
                match_length += 1
            else:
                error_count += 1
                if error_count > nb_errors:
                    #If the number of errors exceeds the authorized threshold, end the current check.
                    break
            
            # Check whether the current length of the valid match exceeds the threshold k
            if match_length >= k:
                found_matches.append((start, i - error_count))
                break

        start += 1  # Move to the next starting position for the next check

    # Filter the intervals found to keep only those that are exclusive
    unique_found_matches = find_exclusive_intervals_optimized(found_matches)

    return unique_found_matches


def get_matches_new(seq_1_str, seq_2_str, data, max_mismatches=3):
    matches = find_mutual_matches_optimized(data)
    matches = add_matching_neighbors_optimized(seq_1_str, seq_2_str, matches)
    valid_segments = find_all_matches_optimized(seq_1_str, seq_2_str, max_mismatches, matches)
    valid_segments = sorted(valid_segments, key=lambda x: x[0][0])
    valid_diagonals = get_valid_diagonals(valid_segments)
    matches = cleanup_matches(matches, valid_diagonals)
    
    return matches


def generate_rrotation(s, t, offset):
    """
    generate_lrotation inputs:
    s = seq_1_str
    t = seq_2_str
    offset = position in sequence where offset occurs

    generate_lrotation function rotates seq_2_str 1 position right
    along corresponding seq_1_str for each iteration and
    returns rotated string.
    """
    # If the offset is larger than the length of the
    # sequence 't', raise an exception.
    if offset >= len(s):
        raise Exception(f"offset {offset} larger than seq length {len(s)}")

    lgaps = '-' * offset

    # Extract a substring from sequence 't' starting from the offset
    # index up to the length of 's'.
    # my_str represents the part of 't' that will be kept after the rotation.
    my_str = t[0:len(s) - offset]

    # Generate a string of '-' characters of length equal to the remaining
    # length of 's' after adding 'my_str'.
    # rgaps represents the right gaps that will be added to the end of the sequence.
    rgaps = '-' * (len(s) - len(lgaps + my_str))

    return lgaps + my_str + rgaps


def generate_lrotation(s, t, offset):
    """
    generate_lrotation inputs:
    s = seq_1_str
    t = seq_2_str
    offset = position in sequence where offset occurs

    generate_lrotation function rotates seq_2_str 1 position left
    along corresponding seq_1_str for each iteration and
    returns rotated string.
    """
    # If the offset is larger than the length of the
    # sequence 't', raise an exception.
    if offset >= len(t):
        raise Exception(f"offset {offset} larger than seq length {len(s)}")

    # Extract a substring from sequence 't' starting from the offset
    # index up to the length of 's'.
    # my_str represents the part of 't' that will be kept after the rotation.
    my_str = t[offset:len(s)]

    # Generate a string of '-' characters of length equal to the remaining
    # length of 's' after adding 'my_str'.
    # rgaps represents the right gaps that will be added to the end of the sequence.
    rgaps = '-' * (len(s) - len(my_str))

    return my_str + rgaps


def find_all_matches_optimized(s, t, k, matched_pairs):
    """
    find_all_matches inputs:
    s = seq_1 sequence string denoted as 'seq_1_str'
    t = seq_2 sequence string denoted as 'seq_2_str'
    k = max_mismatches, hyperparameter defined above for amount of
    mismatches allowed.
    matched_pairs = current 'matches' list, which contains mutual matches
    and matching neighbors.
    """
    all_matches = []

    # In each iteration, generate a right rotation of 'seq_2_str' by the
    # current index and run find_match function to identify matching pairs
    # in 'seq_1_str' and 'seq_2_str' after rotation.
    # Matched pairs identified during rotation are added to all_matches
    # list.
    for i in range(0, len(s)):
        t_offset = generate_rrotation(s, t, i)

        match_in_i = find_matches_optimized(s, t_offset, -i, matched_pairs, k)

        # Adds another match along the same diagonal to match_in_i
        match_in_j = [(x - i, y - i) for x, y in match_in_i]

        # Adds both matches along same diagonal to 'all_matches' list
        all_matches.extend(list(zip(match_in_i, match_in_j)))

    # In each iteration, generate a left rotation of 'seq_2_str' by the
    # current index and run find_match function to identify matching pairs
    # in 'seq_1_str' and 'seq_2_str' after rotation.
    # Matched pairs identified during rotation are added to all_matches
    # list.
    for i in range(1, len(t)):
        t_offset = generate_lrotation(s, t, i)

        match_in_i = find_matches_optimized(s, t_offset, +i, matched_pairs, k)

        # Adds another match along the same diagonal to match_in_i
        match_in_j = [(x + i, y + i) for x, y in match_in_i]

        # Adds both matches along same diagonal to 'all_matches' list
        all_matches.extend(list(zip(match_in_i, match_in_j)))

    return all_matches


def build_paths_graph(data, matches):
    """
    build_paths_graph function identifies diagonal segments
    from sorted matches.
    """
    dag = {}

    graph = nx.DiGraph()

    max_depth = max([x[0] for x in matches])

    # Sort the matches based on the second element of the match pairs.
    sorted_matches = sorted(matches, key=lambda x: x[1])

    # Loop over the sorted matches and
    # add edges between them to build the graph.
    for i in range(len(sorted_matches) - 1):
        last_depth = max_depth
        dag[sorted_matches[i]] = []

        for j in range(i + 1, len(sorted_matches)):

            if (sorted_matches[i][0] == sorted_matches[j][0]) or (sorted_matches[i][1] == sorted_matches[j][1]):
                # Don't consider overlapping cells
                continue

            if (sorted_matches[j][0]) < last_depth and (sorted_matches[j][0] > sorted_matches[i][0]):
                dag[sorted_matches[i]].append(sorted_matches[j])
                seq_1_idx, seq_2_idx = sorted_matches[j]
                graph.add_edge(sorted_matches[i], sorted_matches[j], weigth=data.iloc[seq_1_idx, seq_2_idx])
                last_depth = sorted_matches[j][0]

    return graph


def get_valid_diagonals(valid_segments):
    """
    valid_segments = sorted(valid_segments)

    get_valid_diagonals function identifies matches that occur consecutively
    in a diagonal and stores them in a dictionary 'valid_diagonals'.
    """
    valid_diagonals = defaultdict(int)

    # Loop over the valid segments and add the length of each segment
    # to its corresponding diagonal in the dictionary.
    for x in valid_segments:
        min_val = min(x[0][0], x[1][0])
        diag = (x[0][0] - min_val, x[1][0] - min_val)
        valid_diagonals[diag] += x[0][1] - x[0][0] + 1

    return valid_diagonals


def cleanup_matches(matches, valid_diagonals):
    """
    cleanup_matches removes matches that do not occur in a valid_diagonal
    but are shorter than min_diagonal_length (hyperparameter).
    """
    remove_elems = []

    # Loop over the matches and add any invalid match to the removal list
    for x in matches:
        min_val = min(x[0], x[1])
        diag = (x[0] - min_val, x[1] - min_val)
        if valid_diagonals[diag] < min_diagonal_length:
            remove_elems.append(x)

    # Remove the invalid matches from the original list
    matches = list(set(matches).difference(remove_elems))

    return matches


def get_longest_path(data, matches):
    longest_path = []

    # If there are any matches left, build a paths graph and find the longest path in the graph
    if len(matches) > 0:
        graph = build_paths_graph(data, matches)
        longest_path = nx.dag_longest_path(graph)

    return longest_path


def soft_align_time_new(seq_1_str, seq_2_str, seq_1_embedding, seq_2_embedding, pca_matrix = False): 
    if pca_matrix : 
        data = approximate_similarity_matrix(seq_1_embedding,seq_2_embedding)
    else : 
        data = get_data_matrix(seq_1_embedding, seq_2_embedding)
    matches = get_matches_new(seq_1_str, seq_2_str, data)
    longest_path = get_longest_path(data, matches)
    
    return longest_path

In [5]:
from Bio import SeqIO

In [5]:
!pip install memory_profiler

Collecting memory_profiler
  Obtaining dependency information for memory_profiler from https://files.pythonhosted.org/packages/49/26/aaca612a0634ceede20682e692a6c55e35a94c21ba36b807cc40fe910ae1/memory_profiler-0.61.0-py3-none-any.whl.metadata
  Downloading memory_profiler-0.61.0-py3-none-any.whl.metadata (20 kB)
Downloading memory_profiler-0.61.0-py3-none-any.whl (31 kB)
[33mDEPRECATION: pytorch-lightning 1.6.5 has a non-standard dependency specifier torch>=1.8.*. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pytorch-lightning or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mInstalling collected packages: memory_profiler
Successfully installed memory_profiler-0.61.0


In [37]:
from transformers import EsmForSequenceClassification, EsmModel, AutoConfig, EsmConfig, EsmForTokenClassification
from peft import PeftModel
import torch
import esm
from Bio import SeqIO
import matplotlib.pyplot as plt
import numpy as np
import json
import os
from Bio.Seq import Seq
import logging
from tqdm import tqdm
import gc
from memory_profiler import profile
import statistics
import psutil
import time
from scipy.stats import gaussian_kde
import torch
from collections import Counter
gc.collect()


1151

In [2]:
from src.unique_virus_fasta import proteins_fasta, virus_fasta
from src.unique_virus_interactions import full_interactions
from src.get_attentions_viruses import parse_fasta, process_sequences
from src.distance_scores import *
from src.virus_contacts_numbers import count_contacts
from src.analyse_contacts_virus import all_metrics, read_full_interaction_file, read_interaction_file


In [6]:
def apc(x):
    a1 = x.sum(-1, keepdims=True)
    a2 = x.sum(-2, keepdims=True)
    a12 = x.sum((-1, -2), keepdims=True)
    avg = a1 * a2
    avg.div_(a12)  # in-place to reduce memory
    x.sub_(avg) # in-place to reduce memory
    del avg
    return x

In [28]:
def get_attention_block(self_attention_t_weighted, p1_boundaries, p2_boundaries, apc_norm=True):
    """
        self_attention: square matrix representing the self attentions
        returns: mutal attention between p1 and p2        
    """
    #self_attention_t_weighted_reduced = self_attention_t_weighted[:, 1:-1, 1:-1]


    block1 = self_attention_t_weighted[0][p1_boundaries[0]: p1_boundaries[1], p2_boundaries[0]: p2_boundaries[1]]
    block2 = self_attention_t_weighted[0][p2_boundaries[0]: p2_boundaries[1], p1_boundaries[0]: p1_boundaries[1]]

    block2_transposed = block2.t()
    mutual_information_tensor = block1 + block2_transposed
    mutual_apc = apc((mutual_information_tensor))    
    return mutual_apc



In [8]:
# attention_block = torch.load("./sequence_0_mutual_information_apc/4_5_attention.pt") # 295 -> 54
# attention_block = torch.load("./sequence_0_mutual_information_apc/10_11_attention.pt") # 926 ->  574
# attention_block = torch.load("./sequence_0_mutual_information_apc/5_9_attention.pt") #  255 -> 38
# attention_block = torch.load("./sequence_0_mutual_information_apc/1_9_attention.pt") #  177 -> 21
# attention_block = torch.load("./sequence_0_mutual_information_apc/4_10_attention.pt") # 794 -> 94
# attention_block = torch.load("./sequence_0_mutual_information_apc/0_3_attention.pt") # 808 -> 285
# attention_block = torch.load("./sequence_0_mutual_information_apc/1_5_attention.pt") #  741 -> 114
# attention_block = torch.load("./sequence_0_mutual_information_apc/2_9_attention.pt") #  217 -> 8
# attention_block = torch.load("./sequence_0_mutual_information_apc/1_2_attention.pt") # 736  -> 24




In [170]:
pola_ref_str = "MVQIPQNPLILVDGSSYLYRAYHAFPPLTNSAGEPTGAMYGVLNMLRSLIMQYKPTHAAVVFDAKGKTFRDELFEHYKSHRPPMPDDLRAQIEPLHAMVKAMGLPLLAVSGVEADDVIGTLAREAEKAGRPVLISTGDKDMAQLVTPNITLINTMTNTILGPEEVVNKYGVPPELIIDFLALMGDSSDNIPGVPGVGEKTAQALLQGLGGLDTLYAEPEKIAGLSFRGAKTMAAKLEQNKEVAYLSYQLATIKTDVELELTCEQLEVQPPAAEELLGLFKKYEFKRWTADVEAGKWLQAKGVKPAARPQETSVADEAPEVTATVISYDNYVTILDEETLKEWIAKLEKAPVFAFDTETDSLDNISANLVGLSFAIEPGVAAYIPVAHDYLDAPDQISRERALELLKPLLEDEKALKVGQNLKYDRGILANYGIELRGIAFDTMLESYILNSVAGRHDMDSLAERWLKHKTITFEEIAGKGKNQLTFNQIALEEAGRYAAEDADVTLQLHLKMWPDLQKHKGPLNVFENIEMPLVPVLSRIERNGVKIDPKVLHNHSEELTLRLAELEKKAHEIAGEEFNLSSTKQLQTILFEKQGIKPLKKTPGGAPSTSEEVLEELALDYPLPKVILEYRGLAKLKSTYTDKLPLMINPKTGRVHTSYHQAVTATGRLSSTDPNLQNIPVRNEEGRRIRQAFIAPEDYVIVSADYSQIELRIMAHLSRDKGLLTAFAEGKDIHRATAAEVFGLPLETVTSEQRRSAKAINFGLIYGMSAFGLARQLNIPRKEAQKYMDLYFERYPGVLEYMERTRAQAKEQGYVETLDGRRLYLPDIKSSNGARRAAAERAAINAPMQGTAADIIKRAMIAVDAWLQAEQPRVRMIMQVHDELVFEVHKDDVDAVAKQIHQLMENCTRLDVPLLVEVGSGENWDQAH"
rnr_ref_str = "MNQNLLVTKRDGSTERINLDKIHRVLDWAAEGLHNVSISQVELRSHIQFYDGIKTSDIHETIIKAAADLISRDAPDYQYLAARLAIFHLRKKAYGQFEPPALYDHVVKMVEMGKYDNHLLEDYTEEEFKQMDTFIDHDRDMTFSYAAVKQLEGKYLVQNRVTGEIYESAQFLYILVAACLFSNYPRETRLQYVKRFYDAVSTFKISLPTPIMSGVRTPTRQFSSCVLIECGDSLDSINATSSAIVKYVSQRAGIGINAGRIRALGSPIRGGEAFHTGCIPFYKHFQTAVKSCSQGGVRGGAATLFYPMWHLEVESLLVLKNNRGVEGNRVRHMDYGVQINKLMYTRLLKGEDITLFSPSDVPGLYDAFFADQEEFERLYTKYEKDDSIRKQRVKAVELFSLMMQERASTGRIYIQNVDHCNTHSPFDPAIAPVRQSNLCLEIALPTKPLNDVNDENGEIALCTLSAFNLGAINNLDELEELAILAVRALDALLDYQDYPIPAAKRGAMGRRTLGIGVINFAYYLAKHGKRYSDGSANNLTHKTFEAIQYYLLKASNELAKEQGACPWFNETTYAKGILPIDTYKKDLDTIANEPLHYDWEALRESIKTHGLRNSTLSALMPSETSSQISNATNGIEPPRGYVSIKASKDGILRQVVPDYEHLHDAYELLWEMPGNDGYLQLVGIMQKFIDQSISANTNYDPSRFPSGKVPMQQLLKDLLTAYKFGVKTLYYQNTRDGAEDAQDDLVPSIQDDGCESGACKI"

In [29]:
def get_highest_attn(attention_block, threshold=0.99, plot=False):
    """        
    _tensor: the symmetrized APC normalized attention between two proteins
    threshold: the p-value to consider
    """
    attentions = np.array(attention_block.reshape(1, -1).tolist()[0])
    interval_len = 0.001
    x = np.arange(min(attentions),max(attentions), interval_len)
    g_kde = gaussian_kde(attentions)
    y = g_kde.evaluate(x)


    prob_masses = y * interval_len
    assert np.isclose(sum(prob_masses), 1, 1e-3)

    cum_prob_masses = np.cumsum(prob_masses)
    index_first_outlier = np.where(cum_prob_masses >=threshold)[0][0]
    attention_threshold = x[index_first_outlier]
    all_outliers =  attentions[attentions > attention_threshold]
    a, b = np.where(attention_block > attention_threshold)
    return list(zip(a,b))    
    
#     if plot : 
#         plt.plot(x, eval)
#         plt.axvline(x=x[last_index_limit], color='black', linestyle='--', label='0.95')
#         plt.tight_layout()
#         plt.show()

        

    
    

In [176]:
all_seqs = SeqIO.to_dict(SeqIO.parse("/Users/mahdi/Documents/GitHub/phage_proj_2/ENA_clusters/ENA_votu_embedding_formatted.pep", 'fasta'))
seqs = SeqIO.to_dict(SeqIO.parse("./test_files/ENA_pol_rnr_assembled.fasta", 'fasta'))


current_seq_id = 0
seq_id2name = dict(enumerate(seqs.keys()))
seq_id = seq_id2name[current_seq_id]
lengths = list(map(int, seq_id.split("prots")[1][1:-1].split("_")))
print(len(lengths))
print(sum(lengths))

assert sum(lengths) == len(seqs[seq_id])


9
2346


In [177]:
seq_id2name[5]

'ENA_AY954970_AY954970_1_57620_58669_56_to_ENA_AY954970_AY954970_1_63253_66456_65_prots_349_229_108_349_89_102_240_98_103_1067_'

In [178]:
p1_start, p1_end, p2_start, p2_end = 0, lengths[0],  sum(lengths[:-1]), sum(lengths)
# p1_start, p1_end, p2_start, p2_end = sum(lengths[:1]), sum(lengths[:2]),  sum(lengths[:-1]), sum(lengths)

segment_attentions = torch.load(f"./test_files/pol_rnr_{current_seq_id}_full_attentions_weighted.pt")

attention_block = get_attention_block(segment_attentions, (p1_start, p1_end), (p2_start, p2_end), apc_norm=True)
print(attention_block.shape)
assert sorted(attention_block.shape) == sorted([lengths[0], lengths[-1]])

torch.Size([706, 797])


In [179]:
x_attention_sites = get_highest_attn(attention_block, threshold=0.999, plot=False)
x_attention_sites

[(0, 619),
 (0, 796),
 (2, 463),
 (9, 261),
 (14, 726),
 (15, 519),
 (18, 272),
 (19, 0),
 (19, 1),
 (19, 2),
 (19, 3),
 (19, 5),
 (19, 6),
 (19, 8),
 (19, 10),
 (19, 11),
 (19, 12),
 (19, 15),
 (19, 16),
 (19, 18),
 (19, 19),
 (19, 23),
 (19, 27),
 (19, 33),
 (19, 48),
 (19, 51),
 (19, 52),
 (19, 56),
 (19, 63),
 (19, 71),
 (19, 72),
 (19, 73),
 (19, 74),
 (19, 75),
 (19, 76),
 (19, 79),
 (19, 82),
 (19, 83),
 (19, 84),
 (19, 85),
 (19, 86),
 (19, 87),
 (19, 88),
 (19, 89),
 (19, 90),
 (19, 91),
 (19, 92),
 (19, 93),
 (19, 94),
 (19, 95),
 (19, 96),
 (19, 97),
 (19, 98),
 (19, 99),
 (19, 100),
 (19, 101),
 (19, 102),
 (19, 103),
 (19, 104),
 (19, 105),
 (19, 106),
 (19, 107),
 (19, 108),
 (19, 109),
 (19, 110),
 (19, 111),
 (19, 112),
 (19, 113),
 (19, 114),
 (19, 115),
 (19, 116),
 (19, 117),
 (19, 118),
 (19, 119),
 (19, 120),
 (19, 121),
 (19, 122),
 (19, 123),
 (19, 124),
 (19, 125),
 (19, 126),
 (19, 127),
 (19, 128),
 (19, 129),
 (19, 130),
 (19, 131),
 (19, 132),
 (19, 133),
 (

In [186]:
seq_id2name

{0: 'ENA_AP011616_AP011616_1_30378_32498_33_to_ENA_AP011616_AP011616_1_35201_37594_41_prots_706_96_88_92_109_142_142_174_797_',
 1: 'ENA_AB605730_AB605730_1_103181_105283_177_to_ENA_AB605730_AB605730_1_116962_119646_192_prots_700_361_218_69_128_78_80_212_171_896_637_122_382_149_106_894_',
 2: 'ENA_AF189021_AF189021_1_15502_17244_27_to_ENA_AF189021_AF189021_1_20174_22189_35_prots_580_178_134_287_134_61_101_60_671_',
 3: 'ENA_AP011617_AP011617_1_30261_32381_33_to_ENA_AP011617_AP011617_1_34895_37324_40_prots_706_96_88_92_124_142_174_809_',
 4: 'ENA_AP014858_AP014858_1_32524_30659_63_to_ENA_AP014858_AP014858_1_129359_126762_208_prots_621_58_153_281_173_280_78_42_104_549_95_80_45_162_219_494_76_110_304_288_204_221_249_78_96_164_75_194_114_81_113_109_81_464_71_124_149_187_66_79_64_63_87_51_73_86_101_153_152_64_120_79_116_334_533_63_145_73_326_107_78_174_444_89_67_79_76_60_74_510_190_176_328_120_345_169_290_596_53_960_635_999_463_411_884_88_102_46_217_233_123_167_109_88_76_109_209_169_133_225