In [1]:
!mamba install t-coffee mmseqs2 -y -q

In [1]:
import os
import numpy as np
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from learnMSA.msa_hmm import Configuration, Align, Visualize, Emitter, Transitioner, Initializers, Training
from learnMSA.msa_hmm.SequenceDataset import SequenceDataset
from learnMSA.protein_language_models import Common, EmbeddingBatchGenerator

2024-02-06 15:10:13.502863: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-06 15:10:13.571038: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-06 15:10:13.586046: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-06 15:10:13.972181: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvi


# Experimental demo: learnMSA + protein language model

This notebook demonstrates how to align a set of protein sequences with learnMSA supported by a large, pre-trained protein language model.

This configuration of learnMSA - although a prototype - is the most accurate variant of learnMSA currently available. It is also the most computationally expensive. It is recommended to run this notebook on a GPU.

In [2]:
# Your fasta file with unaligned sequences.

#train_filename = "test/data/egf.fasta"
train_filename = "../../snakeMSA/data/homfam/train/peroxidase"

# Reference file with aligned sequences that have matching IDs to (potentially a subset of) the 
# sequences in the train_file.
# Replace with empty string if no reference is available.
#ref_filename = "test/data/egf.ref"
ref_filename = "../../snakeMSA/data/homfam/refs/peroxidase"

# The number of independently trained models.
num_models = 4

HMM training supported by protein embeddings:

In [3]:
def align(filename, out_filename):
    scoring_model_config = Common.ScoringModelConfig(dim=32, lm_name="protT5", activation="softmax", scaled=False)
    config = Configuration.make_default(num_models, 
                                        use_language_model=True, 
                                        scoring_model_config=scoring_model_config,
                                        frozen_insertions=True,
                                        num_prior_components=10,
                                        V2_emitter=True,
                                        V2_temperature=3.)
    # we have to define a special model- and batch generator if using a language model
    # because the emission probabilities are computed differently and the LM requires specific inputs
    model_gen = EmbeddingBatchGenerator.make_generic_embedding_model_generator(config["scoring_model_config"].dim)
    batch_gen = EmbeddingBatchGenerator.EmbeddingBatchGenerator(config["scoring_model_config"])
    with SequenceDataset(train_filename, fmt="fasta") as data:
        config["crop_long_seqs"] = int(np.ceil(3 * np.mean(data.seq_lens)))
        alignment_model = Align.run_learnMSA(data,
                                            out_filename,
                                            config, 
                                            model_generator=model_gen,
                                            batch_generator=batch_gen,
                                            sequence_weights=Align.compute_sequence_weights(train_filename, "tmp", config["cluster_seq_id"]),
                                            verbose=True,
                                            align_insertions=True)
    return alignment_model

In [4]:
alignment_model = align(train_filename, "test/data/interactive.alignment.fasta") 
Visualize.print_and_plot(alignment_model, alignment_model.best_model)

Training of 4 models on file peroxidase
Configuration: 
{
num_models : 4
transitioner : ProfileHMMTransitioner(
 transition_init=
    {
    begin_to_match : DefaultEntry() , match_to_end : DefaultExit() , 
    match_to_match : DefaultMatchTransition(1) , match_to_insert : DefaultMatchTransition(-1) , 
    insert_to_match : Norm(0, 0.1) , insert_to_insert : Norm(-0.5, 0.1) , 
    match_to_delete : DefaultMatchTransition(-1) , delete_to_match : Norm(0, 0.1) , 
    delete_to_delete : Norm(-0.5, 0.1) , left_flank_loop : Norm(0, 0.1) , 
    left_flank_exit : Norm(-1, 0.1) , right_flank_loop : Norm(0, 0.1) , 
    right_flank_exit : Norm(-1, 0.1) , unannotated_segment_loop : Norm(0, 0.1) , 
    unannotated_segment_exit : Norm(-1, 0.1) , end_to_unannotated_segment : Norm(-9, 0.1) , 
    end_to_right_flank : Norm(0, 0.1) , end_to_terminal : Norm(0, 0.1)
    },
 flank_init=Const(0.0),
 prior=ProfileHMMTransitionPrior(match_comp=1, insert_comp=1, delete_comp=1, alpha_flank=7000, alpha_single=1000

  from .autonotebook import tqdm as notebook_tqdm
2024-02-06 15:11:04.151958: I tensorflow/stream_executor/cuda/cuda_blas.cc:1614] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Computing all embeddings (this may take a while).
10% done.
20% done.
30% done.
40% done.
50% done.
60% done.
70% done.
80% done.
90% done.
100% done.
Fitting models of lengths [203 203 201 196] on 4514 sequences.
Batch size= 100 Learning rate= 0.05
Using sequence weights  [0.11111111 0.11111111 0.11111111 ... 0.01020408 0.1        0.03571429] .
Using 1 GPUs.


2024-02-06 15:13:15.471236: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-02-06 15:13:15.471392: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-02-06 15:13:15.471472: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-02-06 15:13:15.471589: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-02-06 15:13:15.471668: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from S

Epoch 1/10


2024-02-06 15:13:17.073981: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-02-06 15:13:17.074151: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-02-06 15:13:17.074229: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-02-06 15:13:17.074342: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-02-06 15:13:17.074419: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from S

67/67 - 28s - loss: 681.2725 - loglik: -6.1808e+02 - logprior: -6.2820e+01 - 28s/epoch - 412ms/step
Epoch 2/10
67/67 - 20s - loss: 522.7279 - loglik: -5.2296e+02 - logprior: 0.9082 - 20s/epoch - 297ms/step
Epoch 3/10
67/67 - 19s - loss: 507.8738 - loglik: -5.1518e+02 - logprior: 8.0145 - 19s/epoch - 291ms/step
Epoch 4/10
67/67 - 20s - loss: 501.9378 - loglik: -5.1203e+02 - logprior: 10.7974 - 20s/epoch - 292ms/step
Epoch 5/10
67/67 - 20s - loss: 498.1382 - loglik: -5.1019e+02 - logprior: 12.7633 - 20s/epoch - 294ms/step
Epoch 6/10
67/67 - 20s - loss: 501.3918 - loglik: -5.1486e+02 - logprior: 14.1830 - 20s/epoch - 296ms/step


2024-02-06 15:15:24.074296: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-02-06 15:15:24.074460: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-02-06 15:15:24.074538: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-02-06 15:15:24.074653: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-02-06 15:15:24.074730: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from S

expansions model 0: [(0, 14), (5, 2), (6, 3), (24, 5), (36, 2), (43, 1), (46, 2), (52, 1), (54, 2), (78, 2), (79, 1), (82, 2), (95, 2), (96, 2), (107, 11), (108, 2), (110, 1), (139, 2), (143, 2), (147, 1), (152, 1), (155, 1), (158, 2), (160, 1), (162, 3), (170, 1), (180, 1), (181, 16), (191, 2), (192, 2), (203, 2)]
discards model 0: [143]
expansions model 1: [(0, 14), (7, 4), (8, 1), (24, 5), (36, 2), (44, 2), (52, 1), (55, 1), (78, 2), (80, 1), (83, 2), (84, 2), (95, 2), (96, 2), (107, 11), (108, 3), (109, 3), (117, 1), (142, 2), (151, 2), (154, 1), (157, 1), (158, 1), (160, 1), (161, 1), (165, 2), (178, 2), (181, 15), (189, 2), (203, 2)]
discards model 1: [ 35  36 145 146 147]
expansions model 2: [(0, 14), (5, 1), (6, 4), (7, 1), (8, 2), (22, 5), (35, 1), (43, 1), (44, 1), (55, 1), (62, 1), (80, 1), (83, 2), (84, 2), (95, 2), (96, 2), (107, 11), (108, 3), (109, 3), (137, 2), (143, 2), (151, 2), (159, 1), (162, 2), (163, 1), (170, 1), (179, 17), (186, 2), (187, 3), (201, 2)]
discards 

2024-02-06 15:15:30.037570: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-02-06 15:15:30.037737: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-02-06 15:15:30.037816: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-02-06 15:15:30.037931: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-02-06 15:15:30.038009: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from S

Epoch 1/4


2024-02-06 15:15:31.084744: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-02-06 15:15:31.084903: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-02-06 15:15:31.084981: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-02-06 15:15:31.085096: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-02-06 15:15:31.085176: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from S

134/134 - 42s - loss: 532.2435 - loglik: -4.9874e+02 - logprior: -3.2885e+01 - 42s/epoch - 314ms/step
Epoch 2/4
134/134 - 35s - loss: 456.6059 - loglik: -4.9063e+02 - logprior: 34.6737 - 35s/epoch - 265ms/step
Epoch 3/4
134/134 - 35s - loss: 445.0650 - loglik: -4.8876e+02 - logprior: 44.3358 - 35s/epoch - 264ms/step
Epoch 4/4


KeyboardInterrupt: 

In [None]:
alignment_model.model.layers[-3].cell.emitter[0].make_B()[0,:,4]

<tf.Tensor: shape=(369,), dtype=float32, numpy=
array([2.24936139e-02, 6.23358214e-17, 3.47443699e-14, 1.49089886e-18,
       1.50349558e-18, 1.55535401e-18, 3.98634235e-20, 1.69753214e-20,
       1.68715506e-20, 5.58980880e-03, 2.08687163e-18, 1.83283740e-20,
       6.14831038e-02, 2.60568913e-02, 1.66628394e-20, 2.44942768e-20,
       1.20309098e-18, 2.19452616e-20, 3.77059433e-22, 3.63020990e-22,
       2.76029073e-02, 3.71161303e-22, 3.54347180e-22, 3.24690938e-02,
       3.64542040e-22, 3.57854797e-22, 5.95521927e-22, 1.31001620e-21,
       1.48310566e-21, 1.09716672e-02, 7.65357747e-21, 3.57599004e-22,
       3.60278001e-22, 3.66362428e-22, 5.19395582e-02, 3.58105894e-22,
       3.87421506e-22, 2.24134251e-02, 3.70907933e-22, 3.50702820e-22,
       3.72396116e-22, 3.54342712e-22, 3.79569525e-22, 4.15133394e-22,
       3.60572113e-22, 3.60776081e-22, 3.58379434e-22, 3.56883551e-22,
       3.63621635e-22, 3.62074105e-22, 3.69959356e-22, 1.32112559e-02,
       3.69539203e-22, 3.5814

In [None]:
!id_list=$(sed -n '/^>/p' {ref_filename} | sed 's/^.//') ; export MAX_N_PID_4_TCOFFEE=10000000 ; t_coffee -other_pg seq_reformat -in test/data/interactive.alignment.fasta -action +extract_seq_list ${{id_list[@]}} +rm_gap > test/data/interactive.projection.fasta

HERE: 1tme
HERE: 2mev
HERE: 1bbt
HERE: 1r1a
HERE: 4rhv
HERE: 2plv


In [None]:
!t_coffee -other_pg aln_compare -al1 {ref_filename} -al2 test/data/interactive.projection.fasta -compare_mode sp

*****************************************************
seq1       seq2          Sim   [ALL]           Tot  
rhv           6          33.1    67.2 [100.0]   [20998]
