In [1]:
import os
import pathlib
import numpy as np
from Bio.AlignIO import read
from Bio.Align import MultipleSeqAlignment
from aldiscore.scoring import pairwise
from aldiscore.scoring import set_based
from aldiscore.datastructures.ensemble import Ensemble
from aldiscore.datastructures.alignment import Alignment

In [2]:
TEST_DATA_DIR = pathlib.Path.cwd() / "data"

_alignments: list[Alignment] = []

for msa_file in os.listdir(TEST_DATA_DIR):
    if msa_file.startswith("protein"):
        msa = read(TEST_DATA_DIR / msa_file, "fasta")
        _alignment = Alignment(msa=msa, sort_sequences=True)
        _alignments.append(_alignment)
_ensemble = Ensemble(_alignments)


num_seqs = len(_ensemble.dataset.records)
min_seq_length = min(_ensemble.dataset._sequence_lengths)
max_seq_length = max(_ensemble.dataset._sequence_lengths)
alignment_shapes = [a.shape for a in _ensemble.alignments]
print(f"Ensemble of {len(_ensemble.alignments)} alignments.")
print(f"{num_seqs} sequences of lengths between {min_seq_length} and {max_seq_length}.")
print(f"Alignment dimensions: {alignment_shapes}")

Ensemble of 4 alignments.
14 sequences of lengths between 329 and 465.
Alignment dimensions: [(14, 764), (14, 806), (14, 774), (14, 774)]


**Compute aggregated confusion scores**

In [3]:
measure = set_based.ConfusionSet(aggregate="site")
score = measure.compute(_ensemble)
print("ConfSet:", score)

measure = set_based.ConfusionEntropy(aggregate="site")
score = measure.compute(_ensemble)
print("ConfEntropy:", score)

measure = set_based.ConfusionDisplace(aggregate="site")
score = measure.compute(_ensemble)
print("ConfDisplace:", score)

ConfSet: 0.33005356043693557
ConfEntropy: 0.3742146871758933
ConfDisplace: 0.12681854154444624


**Compute ConfEntropy in three levels of aggregation**

In [4]:
measure = set_based.ConfusionEntropy(aggregate="site")
score = measure.compute(_ensemble)
print("Overall mean")
print(score)
print()

measure = set_based.ConfusionEntropy(aggregate="sequence")
scores = measure.compute(_ensemble)
print("Mean per sequence")
print(scores)
print()

measure = set_based.ConfusionEntropy(aggregate=None)
scores = measure.compute(_ensemble)
print("Residue-level scores")
print(scores[0][:24])
print("...")

Overall mean
0.3742146871758933

Mean per sequence
[0.4851988  0.49368874 0.2950834  0.36375336 0.32025744 0.33836156
 0.35935771 0.3025806  0.28247488 0.40750361 0.32739424 0.42480393
 0.38748052 0.38967909]

Residue-level scores
[0.         0.06240601 0.2547137  0.53808558 0.70899363 0.91581839
 0.7547137  0.87735685 0.89658762 0.89658762 0.89658762 0.89658762
 0.92307692 0.94230769 0.94230769 0.94230769 0.94230769 0.96153846
 0.96153846 0.86538462 0.88461538 0.94230769 0.96153846 0.98076923]
...


**Compute aggregated pairwise scores**

In [5]:
measure = pairwise.SSPDistance()
score = measure.compute(_ensemble).mean()
print("d_SSP:", score)
measure = pairwise.DSeqDistance()
score = measure.compute(_ensemble).mean()
print("d_seq:", score)
measure = pairwise.DPosDistance()
score = measure.compute(_ensemble).mean()
print("d_pos:", score)
measure = pairwise.PHashDistance()
score = measure.compute(_ensemble).mean()
print("pHash:", score)

d_SSP: 0.4973944949996321
d_seq: 0.3485447730342033
d_pos: 0.42303328464202705


pHash: 0.2775297619047619


**Compute distance matrix for d_pos**

In [6]:
measure = pairwise.DPosDistance()
score = measure.compute(_ensemble, format="matrix")
print("distance matrix")
print(score.round(2))

distance matrix
[[0.   0.43 0.3  0.47]
 [0.43 0.   0.39 0.48]
 [0.3  0.39 0.   0.47]
 [0.47 0.48 0.47 0.  ]]
