In [10]:
import numpy as np
import pathlib
from aldiscore.scoring import confusion
from aldiscore.datastructures.alignment import Alignment
from aldiscore.datastructures.ensemble import Ensemble
import os
from Bio.Align import MultipleSeqAlignment
from Bio.AlignIO import read

In [38]:
TEST_DATA_DIR = pathlib.Path.cwd().parent / "tests" / "data"

_alignments: list[Alignment] = []

for msa_file in os.listdir(TEST_DATA_DIR):
    if msa_file.startswith("protein"):
        msa: MultipleSeqAlignment = read(TEST_DATA_DIR / msa_file, "fasta")
        _alignment = Alignment(msa=msa, sort_sequences=True)
        _alignments.append(_alignment)
_dataset = _alignment.get_dataset()
_ensemble = Ensemble(_alignments, _dataset)


num_seqs = len(_ensemble.dataset.records)
min_seq_length = min(_ensemble.dataset._sequence_lengths)
max_seq_length = max(_ensemble.dataset._sequence_lengths)
alignment_shapes = [a.shape for a in _ensemble.alignments]
print(f"Ensemble of {len(_ensemble.alignments)} alignments.")
print(f"{num_seqs} sequences of lengths between {min_seq_length} and {max_seq_length}.")
print(f"Alignment dimensions: {alignment_shapes}")

Ensemble of 4 alignments.
14 sequences of lengths between 329 and 465.
Alignment dimensions: [(14, 764), (14, 806), (14, 774), (14, 774)]


**Compute three variants of the confusion score**

In [52]:
metric = confusion.ConfusionSet(aggregate="site")
score = metric.compute(_ensemble)
print("ConfSet:", score)

metric = confusion.ConfusionEntropy(aggregate="site")
score = metric.compute(_ensemble)
print("ConfEntropy:", score)

metric = confusion.ConfusionDisplace(aggregate="site")
score = metric.compute(_ensemble)
print("ConfDisplace:", score)

ConfSet: 0.33005356043693557
ConfEntropy: 0.3742146871758933
ConfDisplace: 0.12681854154444624


**Compute ConfEntropy in three levels of aggregation**

In [54]:
metric = confusion.ConfusionEntropy(aggregate="site")
score = metric.compute(_ensemble)
print("Overall mean")
print(score)
print()

metric = confusion.ConfusionEntropy(aggregate="sequence")
scores = metric.compute(_ensemble)
print("Mean per sequence")
print(scores)
print()

metric = confusion.ConfusionEntropy()
scores = metric.compute(_ensemble)
print("Residue-level scores")
print(scores[0][:40])
print("...")

Overall mean
0.3742146871758933

Mean per sequence
[0.4851988  0.49368874 0.2950834  0.36375336 0.32025744 0.33836156
 0.35935771 0.3025806  0.28247488 0.40750361 0.32739424 0.42480393
 0.38748052 0.38967909]

Residue-level scores
[0.         0.06240601 0.2547137  0.53808558 0.70899363 0.91581839
 0.7547137  0.87735685 0.89658762 0.89658762 0.89658762 0.89658762
 0.92307692 0.94230769 0.94230769 0.94230769 0.94230769 0.96153846
 0.96153846 0.86538462 0.88461538 0.94230769 0.96153846 0.98076923
 0.98076923 0.98076923 0.98076923 0.98076923 0.98076923 0.94230769
 0.92307692 0.88461538 0.96153846 0.96153846 0.96153846 0.96153846
 0.98076923 0.94230769 0.86538462 0.60558125]
...
