In [1]:
import os
import pathlib
import numpy as np
from Bio.AlignIO import read
from Bio.Align import MultipleSeqAlignment
from aldiscore.scoring import pairwise
from aldiscore.scoring import set_based
from aldiscore.datastructures.ensemble import Ensemble
from aldiscore.datastructures.alignment import Alignment

In [2]:
TEST_DATA_DIR = pathlib.Path.cwd() / "data"

_alignments: list[Alignment] = []

for msa_file in sorted(os.listdir(TEST_DATA_DIR)):
    if msa_file.startswith("protein"):
        print(msa_file)
        msa = read(TEST_DATA_DIR / msa_file, "fasta")
        _alignment = Alignment(msa=msa, sort_sequences=False)
        _alignments.append(_alignment)
_ensemble = Ensemble(_alignments)


num_seqs = len(_ensemble.dataset.records)
min_seq_length = min(_ensemble.dataset._sequence_lengths)
max_seq_length = max(_ensemble.dataset._sequence_lengths)
alignment_shapes = [a.shape for a in _ensemble.alignments]
print(f"Ensemble of {len(_ensemble.alignments)} alignments.")
print(f"{num_seqs} sequences of lengths between {min_seq_length} and {max_seq_length}.")
print(f"Alignment dimensions: {alignment_shapes}")

protein.0.fasta


protein.1.fasta
protein.2.fasta
protein.3.fasta
Ensemble of 4 alignments.
14 sequences of lengths between 329 and 465.
Alignment dimensions: [(14, 774), (14, 806), (14, 764), (14, 774)]


**Compute aggregated confusion scores**

In [3]:
measure = set_based.ConfusionSet()
score = measure.compute(_ensemble)
print("ConfSet:", score)

measure = set_based.ConfusionEntropy()
score = measure.compute(_ensemble)
print("ConfEntropy:", score)

measure = set_based.ConfusionDisplace()
score = measure.compute(_ensemble)
print("ConfDisplace:", score)

ConfSet: 0.3300535604369356
ConfEntropy: 0.3742146871758933
ConfDisplace: 0.12681854154444622


**Compute ConfEntropy in three levels of aggregation**

In [4]:
measure = set_based.ConfusionEntropy()
score = measure.compute(_ensemble)
print("Overall mean")
print(score)
print()

measure = set_based.ConfusionEntropy(format="sequence")
scores = measure.compute(_ensemble)
print("Mean per sequence")
print(scores)
print()

measure = set_based.ConfusionEntropy(format="site")
scores = measure.compute(_ensemble)
print("Residue-level scores")
print(scores[0][:24])
print("...")

Overall mean
0.3742146871758933

Mean per sequence
[0.32025744 0.35935771 0.4851988  0.36375336 0.49368874 0.2950834
 0.40750361 0.38967909 0.32739424 0.42480393 0.28247488 0.33836156
 0.3025806  0.38748052]

Residue-level scores
[0.         0.06240601 0.31203005 0.44193173 0.50688257 0.74274147
 0.69702139 0.73548293 0.74745517 0.67779062 0.81966454 0.80043377
 0.81966454 0.81966454 0.81966454 0.81966454 0.78846154 0.78846154
 0.80043377 0.81966454 0.76197224 0.74274147 0.74274147 0.74274147]
...


**Compute aggregated pairwise scores**

In [5]:
measure = pairwise.SSPDistance()
score = measure.compute(_ensemble)
print("d_SSP:", score)

measure = pairwise.DSeqDistance()
score = measure.compute(_ensemble)
print("d_seq:", score)

measure = pairwise.DPosDistance()
score = measure.compute(_ensemble)
print("d_pos:", score)

measure = pairwise.PHashDistance()
score = measure.compute(_ensemble)
print("pHash:", score)

d_SSP: 0.49739449499963223
d_seq: 0.34854477303420334
d_pos: 0.423033284642027


pHash: 0.23214285714285718


**Compute distance matrix for d_pos**

In [6]:
measure = pairwise.DPosDistance(format="matrix")
score = measure.compute(_ensemble)
print("distance matrix")
print(score.round(2))

distance matrix
[[0.   0.39 0.3  0.47]
 [0.39 0.   0.43 0.48]
 [0.3  0.43 0.   0.47]
 [0.47 0.48 0.47 0.  ]]


**Compare reference-free and reference-based scores**

In [7]:
measure = pairwise.DPosDistance()
reference = _ensemble.alignments[0]  # Not an actual reference!

ref_free = measure.compute(_ensemble)
print("Reference-free:", ref_free)
ref_based = measure.compute(_ensemble, reference)
print("Reference-based:", ref_based)

Reference-free: 0.423033284642027
Reference-based: 0.2887819364440647
