In [1]:
import dotenv
import numpy as np
import pathlib
import itertools
import features.pairwise
import utils.enums
from alignment.dataset import Dataset
from alignment.ensemble import Ensemble
from alignment.alignment import Alignment
import os
from pypythia.custom_types import FileFormat
from features.aligned import EnsembleFeatureExtractor
from utils.path import WildcardPath
from collections import defaultdict
import pandas as pd
from utils.enums import FeatureEnum as FE

In [2]:
DATA_DIR = WildcardPath("/hits/fast/cme/bodynems/data/")
RESULT_DIR = WildcardPath("/hits/fast/cme/bodynems/MSA_difficulty/results")

run_suffix = ""
# run_suffix = "_muscle5"
# run_suffix = "_reduced"

BENCHMARKS = ["balibase3/RV11" + run_suffix, "balibase3/RV12" + run_suffix]
benchmark_mapping = dict(zip(BENCHMARKS, ["RV11 (hard)", "RV12 (easy)"]))
TOOLS = list(utils.enums.ToolEnum)
MUSCLE5 = utils.enums.AlignerEnum.MUSCLE5.path
METAL = pathlib.Path("/hits/fast/cme/bodynems/tools/metal")

OUTPUT_DIR = DATA_DIR / "output" / "{benchmark}"
TOOL_DIR = OUTPUT_DIR / "{dataset}" / "{tool}"

ssp_new = features.pairwise.SSPDistance()
seq_new = features.pairwise.HomologySeqDistance()
pos_new = features.pairwise.HomologyPosDistance()
metrics = {metric.name: metric for metric in [ssp_new, seq_new, pos_new]}

In [48]:
def compute_metric_deviation(extractor):
    dists_new_dict = defaultdict(list)

    for alignment_x, alignment_y in itertools.combinations(
        extractor._ensemble.ensemble, r=2
    ):
        for name, metric in metrics.items():
            dist = metric.compute(alignment_x, alignment_y)
            dists_new_dict[name].append(dist)
    dists_old_dict = extractor._metal_stats()

    results = {}
    for name in dists_new_dict:
        dists_new = np.array(dists_new_dict[name])
        dists_old = np.array(dists_old_dict[name + "_metal"])
        results[name] = {
            "deviation": np.abs(dists_new - dists_old),
            "original": dists_old,
        }
    return results


def deviation_job(dataset_dir):
    deviations = defaultdict(list)
    originals = defaultdict(list)
    dataset = Dataset(dataset_dir / "sequences.fasta")
    ensemble = Ensemble.from_efa(
        MUSCLE5,
        dataset_dir / "all_tools.efa",
        dataset=dataset,
    )
    extractor = EnsembleFeatureExtractor(
        ensemble,
        dataset,
        None,
        None,
        None,
        METAL,
        compute_distance_matrix=False,
        threads=1,
    )
    out = compute_metric_deviation(extractor)
    for key in out:
        deviations[key].extend(out[key]["deviation"])
        originals[key].extend(out[key]["original"])
    return (deviations, originals)

In [None]:
from multiprocessing import Pool

dataset_dirs = []
for benchmark in BENCHMARKS:
    dataset_dir = OUTPUT_DIR.format(benchmark=benchmark)
    datasets = dataset_dir.listdir(dirs_only=True)
    for dataset_name in datasets:
        dataset_dirs.append(dataset_dir / dataset_name)


with Pool(24) as pool:
    pile = pool.map(deviation_job, dataset_dirs)

deviation_dict = defaultdict(list)
score_dict = defaultdict(list)
for entry in pile:
    for metric_name in metrics:
        deviation_dict[metric_name].extend(entry[0][metric_name])
        score_dict[metric_name].extend(entry[1][metric_name])

deviation_df = pd.DataFrame(deviation_dict)
score_df = pd.DataFrame(score_dict)
# deviation_df.to_pickle(RESULT_DIR / "technical" / "metal_deviation_df.pkl")
# score_df.to_pickle(RESULT_DIR / "technical" / "metal_score_df.pkl")

In [3]:
deviation_df = pd.read_pickle(RESULT_DIR / "technical" / "metal_deviation_df.pkl")
score_df = pd.read_pickle(RESULT_DIR / "technical" / "metal_score_df.pkl")

print(deviation_df.shape, score_df.shape)
print(deviation_df[FE.HOMOLOGY_POS_DIST].max())

(182736, 3) (182736, 3)
0.029619404456714626


In [12]:
import seaborn as sn

plot_df = deviation_df[[FE.HOMOLOGY_POS_DIST]]

print((plot_df > 1e-4).mean())

# np.quantile(plot_df, np.arange(1, 11) / 10)
datasets = pd.Series(np.arange(162).repeat((48 * 47 / 2)))
plot_df = (
    plot_df.groupby(datasets)
    .mean()
    .sort_values(by=FE.HOMOLOGY_POS_DIST, ascending=False, ignore_index=True)
)
percentiles = [0.5, 0.75, 0.9, 0.95, 0.99, 1]
names = [f"$P_{{{q}}}$" for q in percentiles]
names[-1] = "max"
qs = np.quantile(plot_df[FE.HOMOLOGY_POS_DIST], percentiles)

print(pd.Series(qs, index=names).to_frame().T.round(6).to_latex())

homology_pos_dist    0.008373
dtype: float64
\begin{tabular}{lrrrrrr}
\toprule
 & $P_{0.5}$ & $P_{0.75}$ & $P_{0.9}$ & $P_{0.95}$ & $P_{0.99}$ & max \\
\midrule
0 & 0.000000 & 0.000002 & 0.000018 & 0.000035 & 0.000093 & 0.001081 \\
\bottomrule
\end{tabular}



In [None]:
 for name, stats in deviation_dict.items():
    scores = score_dict[name]
    print()
    print(d)
    print(stats)
    print(name, np.max(stats))

['abs', 'rel', 'abs', 'rel']


UFuncTypeError: ufunc 'maximum' did not contain a loop with signature matching types (dtype('<U3'), dtype('<U3')) -> None