# Notebook to evaluate inferred microbial anomaly scores

To run this notebook you need to create and activate the following conda environment:

```
conda create --name score_eval numpy pandas matplotlib seaborn scipy ipython ipykernel -y
conda activate score_eval
pip install -e .
```


## Setup

In [1]:
import os

import matplotlib.pyplot as plt
import pandas as pd

from src.utils_eval_score import (
    _get_abx_info,
    _get_all_scores,
    _plot_score_after_nth_abx_exposure,
    _plot_score_over_age,
    _select_samples_around_nth_abx_exposure,
    display_scatterplot_w_scores,
    get_age_at_1st_2nd_3rd_abx_exposure,
    plot_time_between_abx_exposures,
    plot_trajectory,
)

%load_ext autoreload
%autoreload 2
%matplotlib inline

# avg. number of days per month
DAYS_PER_MONTH = 30.437

USER input: define the inferred model and linked datasets to evaluate here:

In [2]:
#### USER INPUT START
# name of the model
model_name = "saved_models_microbial_novel_alpha_div/id-2_test"
# which model version to evaluate: "best" or "last"
point_to_evaluate = "best"

# name of feature dataset used for model
ft_name = "ft_vat19_anomaly_v20240806_entero_family"
# name of abx time-series used for model
abx_ts_name = "ts_vat19_abx_v20240806"

# limit evaluation to time range up to this many months (if None no limit is set
# and all scores are evaluated)
limit_months = 24.0

# whether to group samples prior to abx exposure in analysis
group_samples = False

# how many samples prior and after abx exposure to consider
min_samples = -3.0
max_samples = 12.0
#### USER INPUT END

## Prepare data

In [3]:
scores_path = f"../data/{model_name}/anomaly_detection/scores_{point_to_evaluate}/"
evaluation_path = (
    f"../data/{model_name}/anomaly_detection/evaluation_{point_to_evaluate}_overall/"
)

if not os.path.exists(evaluation_path):
    os.makedirs(evaluation_path)


In [4]:
# get train
scores_train = _get_all_scores(scores_path, "train", limit_months=limit_months)

# get val
scores_val = _get_all_scores(scores_path, "val", limit_months=limit_months)


In [5]:
# get noabx samples per split
noabx_train = scores_train[~scores_train["abx"]].copy()
noabx_val = scores_val[~scores_val["abx"]].copy()

# select correct scores
noabx_train.drop(columns=["score_2", "score_3"], inplace=True)

noabx_val.drop(columns=["score_2", "score_3"], inplace=True)

In [6]:
# merge all abx scores into one group: train + val
# since none of the abx samples were used for training
abx_scores_flat = scores_train[scores_train["abx"]].copy()
abx_scores_flat_val = scores_val[scores_val["abx"]].copy()

abx_scores_flat = pd.concat([abx_scores_flat, abx_scores_flat_val])


In [None]:
# add metadata matching samples over time from ft
ft_df = pd.read_csv(f"../data/original_data/{ft_name}.tsv", sep="\t", index_col=0)
ft_df["age_days"] = ft_df["age_days"].astype(int)
ft_df.rename(columns={"age_days": "day"}, inplace=True)

cols_to_evaluate = [
    "abx_any_cumcount",
    "abx_max_count_ever",
    "abx_any_last_t_dmonths",
    "abx_any_last_dur_days",
    "geo_location_name",
]
ft_df = ft_df[["day", "host_id"] + cols_to_evaluate].copy()
ft_df = ft_df.assign(
    max_abx_w_microbiome=lambda df: df.groupby("host_id")["abx_any_cumcount"].transform(
        "max"
    ),
)
# add additional information to inferred scores
print(abx_scores_flat.shape)
abx_scores_flat = abx_scores_flat.merge(ft_df, on=["host_id", "day"], how="left")
print(abx_scores_flat.shape)

In [8]:
# get start of each abx course per host
path_to_abx_ts = f"../data/original_data/{abx_ts_name}.tsv"
abx_df = _get_abx_info(path_to_abx_ts, limit_months=limit_months)

In [9]:
# get age at n-th abx exposures
abx_age_at_all = get_age_at_1st_2nd_3rd_abx_exposure(abx_df)

## Score after abx exposure 1st, 2nd and 3rd


### score_1, score_2 and score_3 respectively

In [None]:
# get samples around n-th abx exposure
for n in [1, 2, 3]:
    score_col = f"score_{n}"
    scores_abx_nth_samples = _select_samples_around_nth_abx_exposure(
        abx_scores_flat,
        abx_df,
        n=n,
        min_samples=min_samples,
        max_samples=max_samples,
        group_samples=group_samples,
        score_var=score_col,
    )
    _plot_score_after_nth_abx_exposure(
        scores_abx_nth_samples,
        x_axis="diff_age_nth_abx",
        y_axis=score_col,
        n=n,
        path_to_save=evaluation_path,
        flag=score_col,
        min_samples=min_samples,
        max_samples=max_samples,
        grouped_samples=group_samples,
    )

In [None]:
plot_time_between_abx_exposures(
    abx_age_at_all, n0_label="1st", n1_label="2nd", path_to_save=evaluation_path
)
plt.show()
plot_time_between_abx_exposures(
    abx_age_at_all, n0_label="2nd", n1_label="3rd", path_to_save=evaluation_path
)
plt.show()

### score_1 only - for comparison with alpha diversity matching

In [None]:
# get samples around n-th abx exposure
for n in [1, 2, 3]:
    score_col = "score_1"
    scores_abx_nth_samples = _select_samples_around_nth_abx_exposure(
        abx_scores_flat,
        abx_df,
        n=n,
        min_samples=min_samples,
        max_samples=max_samples,
        group_samples=group_samples,
        score_var=score_col,
    )
    _plot_score_after_nth_abx_exposure(
        scores_abx_nth_samples,
        x_axis="diff_age_nth_abx",
        y_axis=score_col,
        n=n,
        path_to_save=evaluation_path,
        flag="score_1_only",
        min_samples=min_samples,
        max_samples=max_samples,
        grouped_samples=group_samples,
    )

## Score after 2nd abx: split by time since 1st abx exposure

In [13]:
duration_threshold = 8  # in months

In [14]:
abx_time_between = abx_age_at_all.copy()
abx_time_between["time_since_1st"] = (
    abx_time_between["age_2nd_abx"] - abx_time_between["age_1st_abx"]
)

bins = [-float("inf"), duration_threshold, float("inf")]
between_labels = [f"<= {duration_threshold} months", f"> {duration_threshold} months"]
abx_time_between["time_since_1st_cat"] = pd.cut(
    abx_time_between["time_since_1st"],
    bins=bins,
    labels=between_labels,
    right=True,
)
abx_time_between.reset_index(inplace=True)

In [None]:
# get samples around n-th abx exposure
n = 2
i = 0
score_col = f"score_{n}"
scores_abx_nth_samples = _select_samples_around_nth_abx_exposure(
    abx_scores_flat,
    abx_df,
    n=n,
    min_samples=min_samples,
    max_samples=max_samples,
    group_samples=group_samples,
    score_var=score_col,
)

for cat in between_labels:
    # filter only for scores of hosts where 2nd abx is in cat time duration
    hosts = abx_time_between[abx_time_between["time_since_1st_cat"] == cat][
        "host_id"
    ].unique()

    scores_abx_nth_samples_f = scores_abx_nth_samples[
        scores_abx_nth_samples["host_id"].isin(hosts)
    ].copy()
    flag = f"time_since_{n-1}th_{i}"
    _plot_score_after_nth_abx_exposure(
        scores_abx_nth_samples_f,
        x_axis="diff_age_nth_abx",
        y_axis=score_col,
        n=n,
        path_to_save=evaluation_path,
        flag=flag,
        tag=f"time since {n-1}-th: {cat}",
        min_samples=min_samples,
        max_samples=max_samples,
        grouped_samples=group_samples,
    )
    i += 1

## Score after 1st abx: split by duration: < 7 days vs. >= 7 days

In [None]:
n = 1
score_col = f"score_{n}"
scores_abx_nth_samples = _select_samples_around_nth_abx_exposure(
    abx_scores_flat,
    abx_df,
    n=n,
    min_samples=min_samples,
    max_samples=max_samples,
    group_samples=group_samples,
    score_var=score_col,
)

# bin duration into short, mid and long duration
scores_abx_nth_samples["abx_any_last_dur_days"].hist(bins=10)

bins = [-float("inf"), 6, float("inf")]
dur_labels = ["< 6 days", ">= 6 days"]
scores_abx_nth_samples["abx_duration_category"] = pd.cut(
    scores_abx_nth_samples["abx_any_last_dur_days"],
    bins=bins,
    labels=dur_labels,
    right=False,
)

scores_abx_nth_samples["abx_duration_category"].value_counts(dropna=False)

In [None]:
i = 0
for dur in dur_labels:
    print(dur)
    # evaluation_path_bin = f"{evaluation_path}duration_bins{i}/"
    host_w_dur = (
        scores_abx_nth_samples.loc[
            scores_abx_nth_samples["abx_duration_category"] == dur, "host_id"
        ]
        .unique()
        .tolist()
    )
    scores_abx_nth_samples_dur = scores_abx_nth_samples.loc[
        scores_abx_nth_samples["host_id"].isin(host_w_dur)
    ].copy()
    flag_dur = f"dur_{i}"
    _plot_score_after_nth_abx_exposure(
        scores_abx_nth_samples_dur,
        x_axis="diff_age_nth_abx",
        y_axis=score_col,
        n=n,
        path_to_save=evaluation_path,
        flag=flag_dur,
        tag=f"duration: {dur}",
        min_samples=min_samples,
        max_samples=max_samples,
        grouped_samples=group_samples,
    )

    i += 1

## Score after 1st abx: split by time of life

In [None]:
n = 1
score_col = f"score_{n}"
scores_abx_nth_samples = _select_samples_around_nth_abx_exposure(
    abx_scores_flat,
    abx_df,
    n=n,
    min_samples=min_samples,
    max_samples=max_samples,
    group_samples=group_samples,
    score_var=score_col,
)

# bin duration into short, mid and long duration
scores_abx_nth_samples["age_nth_abx"].hist(bins=24)

bins_age = [-float("inf"), 6, 12, 18, float("inf")]
# <6: pre weaning
age_labels = ["<= 6 months", "6 - 12 months", "12 - 18 months", "18 - 24 months"]
scores_abx_nth_samples["age_nth_abx_category"] = pd.cut(
    scores_abx_nth_samples["age_nth_abx"], bins=bins_age, labels=age_labels, right=False
)

scores_abx_nth_samples["age_nth_abx_category"].value_counts(dropna=False)

In [None]:
i = 0
for a in age_labels:
    print(a)
    # evaluation_path_bin = f"{evaluation_path}age_bins{i}/"

    # age at time of abx exposure should match a
    scores_abx_nth_samples_age = scores_abx_nth_samples.loc[
        scores_abx_nth_samples["age_nth_abx_category"] == a
    ].copy()
    flag_age = f"age_{i}"
    _plot_score_after_nth_abx_exposure(
        scores_abx_nth_samples_age,
        x_axis="diff_age_nth_abx",
        y_axis=score_col,
        n=n,
        path_to_save=evaluation_path,
        flag=flag_age,
        tag=f"age: {a}",
        min_samples=min_samples,
        max_samples=max_samples,
        grouped_samples=group_samples,
    )

    i += 1

## Score after 1st abx: split by type of abx (top 1 vs. others)

In [None]:
n = 1
score_col = f"score_{n}"
scores_abx_nth_samples = _select_samples_around_nth_abx_exposure(
    abx_scores_flat,
    abx_df,
    n=n,
    min_samples=min_samples,
    max_samples=max_samples,
    group_samples=group_samples,
    score_var=score_col,
)

# add top 1 vs. others
scores_abx_nth_samples["abx_type_cat"] = scores_abx_nth_samples["abx_type"].apply(
    lambda x: "Penicillin" if x == "Penicillin" else "Others"
)
scores_abx_nth_samples["abx_type_cat"].value_counts(dropna=False)

In [None]:
# select types to look for
abx_types = scores_abx_nth_samples["abx_type_cat"].unique().tolist()
# abx_types = ["Penicillin", "Cephalosporine", "Cotrimoxazole", "Macrolide"]
abx_types

In [None]:
abx_col = "abx_type_cat"
# abx_col = "abx_type"
for abx in abx_types:
    print(abx)
    abx_str = abx.lower().replace(" ", "_").replace(",", "_")
    flag_abx = f"abx_type_{abx_str}"
    # evaluation_path_abx = f"{evaluation_path}abx_type_{abx_str}/"

    # age at time of abx exposure should match a
    scores_abx_nth_samples_abx = scores_abx_nth_samples.loc[
        scores_abx_nth_samples[abx_col].str.contains(abx)
    ].copy()

    _plot_score_after_nth_abx_exposure(
        scores_abx_nth_samples_abx,
        x_axis="diff_age_nth_abx",
        y_axis=score_col,
        n=n,
        path_to_save=evaluation_path,
        flag=flag_abx,
        tag=f"Abx type: {abx}",
        min_samples=min_samples,
        max_samples=max_samples,
        grouped_samples=group_samples,
    )

    i += 1

## Score after 1st abx: split by type of abx reason (top 1 vs. others)

In [None]:
n = 1
score_col = f"score_{n}"
scores_abx_nth_samples = _select_samples_around_nth_abx_exposure(
    abx_scores_flat,
    abx_df,
    n=n,
    min_samples=min_samples,
    max_samples=max_samples,
    group_samples=group_samples,
    score_var=score_col,
)
# top 1 vs. others
print(scores_abx_nth_samples["abx_reason"].value_counts(dropna=False))

scores_abx_nth_samples["abx_reason_cat"] = scores_abx_nth_samples["abx_reason"].apply(
    lambda x: "Otitis media" if x == "Otitis media" else "Others"
)
print(scores_abx_nth_samples["abx_reason_cat"].value_counts(dropna=False))

reasons = scores_abx_nth_samples["abx_reason_cat"].unique().tolist()


In [24]:
# # top 4 reasons
# reasons = (
#     scores_abx_nth_samples["abx_reason"]
#     .value_counts(dropna=False)
#     .iloc[:4]
#     .index.tolist()
# )
# reasons

In [None]:
for r in reasons:
    print(r)
    r_str = r.lower().replace(" ", "_").replace(",", "_")
    # evaluation_path_r = f"{evaluation_path}abx_reason_{r_str}/"

    # age at time of abx exposure should match a
    scores_abx_nth_samples_r = scores_abx_nth_samples.loc[
        scores_abx_nth_samples["abx_reason"] == r
    ].copy()
    flag_reason = f"abx_reason_{r_str}"
    _plot_score_after_nth_abx_exposure(
        scores_abx_nth_samples_r,
        x_axis="diff_age_nth_abx",
        y_axis=score_col,
        n=n,
        path_to_save=evaluation_path,
        flag=flag_reason,
        tag=f"Abx reason: {r}",
        min_samples=min_samples,
        max_samples=max_samples,
        grouped_samples=group_samples,
    )

    i += 1

## Score over age range

In [None]:
dic_splits_n_scores = {
    "train_noabx": ["score_1", noabx_train, None],
    "val_noabx": ["score_1", noabx_val, None],
    "abx_1st": ["score_1", abx_scores_flat, abx_age_at_all["age_1st_abx"]],
    "abx_2nd": ["score_2", abx_scores_flat, abx_age_at_all["age_2nd_abx"]],
    "abx_3rd": ["score_3", abx_scores_flat, abx_age_at_all["age_3rd_abx"]],
}

for name, v in dic_splits_n_scores.items():
    score_col = v[0]
    scores = v[1]
    abx_age_values = v[2]
    _plot_score_over_age(scores, score_col, name, evaluation_path, abx_age_values)
    plt.show()

## Score overall - scatter

In [27]:
# sort both abx dataframes by increasing abx exposure in same way
abx_scores_flat.sort_values(
    [
        "abx_max_count_ever",
        "max_abx_w_microbiome",
        "host_id",
        "day",
    ],
    ascending=[True, True, True, True],
    inplace=True,
)

# sort abx_df accordingly
# sort abx_df in same order and remove samples that don't exist in md_df
abx_events = pd.DataFrame()
abx_events["host_id"] = abx_scores_flat["host_id"].unique()
abx_events = pd.merge(abx_events, abx_df, on="host_id", how="left")

assert abx_events.host_id.unique().tolist() == abx_scores_flat.host_id.unique().tolist()


In [None]:
dic_splits = {
    "train_noabx": ["score_1", noabx_train, None],
    "val_noabx": ["score_1", noabx_val, None],
    "abx": ["score_1", abx_scores_flat, abx_events],
}

display_scatterplot_w_scores(
    dic_splits, False, path_to_output=evaluation_path, flag="noabx_vs_abx"
)

In [None]:
dic_splits = {
    "abx_1st": ["score_1", abx_scores_flat, abx_events],
    "abx_2nd": ["score_2", abx_scores_flat, abx_events],
    "abx_3rd": ["score_3", abx_scores_flat, abx_events],
}

display_scatterplot_w_scores(
    dic_splits, False, True, path_to_output=evaluation_path, flag="all_abx"
)

## Individual trajectories: score

### abx

In [None]:
plot_trajectory(
    abx_scores_flat,
    abx_events,
    "P006862",
    ["score_1", "score_2", "score_3"],
    path_to_output=evaluation_path,
    flag="all_scores",
)

In [None]:
abx_scores_flat.loc[abx_scores_flat.host_id == "E024646", "abx_any_cumcount"].describe()

In [None]:
plot_trajectory(
    abx_scores_flat,
    abx_events,
    "E024646",
    ["score_1", "score_2", "score_3"],
    path_to_output=evaluation_path,
    flag="all_scores",
)

In [None]:
abx_scores_flat.loc[abx_scores_flat.host_id == "E009676", "abx_any_cumcount"].describe()

In [None]:
plot_trajectory(
    abx_scores_flat,
    abx_events,
    "E009676",
    ["score_1", "score_2", "score_3"],
    path_to_output=evaluation_path,
    flag="all_scores",
)

In [None]:
abx_scores_flat.loc[abx_scores_flat.host_id == "E004898", :]

In [None]:
plot_trajectory(
    abx_scores_flat,
    abx_events,
    "E004898",
    ["score_1", "score_2", "score_3"],
    path_to_output=evaluation_path,
    flag="all_scores",
)

In [None]:
plot_trajectory(
    abx_scores_flat,
    abx_events,
    "E004628",
    ["score_1", "score_2", "score_3"],
    path_to_output=evaluation_path,
    flag="all_scores",
)

In [None]:
plot_trajectory(
    abx_scores_flat,
    abx_events,
    "E021822",
    ["score_1", "score_2", "score_3"],
    path_to_output=evaluation_path,
    flag="all_scores",
)

In [None]:
plot_trajectory(
    abx_scores_flat,
    abx_events,
    "E003188",
    ["score_1", "score_2", "score_3"],
    path_to_output=evaluation_path,
    flag="all_scores",
)

### noabx

In [None]:
plot_trajectory(
    noabx_train,
    None,
    "E035134",
    ["score_1"],
    path_to_output=evaluation_path,
    flag="noabx_score1",
)

In [None]:
plot_trajectory(
    noabx_train,
    None,
    "E022497",
    ["score_1"],
    path_to_output=evaluation_path,
    flag="noabx_score1",
)