# Notebook to evaluate anomaly scores by unsupervised prediction performance

To run this notebook you need to create and activate the following conda environment:

```
conda create --name score_eval -c conda-forge -c defaults numpy pandas matplotlib seaborn scipy scikit-learn ipython ipykernel -y
conda activate score_eval
pip install -e .
```


## Setup

In [None]:
import os

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)

from src.utils_eval_score import (
    _add_month_bins,
    _get_abx_info,
    _transform_scores,
    display_scatterplot_w_scores,
    get_scores_n_abx_info,
)
from src.utils_prediction import calculate_metrics

%load_ext autoreload
%autoreload 2
%matplotlib inline

USER input: define the inferred model and linked datasets to evaluate here:

In [None]:
#### USER INPUT START
# name of the model
model_name = "saved_models_microbial_novel_alpha_div2/id-55"
# which model version to evaluate: "best" or "last"
point_to_evaluate = "best"

# name of feature dataset used for model
ft_name = "ft_vat19_anomaly_v20240806_entero_genus"
# path to abx time-series file
path_to_abx_data = "../data/original_data/"
# name of abx time-series used for model
abx_ts_name = "ts_vat19_abx_v20240806"

# limit evaluation to time range up to this many months (if None no limit is set
# and all scores are evaluated)
limit_months = 24.0
#### USER INPUT END

In [None]:
def get_no_cutoff_scores(scores_path, split, limit_months=None):
    scores = pd.read_csv(f"{scores_path}{split}_ad_scores_False_coord-0.csv")

    scores_t = _transform_scores(scores)
    scores_t = _add_month_bins(scores_t)

    if limit_months is not None:
        scores_t = scores_t[scores_t["month5_bin"] <= limit_months].copy()
    return scores_t

## Prepare data

In [None]:
scores_path = (
    f"../data/{model_name}/anomaly_detection/scores_{point_to_evaluate}_normal/"
)
evaluation_path = f"../data/{model_name}/anomaly_detection/evaluation_{point_to_evaluate}_unsupervised_pred/"

if not os.path.exists(evaluation_path):
    os.makedirs(evaluation_path)

In [None]:
# get scores
noabx_train, noabx_val, abx_scores_flat, abx_df, abx_age_at_all = get_scores_n_abx_info(
    scores_path, ft_name, limit_months, abx_ts_name
)


## Score distribution

In [None]:
abx_scores_flat.head()

In [None]:
# sort both abx dataframes by increasing abx exposure in same way
abx_scores_flat.sort_values(
    [
        "abx_max_count_ever",
        "max_abx_w_microbiome",
        "host_id",
        "day",
    ],
    ascending=[True, True, True, True],
    inplace=True,
)

# sort abx_df accordingly
# sort abx_df in same order and remove samples that don't exist in md_df
abx_events = pd.DataFrame()
abx_events["host_id"] = abx_scores_flat["host_id"].unique()
abx_events = pd.merge(abx_events, abx_df, on="host_id", how="left")
assert abx_events.host_id.unique().tolist() == abx_scores_flat.host_id.unique().tolist()

# display scatter
dic_splits = {
    "train_noabx": ["score_0", noabx_train, None],
    "val_noabx": ["score_0", noabx_val, None],
    "abx": ["score_0", abx_scores_flat, abx_events],
}

display_scatterplot_w_scores(
    dic_splits, False, path_to_output=evaluation_path, flag="noabx_vs_abx"
)

open points:
* ensure score_0 was also scaled as it should

### Infer threshold for noabx exposed score from noabx_val

In [None]:
noabx_val.score_0.describe()

In [None]:
thresh = noabx_val["score_0"].quantile(0.95)
thresh

| q-threshold | 3M true_f1_score | 3M weighted_avg_f1_score | 3M accuracy |
|-------------|------------------|--------------------------|-------------|
| 0.7         | 0.21             | 0.66                     | 0.58        |
| 0.8         | 0.20             | 0.73                     | 0.68        |
| 0.9         | 0.19             | 0.80                     | 0.79        |
| 0.95        | 0.13             | 0.81                     | 0.82        |


## Evaluate performance of inferred threshold

goal: detect observed samples in first 1,2,3 months after 1st abx exposure

In [None]:
for th_sample_after_abx_months in reversed(range(1,4)):
    print(th_sample_after_abx_months)
    abx_scores_flat_th = abx_scores_flat.copy()

    # flag samples that were observed within x months after abx exposure
    abx_scores_flat_th = abx_scores_flat_th.assign(
        sample_lt_xm_after_abx=lambda df: df["abx_any_last_t_dmonths"]
        <= th_sample_after_abx_months
    )

    # add target description: sample lt x months after abx & cumcount == 1
    abx_scores_flat_th = abx_scores_flat_th.assign(
        true_target=lambda df: df["sample_lt_xm_after_abx"] & (df["abx_any_cumcount"] == 1)
    )
    print(abx_scores_flat_th.true_target.value_counts(dropna=False))
    print()
    # define predicted target
    abx_scores_flat_th["pred_target"] = abx_scores_flat_th["score_0"] > thresh

    # evaluate classification 
    calculate_metrics(abx_scores_flat_th["true_target"], abx_scores_flat_th["pred_target"])