# Notebook to infer reliable time horizon

To run this notebook you need to create and activate the following conda environment:

```
conda create --name score_eval numpy pandas matplotlib seaborn scipy ipython ipykernel -y
conda activate score_eval
pip install -e .
```


## Setup

In [1]:
import os

import numpy as np
import pandas as pd

from src.utils_eval_score import (
    _plot_score_after_nth_abx_exposure,
)
from src.utils_t_horizon import (
    enrich_scores,
    plot_cutoff_date_distribution,
    transform_cutoff_scores,
)

%load_ext autoreload
%autoreload 2
%matplotlib inline

# avg. number of days per month
DAYS_PER_MONTH = 30.437

USER input: define the inferred model and linked datasets to evaluate here:

In [2]:
#### USER INPUT START
# name of the model
model_name = "saved_models_microbial_novel_alpha_div/id-2_test"

# name of feature dataset used for model
ft_name = "ft_vat19_anomaly_v20240806_entero_family"
# name of abx time-series used for model
abx_ts_name = "ts_vat19_abx_v20240806"

# limit evaluation to time range up to this many months (if None no limit is set
# and all scores are evaluated)
limit_months = 24.0

# whether to group samples prior to cutoff in analysis
group_samples = False

# how many samples prior and after cutoff to consider
min_samples = -3.0
max_samples = 24.0
#### USER INPUT END

## Prepare data

In [3]:
cutoff_scores_path = (
    f"../data/{model_name}/anomaly_detection/reliability_eval-val-noabx/"
)
evaluation_path = f"../data/{model_name}/anomaly_detection/reliability_evaluation/"

if not os.path.exists(evaluation_path):
    os.makedirs(evaluation_path)


In [None]:
c_scores_list = []
i_values = list(range(0, 1141, 30))

for i in i_values:
    c_scores = pd.read_csv(f"{cutoff_scores_path}val_noabx_ad_scores_{i}_coord-0.csv")

    # transform scores from wide to long format
    c_scores_t = transform_cutoff_scores(c_scores)

    # filter scores
    # ! at least one observation must be present before the cutoff
    # ! same restriction was applied to abx scores in evaluate_scores.ipynb
    c_scores_t = enrich_scores(c_scores_t)

    # append each cutoff date to each other
    c_scores_list.append(c_scores_t)

c_scores_all = pd.concat(c_scores_list, ignore_index=True)
c_scores_all

In [None]:
# plot before filtering:
plot_cutoff_date_distribution(c_scores_all, "- before filtering")

In [None]:
# filter scores
# ! filter: at least one observation must be present before the cutoff
c_scores_all_f = c_scores_all[c_scores_all["nb_obs_before_cutoff"] > 0].copy()
# plot after filtering
plot_cutoff_date_distribution(c_scores_all_f, "- after filtering")

## Visualize all cutoffs

In [None]:
# todo: filter by months_since_cutoff to be in min_ max_samples range
c_scores_subset_f = c_scores_all_f.loc[
    np.logical_and(
        c_scores_all_f["months_since_cutoff"] >= min_samples,
        c_scores_all_f["months_since_cutoff"] <= max_samples,
    ),
    :,
]
c_scores_subset_f.shape

In [None]:
all_cutoffs = c_scores_subset_f.use_obs_until_day.unique().tolist()
all_cutoffs.remove(1140)

for cutoff in all_cutoffs:
    c_scores_subset_f_ss = c_scores_subset_f[
        c_scores_subset_f["use_obs_until_day"] == cutoff
    ].copy()
    print(cutoff)
    _plot_score_after_nth_abx_exposure(
        c_scores_subset_f_ss,
        x_axis="months_since_cutoff",
        y_axis="score",
        n=0,
        path_to_save=evaluation_path,
        flag=f"noabx_cutoff{int(cutoff)}_scores",
        tag=f"with cutoff={cutoff}",
        min_samples=min_samples,
        max_samples=max_samples,
        grouped_samples=group_samples,
    )