# Notebook exploring distribution of score_0 vs. score_2

To run this notebook you need to create and activate the following conda environment:

```
conda create --name score_eval -c conda-forge -c defaults numpy pandas matplotlib seaborn scipy scikit-learn ipython ipykernel -y
conda activate score_eval
pip install -e .
```


## Setup

In [None]:
import os

import pandas as pd

from src.utils_eval_score import (
    display_scatterplot_w_scores,
    get_scores_n_abx_info,
)
from src.utils_prediction import plot_distribution

%load_ext autoreload
%autoreload 2
%matplotlib inline

USER input: define the inferred model and linked datasets to evaluate here:

In [None]:
#### USER INPUT START
# name of the model
model_name = "saved_models_microbial_novel_alpha_div2/id-55"
# which model version to evaluate: "best" or "last"
point_to_evaluate = "best"

# name of feature dataset used for model
ft_name = "ft_vat19_anomaly_v20240806_entero_genus"
# path to abx time-series file
path_to_abx_data = "../data/original_data/"
# name of abx time-series used for model
abx_ts_name = "ts_vat19_abx_v20240806"

# limit evaluation to time range up to this many months (if None no limit is set
# and all scores are evaluated)
limit_months = 24.0

# scaling factor options:
scaling_factors_used = True

# if scaling_factors_used is True, then the following options are required:
# non-centered = "nc_std" or centered = "std"
stddev_type = "nc_std"
# moving average window size: 30 or 10
moving_avg = 10
# whether to include duplicates: "--RD-True" or ""
duplicates = ""

#### USER INPUT END

## Prepare data

In [None]:
base_path = f"../data/{model_name}/anomaly_detection/"

if scaling_factors_used:
    print("Scaling factors used.")
    folder_name = f"using-SF_{stddev_type}_z_scores--moving_avg-{moving_avg}-cummax-lower_bound-1{duplicates}"

    scores_path = f"{base_path}scores_{point_to_evaluate}_normal/{folder_name}/"
    evaluation_path = f"{base_path}evaluation_{point_to_evaluate}_unsupervised_pred_{stddev_type}_ma{moving_avg}{duplicates.replace("-", "_").lower()}/"
else:
    scores_path = f"{base_path}scores_{point_to_evaluate}_normal/"
    evaluation_path = f"{base_path}evaluation_{point_to_evaluate}_unsupervised_pred/"

if not os.path.exists(evaluation_path):
    os.makedirs(evaluation_path)

In [None]:
# get scores
noabx_train, noabx_val, abx_scores_flat, abx_df, abx_age_at_all = get_scores_n_abx_info(
    scores_path, ft_name, limit_months, abx_ts_name
)


## Exploration: Score distribution

In [None]:
noabx_train[["score_0", "score_1"]].describe()

In [None]:
abx_scores_flat[["score_0", "score_1", "score_2", "score_3"]].describe()

In [None]:
# plot score_0 vs score_1 for abx cohort
plot_distribution(
    columns=["score_0", "score_1"],
    dataframes={"abx_scores_flat": abx_scores_flat},
    figsize=(4, 5),
    kde=True,
)

In [None]:
# plot score_0 for abx vs. noabx train cohort
plot_distribution(
    columns="score_0",
    dataframes={
        "abx_scores_flat": abx_scores_flat,
        "noabx_train": noabx_train,
        "noabx_val": noabx_val,
    },
    figsize=(10, 6),
    kde=True,
)

In [None]:
# sort both abx dataframes by increasing abx exposure in same way
abx_scores_flat.sort_values(
    [
        "abx_max_count_ever",
        "max_abx_w_microbiome",
        "host_id",
        "day",
    ],
    ascending=[True, True, True, True],
    inplace=True,
)

# sort abx_df accordingly
# sort abx_df in same order and remove samples that don't exist in md_df
abx_events = pd.DataFrame()
abx_events["host_id"] = abx_scores_flat["host_id"].unique()
abx_events = pd.merge(abx_events, abx_df, on="host_id", how="left")
assert abx_events.host_id.unique().tolist() == abx_scores_flat.host_id.unique().tolist()

# display scatter
dic_splits = {
    "train_noabx": ["score_0", noabx_train, None],
    "val_noabx": ["score_0", noabx_val, None],
    "abx": ["score_0", abx_scores_flat, abx_events],
}

display_scatterplot_w_scores(
    dic_splits, False, path_to_output=evaluation_path, flag="noabx_vs_abx"
)