# Results
The evaluation of automatic predictions had two different scenarios or sub-tracks:

1.  **NER offset and entity type classification**: the first sub-track was focused
on the identification and classification of sensitive information (e.g., patient
names, telephones, addresses, etc.).  

2.  **Sensitive span detection**: the second sub-track was focused on the detection
of sensitive text more specific to the practical scenario necessary for the
release of de-identified clinical documents, where the objective is to identify
and to mask confidential data, regardless of the real type of entity or the
correct identification of PHI type.

We evaluate our models using the various evaluation scripts and report averaged F1-Score over treee runs.

First create some code to automatically extract the evaluation results.

In [1]:
import pandas as pd
from collections import defaultdict
from pathlib import Path
from typing import Callable, DefaultDict, List, NamedTuple


class SubtrackScores(NamedTuple):
    precision: float
    recall: float

def _get_scores(folder_path: Path, filename: str, precision_line: int, recall_line: int) -> SubtrackScores:
    fpth = Path(folder_path / filename)
    if not fpth.exists():
        raise FileNotFoundError(f"{fpth} not found!")

    lines = fpth.read_text().split("\n")

    precision = float(lines[precision_line].split("=")[-1])
    recall = float(lines[recall_line].split("=")[-1])

    return SubtrackScores(precision, recall)

def get_subtrack1_scores(folder_path: Path) -> SubtrackScores:
    return _get_scores(folder_path, "ner", -3, -2)

def get_subtrack2_strict_scores(folder_path: Path) -> SubtrackScores:
    return _get_scores(folder_path, "spans", -6, -5)

def get_subtrack2_merged_scores(folder_path: Path) -> SubtrackScores:
    return _get_scores(folder_path, "spans", -3, -2)

def get_scores_as_df(seeds: List[int], get_folder: Callable[[int], Path]) -> pd.DataFrame:
    subtracks_scores: DefaultDict[List, float] = defaultdict(list)

    for seed in seeds:
        fpth = get_folder(seed)

        p, r = get_subtrack1_scores(fpth)
        subtracks_scores["1_p"].append(p)
        subtracks_scores["1_r"].append(r)

        p, r = get_subtrack2_strict_scores(fpth)
        subtracks_scores["2_1_p"].append(p)
        subtracks_scores["2_1_r"].append(r)

        p, r = get_subtrack2_merged_scores(fpth)
        subtracks_scores["2_2_p"].append(p)
        subtracks_scores["2_2_r"].append(r)

    df = pd.DataFrame.from_dict(subtracks_scores)
    for col in ["1", "2_1", "2_2"]:
        df[f"{col}_f1"] = 2*df[f"{col}_p"]*df[f"{col}_r"] / (df[f"{col}_p"] + df[f"{col}_r"])

    # Reorder columns
    new_columns = ["1_p", "1_r", "1_f1", "2_1_p", "2_1_r", "2_1_f1", "2_2_p", "2_2_r", "2_2_f1"]
    df = df[new_columns]

    # Prepare multi index names
    multi_index = pd.MultiIndex.from_product(
        [
            ["Subtrack 1", "Subtrack 2 [Strict]", "Subtrack 2 [Merged]"],
            ["precision", "recall", "f1"]
        ],
        names=["Track", "Scores"]
    )
    # Give multi index to df
    return pd.DataFrame(df.to_numpy().T, index=multi_index)

Define the root folder where all the results and the trained models are stored

In [2]:
base_folder = Path("/home/wave/Project/MedDocAn")

Store the evaluation for each model in a ``pandas.DataFrame``.

In [3]:
seeds = [1, 10, 25, 33, 42]
get_folders = lambda seed: base_folder / f"experiments/corpus_sentence_flair_we_lstm_crf/results_seed_{seed}/evals/test"
df = get_scores_as_df(seeds, get_folders)
LSTM_CRF_FLAIR_WE = df.T.describe().T[["mean", "std"]].loc[pd.IndexSlice[:, ['f1']], :]
LSTM_CRF_FLAIR_WE

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std
Track,Scores,Unnamed: 2_level_1,Unnamed: 3_level_1
Subtrack 1,f1,0.968385,0.001274
Subtrack 2 [Strict],f1,0.973075,0.001142
Subtrack 2 [Merged],f1,0.984384,0.000787


In [4]:
seeds = [1, 12, 33]
get_folders = lambda seed: base_folder / f"experiments/corpus_sentence_flair_lstm_crf/an_wh_rs_True_dpt_0.08716810045694838_emb_seed_{seed}_Stack(0_lm-es-forward.pt, 1_lm-es-backward.pt)_hdn_sz_256_lr_0.1_it_150_bs_4_opti_SGD_pjct_emb_True_rnn_ly_2_sdl_AnnealOnPlateau_use_crf_True_use_rnn_True/0/evals/test"
df = get_scores_as_df(seeds, get_folders)
LSTM_CRF_FLAIR = df.T.describe().T[["mean", "std"]].loc[pd.IndexSlice[:, ['f1']], :]
LSTM_CRF_FLAIR

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std
Track,Scores,Unnamed: 2_level_1,Unnamed: 3_level_1
Subtrack 1,f1,0.968768,0.001296
Subtrack 2 [Strict],f1,0.976428,0.001584
Subtrack 2 [Merged],f1,0.983721,0.001211


In [5]:
seeds = [1, 12, 33]
get_folders = lambda seed: base_folder / f"experiments/corpus_sentence_bert_context_finetune/an_wh_rs_False_dpt_0_emb_beto-cased-context_FT_True_Ly_-1_seed_{seed}_lr_5e-06_it_150_bs_4_opti_AdamW_pjct_emb_False_sdl_LinearSchedulerWithWarmup_use_crf_False_use_rnn_False_wup_0.1/0/evals/test"
df = get_scores_as_df(seeds, get_folders)
FINE_TUNE_BETO_CONTEXT = df.T.describe().T[["mean", "std"]].loc[pd.IndexSlice[:, ['f1']], :]
FINE_TUNE_BETO_CONTEXT

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std
Track,Scores,Unnamed: 2_level_1,Unnamed: 3_level_1
Subtrack 1,f1,0.973673,0.001587
Subtrack 2 [Strict],f1,0.97966,0.001506
Subtrack 2 [Merged],f1,0.985812,0.000989


In [6]:
seeds = [1, 12, 33]
get_folders = lambda seed: base_folder / f"experiments/corpus_sentence_bert_finetune/an_wh_rs_False_dpt_0_emb_beto-cased_FT_True_Ly_-1_seed_{seed}_lr_5e-06_it_40_bs_4_opti_AdamW_pjct_emb_False_sdl_LinearSchedulerWithWarmup_use_crf_False_use_rnn_False_wup_0.05/0/evals/test"
df = get_scores_as_df(seeds, get_folders)
FINE_TUNE_BETO = df.T.describe().T[["mean", "std"]].loc[pd.IndexSlice[:, ['f1']], :]
FINE_TUNE_BETO

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std
Track,Scores,Unnamed: 2_level_1,Unnamed: 3_level_1
Subtrack 1,f1,0.972018,0.001341
Subtrack 2 [Strict],f1,0.977234,0.001437
Subtrack 2 [Merged],f1,0.984776,0.000964


In [7]:
seeds = [1, 12, 33]
get_folders = lambda seed: base_folder / f"experiments/corpus_sentence_bert_finetune/an_wh_rs_False_dpt_0_emb_beto-cased_FT_True_Ly_-1_seed_{seed}_lr_5e-06_it_40_bs_4_opti_AdamW_pjct_emb_False_sdl_LinearSchedulerWithWarmup_use_crf_False_use_rnn_False_wup_0.05/0/evals/test"
df = get_scores_as_df(seeds, get_folders)
FINE_TUNE_BETO = df.T.describe().T[["mean", "std"]].loc[pd.IndexSlice[:, ['f1']], :]
FINE_TUNE_BETO

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std
Track,Scores,Unnamed: 2_level_1,Unnamed: 3_level_1
Subtrack 1,f1,0.972018,0.001341
Subtrack 2 [Strict],f1,0.977234,0.001437
Subtrack 2 [Merged],f1,0.984776,0.000964


In [8]:
seeds = [1, 12, 33]
get_folders = lambda seed: base_folder / f"experiments/corpus_sentence_bert_lstm_crf/an_wh_rs_False_dpt_0_emb_beto_Ly_all_mean_seed_{seed}_hdn_sz_256_lr_0.1_it_500_bs_4_opti_SGD_pjct_emb_False_rnn_ly_2_sdl_AnnealOnPlateau_use_crf_True_use_rnn_True/0/evals/test"
df = get_scores_as_df(seeds, get_folders)
LSTM_CRF_BETO = df.T.describe().T[["mean", "std"]].loc[pd.IndexSlice[:, ['f1']], :]
LSTM_CRF_BETO

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std
Track,Scores,Unnamed: 2_level_1,Unnamed: 3_level_1
Subtrack 1,f1,0.971994,0.000463
Subtrack 2 [Strict],f1,0.978217,0.000734
Subtrack 2 [Merged],f1,0.985028,0.000503


In [9]:
seeds = [1, 12, 33]
get_folders = lambda seed: base_folder / f"experiments/corpus_sentence_bert_we_lstm_crf/an_wh_rs_False_dpt_0_emb_Stack(0_es-wiki-fasttext-300d-1M, 1_1-beto_Ly_all_mean_seed_{seed})_hdn_sz_256_lr_0.1_it_500_bs_4_opti_SGD_pjct_emb_False_rnn_ly_2_sdl_AnnealOnPlateau_use_crf_True_use_rnn_True/0/evals/test"
df = get_scores_as_df(seeds, get_folders)
LSTM_CRF_BETO_WE = df.T.describe().T[["mean", "std"]].loc[pd.IndexSlice[:, ['f1']], :]
LSTM_CRF_BETO_WE

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std
Track,Scores,Unnamed: 2_level_1,Unnamed: 3_level_1
Subtrack 1,f1,0.973711,0.000714
Subtrack 2 [Strict],f1,0.979351,0.000444
Subtrack 2 [Merged],f1,0.986297,0.00063


In [10]:
seeds = [1, 12, 33]
get_folders = lambda seed: base_folder / f"experiments/corpus_sentence_bert_context_we_lstm_crf/an_wh_rs_False_dpt_0_emb_Stack(0_es-wiki-fasttext-300d-1M, 1_1-beto_Ly_all_mean_context_seed_{seed})_hdn_sz_256_lr_0.1_it_500_bs_4_opti_SGD_pjct_emb_False_rnn_ly_2_sdl_AnnealOnPlateau_use_crf_True_use_rnn_True/0/evals/test"
df = get_scores_as_df(seeds, get_folders)
LSTM_CRF_BETO_WE_CONTEXT = df.T.describe().T[["mean", "std"]].loc[pd.IndexSlice[:, ['f1']], :]
LSTM_CRF_BETO_WE_CONTEXT

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std
Track,Scores,Unnamed: 2_level_1,Unnamed: 3_level_1
Subtrack 1,f1,0.974963,0.000642
Subtrack 2 [Strict],f1,0.980899,0.000815
Subtrack 2 [Merged],f1,0.986733,0.000919


In [11]:
seeds = [1, 12, 33]
get_folders = lambda seed: base_folder / f"experiments/corpus_sentence_bert_context_lstm_crf/an_wh_rs_False_dpt_0_emb_beto_Ly_all_mean_context_seed_{seed}_hdn_sz_256_lr_0.1_it_500_bs_4_opti_SGD_pjct_emb_False_rnn_ly_2_sdl_AnnealOnPlateau_use_crf_True_use_rnn_True/0/evals/test"
df = get_scores_as_df(seeds, get_folders)
LSTM_CRF_BETO_CONTEXT = df.T.describe().T[["mean", "std"]].loc[pd.IndexSlice[:, ['f1']], :]
LSTM_CRF_BETO_CONTEXT

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std
Track,Scores,Unnamed: 2_level_1,Unnamed: 3_level_1
Subtrack 1,f1,0.970914,0.000867
Subtrack 2 [Strict],f1,0.977782,0.000641
Subtrack 2 [Merged],f1,0.984388,0.001002


In [12]:
seeds = [1, 12, 33]
get_folders = lambda seed: base_folder / f"experiments/corpus_sentence_grid_search_flert_xlm-roberta_docstart/an_wh_rs_False_dpt_0_emb_xlm-roberta-large-cased-context_FT_True_Ly_-1_seed_{seed}_lr_5e-06_it_40_bs_4_opti_AdamW_pjct_emb_False_sdl_LinearSchedulerWithWarmup_use_crf_False_use_rnn_False_wup_0.1/0/evals/test"
df = get_scores_as_df(seeds, get_folders)
FINE_TUNE_XLMR_LARGE_CONTEXT = df.T.describe().T[["mean", "std"]].loc[pd.IndexSlice[:, ['f1']], :]
FINE_TUNE_XLMR_LARGE_CONTEXT

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std
Track,Scores,Unnamed: 2_level_1,Unnamed: 3_level_1
Subtrack 1,f1,0.974911,0.000691
Subtrack 2 [Strict],f1,0.980135,0.000723
Subtrack 2 [Merged],f1,0.986529,0.000465


Group the evaluation for all the models in a ``pandas.DataFrame``.

In [14]:
data = {
    "FINETUNE + XLMR LARGE + CONTEXT": FINE_TUNE_XLMR_LARGE_CONTEXT,
    "FINETUNE + BETO + CONTEXT": FINE_TUNE_BETO_CONTEXT,
    "FINETUNE + BETO": FINE_TUNE_BETO,
    # "FINETUNE + BETO + WE": TODO,
    # "FINETUNE + BETO + WE + CONTEXT": TODO,
    "LSTM CRF + BETO + CONTEXT": LSTM_CRF_BETO_CONTEXT,
    "LSTM CRF + BETO": LSTM_CRF_BETO,
    "LSTM CRF + BETO + WE + CONTEXT": LSTM_CRF_BETO_WE_CONTEXT,
    "LSTM CRF + BETO + WE": LSTM_CRF_BETO_WE,
    "LSTM CRF + FLAIR + WE": LSTM_CRF_FLAIR_WE,
    "LSTM CRF + FLAIR": LSTM_CRF_FLAIR,

}
result_metrics = pd.concat(data.values(), axis=1, keys=data.keys(), names=["Model", "computation"]).T

Visualize the results.

In [15]:
((result_metrics*100)
.iloc[::2, ::]
.style
.background_gradient()
.set_table_styles([
    {'selector': '.index_name', 'props': 'font-style: italic; color: darkgrey; font-weight:normal;'},
    {'selector': 'th.col_heading', 'props': 'text-align: center;'},
    {'selector': 'th.col_heading.level0', 'props': 'font-size: 1.5em;'},
    {'selector': 'td', 'props': 'text-align: center; font-weight: bold;'},
], overwrite=False)
.hide(axis="index", level=1)
.hide(axis="columns", level=1)
.format(precision=2))

Track,Subtrack 1,Subtrack 2 [Strict],Subtrack 2 [Merged]
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
FINETUNE + XLMR LARGE + CONTEXT,97.49,98.01,98.65
FINETUNE + BETO + CONTEXT,97.37,97.97,98.58
FINETUNE + BETO,97.2,97.72,98.48
LSTM CRF + BETO + CONTEXT,97.09,97.78,98.44
LSTM CRF + BETO,97.2,97.82,98.5
LSTM CRF + BETO + WE + CONTEXT,97.5,98.09,98.67
LSTM CRF + BETO + WE,97.37,97.94,98.63
LSTM CRF + FLAIR + WE,96.84,97.31,98.44
LSTM CRF + FLAIR,96.88,97.64,98.37
