Import required libraries and define constants:
- `CORPORA`: List of corpora (`DisTEMIST`, `MedProcNER`, `SympTEMIST`).
- `DATA_PATH` and `OUTPUT_PATH`: Paths for input and output data.

In [1]:
import os
import sys
import pandas as pd

sys.path.append(os.path.join(os.getcwd(), '../src'))
from utils import load_corpus_data

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
CORPORA = ["DisTEMIST", "MedProcNER", "SympTEMIST"]
DATA_PATH = "/scratch/data/"
OUTPUT_PATH = "../data/"

For each corpus:
1. Load test, train, and gazetteer datasets.
2. Identify unseen mentions (UM) and unseen codes (UC).
3. Filter out composite codes (`+`) and `NO_CODE`.
4. Save filtered subsets (`df_um.tsv` and `df_uc.tsv`) to the output directory.

In [3]:
for corpus in CORPORA:
    test_df, train_df, gaz_df = load_corpus_data(DATA_PATH, corpus)
    train_gaz_df = pd.concat([train_df[["term", "code"]], gaz_df[["term","code"]]], ignore_index=True)
    clean_df = test_df[
        test_df['code'].notna() & 
        (test_df['code'] != "NO_CODE") & 
        (~test_df['code'].str.contains("\+", na=False))
    ]
    train_gaz_df.drop_duplicates(inplace=True)
    aux_path = os.path.join(OUTPUT_PATH, corpus)
    os.makedirs(aux_path, exist_ok=True)
    
    df_um = test_df[~test_df['term'].isin(train_df['term']) & ~test_df['term'].isin(gaz_df['term'])]
    df_uc = test_df[~test_df['code'].isin(train_df['code'])]
    df_um_filtered = df_um[~df_um['code'].str.contains(r'\+|NO_CODE', na=False)]
    df_uc_filtered = df_uc[~df_uc['code'].str.contains(r'\+|NO_CODE', na=False)]
    
    df_um_filtered.to_csv(os.path.join(aux_path, "df_um.tsv"), sep="\t", index=False)
    df_uc_filtered.to_csv(os.path.join(aux_path, "df_uc.tsv"), sep="\t", index=False)

    print(f"CORPUS: {corpus}")
    print(f"Train + Gaz: {train_gaz_df.shape[0]}")
    print(f"Gold standard:  {test_df.shape[0]}")
    print(f"Cleaned: {clean_df.shape[0]}")
    print(f"Unseen mentions (filtered): {df_um_filtered.shape[0]}")
    print(f"Unseen codes (filtered): {df_uc_filtered.shape[0]}")
    print("="*50)


CORPUS: DisTEMIST
Train + Gaz: 149305
Gold standard:  2598
Cleaned: 2507
Unseen mentions (filtered): 1375
Unseen codes (filtered): 1115
CORPUS: MedProcNER
Train + Gaz: 237565
Gold standard:  3618
Cleaned: 3512
Unseen mentions (filtered): 1730
Unseen codes (filtered): 878
CORPUS: SympTEMIST
Train + Gaz: 169814
Gold standard:  2848
Cleaned: 2741
Unseen mentions (filtered): 1573
Unseen codes (filtered): 763
