In [25]:
import pandas as pd
from tqdm import tqdm

from fusion import load_information_files


def fusion_examination(category):
    splitting_examination, information_files = load_information_files(prefix="../")

    # Drop files that are not corresponding to the category
    # Drop PAXRAW_D: cannot be downloaded properly
    # Drop P_BPXO, P_BMX, P_OHXDEN and P_OHXREF: files not accessible and convered by others
    # Drop AUXAR_I, AUXTYM_I, AUXWBR_I, PAXRAW_C, SPXRAW_E, SPXRAW_F, SPXRAW_G, PAXDAY_G, PAXDAY_H, PAXHR_G, PAXHR_H, PAXMIN_G, PAXMIN_H: those files are time series
    files_to_drop = [
        "PAXRAW_D",
        "P_BPXO",
        "P_BMX",
        "P_OHXDEN",
        "P_OHXREF",
        "AUXAR_I",
        "AUXTYM_I",
        "AUXWBR_I",
        "PAXRAW_C",
        "SPXRAW_E",
        "SPXRAW_F",
        "SPXRAW_G",
        "PAXDAY_G",
        "PAXDAY_H",
        "PAXHR_G",
        "PAXHR_H",
        "PAXMIN_G",
        "PAXMIN_H",
    ]

    information_files.drop(
        index=information_files.index[
            (
                ~information_files["data_file_description"].isin(
                    splitting_examination[category]
                )
            )
            | information_files["data_file_name"].isna()
            | information_files["data_file_name"].isin(files_to_drop)
        ],
        inplace=True,
    )

    file_names = information_files["data_file_name"].drop_duplicates()
     
    # Get the minimum and maximum SEQN number
    min_seqn = float("inf")
    max_seqn = -float("inf")
    for file_name in tqdm(file_names):
        raw_data = pd.read_csv("extraction/raw_data/examination/" + file_name + ".csv")

        if raw_data["SEQN"].min() < min_seqn:
            min_seqn = raw_data["SEQN"].min()
        if max_seqn < raw_data["SEQN"].max():
            max_seqn = raw_data["SEQN"].max()

    # Fill the dataframe
    data_category = pd.DataFrame(
        None, index=pd.Index(range(int(min_seqn), int(max_seqn) + 1), name="SEQN")
    )

    for file_name in tqdm(file_names):
        raw_data = pd.read_csv(
            "extraction/raw_data/examination/" + file_name + ".csv"
        ).set_index("SEQN")

        if (
            "SPXRAW" not in file_name
        ):  # "Spirometry - Raw Curve Data" does not contain extra columns
            raw_data.drop(
                columns=["file_name", "cycle", "begin_year", "end_year"], inplace=True
            )

        data_category.loc[raw_data.index, raw_data.columns] = raw_data

    columns_object = data_category.columns[data_category.dtypes == "object"]
    data_category[columns_object] = data_category[columns_object].astype(
        str, copy=False
    )

    data_category.dropna(how="all", inplace=True)
    # data_category.reset_index().to_feather(
    #     f"fusion/fusionned_data/examination/{category}.feather"
    # )