In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
dataset_name = 'signal_dataset_chu'

data = pd.read_parquet(f'./data/datasets/{dataset_name}/cases/', engine='pyarrow')
static = pd.read_parquet(f'./data/datasets/{dataset_name}/meta.parquet', engine='pyarrow')
data = data.merge(static, on='caseid')

data["last_map_value"] = data["mbp_19"]
data = data[data['ioh_in_leading_time']==0]
data = data[data['ioh_at_time_t']==0]
data = data[data['intervention']==0]

In [6]:
signal_types = ['mbp', 'sbp', 'dbp', 'hr', 'rr', 'spo2', 'etco2', 'mac', 'pp_ct', 'rf_ct', 'body_temp']

def get_signal_time(signal):
    try:
        # Tries to get the last part after splitting by '_'
        return int(signal.split('_')[-1])
    except ValueError:
        return 0
    except IndexError:
        return 0

# Group columns by signal type and sort them by their time index
grouped_columns = {
    signal: sorted([col for col in data.columns if col.startswith(signal)], key=get_signal_time)
    for signal in signal_types
}

other_columns = ['age','bmi','asa',"last_map_value","label", "label_id", 'caseid', 'time',]

# --- Selecting the desired signal columns ---
selected_signal_columns = []
for signal_type in signal_types:
    # Get the sorted columns for the current signal type
    cols_for_signal = grouped_columns.get(signal_type, [])
    selected_signal_columns.extend(cols_for_signal[:20])

all_desired_columns = selected_signal_columns + other_columns
data = data.reindex(columns=all_desired_columns)

def check_signal_time_steps(data: pd.DataFrame, signal_types: list, max_time_step: int = 19):
    """
    Checks if all expected time steps (0 to max_time_step) are present
    for each specified signal type in the DataFrame.
    """
    missing_steps_by_signal = {}

    for signal_type in signal_types:
        expected_columns = {f"{signal_type}_{i}" for i in range(max_time_step + 1)}
        
        # Get all columns in the DataFrame that start with the current signal type
        present_columns_for_signal = {col for col in data.columns if col.startswith(signal_type)}
        
        # Find which expected columns are not present
        missing_columns = expected_columns - present_columns_for_signal
        
        if missing_columns:
            # Extract just the time steps from the missing column names
            missing_time_steps = sorted([int(col.split('_')[-1]) for col in missing_columns])
            missing_steps_by_signal[signal_type] = missing_time_steps
        else:
            missing_steps_by_signal[signal_type] = [] 

    return missing_steps_by_signal

missing_steps = check_signal_time_steps(data,signal_types)
missing_steps

{'mbp': [],
 'sbp': [],
 'dbp': [],
 'hr': [],
 'rr': [],
 'spo2': [7, 8, 9, 10, 11, 12],
 'etco2': [],
 'mac': [],
 'pp_ct': [],
 'rf_ct': [],
 'body_temp': []}

For all the dataset, spo2 is missing, I will then average the 6th and 13th for every timesteps between.

In [None]:
# Colonnes manquantes pour 'spo2'
missing_spo2_steps = [7, 8, 9, 10, 11, 12]

# --- Calcul de la valeur d'imputation ---
if f'spo2_6' in data.columns and f'spo2_13' in data.columns:
    imputation_value = data[[f'spo2_6', f'spo2_13']].mean(axis=1)

    # Impute les colonnes manquantes
    for step in missing_spo2_steps:
        col_name = f'spo2_{step}'
        if col_name not in data.columns: # Vérifie si la colonne n'existe pas déjà pour éviter de l'écraser
            data[col_name] = imputation_value
            print(f"La colonne '{col_name}' a été ajoutée et imputée avec la moyenne de spo2_6 et spo2_13.")
        else:
            print(f"La colonne '{col_name}' existe déjà. Aucune imputation n'a été effectuée.")
else:
    print("Les colonnes 'spo2_6' ou 'spo2_13' (ou les deux) sont manquantes. Impossible de calculer la moyenne pour l'imputation.")

La colonne 'spo2_7' a été ajoutée et imputée avec la moyenne de spo2_6 et spo2_13.
La colonne 'spo2_8' a été ajoutée et imputée avec la moyenne de spo2_6 et spo2_13.
La colonne 'spo2_9' a été ajoutée et imputée avec la moyenne de spo2_6 et spo2_13.
La colonne 'spo2_10' a été ajoutée et imputée avec la moyenne de spo2_6 et spo2_13.
La colonne 'spo2_11' a été ajoutée et imputée avec la moyenne de spo2_6 et spo2_13.
La colonne 'spo2_12' a été ajoutée et imputée avec la moyenne de spo2_6 et spo2_13.


In [10]:
output_directory = 'data/datasets/clean_chu_trends'
output_filepath = os.path.join(output_directory, 'data.parquet')

# Create the directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Save the DataFrame to a Parquet file
data.to_parquet(output_filepath)