<a href="https://colab.research.google.com/github/MestDash/PID/blob/main/notebooks/preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Functions

In [1]:
import pandas as pd
import numpy as np
from functools import reduce
from sklearn.model_selection import train_test_split
import os

def cleaner(df):
    # Fill NA in label columns
    label_columns = ['IgG (normalized against RV)', 'IGG2 (normalized against RV)',
                     'IGG3 (normalized against RV)', 'IgM (normalized against RV)',
                     'IGA (normalized against RV)']
    df[label_columns] = df[label_columns].fillna('NA')

    # Replace numerical labels with string labels
    replacement_dict = {0: 'normal', -1: 'lower', 1: 'higher'}
    df[label_columns] = df[label_columns].replace(replacement_dict)

    # Strip whitespace
    if 'Humoral or cellular (B or T-cell mediated)' in df.columns:
        df['Humoral or cellular (B or T-cell mediated)'] = (
            df['Humoral or cellular (B or T-cell mediated)'].astype(str).str.strip()
        )

    # Drop irrelevant or redundant columns
    to_drop = [
        'Age (d)', 'labels: lymphoid PID vs HC vc DC vs non-lymphoid PID',
        'Subgroep labels', 'Diagnose PID', 'Detailed diagnosis',
        'Humoral or cellular (B or T-cell mediated)',
        'Deviations seen in current PIDOT routine appraoch (MG + comparison EuroFlow reference range)',
        'Opmerking', 'Date of birth ', 'Date of sample'
    ] + [f'Unnamed: {i}' for i in range(69, 89)]

    df = df.drop(columns=[col for col in to_drop if col in df.columns])

    # Replace invalid strings with NaN
    columns_to_clean = ['IgG', 'IGG2', 'IGG3', 'IgM', 'IGA']
    df[columns_to_clean] = df[columns_to_clean].replace(to_replace=["IR"], value=np.nan, regex=True)

    # Convert columns to float
    columns_to_convert = [
        'Age (Y)', 'WBC', 'Lymfo/µL (Sysmex)', 'IgG', 'IGG2', 'IGG3', 'IgM', 'IGA', '% lympho', ' lympho/µL',
        'B-cells/µL', 'preGC B-cells/µL', 'MBC + PC/µL', 'Uns memory B-cells/µL', 'Sw memory B-cells/µL',
        'T-cells/µL', 'CD4+ T-cells/µL', 'naieve CD4+ T-cells/µL', 'CM CD4+ T-cells/µL', 'EM CD4+ T cells/µL',
        'Effector TD CD4+ T-cells/µL', 'CD8+ T-cells/µL', 'naieve CD8+ T-cells/µL', 'CM CD8+ T-cells/µL',
        'EM CD8+ T-cells/µL ', 'Effector TD 27+ CD8+ T-cells/µL', 'Effector TD  CD8+ T-cells/µL',
        'DNT TCRgd-T-cells/µL', 'TCRgd= T-cells/µL', 'NK  cells/µL',
        '% B-cells (on lymfo)', '% preGC B-cells (on B)', '% MBC + PC (on B)',
        '% Uns memory B-cells (on B)', '% Sw memory B-cells (on B)',
        '% T-cells (on lymfo)', '% CD4+ T-cells (on T)', '% naieve CD4+ T-cells (on CD4 T)',
        '% CM CD4+ T-cells (on CD4 T)', '% EM CD4+ T cells (on CD4 T)', '% Effector TD CD4+ T-cells (on CD4 T)',
        '% CD8+ T-cells (on T)', '% naieve CD8+ T-cells (on CD8 T)', '% CM CD8+ T-cells (on CD8 T)',
        '% EM CD8+ T-cells (on CD8 T)', '% Effector TD 27+ CD8+ T-cells (on CD8 T)',
        '% Effector TD  CD8+ T-cells (on CD8 T)', '% DNT TCRgd-T-cells (on T)',
        '% TCRgd+ T-cells (on T)', '% NK  cells (on lymfo)'
    ]

    # Standardize column names before conversion
    df = df.rename(columns={
        " lympho/µL": "lympho/µL",
        "EM CD8+ T-cells/µL ": "EM CD8+ T-cells/µL"
    })

    columns_to_convert = [col.strip() for col in columns_to_convert if col.strip() in df.columns]
    df[columns_to_convert] = df[columns_to_convert].astype(float)

    # Normalize percentage column
    if "% lympho" in df.columns:
        df["% lympho"] = df["% lympho"] / 100

    return df


def split_main_strat(
    df, main_label_col, sub_label_col, pcode_col,
    target_main_class="III: Predominantly antibody deficiencies",
    train_size=0.7, val_size=0.15, test_size=0.15,
    random_state=42,
    save_to_csv=False,
    output_dir='splits'
):
    assert abs(train_size + val_size + test_size - 1.0) < 1e-6, "Splits must sum to 1.0"

    # Automatically detect feature columns
    feature_columns = df.drop(columns=[main_label_col, sub_label_col, pcode_col]).columns.tolist()

    # Simplify sub-labels for the target main class
    def simplify_sub_label(row):
        if row[main_label_col] == target_main_class:
            first_char = str(row[sub_label_col]).strip()[0]
            return 'group_1to3' if first_char in {'1', '2', '3'} else 'group_4'
        else:
            return 'NA'

    df = df.copy()
    df['simplified_sub'] = df.apply(simplify_sub_label, axis=1)

    # Stratify only on main_label
    strat_labels = df[main_label_col]

    # First split: Train vs Temp
    df_train, df_temp = train_test_split(
        df,
        stratify=strat_labels,
        test_size=(1.0 - train_size),
        random_state=random_state
    )

    # Second split: Validation vs Test
    df_val, df_test = train_test_split(
        df_temp,
        stratify=df_temp[main_label_col],
        test_size=(test_size / (val_size + test_size)),
        random_state=random_state
    )

    # Identify metadata columns
    all_columns = set(df.columns)
    exclude_columns = set(feature_columns)
    meta_columns = list(all_columns - exclude_columns)

    # Helper to extract and optionally save splits
    def process_and_save_split(df_split, split_name):
        X = df_split[feature_columns].reset_index(drop=True)
        y = df_split[[main_label_col, 'simplified_sub']].reset_index(drop=True)
        meta = df_split[meta_columns].reset_index(drop=True)

        if save_to_csv:
            os.makedirs(output_dir, exist_ok=True)
            X.to_csv(os.path.join(output_dir, f'X_{split_name}.csv'), index=False)
            y.to_csv(os.path.join(output_dir, f'y_{split_name}.csv'), index=False)
            meta.to_csv(os.path.join(output_dir, f'meta_{split_name}.csv'), index=False)

        return {'X': X, 'y': y, 'meta': meta}

    return {
        'train': process_and_save_split(df_train, 'train'),
        'val': process_and_save_split(df_val, 'val'),
        'test': process_and_save_split(df_test, 'test')
    }


def split_hierarchical_strat(
    df, main_label_col, sub_label_col, pcode_col,
    train_size=0.7, val_size=0.15, test_size=0.15,
    random_state=42,
    save_to_csv=False,
    output_dir='splits'
):
    assert abs(train_size + val_size + test_size - 1.0) < 1e-6, "Splits must sum to 1.0"

    # Automatically detect feature columns
    feature_columns = df.drop(columns=[main_label_col, sub_label_col, pcode_col]).columns.tolist()

    # Create stratification key
    combined_strat = df[main_label_col].astype(str) + '__' + df[sub_label_col].astype(str)

    # Identify rare strata (with only 1 sample)
    strat_counts = combined_strat.value_counts()
    rare_mask = combined_strat.isin(strat_counts[strat_counts == 1].index)

    df_common = df[~rare_mask].copy()
    df_rare = df[rare_mask].copy()

    # First split: Train vs Temp
    df_train, df_temp = train_test_split(
        df_common,
        stratify=combined_strat[~rare_mask],
        test_size=(1.0 - train_size),
        random_state=random_state
    )

    # Second split: Val vs Test
    combined_temp_strat = df_temp[main_label_col].astype(str) + '__' + df_temp[sub_label_col].astype(str)
    relative_val_size = val_size / (val_size + test_size)
    df_val, df_test = train_test_split(
        df_temp,
        stratify=combined_temp_strat,
        test_size=(1.0 - relative_val_size),
        random_state=random_state
    )

    # Assign RARE (non-stratifiable) samples
    np.random.seed(random_state)
    rare_split_labels = np.random.choice(
        ['train', 'val', 'test'],
        size=len(df_rare),
        p=[train_size, val_size, test_size]
    )
    df_rare['split'] = rare_split_labels

    df_train = pd.concat([df_train, df_rare[df_rare['split'] == 'train'].drop(columns='split')], ignore_index=True)
    df_val   = pd.concat([df_val,   df_rare[df_rare['split'] == 'val'].drop(columns='split')], ignore_index=True)
    df_test  = pd.concat([df_test,  df_rare[df_rare['split'] == 'test'].drop(columns='split')], ignore_index=True)

    # Identify metadata columns
    all_columns = set(df.columns)
    exclude_columns = set(feature_columns)
    meta_columns = list(all_columns - exclude_columns)

    # Helper to process and optionally save splits
    def process_and_save_split(df_split, split_name):
        X = df_split[feature_columns].reset_index(drop=True)
        y = df_split[[main_label_col, sub_label_col]].reset_index(drop=True)
        meta = df_split[meta_columns].reset_index(drop=True)

        if save_to_csv:
            os.makedirs(output_dir, exist_ok=True)
            X.to_csv(os.path.join(output_dir, f'X_{split_name}.csv'), index=False)
            y.to_csv(os.path.join(output_dir, f'y_{split_name}.csv'), index=False)
            meta.to_csv(os.path.join(output_dir, f'meta_{split_name}.csv'), index=False)

        return {'X': X, 'y': y, 'meta': meta}

    return {
        'train': process_and_save_split(df_train, 'train'),
        'val': process_and_save_split(df_val, 'val'),
        'test': process_and_save_split(df_test, 'test')
    }


def count_label_combinations(df, main_label_col, sub_label_col):
    counts = (
        df.groupby([main_label_col, sub_label_col])
        .size()
        .reset_index(name='count')
        .sort_values(by=[main_label_col, 'count'], ascending=[True, True])
        .reset_index(drop=True)
    )
    return counts

def encode_iuis_columns(df):
    # Step 1: Map main classes
    iuis_class_map = {'I': 1, 'II': 2, 'III': 3, 'IV': 4}
    df = df.copy()  # Optional: avoid modifying original DataFrame

    # Handle 'No arguments for lymphoid-PID'
    df["IUIS"] = df["IUIS"].astype(str)
    no_pid_mask = df["IUIS"] == "No arguments for lymphoid-PID"
    df.loc[no_pid_mask, "IUIS"] = 0
    df.loc[no_pid_mask, "IUIS extended"] = 0

    # Assign numeric class values
    mask_classified = ~no_pid_mask
    df.loc[mask_classified, "IUIS"] = (
        df.loc[mask_classified, "IUIS"]
        .apply(lambda x: iuis_class_map.get(str(x).split(":")[0], 0))
    )

    # Ensure type consistency
    df["IUIS"] = df["IUIS"].astype(int)

    # Step 2: Encode 'IUIS extended'
    df["IUIS extended"] = df["IUIS extended"].astype(str)
    df.loc[df["IUIS extended"] == "Unclassified", "IUIS extended"] = 0
    df["IUIS extended"] = df["IUIS extended"].apply(lambda x: int(str(x)[0]))

    # Step 3: Subclassification columns
    df["seq1"] = (df["IUIS"] == 1).astype(int)
    df["seq2"] = (df["IUIS"] != 0).astype(int)

    # Define seq3 logic
    def map_seq3(row):
        iuis = row["IUIS"]
        ext = row["IUIS extended"]
        if iuis == 0:
            return 5
        elif iuis == 1:
            return 4
        elif iuis == 2:
            return 0
        elif iuis == 3:
            return 2 if ext == 4 else 1
        elif iuis == 4:
            return 3
        return -1  # fallback

    df["seq3"] = df.apply(map_seq3, axis=1)

    return df


def stratified_split_and_save(
    df,
    non_feature_cols,
    label_col,
    output_dir="output_splits",
    train_size=0.6,
    val_size=0.2,
    test_size=0.2,
    random_state=17
):
    """
    Split a DataFrame into train, validation, and test sets with stratification,
    and save features, labels, and metadata for each set as CSV files.

    Parameters:
    - df: input DataFrame
    - non_feature_cols: list of columns to exclude from features
    - label_col: name of the label column (must be in non_feature_cols)
    - output_dir: directory to save CSV files
    - train_size: proportion of training samples (default 0.6)
    - val_size: proportion of validation samples (default 0.2)
    - test_size: proportion of test samples (default 0.2)
    - random_state: reproducibility seed
    """
    assert abs(train_size + val_size + test_size - 1.0) < 1e-6, "Splits must sum to 1.0"
    assert label_col in df.columns, f"Label column '{label_col}' not found in DataFrame"
    assert label_col in non_feature_cols, f"Label column '{label_col}' must be in non_feature_cols"

    # First split: train vs temp
    df_train, df_temp = train_test_split(
        df,
        stratify=df[label_col],
        test_size=val_size + test_size,
        random_state=random_state
    )

    # Compute adjusted validation/test split
    val_ratio = val_size / (val_size + test_size)

    # Second split: validation vs test
    df_val, df_test = train_test_split(
        df_temp,
        stratify=df_temp[label_col],
        test_size=1 - val_ratio,
        random_state=random_state
    )

    # Generate features
    feature_cols = [col for col in df.columns if col not in non_feature_cols]
    X_train = df_train[feature_cols]
    X_val = df_val[feature_cols]
    X_test = df_test[feature_cols]

    # Generate labels
    y_train = df_train[[label_col]]
    y_val = df_val[[label_col]]
    y_test = df_test[[label_col]]

    # Metadata
    meta_train = df_train.copy()
    meta_val = df_val.copy()
    meta_test = df_test.copy()

    # Create output directory
    os.makedirs(output_dir, exist_ok=True)

    # Save to CSV
    X_train.to_csv(os.path.join(output_dir, "X_train.csv"), index=False)
    X_val.to_csv(os.path.join(output_dir, "X_val.csv"), index=False)
    X_test.to_csv(os.path.join(output_dir, "X_test.csv"), index=False)

    y_train.to_csv(os.path.join(output_dir, "y_train.csv"), index=False)
    y_val.to_csv(os.path.join(output_dir, "y_val.csv"), index=False)
    y_test.to_csv(os.path.join(output_dir, "y_test.csv"), index=False)

    meta_train.to_csv(os.path.join(output_dir, "meta_train.csv"), index=False)
    meta_val.to_csv(os.path.join(output_dir, "meta_val.csv"), index=False)
    meta_test.to_csv(os.path.join(output_dir, "meta_test.csv"), index=False)

    print(f"Data split and saved to '{output_dir}'")

    return (X_train, X_val, X_test), (y_train, y_val, y_test), (meta_train, meta_val, meta_test)

# Dataset clean-up

In [19]:
df = pd.read_excel("ds_FLOWSOM_29092021.xlsx")
df = df.dropna(subset=["IUIS"])
df.shape

  warn(msg)


(399, 89)

In [7]:
print(list(df))

['IUIS', 'IUIS extended', 'PCODE', 'M:F', 'Age (Y)', 'WBC', 'Lymfo/µL (Sysmex)', 'IgG', 'IGG2', 'IGG3', 'IgM', 'IGA', 'IgG (normalized against RV)', 'IGG2 (normalized against RV)', 'IGG3 (normalized against RV)', 'IgM (normalized against RV)', 'IGA (normalized against RV)', '% lympho', ' lympho/µL', 'B-cells/µL', 'preGC B-cells/µL', 'MBC + PC/µL', 'Uns memory B-cells/µL', 'Sw memory B-cells/µL', 'T-cells/µL', 'CD4+ T-cells/µL', 'naieve CD4+ T-cells/µL', 'CM CD4+ T-cells/µL', 'EM CD4+ T cells/µL', 'Effector TD CD4+ T-cells/µL', 'CD8+ T-cells/µL', 'naieve CD8+ T-cells/µL', 'CM CD8+ T-cells/µL', 'EM CD8+ T-cells/µL ', 'Effector TD 27+ CD8+ T-cells/µL', 'Effector TD  CD8+ T-cells/µL', 'DNT TCRgd-T-cells/µL', 'TCRgd= T-cells/µL', 'NK  cells/µL', '% B-cells (on lymfo)', '% preGC B-cells (on B)', '% MBC + PC (on B)', '% Uns memory B-cells (on B)', '% Sw memory B-cells (on B)', '% T-cells (on lymfo)', '% CD4+ T-cells (on T)', '% naieve CD4+ T-cells (on CD4 T)', '% CM CD4+ T-cells (on CD4 T)', 

In [20]:
df = cleaner(df)

  df[columns_to_clean] = df[columns_to_clean].replace(to_replace=["IR"], value=np.nan, regex=True)


In [21]:
df.head()

Unnamed: 0,IUIS,IUIS extended,PCODE,M:F,Age (Y),WBC,Lymfo/µL (Sysmex),IgG,IGG2,IGG3,...,% Effector TD CD4+ T-cells (on CD4 T),% CD8+ T-cells (on T),% naieve CD8+ T-cells (on CD8 T),% CM CD8+ T-cells (on CD8 T),% EM CD8+ T-cells (on CD8 T),% Effector TD 27+ CD8+ T-cells (on CD8 T),% Effector TD CD8+ T-cells (on CD8 T),% DNT TCRgd-T-cells (on T),% TCRgd+ T-cells (on T),% NK cells (on lymfo)
0,III: Predominantly antibody defciencies,2. Severe Reduction in at Least 2 Serum Immuno...,P001,F,12.25,8720.0,2280.0,6.0,2.07,0.178,...,0.004,0.3,0.67,0.27,0.015,0.031,0.01,0.02,0.06,0.07
1,III: Predominantly antibody defciencies,2. Severe Reduction in at Least 2 Serum Immuno...,P002,M,15.083333,6780.0,1940.0,5.9,1.63,0.175,...,0.001,0.27,0.57,0.35,0.031,0.044,0.005,0.013,0.121,0.081
2,III: Predominantly antibody defciencies,2. Severe Reduction in at Least 2 Serum Immuno...,P003,F,17.25,5950.0,2040.0,11.8,3.53,0.253,...,0.0,0.34,0.52,0.37,0.035,0.075,0.013,,0.076,0.089
3,III: Predominantly antibody defciencies,2. Severe Reduction in at Least 2 Serum Immuno...,P004,M,15.75,3910.0,1350.0,10.0,2.46,0.929,...,0.0,0.43,0.43,0.42,0.141,0.088,0.041,0.025,0.018,0.047
4,III: Predominantly antibody defciencies,2. Severe Reduction in at Least 2 Serum Immuno...,P005,M,15.0,9200.0,1860.0,12.6,2.79,1.27,...,0.02,0.33,0.45,0.01,0.013,0.443,0.08,0.018,0.26,0.153


In [22]:
df.to_csv("ds_FLOWSOM.csv", index=False)

# Subset split

In [28]:
df = pd.read_csv("df_FLOWSOM.csv")

In [29]:
df = encode_iuis_columns(df)

In [30]:
label_counts = count_label_combinations(df, main_label_col='IUIS', sub_label_col='IUIS extended')
label_counts.to_csv("label_counts.csv", index=False)

In [31]:
(X_train, X_val, X_test), (y_train, y_val, y_test), (meta_train, meta_val, meta_test) = stratified_split_and_save(
    df=df,
    non_feature_cols=['PCODE', 'IUIS', 'IUIS extended', 'seq1', 'seq2', 'seq3'],
    label_col='seq3', # Corrected from ['IUIS'] to 'IUIS'
    output_dir="split"
)

Data split and saved to 'split'


# Cluster dataset

In [12]:
df1 = pd.read_csv("cluster2mcl_perc.csv")
df2 = pd.read_csv("mcluster_counts.csv")
df3 = pd.read_csv("mcluster_perc.csv")
md = pd.read_csv("metadata.csv")

In [17]:
df1 = df1.rename(columns={'Unnamed: 0': 'PCODE'})
df2 = df2.rename(columns={'Unnamed: 0': 'PCODE'})
df3 = df3.rename(columns={'Unnamed: 0': 'PCODE'})
md = md.drop(columns='Unnamed: 0')

In [18]:
from functools import reduce
data_frames = [df1, df2, df3, md]
df_merged = reduce(lambda left, right: pd.merge(left, right, on='PCODE', how='inner'), data_frames)

In [34]:
columns = df_merged.columns.tolist()
print(columns)
df_merged.shape

['PCODE', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26', 'C27', 'C28', 'C29', 'C30', 'C31', 'C32', 'C33', 'C34', 'C35', 'C36', 'C37', 'C38', 'C39', 'C40', 'C41', 'C42', 'C43', 'C44', 'C45', 'C46', 'C47', 'C48', 'C49', 'C50', 'C51', 'C52', 'C53', 'C54', 'C55', 'C56', 'C57', 'C58', 'C59', 'C60', 'C61', 'C62', 'C63', 'C64', 'C65', 'C66', 'C67', 'C68', 'C69', 'C70', 'C71', 'C72', 'C73', 'C74', 'C75', 'C76', 'C77', 'C78', 'C79', 'C80', 'C81', 'C82', 'C83', 'C84', 'C85', 'C86', 'C87', 'C88', 'C89', 'C90', 'C91', 'C92', 'C93', 'C94', 'C95', 'C96', 'C97', 'C98', 'C99', 'C100', 'C101', 'C102', 'C103', 'C104', 'C105', 'C106', 'C107', 'C108', 'C109', 'C110', 'C111', 'C112', 'C113', 'C114', 'C115', 'C116', 'C117', 'C118', 'C119', 'C120', 'C121', 'C122', 'C123', 'C124', 'C125', 'C126', 'C127', 'C128', 'C129', 'C130', 'C131', 'C132', 'C133', 'C134', 'C135', 'C136', 'C137', 'C

(398, 274)

In [35]:
df_merged = cleaner(df_merged)
df_merged.head()

Unnamed: 0,PCODE,C1,C2,C3,C4,C5,C6,C7,C8,C9,...,% Effector TD CD4+ T-cells (on CD4 T),% CD8+ T-cells (on T),% naieve CD8+ T-cells (on CD8 T),% CM CD8+ T-cells (on CD8 T),% EM CD8+ T-cells (on CD8 T),% Effector TD 27+ CD8+ T-cells (on CD8 T),% Effector TD CD8+ T-cells (on CD8 T),% TCRgd+ T-cells (on T),% NK cells (on lymfo),agegroup
0,P001,0.397924,0.07931,0.100449,0.053025,0.006496,0.071802,0.022489,0.004081,0.022648,...,0.004,0.3,0.67,0.27,0.015,0.0314,0.0095,0.06,0.07,10-17 jaar
1,P002,0.194802,0.226996,0.080528,0.006319,0.004162,0.010503,0.111871,0.012791,0.113832,...,0.001,0.27,0.57,0.35,0.031,0.044,0.005,0.121,0.081,10-17 jaar
2,P003,0.379372,0.119038,0.069921,0.007297,0.003508,0.018203,0.076965,0.004939,0.048271,...,0.0,0.34,0.52,0.37,0.035,0.075,0.013,0.076,0.089,10-17 jaar
3,P004,0.172252,0.209472,0.135169,0.00611,0.008976,0.006434,0.033388,0.036957,0.086577,...,0.0,0.43,0.44,0.1253,0.0428,0.0295,0.3608,0.018,0.047,10-17 jaar
4,P005,0.450865,0.058827,0.075021,0.124902,0.05151,0.108361,0.040458,0.037787,0.06311,...,0.04708,0.33,0.4975,0.1475,0.018,0.2458,0.097,0.26,0.153,10-17 jaar


In [41]:
df_merged.to_csv("df_cluster.csv", index=False)

In [16]:
df_fl = pd.read_csv("ds_FLOWSOM.csv")
df_cl = pd.read_csv("df_cluster.csv")

In [20]:
fl_codes = df_fl["PCODE"].unique()
cl_codes = df_cl["PCODE"].unique()

def uppercase_column(df, column_name):
    df[column_name] = df[column_name].apply(lambda x: x.upper())
    return df

df_fl = uppercase_column(df_fl, 'PCODE')

for element in fl_codes:
    if element not in cl_codes:
        print(element)

for element in cl_codes:
    if element not in fl_codes:
        print('\n')
        print(element)

P557


In [21]:
df_fl = df_fl[df_fl["PCODE"] != "P557"]

In [23]:
to_drop = ['Date of birth', 'Label short']
df_cl = df_cl.drop(columns=to_drop)
df_cl = df_cl.astype({col: 'float64' for col in df_cl.select_dtypes(include='int64').columns})

In [27]:
df_fl.to_csv("df_FLOWSOM.csv", index=False)
df_cl.to_csv("df_cluster.csv", index=False)

# Split

In [32]:
df = pd.read_csv("df_cluster.csv")

In [33]:
df = encode_iuis_columns(df)

In [34]:
label_counts = count_label_combinations(df, main_label_col='IUIS', sub_label_col='IUIS extended')
label_counts.to_csv("label_counts.csv", index=False)

In [35]:
(X_train, X_val, X_test), (y_train, y_val, y_test), (meta_train, meta_val, meta_test) = stratified_split_and_save(
    df=df,
    non_feature_cols=['PCODE', 'IUIS', 'IUIS extended', 'seq1', 'seq2', 'seq3'],
    label_col='seq3', # Corrected from ['IUIS'] to 'IUIS'
    output_dir="cluster"
)

Data split and saved to 'cluster'
