# Data Processing

This notebook focuses on processing and preparing medical data for predictive modeling. It includes functions for feature engineering, handling missing data, scaling features, and removing outliers. The workflow involves loading datasets, applying transformations, and saving processed data for further analysis.

In [None]:
from tqdm import tqdm
from column_groups import *
tqdm.pandas()
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import zscore
import joblib

In [None]:

def separate_id(row):
    """
    Determines if the 'epic_pmrn' value in the row contains any letters and therefore comes from numom.
    
    Args:
        row (pd.Series): A row from the DataFrame containing 'epic_pmrn' column.
    
    Returns:
        bool: Returns True if 'epic_pmrn' contains exactly one letter, False otherwise.
    
    Raises:
        ValueError: If 'epic_pmrn' contains more than one letter or is not a valid format.
    """
    # Check if the string is all numbers
    if str(row.epic_pmrn).isdigit():
        return False
    
    # Check if the string contains exactly one letter
    letter_count = sum(c.isalpha() for c in str(row.epic_pmrn))
    if letter_count > 0:
        return True
    raise(ValueError)


def process_hdp_labs(row):
    """
    Processes laboratory and vital sign data for a single row.
    
    This function calculates the minimum, maximum, mean, and most recent values
    for each laboratory and vital sign column based on the available dates and limit date.
    
    Args:
        row (pd.Series): A row from the DataFrame containing lab and vital sign data.
    
    Returns:
        pd.Series: The input row with additional columns for minimum, maximum, mean,
                    and most recent values of laboratory and vital sign data.
    """
    lab_cols = ['labs_glucose_serum', 'labs_calcium_serum', 'labs_magnesium_blood', 'labs_ptt', 'labs_ptinr', 
                'labs_urine_protein_24h_derived_timeseries_mg_dl_d', 'labs_urine_protein_24h_derived_timeseries_volume',
                'labs_urine_spot_protein_timeseries', 'labs_urine_spot_creatinine_timeseries', 
                'labs_urine_protein_24h_derived_timeseries_mg_tv']
    vital_cols = []

    for val_col in vital_cols + lab_cols:
        date_col = val_col + '_datetime'
        if not pd.isna(row[date_col]):
            col_array = pd.to_datetime(pd.Series(row[date_col].split('|')), infer_datetime_format=True)
            before_date_idx = np.array(col_array < row.limit_date)
            val_arr = np.array(str(row[val_col]).split('|'))[before_date_idx]
            val_arr = [float(x) for x in val_arr]
            if len(val_arr) == 0:
                row[val_col + '_min'] = None
                row[val_col + '_max'] = None
                row[val_col + '_mean'] = None
                row[val_col + '_most_recent'] = None
            else:
                row[val_col + '_min'] = np.min(val_arr)
                row[val_col + '_max'] = np.max(val_arr)
                row[val_col + '_mean'] = np.mean(val_arr)
                row[val_col + '_most_recent'] = val_arr[-1]
    return row


def remove_outliers_high_missingness(dfs_to_modify, labels, entry_missing_threshold=0.95):
    """
    Removes entries from DataFrames with high percentages of missing data.
    
    Args:
        dfs_to_modify (list of pd.DataFrame): List of DataFrames to be modified.
        labels (list of pd.Series): List of labels corresponding to the DataFrames.
        entry_missing_threshold (float): The threshold for the percentage of missing data above which rows are removed.
    
    Returns:
        tuple: A tuple containing the modified list of DataFrames and the corresponding list of labels.
    """
    new_dfs = []
    new_labels = []
    for i in range(len(dfs_to_modify)):
        df = dfs_to_modify[i]
        label = labels[i]
        
        entry_missing_percentages = df.isnull().mean(axis=1)
        # Remove entries with high missingness
        df = df[entry_missing_percentages <= entry_missing_percentages.quantile(entry_missing_threshold)]
        label = label[entry_missing_percentages <= entry_missing_percentages.quantile(entry_missing_threshold)]
        new_dfs.append(df)
        new_labels.append(label)
    return new_dfs, new_labels


def scale_features(X_train, other_dfs, weeks, exclude_features=None):
    """
    Scales features in the training DataFrame and other DataFrames using Min-Max scaling.
    
    Args:
        X_train (pd.DataFrame): The training DataFrame to be scaled.
        other_dfs (list of pd.DataFrame): List of other DataFrames to be scaled with the same scaler.
        weeks (int): Number of weeks used for scaling file naming.
        exclude_features (list of str, optional): List of feature names to exclude from scaling.
    
    Returns:
        tuple: A tuple containing the scaled training DataFrame and the scaled list of other DataFrames.
    """
    if exclude_features is None:
        exclude_features = []
    features_to_scale = [feature for feature in X_train.columns if feature not in exclude_features]
    features_to_exclude = exclude_features

    # Scaling only the selected features
    scaler = MinMaxScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train[features_to_scale]), columns=features_to_scale)
    X_train_final = pd.concat([X_train_scaled, X_train[features_to_exclude].reset_index(drop=True)], axis=1)

    # Scale and concatenate other DataFrames
    other_dfs_final = []
    for df in other_dfs:
        df_scaled = pd.DataFrame(scaler.transform(df[features_to_scale]), columns=features_to_scale)
        df_final = pd.concat([df_scaled, df[features_to_exclude].reset_index(drop=True)], axis=1)
        other_dfs_final.append(df_final)

    # Dump the scaler to a file
    joblib.dump(scaler, f'../models/scalers/{weeks}weeks_pretraining.scaler')

    return X_train_final, other_dfs_final


def impute_median(X_train, other_dfs, exclude_features=[]):
    """
    Imputes missing values in DataFrames with the median of each feature.
    
    Args:
        X_train (pd.DataFrame): The training DataFrame used to compute median values.
        other_dfs (list of pd.DataFrame): List of other DataFrames where missing values will be imputed.
        exclude_features (list of str): List of feature names to exclude from imputation.
    
    Returns:
        tuple: A tuple containing the imputed training DataFrame and the imputed list of other DataFrames.
    """
    if exclude_features is None:
        exclude_features = []
    features_to_impute = [feature for feature in X_train.columns if feature not in exclude_features]
    
    # Calculate the median for columns that are not excluded
    imputing_values = X_train[features_to_impute].median()
    X_train_final = pd.concat([X_train[features_to_impute].fillna(imputing_values).reset_index(drop=True), X_train[exclude_features].reset_index(drop=True)], axis=1)
    
    other_dfs_final = []
    for df in other_dfs:
        df_final = pd.concat([df[features_to_impute].fillna(imputing_values).reset_index(drop=True), df[exclude_features].reset_index(drop=True)], axis=1)
        other_dfs_final.append(df_final)
    
    return X_train_final, other_dfs_final


In [None]:
# List of columns containing patient information
info_cols = ['epic_pmrn', 'delivery_date', 'delivery_hospital', 'pregnancy_start', 'limit_date', 'unique_id']

# Iterate over different weeks for processing
for weeks in [14, 20, 24, 28, 32, 34, 36, 38]:
    # Load the dataset for the given number of weeks
    DATA = pd.read_csv(f'../processed_data/processing_data/{weeks}_final.csv')
    DATA['magnesium_medication'] = DATA.magnesium_medication < DATA.limit_date
    DATA['proteinuria'] = DATA.proteinuria < DATA.limit_date
    
    # Determine rows that need special handling based on ID
    final_numom_rows = DATA.apply(separate_id, axis=1)
    
    # Filter data: include specific hospitals and newborn gestational age or rows identified by separate_id
    DATA = DATA[((DATA.delivery_hospital.isin(['bwh', 'mgh', 'nwh'])) & (DATA.newborn_gestational_age.notna())) | final_numom_rows.values].reset_index(drop=True)
    
    # Reapply separate_id logic to filtered data
    final_numom_rows = DATA.apply(separate_id, axis=1)

    # Define feature sets
    features = [x for x in final_numom if x not in group_cols]
    engineered_features = [x for x in DATA.columns if '__' in x or 'coef' in x]
    final_features = features + engineered_features + comparison_cols + info_cols

    # Extract unique epic_pmnr for final_numom rows
    final_numom_epics = DATA.epic_pmrn[final_numom_rows]
    DATA[features + engineered_features + comparison_cols] = DATA[features + engineered_features + comparison_cols].fillna(DATA[features + engineered_features + comparison_cols].median())
    # Split epic_pmnr into training and testing sets
    train_epics, test_epics = train_test_split(DATA[~final_numom_rows].epic_pmrn.unique(), test_size=0.2, random_state=42)
    
    # Prepare training and testing data
    X_train, y_train = DATA[DATA.epic_pmrn.isin(train_epics)][final_features], DATA[DATA.epic_pmrn.isin(train_epics)]['all_pet_xai3_dates'].notna()
    X_test, y_test = DATA[DATA.epic_pmrn.isin(test_epics)][final_features], DATA[DATA.epic_pmrn.isin(test_epics)]['all_pet_xai3_dates'].notna()
    
    # Prepare numom data
    X_numom, y_numom = DATA[DATA.epic_pmrn.isin(final_numom_epics)][final_features], DATA[DATA.epic_pmrn.isin(final_numom_epics)]['all_pet_xai3_dates'].notna()
    
    # Remove outliers based on missingness
    [X_train, X_test, X_numom], [y_train, y_test, y_numom] = remove_outliers_high_missingness([X_train, X_test, X_numom], [y_train, y_test, y_numom])
    
    # Save the processed data to a file
    filepath = f"../processed_data/modelling_data/{weeks}_data.pkl"
    final_file = {'train': {'X': X_train, 'y': y_train},
                  'test': {'X': X_test, 'y': y_test},
                  'numom': {'X': X_numom, 'y': y_numom}}
    joblib.dump(final_file, filepath)
