In [2]:
import os
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import OneHotEncoder, RobustScaler, MinMaxScaler

In [4]:



def load_and_convert_grid_to_triplets(grid_data_path, output_file='grid_triplets.csv', 
                                     preprocess_params_dir='enhanced_preprocessed_data'):
    """
    Alternative method: Load existing grid data and convert it to triplets format.
    Useful if you already have the time-grid data from previous steps.
    
    Args:
        grid_data_path: Path to the patient grid data (from previous processing)
        output_file: Name of the output file to save triplets
        preprocess_params_dir: Directory containing preprocessing parameters
    
    Returns:
        DataFrame containing triplets
    """
    print(f"Loading grid data from {grid_data_path} and converting to triplets")
    
    # Load grid data
    grid_df = pd.read_csv(grid_data_path)
    
    # Load preprocessing parameters
    try:
        robust_scaler = pickle.load(open(os.path.join(preprocess_params_dir, 'robust_scaler.pkl'), 'rb'))
        minmax_scaler = pickle.load(open(os.path.join(preprocess_params_dir, 'minmax_scaler.pkl'), 'rb'))
        scaling_info = pickle.load(open(os.path.join(preprocess_params_dir, 'scaling_info.pkl'), 'rb'))
        
        # Extract scaling groups
        log_features = scaling_info['log_features']
        minmax_features = scaling_info['minmax_features']
        robust_features = scaling_info['robust_features']
        log_epsilon = scaling_info.get('log_epsilon', 1e-6)
        
        print(f"Loaded preprocessing parameters from {preprocess_params_dir}")
    except Exception as e:
        print(f"Error loading preprocessing parameters: {e}")
        # Default scaling groups if not loaded
        log_features = ['Glucose', 'BUN', 'Creatinine', 'AST', 'ALT', 'ALP', 'Bilirubin', 
                        'Troponin', 'Lactate', 'PaO2', 'Urine', 'WBC', 'Platelets']
        minmax_features = ['pH', 'FiO2', 'O2Sat', 'GCS', 'Age']
        log_epsilon = 1e-6
        robust_features = []
    
    # Identify columns to exclude from parameter list (metadata columns)
    exclude_columns = ['PatientID', 'Hour', 'Time', 'Gender', 'ICUType', 'Age', 'Height', 'Weight']
    
    # Identify parameter columns (all columns that are not excluded)
    param_columns = [col for col in grid_df.columns if col not in exclude_columns]
    
    if not robust_features:
        robust_features = [col for col in param_columns 
                          if col not in log_features and col not in minmax_features]
    
    # Create parameter encoder
    param_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
    param_encoder.fit(np.array(param_columns).reshape(-1, 1))
    
    # Determine max hour for time scaling (typically 48 from the original code)
    max_hour = grid_df['Hour'].max()
    
    # Create triplets by melting the grid data
    print("Converting grid data to triplets format...")
    
    # Melt the DataFrame to long format (Hour, Parameter, Value)
    melted_df = pd.melt(
        grid_df, 
        id_vars=['PatientID', 'Hour'],
        value_vars=param_columns,
        var_name='Parameter',
        value_name='v_original'
    )
    
    # Remove rows with null values
    melted_df = melted_df.dropna(subset=['v_original'])
    
    # Scale time to [0,1]
    melted_df['t'] = melted_df['Hour'] / max_hour
    
    # Apply appropriate scaling to values based on parameter type
    melted_df['v'] = melted_df['v_original']  # Default, will update for each scaling type
    
    # Apply log scaling for log features
    for param in log_features:
        if param in param_columns:
            param_mask = melted_df['Parameter'] == param
            # Only apply to positive values
            valid_mask = param_mask & (melted_df['v_original'] > 0)
            if valid_mask.sum() > 0:
                melted_df.loc[valid_mask, 'v'] = np.log1p(
                    melted_df.loc[valid_mask, 'v_original'].clip(lower=log_epsilon)
                )
    
    # Apply MinMax scaling for minmax features
    for param in minmax_features:
        if param in param_columns:
            param_mask = melted_df['Parameter'] == param
            if param_mask.sum() > 0:
                values = melted_df.loc[param_mask, 'v_original'].values.reshape(-1, 1)
                if len(values) > 1:  # Need at least 2 values for proper scaling
                    param_minmax = MinMaxScaler().fit(values)
                    melted_df.loc[param_mask, 'v'] = param_minmax.transform(values).flatten()
    
    # Apply Robust scaling for robust features
    for param in robust_features:
        if param in param_columns:
            param_mask = melted_df['Parameter'] == param
            if param_mask.sum() > 0:
                values = melted_df.loc[param_mask, 'v_original'].values.reshape(-1, 1)
                if len(values) > 1:  # Need at least 2 values for proper scaling
                    param_robust = RobustScaler().fit(values)
                    melted_df.loc[param_mask, 'v'] = param_robust.transform(values).flatten()
    
    # Apply one-hot encoding for parameters
    param_encoded = param_encoder.transform(melted_df['Parameter'].values.reshape(-1, 1))
    
    # Create a DataFrame with the one-hot encoded parameters
    param_df = pd.DataFrame(
        param_encoded, 
        columns=param_encoder.get_feature_names_out(),
        index=melted_df.index
    )
    
    # Concatenate with melted DataFrame
    triplets_df = pd.concat([melted_df, param_df], axis=1)
    
    # Save the transformers for future use
    transformers = {
        'param_encoder': param_encoder,
        'max_hour': max_hour,
        'log_features': log_features,
        'minmax_features': minmax_features,
        'robust_features': robust_features,
        'log_epsilon': log_epsilon
    }
    
    # Save output if filename provided
    if output_file:
        triplets_df.to_csv(output_file, index=False)
        print(f"Triplets saved to {output_file}")
        
        # Save transformers
        pickle.dump(transformers, open(output_file.replace('.csv', '_transformers.pkl'), 'wb'))
        print(f"Transformers saved to {output_file.replace('.csv', '_transformers.pkl')}")
    
    print(f"Created {len(triplets_df)} triplets for {triplets_df['PatientID'].nunique()} patients")
    print(f"DataFrame shape: {triplets_df.shape}")
    
    return triplets_df, transformers

In [5]:



# Example usage (depending on whether starting from raw files or grid data)
if __name__ == "__main__":

    # Option 2: Convert from existing grid data
    grid_data_path = '/home/taekim/enhanced_preprocessed_data/enhanced_sampled_set-a.csv'  # From previous processing
    triplets_df, transformers = load_and_convert_grid_to_triplets(
         grid_data_path,
         output_file='grid_triplets_set-a.csv',
         preprocess_params_dir='enhanced_preprocessed_data'
    )
    
    # Display triplet examples
    print("\nTriplet examples:")
    print(triplets_df.head())
    
    # Display information about the categorical encoding
    if transformers:
        print("\nParameter encoding information:")
        print(f"Number of parameters encoded: {len(transformers['param_encoder'].get_feature_names_out())}")
        print(f"Encoded feature names: {transformers['param_encoder'].get_feature_names_out()}")

Loading grid data from /home/taekim/enhanced_preprocessed_data/enhanced_sampled_set-a.csv and converting to triplets
Loaded preprocessing parameters from enhanced_preprocessed_data


TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'