Code to create the final numerical data set.

The code extracts rows with the fewest NaN values

The code runs through 16 iterations to ensure the whole data set is processed.

In [5]:
# Import libraries
import os
import pandas as pd
import numpy as np

In [None]:
# Function to load the items table
def load_items(file_path):
    """
    Loads the items data from a CSV file and filters it based on specific item IDs.

    Parameters:
    file_path (str): The path to the items CSV file.

    Returns:
    pd.DataFrame: A filtered DataFrame containing specific items.
    """
    items = pd.read_csv(file_path, usecols=['itemid', 'label', 'abbreviation'])
    filtered_items = items[items['itemid'].isin([220210, 220277, 225309, 220045, 220739, 223900, 223901, 223762])]
    return filtered_items

# Function to load and filter the chart events data in chunks
def load_chart_events_in_chunks(file_path, chunk_size=20000000):
    """
    Loads the chart events data from a CSV file in chunks and filters it based on specific item IDs.

    Parameters:
    file_path (str): The path to the chart events CSV file.
    chunk_size (int): The number of rows per chunk to read from the CSV file.

    Yields:
    pd.DataFrame: A filtered DataFrame containing specific chart events for each chunk.
    """
    chunk_iter = pd.read_csv(file_path, usecols=['subject_id', 'hadm_id', 'charttime', 'itemid', 'valuenum', 'valueuom'], chunksize=chunk_size)
    for chunk in chunk_iter:
        filtered_chunk = chunk[chunk['itemid'].isin([220210, 220277, 225309, 220045, 220739, 223900, 223901, 223762])]
        yield filtered_chunk

# Function to combine the two datasets
def combine_data(chart, items):
    """
    Combines the chart events and items data into a single DataFrame.

    Parameters:
    chart (pd.DataFrame): The chart events DataFrame.
    items (pd.DataFrame): The items DataFrame.

    Returns:
    pd.DataFrame: The combined DataFrame.
    """
    combined_data = pd.merge(chart, items, on=['itemid'])
    combined_data = combined_data.pivot_table(index=['subject_id', 'hadm_id', 'charttime'], columns='label', values='valuenum').reset_index()
    combined_data['GCS Total'] = combined_data[['GCS - Eye Opening', 'GCS - Motor Response', 'GCS - Verbal Response']].sum(axis=1)
    combined_data['GCS Total'].replace(0, np.nan, inplace=True)
    combined_data = combined_data.drop(columns=['GCS - Eye Opening', 'GCS - Motor Response', 'GCS - Verbal Response'])
    return combined_data

# Function to select the chart time with the fewest NaNs
def option_fewest_nans(data):
    """
    Selects the chart time with the fewest NaNs for each hadm_id.

    Parameters:
    data (pd.DataFrame): The combined DataFrame.

    Returns:
    pd.DataFrame: A DataFrame with the fewest NaNs for each hadm_id.
    """
    data['nan_count'] = data.isna().sum(axis=1)
    sorted_data = data.sort_values(by=['hadm_id', 'nan_count', 'charttime'])
    fewest_nans_data = sorted_data.drop_duplicates(subset=['hadm_id'], keep='first').drop(columns=['nan_count'])
    return fewest_nans_data

# Function to load patient data
def load_patient_data(file_path):
    """
    Loads the patient data from a CSV file.

    Parameters:
    file_path (str): The path to the patient CSV file.

    Returns:
    pd.DataFrame: A DataFrame containing the patient data.
    """
    patient_data = pd.read_csv(file_path, usecols=['subject_id', 'anchor_age'])
    return patient_data

# Function to merge patient data with another DataFrame
def merge_with_patient_data(patient_data, df):
    """
    Merges the patient data with another DataFrame based on the 'subject_id' column.

    Parameters:
    patient_data (pd.DataFrame): The DataFrame containing patient data.
    df (pd.DataFrame): The DataFrame to be merged with patient data.

    Returns:
    pd.DataFrame: The merged DataFrame.
    """
    final_df = pd.merge(patient_data, df, on=['subject_id'])
    return final_df

# Function to process the data in chunks and output the results
def process_and_output_chunks(chart_file_path, items_file_path, patient_file_path, chunk_size=20000000):
    """
    Processes the chart events data in chunks, combines it with items and patient data, 
    and outputs the results while ensuring no duplicates within each chunk.

    Parameters:
    chart_file_path (str): The path to the chart events CSV file.
    items_file_path (str): The path to the items CSV file.
    patient_file_path (str): The path to the patient data CSV file.
    chunk_size (int): The number of rows per chunk to read from the chart events CSV file.

    Returns:
    list: A list of processed DataFrames.
    """
    items = load_items(items_file_path)  # Load and filter items data
    patient_data = load_patient_data(patient_file_path)  # Load patient data
    chunk_number = 0
    final_dataframes = []  # List to store processed DataFrames

    # Iterate over each chunk of chart events data
    for chart_chunk in load_chart_events_in_chunks(chart_file_path, chunk_size):
        combined_data = combine_data(chart_chunk, items)  # Combine chart events and items data
        processed_data = option_fewest_nans(combined_data)  # Select rows with the fewest NaNs
        final_data = merge_with_patient_data(patient_data, processed_data)  # Merge with patient data

        # Ensure no duplicates within the chunk
        final_data = final_data.drop_duplicates()

        final_dataframes.append(final_data)  # Store the processed DataFrame
        
        # Output the processed data
        print(f"Processed chunk {chunk_number}")
        #print(final_data.head())  # Print the first few rows of the processed data
        
        chunk_number += 1
    
    return final_dataframes

# Function to combine all DataFrames and clean the resulting DataFrame
def combine_and_clean_dataframes(dataframes):
    """
    Combines all DataFrames into one large DataFrame, removes duplicates based on hadm_id, 
    and removes rows with more than 2 NaN values.

    Parameters:
    dataframes (list): List of DataFrames to be combined and cleaned.

    Returns:
    pd.DataFrame: The combined and cleaned DataFrame.
    """
    # Combine all DataFrames into one large DataFrame
    combined_df = pd.concat(dataframes, ignore_index=True)
    
    # Drop duplicates from the combined DataFrame based on hadm_id
    combined_df = combined_df.drop_duplicates(subset=['hadm_id'], keep='first')

    # Remove rows with more than 2 NaN values
    cleaned_df = combined_df.dropna(thresh=len(combined_df.columns) - 2)
    
    # Output the shape of the final cleaned DataFrame
    print("Combined and cleaned DataFrame shape:", cleaned_df.shape)
    
    return cleaned_df

# Specify the paths to your large CSV files
chart_file_path = os.path.join('..', 'chartevents.csv')
items_file_path = os.path.join('..', 'd_items.csv')
patient_file_path = os.path.join('..','patients.csv', 'patients.csv')

# Call the function to process the files
final_dataframes = process_and_output_chunks(chart_file_path, items_file_path, patient_file_path)

# Combine and clean the final DataFrames
cleaned_data = combine_and_clean_dataframes(final_dataframes)

cleaned_data.to_csv('final.csv')

In [9]:
cleaned_data.to_csv('final.csv')