This code is used to create the Numerical Dataset

Three options are used to create the Dataset

1. Option 1: Using the earliest chart time for each hadm_id
2. Option 2: Picking the charttime with the fewest NaNs
3. Option 3: Picking the first reading within the hour (from the start of the first recorded time)

Things to note:

Because the chartevents data is very large only 20,000,000 rows were loaded

In [2]:
# Import libraries

import pandas as pd
import os
import numpy as np

In [3]:
# Funtion to load the items table
def load_items(file_path):

    """
    Loads the items data from a CSV file and filters it based on specific item IDs.

    Parameters:
    file_path (str): The path to the items CSV file.

    Returns:
    pd.DataFrame: A filtered DataFrame containing specific items.
    """

    items = pd.read_csv(file_path, usecols=['itemid', 'label', 'abbreviation'])
    filtered_items = items[items['itemid'].isin([220210, 220277, 225309, 220045, 220739, 223900, 223901, 223762])]
    return filtered_items

In [4]:
# Funtion to load the chart events table
def load_chart_events(file_path, nrows=20000000):
    """
    Loads the chart events data from a CSV file and filters it based on specific item IDs.

    Parameters:
    file_path (str): The path to the chart events CSV file.
    nrows (int): The number of rows to read from the CSV file.

    Returns:
    pd.DataFrame: A filtered DataFrame containing specific chart events.
    """
    chart = pd.read_csv(file_path, usecols=['subject_id', 'hadm_id', 'charttime', 'itemid', 'valuenum', 'valueuom'], nrows=nrows)
    filtered_chart = chart[chart['itemid'].isin([220210, 220277, 225309, 220045, 220739, 223900, 223901, 223762])]
    return filtered_chart

In [5]:
# Function to combine the two datasets
def combine_data(chart, items):
    """
    Combines the chart events and items data into a single DataFrame.

    Parameters:
    chart (pd.DataFrame): The chart events DataFrame.
    items (pd.DataFrame): The items DataFrame.

    Returns:
    pd.DataFrame: The combined DataFrame.
    """
    combined_data = pd.merge(chart, items, on=['itemid'])
    combined_data = combined_data.pivot_table(index=['subject_id', 'hadm_id', 'charttime'], columns='label', values='valuenum').reset_index()
    combined_data['GCS Total'] = combined_data[['GCS - Eye Opening', 'GCS - Motor Response', 'GCS - Verbal Response']].sum(axis=1)
    combined_data = combined_data.drop(columns=['GCS - Eye Opening', 'GCS - Motor Response', 'GCS - Verbal Response'])
    return combined_data

In [6]:
# Function for option 1: using the earliest chart time for each hadm_id
def option_earliest_chart_time(data):
    """
    Selects the earliest chart time for each hadm_id.

    Parameters:
    data (pd.DataFrame): The combined DataFrame.

    Returns:
    pd.DataFrame: A DataFrame with the earliest chart time for each hadm_id.
    """
    data['charttime'] = pd.to_datetime(data['charttime'])
    sorted_data = data.sort_values(by=['hadm_id', 'charttime'])
    earliest_data = sorted_data.loc[sorted_data.groupby('hadm_id')['charttime'].idxmin()].reset_index(drop=True)
    return earliest_data

In [7]:
# Function for option 2: picking the chart time with the fewest NaNs
def option_fewest_nans(data):
    """
    Selects the chart time with the fewest NaNs for each hadm_id.

    Parameters:
    data (pd.DataFrame): The combined DataFrame.

    Returns:
    pd.DataFrame: A DataFrame with the fewest NaNs for each hadm_id.
    """
    data['nan_count'] = data.isna().sum(axis=1)
    sorted_data = data.sort_values(by=['hadm_id', 'nan_count'])
    fewest_nans_data = sorted_data.drop_duplicates(subset=['hadm_id'], keep='first').drop(columns=['nan_count'])
    return fewest_nans_data

In [8]:
# Function for option 3: Picking the first reading within the hour (from the start of the first recorded time)
def option_first_within_hour(data):
    """
    Selects the first reading within the hour from the start of the first recorded time for each hadm_id.

    Parameters:
    data (pd.DataFrame): The combined DataFrame.

    Returns:
    pd.DataFrame: A DataFrame with the first reading within the hour for each hadm_id.
    """
    data['charttime'] = pd.to_datetime(data['charttime'])
    sorted_data = data.sort_values(by=['hadm_id', 'charttime'])
    results = []

    for hadmid, group in sorted_data.groupby('hadm_id'):
        first_readings = {'subject_id': group['subject_id'].iloc[0], 'hadm_id': hadmid}
        start_time = group['charttime'].iloc[0]
        end_time = start_time + pd.Timedelta(hours=1)
        within_first_hour = group[(group['charttime'] >= start_time) & (group['charttime'] < end_time)]

        first_readings['charttime'] = start_time
        for column in group.columns[2:]:
            first_readings[column] = np.nan

        for column in group.columns[2:]:
            first_valid_index = within_first_hour[column].first_valid_index()
            if first_valid_index is not None:
                first_readings['charttime'] = within_first_hour.loc[first_valid_index, 'charttime']
                first_readings[column] = within_first_hour.loc[first_valid_index, column]
            else:
                first_readings[column] = np.nan

        results.append(first_readings)

    result_df_hour = pd.DataFrame(results).drop(columns=['nan_count'], errors='ignore')
    return result_df_hour

In [9]:
# Function to load patient data and combine with other dataset
def load_patient_data(file_path):
    """
    Loads the patient data from a CSV file.

    Parameters:
    file_path (str): The path to the patient CSV file.

    Returns:
    pd.DataFrame: A DataFrame containing the patient data.
    """
    patient_data = pd.read_csv(file_path, usecols=['subject_id', 'anchor_age'])
    return patient_data

def merge_with_patient_data(patient_data, df):
    """
    Merges the patient data with another DataFrame based on the 'subject_id' column.

    Parameters:
    patient_data (pd.DataFrame): The DataFrame containing patient data.
    df (pd.DataFrame): The DataFrame to be merged with patient data.

    Returns:
    pd.DataFrame: The merged DataFrame.
    """
    final_df = pd.merge(patient_data, df, on=['subject_id'])
    return final_df

In [10]:
# Specify the paths
items_path = os.path.join('..', 'd_items.csv')
chart_path = os.path.join('..', 'chartevents.csv')
patient_path = os.path.join('..', 'patients.csv')

# Load the data
items = load_items(items_path)
chart = load_chart_events(chart_path)
patient_data = load_patient_data(patient_path)

# Combine the datasets
combined_data = combine_data(chart, items)

FileNotFoundError: [Errno 2] No such file or directory: '..\\d_items.csv'

In [None]:
# Option 1
earliest_data = option_earliest_chart_time(combined_data)

final_earliest_data = merge_with_patient_data(patient_data, earliest_data)

final_earliest_data.head()

In [None]:
# Option 2
fewest_nans_data = option_fewest_nans(combined_data)

final_fewest_nans_data = merge_with_patient_data(patient_data, fewest_nans_data)

final_fewest_nans_data.head()

In [None]:
# Option 3
first_within_hour_data = option_first_within_hour(combined_data)

final_first_within_hour_data = merge_with_patient_data(patient_data, first_within_hour_data)

final_first_within_hour_data.head()