In [78]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils import shuffle
import dataextraction as de
import utils

In [79]:
conn = de.connect_to_database()
cur = conn.cursor()
df = de.get_base_dataset(conn)

Connected to the PostgreSQL database
PostgreSQL version: PostgreSQL 14.0, compiled by Visual C++ build 1914, 64-bit


# Preprocessing

In [80]:
# Get the columns with most null values
null_values = df.isnull()
null_counts = null_values.sum()
for key, value in null_counts.to_dict().items():
    if value > 10000:
        print(f"{key}: {value}")

dod: 28030
deathtime: 43994
edregtime: 19526
edouttime: 19526
language: 16770
height: 30281
height_chart: 30281
height_echo: 49855
weight_admit: 12277
weight_daily: 25712
weight_echoinhosp: 49855
weight_echoprehosp: 49855
albumin_min: 31803
albumin_max: 31803
bands_min: 44052
bands_max: 44052
bilirubin_min: 27604
bilirubin_max: 27604
lactate_min: 20419
lactate_max: 20419


In [81]:
# Check duplicate subject_id's
df['subject_id'].duplicated().sum()

11976

In [82]:
# Get only first ICU admissions
df = df[df['first_icu_stay'] == True]

In [83]:
# Check duplicate subject_id's again
df['subject_id'].duplicated().sum()

9321

In [84]:
# Aggreagte means and create _mean column for columns that have min, 
# max measurements but missing mean measurements(generally lab and vital measurements)
missing_mean_columns = utils.detect_missing_mean_columns(df)
df = utils.add_missing_mean_columns(df, missing_mean_columns)

In [85]:
# Aggreagte rest of the multiple mean measurments by single patients by 
# taking mean of all the mean records
mean_columns = [col for col in df.columns if '_mean' in col]
aggregation_functions = {}
for col in mean_columns:
    aggregation_functions[col] = 'mean'
aggregation_functions

{'heartrate_mean': 'mean',
 'sysbp_mean': 'mean',
 'diasbp_mean': 'mean',
 'meanbp_mean': 'mean',
 'resprate_mean': 'mean',
 'tempc_mean': 'mean',
 'spo2_mean': 'mean',
 'glucose_mean': 'mean',
 'aniongap_mean': 'mean',
 'albumin_mean': 'mean',
 'bands_mean': 'mean',
 'bicarbonate_mean': 'mean',
 'bilirubin_mean': 'mean',
 'creatinine_mean': 'mean',
 'chloride_mean': 'mean',
 'hematocrit_mean': 'mean',
 'hemoglobin_mean': 'mean',
 'lactate_mean': 'mean',
 'platelet_mean': 'mean',
 'potassium_mean': 'mean',
 'ptt_mean': 'mean',
 'inr_mean': 'mean',
 'pt_mean': 'mean',
 'sodium_mean': 'mean',
 'bun_mean': 'mean',
 'wbc_mean': 'mean'}

In [86]:
# Grouping by subject ID for unique subject records (due to multiple mean measurmenets
# there were multiple records for single subject ID for mean values while rest of the
# features are stayed same)
df = df.groupby('subject_id').agg(aggregation_functions).reset_index()

In [87]:
# Summing duplicates to see if we left out with unique subject IDs
df['subject_id'].duplicated().sum()

0