In [10]:
import os
import sys
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt

current_dir = os.getcwd()
utils_path = os.path.join(current_dir, '..', 'utils')
utils_abs_path = os.path.abspath(utils_path)

if utils_abs_path not in sys.path:
    sys.path.append(utils_abs_path)

import get_data

DATA_PATH = get_data.get_dataset_abspath()

training_setA_path = DATA_PATH + 'training_setA'
training_setB_path = DATA_PATH + 'training_setB'

In [14]:
# READS TRAINING SET A FILES
nan_countsA = {}

# List all files in the directory
file_list = os.listdir(training_setA_path)

# Given the number of files this loop is quite slow so do not execute this block needlessly
for file_name in file_list:
    if file_name.endswith(".psv"):
        file_path = os.path.join(training_setA_path, file_name)
        # Read the file
        data = pd.read_csv(file_path, sep='|')
        
        # For the first file, initialize the dictionary with column names
        if not nan_countsA:
            for column in data.columns:
                nan_countsA[column] = {"nan": 0, "total": 0}
        
        # Update NaN counts and total counts
        for column in data.columns:
            nan_countsA[column]["nan"] += data[column].isna().sum()
            nan_countsA[column]["total"] += len(data)

In [15]:
# READS TRAINING SET B FILES
nan_countsB = {}

# List all files in the directory
file_list = os.listdir(training_setB_path)

# Given the number of files this loop is quite slow so do not execute this block needlessly
for file_name in file_list:
    if file_name.endswith(".psv"):
        file_path = os.path.join(training_setB_path, file_name)
        # Read the file
        data = pd.read_csv(file_path, sep='|')
        
        # For the first file, initialize the dictionary with column names
        if not nan_countsB:
            for column in data.columns:
                nan_countsB[column] = {"nan": 0, "total": 0}
        
        # Update NaN counts and total counts
        for column in data.columns:
            nan_countsB[column]["nan"] += data[column].isna().sum()
            nan_countsB[column]["total"] += len(data)

In [60]:
heading = "Training Set A".ljust(70) + "Training Set B" +\
          "\n" + "_"*120
print(heading)

# Iterate through the dictionary and print the required format
for itemsA, itemsB in zip(nan_countsA.items(), nan_countsB.items()):
    columnA, countsA = itemsA[0], itemsA[1]
    columnB, countsB = itemsB[0], itemsB[1]
    nan_countA = countsA["nan"]
    total_countA = countsA["total"]
    percentage = (nan_countA / total_countA) * 100 if total_countA else 0
    outputA = f"{columnA}:".ljust(17) + f" {nan_countA} ".rjust(10) + f"/ {total_countA}".ljust(10) + f"{percentage:6.2f}%"

    nan_countB = countsB["nan"]
    total_countB = countsB["total"]
    percentage = (nan_countB / total_countB) * 100 if total_countB else 0
    outputB = f"{columnB}:".ljust(17) + f" {nan_countB} ".rjust(10) + f"/ {total_countB}".ljust(10) + f"{percentage:6.2f}%"
    
    print(outputA + "|".rjust(20) + outputB)

Training Set A                                                        Training Set B
________________________________________________________________________________________________________________________
HR: 61189 / 790215                        7.74%                   |HR:                  92210 / 761995   12.10%
O2Sat: 95079 / 790215                    12.03%                   |O2Sat:              107657 / 761995   14.13%
Temp: 523314 / 790215                    66.22%                   |Temp:               503670 / 761995   66.10%
SBP: 120201 / 790215                     15.21%                   |SBP:                106064 / 761995   13.92%
MAP: 80858 / 790215                      10.23%                   |MAP:                112412 / 761995   14.75%
DBP: 380297 / 790215                     48.13%                   |DBP:                106257 / 761995   13.94%
Resp: 77258 / 790215                      9.78%                   |Resp:               161077 / 761995   21.14%
EtCO2: 790

In [23]:
for columnB, countsB in nan_countsB.items():
    nan_countB = countsB["nan"]
    total_countB = countsB["total"]
    percentage = (nan_countB / total_countB) * 100 if total_countB else 0
    outputB = f"{columnB}: {nan_countB} / {total_countB}".ljust(40) + f"{percentage:6.2f}%"
    
    print(outputB)

HR: 92210 / 761995                       12.10%
O2Sat: 107657 / 761995                   14.13%
Temp: 503670 / 761995                    66.10%
SBP: 106064 / 761995                     13.92%
MAP: 112412 / 761995                     14.75%
DBP: 106257 / 761995                     13.94%
Resp: 161077 / 761995                    21.14%
EtCO2: 704359 / 761995                   92.44%
BaseExcess: 760231 / 761995              99.77%
HCO3: 760584 / 761995                    99.81%
FiO2: 744785 / 761995                    97.74%
pH: 745037 / 761995                      97.77%
PaCO2: 744982 / 761995                   97.77%
SaO2: 747594 / 761995                    98.11%
AST: 748632 / 761995                     98.25%
BUN: 719903 / 761995                     94.48%
Alkalinephos: 748586 / 761995            98.24%
Calcium: 709982 / 761995                 93.17%
Chloride: 757306 / 761995                99.38%
Creatinine: 719866 / 761995              94.47%
Bilirubin_direct: 760187 / 761995       