In [1]:
import os
import sys
import random
import pandas as pd
import numpy as np
from scipy.linalg import toeplitz
from copy import copy
import hypertools as hyp
# import matplotlib.pyplot as plt

# Comment out if you don't want to see all of the values being printed (i.e. default)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

current_dir = os.getcwd()
# utils_path = os.path.join(current_dir, '..', 'utils')
utils_path = os.path.join(current_dir, '../utils')
utils_abs_path = os.path.abspath(utils_path)
if utils_abs_path not in sys.path:
    sys.path.append(utils_abs_path)

import get_data
# from impute_methods import *
from utils.impute_methods import impute_linear_interpolation

DATA_PATH = get_data.get_dataset_abspath()

training_setA_path = DATA_PATH + 'training_setA'
training_setB_path = DATA_PATH + 'training_setB'

ImportError: libtiff.so.5: cannot open shared object file: No such file or directory

In [8]:
# READS TRAINING SET A FILES
nan_countsA = {}

# List all files in the directory
file_list = os.listdir(training_setA_path)

# Given the number of files this loop is quite slow so do not execute this block needlessly
c = 0
for file_name in file_list:
    if file_name.endswith(".psv"):
        file_path = os.path.join(training_setA_path, file_name)
        # Read the file
        data = pd.read_csv(file_path, sep='|')
        
        # For the first file, initialize the dictionary with column names
        if not nan_countsA:
            for column in data.columns:
                nan_countsA[column] = {"nan": 0, "total": 0}
        
        # Update NaN counts and total counts
        for column in data.columns:
            nan_countsA[column]["nan"] += data[column].isna().sum()
            nan_countsA[column]["total"] += len(data)
    print(f"\r{c}/{len(file_list)}", end="")
    c += 1

20335/20336

In [9]:
# READS TRAINING SET B FILES
nan_countsB = {}

# List all files in the directory
file_list = os.listdir(training_setB_path)

# Given the number of files this loop is quite slow so do not execute this block needlessly
c = 0
for file_name in file_list:
    if file_name.endswith(".psv"):
        file_path = os.path.join(training_setB_path, file_name)
        # Read the file
        data = pd.read_csv(file_path, sep='|')
        
        # For the first file, initialize the dictionary with column names
        if not nan_countsB:
            for column in data.columns:
                nan_countsB[column] = {"nan": 0, "total": 0}
        
        # Update NaN counts and total counts
        for column in data.columns:
            nan_countsB[column]["nan"] += data[column].isna().sum()
            nan_countsB[column]["total"] += len(data)
    print(f"\r{c}/{len(file_list)}", end="")
    c += 1

19999/20000

In [10]:
heading = "Training Set A".ljust(70) + "Training Set B" +\
          "\n" + "Column".ljust(16) + "num of NaN" + "  total" + "  Percentage".ljust(31) +\
                 "Column".ljust(16) + "num of NaN" + "  total" + "  Percentage " +\
          "\n" + "_"*109
print(heading)

# Iterate through the dictionary and print the required format
for itemsA, itemsB in zip(nan_countsA.items(), nan_countsB.items()):
    columnA, countsA = itemsA[0], itemsA[1]
    columnB, countsB = itemsB[0], itemsB[1]
    nan_countA = countsA["nan"]
    total_countA = countsA["total"]
    percentage = (nan_countA / total_countA) * 100 if total_countA else 0
    outputA = f"{columnA}:".ljust(17) + f" {nan_countA} ".rjust(10) + f"/ {total_countA}".ljust(10) + f"{percentage:6.2f}%"

    nan_countB = countsB["nan"]
    total_countB = countsB["total"]
    percentage = (nan_countB / total_countB) * 100 if total_countB else 0
    outputB = f"{columnB}:".ljust(17) + f" {nan_countB} ".rjust(10) + f"/ {total_countB}".ljust(10) + f"{percentage:6.2f}%"
    
    print(outputA + "|".rjust(20) + outputB)

Training Set A                                                        Training Set B
Column          num of NaN  total  Percentage                   Column          num of NaN  total  Percentage 
_____________________________________________________________________________________________________________
HR:                  61189 / 790215    7.74%                   |HR:                  92210 / 761995   12.10%
O2Sat:               95079 / 790215   12.03%                   |O2Sat:              107657 / 761995   14.13%
Temp:               523314 / 790215   66.22%                   |Temp:               503670 / 761995   66.10%
SBP:                120201 / 790215   15.21%                   |SBP:                106064 / 761995   13.92%
MAP:                 80858 / 790215   10.23%                   |MAP:                112412 / 761995   14.75%
DBP:                380297 / 790215   48.13%                   |DBP:                106257 / 761995   13.94%
Resp:                77258 / 790215    9

In [11]:
# Let us look at a random specific patient file

# Change to empty strign to get random file
specific_filename = "p001146.psv"

# From training set A
file_path = random.choice(os.listdir(training_setA_path)) if specific_filename == "" else specific_filename
file_data = pd.read_csv(training_setA_path + "/" + file_path, sep='|')
print(file_path)
print(file_data['HR'])

p001146.psv
0     88.0
1     76.0
2     72.0
3     81.0
4      NaN
5     94.0
6     80.0
7     88.0
8     78.0
9     80.0
10    86.0
11    86.0
12     NaN
13     NaN
14    75.0
15    69.0
16     NaN
17    85.0
18    83.0
19    84.0
20    73.0
21    73.0
22     NaN
23    97.0
Name: HR, dtype: float64


In [12]:
# This linearly interpolates the Heart Rate values which could be sufficient
imputed_data = impute_linear_interpolation(file_data, 'HR')

print(imputed_data['HR'])

0     88.000000
1     76.000000
2     72.000000
3     81.000000
4     87.500000
5     94.000000
6     80.000000
7     88.000000
8     78.000000
9     80.000000
10    86.000000
11    86.000000
12    82.333333
13    78.666667
14    75.000000
15    69.000000
16    77.000000
17    85.000000
18    83.000000
19    84.000000
20    73.000000
21    73.000000
22    85.000000
23    97.000000
Name: HR, dtype: float64


In [4]:
full_dataset = get_data.get_dataset_as_df()

NameError: name 'get_data' is not defined

In [None]:

missing = .1
inds = [(i,j) for i in range(data2.shape[0]) for j in range(data2.shape[1])]
missing_data = [inds[i] for i in np.random.choice(int(len(inds)), int(len(inds)*missing))]
for i,j in missing_data:
    data2[i,j]=np.nan

# plot
hyp.plot([data1, data2], linestyle=['-',':'], legend=['Original', 'PPCA'])