In [2]:
import numpy as np
import pandas as pd

In [12]:
def prep_signs():
    df = pd.read_csv('train/train_signs.csv')

    df['charttime'] = pd.to_datetime(df['charttime'])

    #Creates new column 'firsttime' which is the time of first row for each patient
    first_time_row = df.groupby('patient_id')['charttime'].first()
    df['firsttime'] = df['patient_id'].map(first_time_row)
    # Sets the index as the time from the first reading so all patients start at 0 and go toward 24 hours
    df = df.set_index(df['charttime'] - df['firsttime'])
    df = df.drop(['charttime','firsttime'],axis=1)
    # Resamples data so all patients have exactly 24 hours
    df = df.groupby('patient_id').resample('h').mean()
    df = df.reindex(pd.MultiIndex.from_product([df.index.levels[0],pd.timedelta_range(start='00:00:00', end='23:00:00', freq='1h')]))
    df = df.groupby(level=['patient_id']).ffill().bfill()
    # Fills NA with zero, na here means a patient that never had a certain measurment taken
    df[df.isna()] = 0
    return df

In [13]:
df = prep_signs()

In [14]:
df.shape

(328992, 30)

In [15]:
num_patients = len(df.index.levels[0])
num_patients * 24

328992

In [20]:
num_patients = len(df.index.levels[0])
num_cols = len(df.columns)
d = np.array(df)
# patients * cols (30) * hours (24)
d=np.reshape(d,(num_patients,num_cols,-1))
display(df)
d[0,:,0]

Unnamed: 0_level_0,Unnamed: 1_level_0,aniongap,bicarbonate,calcium,chloride,creatinine,gcseye,gcsmotor,gcsverbal,glucose,heartrate,...,phosphate,platelet,potassium,rdw,rbc,resp,sodium,ureanitro,vancomycin,wbc
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
p100001,0 days 00:00:00,14.0,25.0,7.1,101.0,1.5,4.0,6.0,5.0,106.0,85.0,...,1.4,133.0,3.9,12.6,4.33,23.333333,136.0,27.0,11.8,9.4
p100001,0 days 01:00:00,14.0,25.0,7.1,101.0,1.5,4.0,6.0,5.0,106.0,79.0,...,1.4,133.0,3.9,12.6,4.33,31.000000,136.0,27.0,11.8,9.4
p100001,0 days 02:00:00,14.0,25.0,7.1,101.0,1.5,4.0,6.0,5.0,106.0,84.0,...,1.4,133.0,3.9,12.6,4.33,29.000000,136.0,27.0,11.8,9.4
p100001,0 days 03:00:00,14.0,25.0,7.1,101.0,1.5,4.0,6.0,5.0,106.0,86.0,...,1.4,133.0,3.9,12.6,4.33,22.000000,136.0,27.0,11.8,9.4
p100001,0 days 04:00:00,14.0,25.0,7.1,101.0,1.5,4.0,6.0,5.0,106.0,89.0,...,1.4,133.0,3.9,12.6,4.33,26.000000,136.0,27.0,11.8,9.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
p117134,0 days 19:00:00,14.0,26.0,10.1,103.0,0.7,4.0,6.0,5.0,114.0,89.0,...,3.0,380.0,3.7,13.1,3.66,18.000000,139.0,11.0,0.0,11.1
p117134,0 days 20:00:00,14.0,26.0,10.1,103.0,0.7,4.0,6.0,5.0,114.0,89.0,...,3.0,380.0,3.7,13.1,3.66,18.000000,139.0,11.0,0.0,11.1
p117134,0 days 21:00:00,14.0,26.0,10.1,103.0,0.7,4.0,6.0,5.0,114.0,89.0,...,3.0,380.0,3.7,13.1,3.66,18.000000,139.0,11.0,0.0,11.1
p117134,0 days 22:00:00,14.0,26.0,10.1,103.0,0.7,4.0,6.0,5.0,114.0,89.0,...,3.0,380.0,3.7,13.1,3.66,18.000000,139.0,11.0,0.0,11.1


array([ 14.  ,   4.33,  93.  ,  31.3 ,   6.  ,  14.  ,   4.33, 106.  ,
        31.3 ,   6.  ,  14.  ,   4.33, 112.  ,  31.3 ,   6.  ,  14.  ,
         4.33, 113.  ,  31.3 ,   6.  ,  14.  ,   4.33, 105.  ,  31.1 ,
         6.  ,  13.  ,   3.96, 114.  ,  31.1 ,   6.  ])

In [3]:
initial_df = pd.read_csv('train/train_signs.csv')

In [11]:
# Percentage of entries that actually existed, weren't filled or smoothed over actual more than I thought 36% might be enough
(initial_df.shape[0] * initial_df.shape[1] - initial_df.isna().sum().sum()) / (13708 * 30 * 24)

0.362565959050676

9869760

In [None]:
357