In [1]:
import numpy as np
import pandas as pd
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit

In [6]:
df_pop = pd.read_csv('./data/population.csv')[['HADM_ID', 'partition']]
df_label = pd.read_csv('./data/label.csv')[['HADM_ID', 'mortality_day', 'label']]

In [10]:
df_pop_label = df_label.merge(df_pop).sort_values(by='HADM_ID')

In [21]:
df_pop_label.to_csv('input/label.csv', index=False)
df_pop_label[['HADM_ID']].rename(columns={'HADM_ID': 'ID'}).to_csv('input/pop.csv', index=False)

In [14]:
df_data = pd.read_pickle('./data/clinical.p')

In [None]:
df_ICD = pd.read_csv('./data/icd_data.csv')

## Run FIDDLE - ICD

In [28]:
! PYTHONPATH="$PYTHONPATH:./" \
python -m FIDDLE_icd.run \
    --data_path='./output/' \
    --input_fname='./data/icd_data.csv' \
    --population='./input/pop.csv' \
    --T=4320.0 --dt=1.0 \
    --theta_1=0.01 --theta_2=0.01 --theta_freq=1
# [0]

Input data file: ./data/icd_data.csv

Input arguments:
    T      = 4320
    dt     = 1.0
    θ₁     = 0.01
    θ₂     = 0.01
    θ_freq = 1.0
    k      = 3 ['min', 'max', 'mean']
binarize = yes

N = 19723
L = 4320


1) Pre-filter
Remove rows not in population
Remove rows with t outside of [0, 4320]
Remove rare variables (<= 0.01)
Total variables     : 1
Rare variables      : 0
Remaining variables : 1
# rows (original)   : 280540
# rows (filtered)   : 280540

2) Transform; 3) Post-filter

--------------------------------------------------------------------------------
*) Detecting and parsing value types
--------------------------------------------------------------------------------
Saved as: ./output/value_types.csv
2020-03-13 19:33:56,600: Note: detected 112 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
2020-03-13 19:33:56,600: Note: NumExpr detected 112 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2020-0

In [30]:
! PYTHONPATH="$PYTHONPATH:./" \
python -m FIDDLE_icd.run \
    --data_path='./output/' \
    --input_fname='./data/icd_data.csv' \
    --population='./input/pop.csv' \
    --T=4320.0 --dt=1.0 \
    --theta_1=0.01 --theta_2=0.01 --theta_freq=1
# [0,1]

Input data file: ./data/icd_data.csv

Input arguments:
    T      = 4320
    dt     = 1.0
    θ₁     = 0.01
    θ₂     = 0.01
    θ_freq = 1.0
    k      = 3 ['min', 'max', 'mean']
binarize = yes

N = 19723
L = 4320


1) Pre-filter
Remove rows not in population
Remove rows with t outside of [0, 4320]
Remove rare variables (<= 0.01)
Total variables     : 1
Rare variables      : 0
Remaining variables : 1
# rows (original)   : 280540
# rows (filtered)   : 280540

2) Transform; 3) Post-filter

--------------------------------------------------------------------------------
*) Detecting and parsing value types
--------------------------------------------------------------------------------
Saved as: ./output/value_types.csv
2020-03-13 19:36:08,912: Note: detected 112 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
2020-03-13 19:36:08,912: Note: NumExpr detected 112 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2020-0

In [25]:
! PYTHONPATH="$PYTHONPATH:./" \
python -m FIDDLE_icd.run \
    --data_path='./output/' \
    --input_fname='./data/icd_data.csv' \
    --population='./input/pop.csv' \
    --T=4320.0 --dt=1.0 \
    --theta_1=0.01 --theta_2=0.01 --theta_freq=1
# [0,1,2]

Input data file: ./data/icd_data.csv

Input arguments:
    T      = 4320
    dt     = 1.0
    θ₁     = 0.01
    θ₂     = 0.01
    θ_freq = 1.0
    k      = 3 ['min', 'max', 'mean']
binarize = yes

N = 19723
L = 4320


1) Pre-filter
Remove rows not in population
Remove rows with t outside of [0, 4320]
Remove rare variables (<= 0.01)
Total variables     : 1
Rare variables      : 0
Remaining variables : 1
# rows (original)   : 280540
# rows (filtered)   : 280540

2) Transform; 3) Post-filter

--------------------------------------------------------------------------------
*) Detecting and parsing value types
--------------------------------------------------------------------------------
Saved as: ./output/value_types.csv
2020-03-13 13:18:08,557: Note: detected 112 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
2020-03-13 13:18:08,557: Note: NumExpr detected 112 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2020-0

## Run FIDDLE

In [None]:
! PYTHONPATH="$PYTHONPATH:./" \
python -m FIDDLE_icd.run \
    --data_path='./output_clinical/' \
    --input_fname='./data/clinical.p' \
    --population='./input/pop.csv' \
    --T=4320.0 --dt=4320.0 \
    --theta_1=0.01 --theta_2=0.01 --theta_freq=1

Input data file: ./data/clinical.p

Input arguments:
    T      = 4320
    dt     = 4320.0
    θ₁     = 0.01
    θ₂     = 0.01
    θ_freq = 1.0
    k      = 3 ['min', 'max', 'mean']
binarize = yes

N = 19723
L = 1


1) Pre-filter
Remove rows not in population
Remove rows with t outside of [0, 4320]
Remove rare variables (<= 0.01)
Total variables     : 4836
Rare variables      : 2941
Remaining variables : 1895
# rows (original)   : 112592300
# rows (filtered)   : 111280602

2) Transform; 3) Post-filter

--------------------------------------------------------------------------------
*) Detecting and parsing value types
--------------------------------------------------------------------------------
Saved as: ./output_clinical/value_types.csv
2020-03-13 13:48:16,136: Note: detected 112 virtual cores but NumExpr set to maximum of 64, check "NUMEXPR_MAX_THREADS" environment variable.
2020-03-13 13:48:16,136: Note: NumExpr detected 112 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing s