In [1]:
import numpy as np
import pandas as pd

In [2]:
file1 = "../data/taur-events-0925.csv"
file2 = "../data/taur-otu-table-15tpts-0925.csv"

In [3]:
f1 = pd.read_csv(file1)
f2 = pd.read_csv(file2, header=None)

In [4]:
data1 = f1.to_numpy()
data2 = f2.to_numpy()

In [5]:
pid = np.array(f1['patientID'])

# obs

In [6]:
measure_pid = np.array(f2.loc[0, 1:], dtype=int)
f2pid = np.unique(measure_pid)

In [7]:
print("{} patients with id {}".format(len(f2pid), f2pid))

108 patients with id [ 12  16  25  26  31  35  45  51  54  59  68  74  75  81  85  88  93  95
  97 111 126 139 146 161 164 167 169 174 218 235 239 244 250 254 255 259
 261 280 284 294 306 311 312 315 320 331 336 338 347 366 367 374 377 383
 386 410 415 429 436 447 449 460 463 468 480 495 501 502 504 515 523 525
 530 531 536 537 551 565 566 580 593 602 605 608 642 645 666 683 694 697
 698 704 708 710 715 718 721 725 738 746 747 748 750 753 756 763 765 769]


In [8]:
count = data2[2:-1, 1:].T.astype(int)
print("count shape (measures, microbio speicies):", count.shape)
count_sum_across_spieces = np.sum(count, axis=1, keepdims=True)
percentage = count / count_sum_across_spieces

count shape (measures, microbio speicies): (1068, 50)


In [9]:
percentage_sum_across_measures = np.sum(percentage, axis=0)
spieces_name = data2[2:-1, 0]
k = 10
top_k_spieces_idx = np.argsort(percentage_sum_across_measures)[-k:][::-1]
print("top_k_spieces_idx:", top_k_spieces_idx)
for i, idx in enumerate(top_k_spieces_idx):
    print("top {:>2}, idx {:>4}, percentage_sum {:>5.2f}, name {}".format(i, idx, percentage_sum_across_measures[idx], spieces_name[idx]))

top_k_spieces_idx: [0 1 2 3 4 6 5 7 8 9]
top  0, idx    0, percentage_sum 246.22, name Bacteria;Firmicutes;Bacilli;Lactobacillales;Enterococcaceae;Enterococcus
top  1, idx    1, percentage_sum 140.79, name Bacteria;Firmicutes;Bacilli;Lactobacillales;Streptococcaceae;Streptococcus
top  2, idx    2, percentage_sum 105.49, name Bacteria;Firmicutes;Erysipelotrichia;Erysipelotrichales;Erysipelotrichaceae;Erysipelatoclostridium
top  3, idx    3, percentage_sum 87.96, name Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;Blautia
top  4, idx    4, percentage_sum 61.11, name Bacteria;Firmicutes;Bacilli;Lactobacillales;Lactobacillaceae;Lactobacillus
top  5, idx    6, percentage_sum 34.62, name Bacteria;Bacteroidetes;Bacteroidia;Bacteroidales;Bacteroidaceae;Bacteroides
top  6, idx    5, percentage_sum 34.29, name Bacteria;Verrucomicrobia;Verrucomicrobiae;Verrucomicrobiales;Akkermansiaceae;Akkermansia
top  7, idx    7, percentage_sum 27.33, name Bacteria;Firmicutes;Negativicutes;Veillo

In [10]:
dates = data2[1, 1:].astype(int)
count_obs = np.zeros((count.shape[0], k + 2)) 
count_obs[:, 0] = dates
count_obs[:, 1:-1] = count[:, top_k_spieces_idx]
count_obs[:, -1] = count.sum(axis=-1) - count[:,top_k_spieces_idx].sum(axis=-1)
percentage_obs = np.copy(count_obs)
percentage_obs[:, 1:-1] = count_obs[:, 1:-1] / np.sum(count_obs[:, 1:-1], axis=1, keepdims=True)

In [11]:
np.array(measure_pid).shape, np.array(count_obs).shape, np.array(percentage_obs).shape

((1068,), (1068, 12), (1068, 12))

In [12]:
use_split = False

Y = []
percentage_Y = []
obs_pid = []

pid_measure_start = 0
pid = measure_pid[0]
while True:
    patient_obs = []
    patient_percentage_obs = []
    for i in range(pid_measure_start, len(measure_pid)):
        if measure_pid[i] != pid:
            pid_measure_start = i
            pid = measure_pid[i]
            break
        if use_split and i > pid_measure_start and dates[i] - dates[i - 1] > 8:
            pid_measure_start = i
            break
        patient_obs.append(count_obs[i])
        patient_percentage_obs.append(percentage_obs[i])
    if len(patient_obs) > 2:
        Y.append(np.asarray(patient_obs))
        percentage_Y.append(np.asarray(patient_percentage_obs))
        obs_pid.append(pid)
    if i == len(measure_pid) - 1:
        break
        
obs, percentage_obs = Y, percentage_Y
print(len(obs), obs[0].shape)

108 (5, 12)


# Inputs

In [13]:
event_pid, event, event_start, event_end = data1.T
event_pid = np.array(event_pid, dtype=int)
event_start = np.array(event_start, dtype=int)
event_end = np.array(event_end, dtype=int)
unique_event = list(np.unique(event))
num_event = len(unique_event)
print("total {} kinds of events: {}".format(num_event, unique_event))

total 16 kinds of events: ['PCP prophylaxis agents', 'anti-VRE agents', 'anti-anaerobic agent', 'beta-lactamase inhibitors', 'carbapenems', 'first/second generation cephalosporins', 'fourth/fifth generation cephalosporins', 'glycopeptide', 'macrolides', 'miscellaneous antibiotics', 'monobactams', 'penicillins', 'quinolones', 'surgery', 'tetracyclines', 'third generation cephalosporins']


In [14]:
Input = []
for pobs, pid in zip(obs, obs_pid):
    patient_event_idxs = np.where(event_pid == pid)[0]
    obs_start = int(pobs[0, 0])
    obs_end = int(pobs[-1, 0])
    patient_input = np.zeros((obs_end - obs_start + 1, num_event + 1))
    patient_input[:, 0] = np.arange(obs_start, obs_end + 1)
    
    for event_idx in patient_event_idxs:
        patient_event = event[event_idx]
        event_id = unique_event.index(patient_event) + 1
        if patient_event == "surgery":
            surgery_start = min(patient_input.shape[0], max(0, 0 - obs_start))
            patient_input[surgery_start:, event_id] = np.ones(patient_input.shape[0] - surgery_start)
        else:
            for i in range(event_start[event_idx], event_end[event_idx] + 1):
                if not obs_start <= i <= obs_end:
                    continue
                patient_input[i - obs_start, event_id] = 1.0
    Input.append(patient_input)
print(len(Input), Input[0].shape)

108 (20, 17)


# explore inputs and observations

In [15]:
Input[0].shape

(20, 17)

In [16]:
obs[0].shape

(5, 12)

In [17]:
Input[0][:,0]

array([-4., -3., -2., -1.,  0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,
        9., 10., 11., 12., 13., 14., 15.])

In [18]:
obs[0][:,0]

array([-4., -1.,  6., 10., 15.])

# counts

In [19]:
obs[0][:,1:].shape

(5, 11)

In [20]:
counts = []
for single_obs in obs:
    single_counts = single_obs[:,1:].sum(axis=-1)
    counts.append(single_counts)

In [21]:
counts[1].shape

(7,)

In [22]:
len(counts)

108

# split into train & test and save

In [23]:
import os, pickle
n_train = 90
data = {}
data["Ytrain"] = obs[:n_train]
data["Ytest"] = obs[n_train:]
data["Vtrain"] = Input[:n_train]
data["Vtest"] = Input[n_train:]
with open(os.path.join("..", "data", "count_microbio{}.p".format("_split" if use_split else "")), "wb") as f:
    pickle.dump(data, f)
data["Ytrain"] = percentage_obs[:n_train]
data["Ytest"] = percentage_obs[n_train:]
with open(os.path.join("..", "data", "microbio{}.p".format("_split" if use_split else "")), "wb") as f:
    pickle.dump(data, f)