In [1]:
import numpy as np
import pandas as pd

In [2]:
file1 = "data/taur-events.csv"
file2 = "data/taur-otu-table-15tpts.csv"

In [3]:
f1 = pd.read_csv(file1)
f2 = pd.read_csv(file2, header=None)

In [4]:
data1 = f1.to_numpy()
data2 = f2.to_numpy()

In [5]:
pid = np.array(f1['patientID'])

### obs

In [6]:
measure_pid = f2.loc[0:0, 1:]
measure_pid = np.array(measure_pid, dtype=int)
measure_pid = measure_pid.reshape((-1))
f2pid = np.unique(measure_pid)

In [7]:
print("{} patients with id {}".format(len(f2pid), f2pid))

21 patients with id [ 26  31  97 447 515 530 531 537 551 642 694 697 698 704 708 721 747 753
 763 765 769]


In [8]:
count = data2[2:-1, 1:].T.astype(int)
print("count shape (measures, microbio speicies):", count.shape)
count_sum_across_spieces = np.sum(count, axis=1, keepdims=True)
percentage = count / count_sum_across_spieces

count shape (measures, microbio speicies): (405, 770)


In [9]:
percentage_sum_across_measures = np.sum(percentage, axis=0)
spieces_name = data2[2:-1, 0]
k = 10
top_k_spieces_idx = np.argsort(percentage_sum_across_measures)[-k:][::-1]
print("top_k_spieces_idx:", top_k_spieces_idx)
for i, idx in enumerate(top_k_spieces_idx):
    print("top {:>2}, idx {:>4}, percentage_sum {:>5.2f}, name {}".format(i, idx, percentage_sum_across_measures[idx], spieces_name[idx]))

top_k_spieces_idx: [281 291 411 326 284 441 731  15  25 261]
top  0, idx  281, percentage_sum 98.14, name Bacteria;Firmicutes;Bacilli;Lactobacillales;Enterococcaceae;Enterococcus
top  1, idx  291, percentage_sum 60.05, name Bacteria;Firmicutes;Bacilli;Lactobacillales;Streptococcaceae;Streptococcus
top  2, idx  411, percentage_sum 34.52, name Bacteria;Firmicutes;Erysipelotrichia;Erysipelotrichales;Erysipelotrichaceae;Erysipelatoclostridium
top  3, idx  326, percentage_sum 31.39, name Bacteria;Firmicutes;Clostridia;Clostridiales;Lachnospiraceae;Blautia
top  4, idx  284, percentage_sum 18.89, name Bacteria;Firmicutes;Bacilli;Lactobacillales;Lactobacillaceae;Lactobacillus
top  5, idx  441, percentage_sum 12.88, name Bacteria;Firmicutes;Negativicutes;Veillonellales;Veillonellaceae;Veillonella
top  6, idx  731, percentage_sum 12.48, name Bacteria;Verrucomicrobia;Verrucomicrobiae;Verrucomicrobiales;Akkermansiaceae;Akkermansia
top  7, idx   15, percentage_sum 11.00, name Bacteria;Actinobacteri

In [12]:
print(dates)

[ -5  -3   5   6  14  16  18  19  20  22  25  26  27  28  29  30  32  33
  34  36  -3   0   4   5   6   7   8  10  11  13  19  21  22  31  32  33
  34  35  36  37  38  39  40  41  42  43  -6  -2   1   2   3   4   5   6
   8   9  10  12  13  14  15  16  22  -9  -8  -7  -6  -5  -4  -3  -2  -1
   0   2   3   4   5   7   9  10  11  12  13  -7  -6  -4  -3  -2  -1   0
   2   3   4   5   6   7   8   9  10  12  14  15  20  21  -3  -2  -1   0
   2   3   5   6   7  10  12  13  14  22  28  29  31  -5   0   1   4   8
  10  12  13  14  16  18  25  31  33  34  36  37  39  41  42  43  44  47
  48  50  51  52   1   2   3   4   7   8  10  12  13  16  17  18  20  21
  25  27  28  -4  -3  -2  -1   0   1   2   3   4   5   6   7   8   9  10
  11  13  16  20  21  26 -13  -6  -2  -1   0   1   2   4   5   6   7   8
   9  11  16  18  19  -6  -3  -2  -1   0   1   3   4   5   6   7  10  11
  12  13  -4  -2  -1   0   4   5   8  10  11  13  15  16  19  21  23  27
  -7  -6  -4  -3  -2  -1   0   1   4   5   6   7  1

In [11]:
obs = []

dates = data2[1, 1:].astype(int)
percentage_obs = np.zeros((percentage.shape[0], k + 2))
percentage_obs[:, 0] = dates
percentage_obs[:, 1:-1] = percentage[:, top_k_spieces_idx]
percentage_obs[:, -1] = 1 - np.sum(percentage_obs[:, 1:], axis=-1)

pid_measure_start = 0
for pid in f2pid:
    patient_obs = []
    for i in range(pid_measure_start, len(measure_pid)):
        if measure_pid[i] != pid:
            pid_measure_start = i
            break
        patient_obs.append(percentage_obs[i])
    obs.append(np.asarray(patient_obs))
print(len(obs), obs[0].shape)

21 (20, 12)


### Input

In [None]:
event_pid, event, event_start, event_end = data1.T
event_pid = np.array(event_pid, dtype=int)
event_start = np.array(event_start, dtype=int)
event_end = np.array(event_end, dtype=int)
unique_event = list(np.unique(event))
num_event = len(unique_event)
print("total {} kinds of events: {}".format(num_event, unique_event))

In [None]:
Input = []
for pid in f2pid:
    patient_event_idxs = np.where(event_pid == pid)[0]
    input_start = min(event_start[patient_event_idxs])
    input_end = max(event_end[patient_event_idxs])
    input_len = input_end - input_start + 1
    patient_input = np.zeros((input_len, num_event + 1))
    patient_input[:, 0] = np.arange(input_start, input_end + 1)
    for event_idx in patient_event_idxs:
        patient_event = event[event_idx]
        event_id = unique_event.index(patient_event) + 1
        event_start_idx = event_start[event_idx] - input_start
        event_end_idx = event_end[event_idx] - input_start + 1
        patient_input[event_start_idx:event_end_idx, event_id] = np.ones(event_end_idx - event_start_idx)
    Input.append(patient_input)
print(len(Input), Input[0].shape)

### split into train & test and save

In [None]:
import os, pickle
n_train = 17
data = {}
data["Ytrain"] = obs[:n_train]
data["Ytest"] = obs[n_train:]
data["Vtrain"] = Input[:n_train]
data["Vtest"] = Input[n_train:]
with open(os.path.join("data", "microbio.p"), "wb") as f:
    pickle.dump(data, f)