## Imports

In [108]:
import pandas
import numpy as np
import os
import json
import pickle
from scipy.stats import expon
from sklearn.model_selection import train_test_split

## Dataset import

In [75]:
dataset = 'ft_vat19_anomaly_v20230824_genus'
df = pandas.read_csv('data/original_data/' + dataset + '.tsv',sep='\t')

In [76]:
print(df)

              id  age_days  host_id delivery_mode   sex geo_location_name  \
0     SRR8118517      56.0  E000823       vaginal  male           Finland   
1     SRR8118530      79.0  E000823       vaginal  male           Finland   
2     SRR8118533     121.0  E000823       vaginal  male           Finland   
3     SRR8118576     151.0  E000823       vaginal  male           Finland   
4     SRR8118648     173.0  E000823       vaginal  male           Finland   
...          ...       ...      ...           ...   ...               ...   
2483  SRR8116545     311.0  T029922      cesarean  male           Estonia   
2484  SRR8120282     437.0  T029922      cesarean  male           Estonia   
2485  SRR8120281     498.0  T029922      cesarean  male           Estonia   
2486  SRR8120283     591.0  T029922      cesarean  male           Estonia   
2487  SRR8120285     685.0  T029922      cesarean  male           Estonia   

     diet_milk diet_weaning  age_months_rounded  abx_any_last_t_dmonths  ..

## Pre-processing

Features

In [78]:
cols = df.columns
microbial_features = []
abx_features = []
static_features = ["delivery_mode", "sex", "geo_location_name"]
other_features = ["diet_milk", "diet_weaning"]

for col in cols:
    print(col)
    if col[:2] == 'g_':
        microbial_features.append(col)
    if len(col) > 4 and col[:4] == 'abx_':
        abx_features.append(col)
        
microbial_features = np.array(microbial_features)
nb_microbial_features = len(microbial_features)
static_features = np.array(static_features)
nb_static_features = len(static_features)
abx_features = np.array(abx_features)


id
age_days
host_id
delivery_mode
sex
geo_location_name
diet_milk
diet_weaning
age_months_rounded
abx_any_last_t_dmonths
abx_any_last_dur_days
abx_any_cumcount
abx_any_cumdur_days
abx_broad_last_t_dmonths
abx_broad_last_dur_days
abx_broad_cumcount
abx_broad_cumdur_days
abx_narrow_last_t_dmonths
abx_narrow_last_dur_days
abx_narrow_cumcount
abx_narrow_cumdur_days
abx_max_count_ever
g__Actinobacillus
g__Actinomyces
g__Agathobacter
g__Akkermansia
g__Alistipes
g__Anaerosporobacter
g__Anaerostipes
g__Bacteroides
g__Barnesiella
g__Bifidobacterium
g__Bilophila
g__Blautia
g__Butyricicoccus
g__CAG_352
g__Campylobacter
g__Chloroplast
g__Christensenellaceae_R_7_group
g__Clostridia_UCG_014
g__Clostridia_vadinBB60_group
g__Clostridioides
g__Clostridium_innocuum_group
g__Clostridium_sensu_stricto_1
g__Colidextribacter
g__Collinsella
g__Coprobacillus
g__Coprococcus
g__DTU089
g__Dialister
g__Dorea
g__Eggerthella
g__Eisenbergiella
g__Enterococcus
g__Erysipelatoclostridium
g__Erysipelotrichaceae_UCG_003


Raw data

In [79]:
microbial_data = np.array(df[microbial_features])
host_data = np.array(df[static_features])
abx_data = np.array(df[abx_features])

static_feature_values = np.array([list(set(host_data[:,i])) for i in range(nb_static_features)], dtype=object)
print(static_feature_values)
# host_feature_values_digitized = np.array([list(range(len(host_feature_values[i]))) for i in range(nb_host_features)], dtype=object)

static_feature_values_counter = np.array([len(static_feature_values[i]) for i in range(nb_static_features)])
static_feature_digitized_size = len(sum(static_feature_values, []))

[list(['cesarean', 'vaginal']) list(['male', 'female'])
 list(['Finland', 'Estonia', 'Russia'])]


Samples age

In [80]:
sample_age_days = np.array(df[['age_days']])
sample_age_days = sample_age_days.astype(int)
sample_age_days_max = sample_age_days.max()

Hosts

In [81]:
sample_host = np.array(df[['host_id']]).reshape(-1)
hosts = np.array(list(set(sample_host)))
nb_host = len(hosts)

Antibiotics presence

In [82]:
abx_max_count_ever = np.array(df[['abx_max_count_ever']]).reshape(-1)
abx_any_last_t_dmonths = np.array(df[['abx_any_last_t_dmonths']]).reshape(-1)

## Dataset dimensions

In [83]:
print("Number of infant hosts : ", nb_host)
print("--> number of paths")
print("Number of microbial features : ", nb_microbial_features)
print("--> dimension")
print("Maximum age (in days) of an infant on which a sample was collected : ", sample_age_days_max)
print("--> number of steps")
print("Total number of samples (among all hosts) : ", df.shape[0])
print("--> total number of data points")

Number of infant hosts :  255
--> number of paths
Number of microbial features :  98
--> dimension
Maximum age (in days) of an infant on which a sample was collected :  1162
--> number of steps
Total number of samples (among all hosts) :  2488
--> total number of data points


## Dataset Creation

In [84]:
paths = np.zeros((nb_host,nb_microbial_features,sample_age_days_max+1), dtype=np.float32)
observed_dates = np.zeros((nb_host,sample_age_days_max+1), dtype=np.int32)

static = np.zeros((nb_host,static_feature_digitized_size), dtype=np.float32)

abx_any = np.zeros(nb_host, dtype=np.bool_)
abx_observed = np.zeros(nb_host, dtype=np.bool_)

Time Series creation

In [85]:
for i in range(df.shape[0]):
    idx = list(hosts).index(sample_host[i])
    time = sample_age_days[i][0]
    observed_dates[idx,time]=1
    paths[idx,:,time] = microbial_data[i]
    
    if abx_max_count_ever[i] > 0.:
        abx_any[idx] = 1
    if not np.isnan(abx_any_last_t_dmonths[i]):
        abx_observed[idx] = 1
nb_obs = np.sum(observed_dates, axis=1)

In [86]:
count = 0

for i in range(df.shape[0]):
    idx = list(hosts).index(sample_host[i])
    
    host_feature = np.zeros(static_feature_digitized_size)
    c = 0
    for f in range(nb_static_features):
        feature = host_data[i,f]
        feature_idx = static_feature_values[f].index(feature)
        host_feature[c + feature_idx] = 1.
        c += static_feature_values_counter[f]

    
    if np.sum(static[idx]) != 0 and (static[idx] != host_feature).any():
        count += 1
        print("Problem : different host static data encountered among samples ! ")
        print(idx)
        print(static[idx])
        print(host_feature)
    static[idx] = host_feature
    
print(count)
        

0


In [87]:
print("Number of host who took antibiotics before at least one sample was taken : ", abx_observed.sum())
print("Number of host who where never observed with antibiotics : ", nb_host - abx_observed.sum())

Number of host who took antibiotics before at least one sample was taken :  141
Number of host who where never observed with antibiotics :  114


## Estimation of paths initial values

In [88]:
weight_method = "time_neg_exponential"
weight_param = 100.
times = np.arange(sample_age_days_max+1)
if weight_method == "time_neg_exponential":
    weights = expon.pdf(times, loc=0., scale=weight_param)

paths_0 = np.zeros((nb_host, nb_microbial_features))
for i in range(nb_host):
    paths_0[i] = np.sum(paths[i] * np.tile((weights * observed_dates[i]).reshape(1,-1),(nb_microbial_features,1)),axis=1) / np.sum(weights * observed_dates[i])

In [89]:
paths[:,:,0] = paths_0
observed_dates[:,0] = 1
nb_obs = np.sum(observed_dates, axis=1)

## Checks

In [90]:
print("Check whether the time of samples is more precise than days :")
# Check if 'age_days' values are just int
if np.max(np.abs(np.array(df[['age_days']]) - np.array(df[['age_days']]).astype(int))) > 0:
    print("--> Yes")
else:
    print("--> No")

Check whether the time of samples is more precise than days :
--> No


In [91]:
print("Check if the sum of features is 1 for each data point : ")
if np.max(np.abs(np.sum(microbial_data, axis=1)-1)) > 1e-10:
    print("--> No")
else:
    print("--> Yes")

Check if the sum of features is 1 for each data point : 
--> Yes


In [92]:
print("Check if the sum of features is 1 for each estimated S0 : ")
if np.max(np.abs(np.sum(paths_0, axis=1)-1)) > 1e-5:
    print("--> No")
else:
    print("--> Yes")

Check if the sum of features is 1 for each estimated S0 : 
--> Yes


In [93]:
print("Minimal Time-difference (in days) : ")
min_dt_per_host = np.empty(nb_host)
min_dt_per_host.fill(np.inf)
for i in range(nb_host):
    times = np.where(observed_dates[i] == 1.)[0]
    if len(times) > 1:
        min_dt_per_host[i] = np.min(times[1:] - times[:-1])
# print(min_dt_per_host)
print(np.min(min_dt_per_host))

Minimal Time-difference (in days) : 
3.0


## Paths

In [94]:
dataset_name = "microbial_genus/"
dataset_path = os.path.join('data/training_data/', dataset_name)
if not os.path.isdir(dataset_name):
    os.mkdir(dataset_name)

## Saving files

In [95]:
with open(dataset_path + 'data.npy', 'wb') as f:
    np.save(f, paths)
    np.save(f, observed_dates)
    np.save(f, nb_obs)
    np.save(f, static)

In [116]:
metadata_dict = {"S0": None,
           "dimension": nb_microbial_features,
           "dt": 1. / float(sample_age_days_max),
           "maturity": 1.,
           "model_name": "microbial_genus",
           "nb_paths": nb_host,
           "nb_steps": int(sample_age_days_max),
          "period": 1.}

with open(dataset_path + "metadata.txt", 'w') as f:
    json.dump(metadata_dict, f, sort_keys=True)

S0
<class 'NoneType'>
dimension
<class 'int'>
dt
<class 'float'>
maturity
<class 'float'>
model_name
<class 'str'>
nb_paths
<class 'int'>
nb_steps
<class 'int'>
period
<class 'float'>
{'S0': None, 'dimension': 98, 'dt': 0.0008605851979345956, 'maturity': 1.0, 'model_name': 'microbial_genus', 'nb_paths': 255, 'nb_steps': 1162, 'period': 1.0}


## Create dataset subdivision train/test/val

Train and val sets without abx

In [106]:
test_size = 0.2
seed = 398

train_idx, val_idx = train_test_split(np.where(~abx_observed)[0], test_size=test_size, random_state=seed)

idx_dataset_path = os.path.join(dataset_path, "no_abx/")
if not os.path.isdir(idx_dataset_path):
    os.mkdir(idx_dataset_path)

with open(idx_dataset_path + 'train_idx.npy', 'wb') as f:
    np.save(f, train_idx)
with open(idx_dataset_path + 'val_idx.npy', 'wb') as f:
    np.save(f, val_idx)
    