# Longitudinal data preparation
This script:
* Selects only metabolonomics (protein markers)
* Deletes subjects that do not have a marker acquisition for m0 and m12
* Saves to disk in the same format so that CIMLR can work seameslly

In [19]:
# do imports
import pandas as pd
import numpy as np

# Load relevant tables
# ADNIMERGE includes general information about the patients
# Not sure I will need it, but whatever
df_data = pd.read_csv('ADNIMERGE.csv')
df_plasma_full = pd.read_csv('adni_plasma_qc_multiplex_11Nov2010.csv')

Prepare the data in the same format that we know and love:

In [24]:
# Select interesting columns
info = ["RID", "PTID", "MMSE", "VISCODE", "EXAMDATE",
        "AGE", "PTGENDER", "APOE4", "DX_bl", "PTEDUCAT"]

df_data = df_data[info].copy()

# Select only baseline data
df_data = df_data[(df_data.VISCODE == "bl")]

# Drop -1 and -4 missing indicators
df_data = df_data.replace(to_replace=[-1, -4], value=[np.nan, np.nan]).dropna()

# ADD Plasma biomarkers
# Drop empty columns
df_plasma_full = df_plasma_full.replace(to_replace=['.'], value=[np.nan])
df_plasma_full = df_plasma_full.dropna(axis=1, how='any')

# Create two new datasets, one with m0, one with m12
df_plasma_bl = df_plasma_full[df_plasma_full.Visit_Code == 'bl']
df_plasma_m12 = df_plasma_full[df_plasma_full.Visit_Code == 'm12']

# Add all colums to bl
df_data_bl = pd.merge(df_data, df_plasma_bl.iloc[:, 1:], how='inner', on="RID")
del df_data_bl['Visit_Code']
del df_data_bl['RBM Sample ID']
del df_data_bl['Sample_Received_Date']

# Drop missing data, print some statistics of the data
df_data_bl = df_data_bl.dropna()

# Add all colums to m12
df_data_m12 = pd.merge(df_data, df_plasma_m12.iloc[:, 1:], how='inner', on="RID")
del df_data_m12['Visit_Code']
del df_data_m12['RBM Sample ID']
del df_data_m12['Sample_Received_Date']

# Drop missing data
df_data_m12 = df_data_m12.dropna()
df_data_m12['VISCODE'] = ["m12"] * len(df_data_m12)

# Remove from m0 the subjects that do not have a m12
df_data_bl = df_data_bl[df_data_bl['RID'].isin(df_data_m12.RID)]

# Sanity check
print('For baseline!')
print('Total samples: ' + str(len(df_data_bl)))
print("AD samples: " + str(len(df_data_bl[df_data_bl.DX_bl == 'AD'])))
print("LMCI samples: " + str(len(df_data_bl[df_data_bl.DX_bl == 'LMCI'])))
print("CN samples: " + str(len(df_data_bl[df_data_bl.DX_bl == 'CN'])))

print('For followup!')
print('Total samples: ' + str(len(df_data_m12)))
print("AD samples: " + str(len(df_data_m12[df_data_m12.DX_bl == 'AD'])))
print("LMCI samples: " + str(len(df_data_m12[df_data_m12.DX_bl == 'LMCI'])))
print("CN samples: " + str(len(df_data_m12[df_data_m12.DX_bl == 'CN'])))

df_data_m12[df_data_m12.DX_bl == 'LMCI'].to_csv('test.csv')
df_data_bl[df_data_bl.DX_bl == 'LMCI'].to_csv('test2.csv')

For baseline!
Total samples: 496
AD samples: 97
LMCI samples: 345
CN samples: 54
For followup!
Total samples: 496
AD samples: 97
LMCI samples: 345
CN samples: 54


Create new dataset with gradient, substracting m12 from m0

In [28]:
cov_names = df_data_bl.iloc[:, 5:].columns.values.tolist()
cov_names.remove('DX_bl')
cov_names.remove('APOE4')
cov_names.remove('PTGENDER')
cov_names.remove('AGE')
cov_names.remove('PTEDUCAT')

df_data_grad = df_data_bl
df_data_grad[cov_names] = df_data_m12[cov_names] - df_data_bl[cov_names]
df_data_grad.to_csv('covariates_long.csv')
df_data_grad['PTID'].to_csv('subjects_experiment_v2.csv')