# Biospecimen data preparation
This script prepares and combines available biospecimen markers data from multiple sources into a single file, and also applies any needed preprocessing steps to the various types of data
Output should be a file named biospecimen_data.csv with:

| RID | DX | Gender | Age | Ed. level | all the other markers... |


In [None]:
# do imports
import pandas as pd
import numpy as np

**Load all relevant data tables**

First, we load all relevant tables in pandas DataFrames.

In [None]:
# ADNIMERGE includes general information about the patients
# Not sure I will need it, but whatever
df_data = pd.read_csv('ADNIMERGE.csv')
df_plasma_full = pd.read_csv('adni_plasma_qc_multiplex_11Nov2010.csv')

# NFL data
df_nfl=pd.read_csv('ADNI_BLENNOWPLASMANFL.csv')

# Plasma ABETA UPENN
df_abeta=pd.read_csv('UPENNPLASMA.csv')

# HOMOCYSTEINE
df_homocysteine=pd.read_csv('HCRES.csv')

## ADMC DATA
# P180 data
df_metabolytes=pd.read_csv('ADMCDUKEP180UPLC_01_15_16.csv')

In [None]:
# Select interesting columns
info = ["RID", "PTID", "MMSE", "VISCODE", "EXAMDATE",
        "AGE", "PTGENDER", "APOE4", "DX_bl", "PTEDUCAT"]

df_data = df_data[info].copy()

# Select only baseline data
df_data = df_data[(df_data.VISCODE == "bl")]

# Drop -1 and -4 missing indicators
df_data = df_data.replace(to_replace=[-1, -4], value=[np.nan, np.nan]).dropna()

# ADD Plasma biomarkers
# Drop empty columns
df_plasma_full = df_plasma_full.replace(to_replace=['.'], value=[np.nan])
df_plasma_full = df_plasma_full.dropna(axis=1, how='any')

# select only baselines
df_plasma_full = df_plasma_full[df_plasma_full.Visit_Code == 'bl']

# Add all colums
df_data = pd.merge(df_data, df_plasma_full.iloc[:, 1:], how='inner', on="RID")
del df_data['Visit_Code']
del df_data['RBM Sample ID']
del df_data['Sample_Received_Date']

# Drop missing data, print some statistics of the data
df_data.dropna()

Select only the subset of the data where:
* No missing data values.
* Data quality is good across all markers

Also, we want to check whether the number of subjects is high enough for the tests, so tests different combinations

In [None]:
# First, without the metabolytes
# Keep adding the data to the origina and check how many subjects still remain

# ADD homocysteine
df_homocysteine = df_homocysteine[df_homocysteine.VISCODE == 'bl']
df_homocysteine = df_homocysteine[["RID", "HCAMPLAS"]]
print(df_homocysteine.isnull().sum())
df_homocysteine.dropna()
df_useddata_homo = pd.merge(df_data, df_homocysteine, how='inner', on="RID")

# ADD PLasma ABETA
df_abeta = df_abeta[df_abeta.VISCODE == 'bl']
df_abeta = df_abeta[["RID", "AB40", "AB42"]]
print(df_abeta.isnull().sum())
df_abeta.dropna()
df_useddata_homo_abeta = pd.merge(df_useddata_homo, df_abeta, how='inner', on="RID")

# Add NFL data
df_nfl = df_nfl[df_nfl.VISCODE == 'bl']
df_nfl = df_nfl[["RID", "PLASMA_NFL"]]
print(df_nfl.isnull().sum())
df_nfl.dropna()
df_useddata_homo_abeta_plasma = pd.merge(df_useddata_homo_abeta, df_nfl, how='inner', on="RID")

In [None]:
# Cell for testing
df_useddata_homo_abeta_plasma
print('Total samples: ' + str(len(df_useddata_homo_abeta_plasma)))
print("AD samples: " + str(len(df_useddata_homo_abeta_plasma[df_useddata_homo_abeta_plasma.DX_bl == 'AD'])))
print("LMCI samples: " + str(len(df_useddata_homo_abeta_plasma[df_useddata_homo_abeta_plasma.DX_bl == 'LMCI'])))
print("CN samples: " + str(len(df_useddata_homo_abeta_plasma[df_useddata_homo_abeta_plasma.DX_bl == 'CN'])))

In [None]:
# Try to add the metabolytes

# Integrate columns names from dictionary
# load dictionary
df_metabolytes_DICT=pd.read_csv('ADMCDUKEP180UPLC_DICT.csv')
df_metabolytes = df_metabolytes.drop_duplicates(subset=['RID'], keep='first', inplace=False)

# Convert to NA non-numeric values
def isnumber(x):
    try:
        float(x)
        return True
    except:
        return False

df_metabolytes = df_metabolytes[df_metabolytes.applymap(isnumber)]

# drop columns that have more than 10% NA values
df_metabolytes = df_metabolytes.dropna(axis=1, how='any')
print(len(df_metabolytes))
print(df_metabolytes)
#After this, remove individual samples
df_metabolytes = df_metabolytes.dropna()
print(len(df_metabolytes))
df_useddata_homo_abeta_plasma_meta = pd.merge(df_useddata_homo_abeta_plasma, df_metabolytes, how='inner', on="RID")
# assign new names

# Check statistics
print('Total samples: ' + str(len(df_useddata_homo_abeta_plasma_meta)))
print("AD samples: " + str(len(df_useddata_homo_abeta_plasma_meta[df_useddata_homo_abeta_plasma_meta.DX_bl == 'AD'])))
print("LMCI samples: " + str(len(df_useddata_homo_abeta_plasma_meta[df_useddata_homo_abeta_plasma_meta.DX_bl == 'LMCI'])))
print("CN samples: " + str(len(df_useddata_homo_abeta_plasma_meta[df_useddata_homo_abeta_plasma_meta.DX_bl == 'CN'])))

df_useddata_homo_abeta_plasma_meta
df_useddata_homo_abeta_plasma_meta.to_csv("useddata_homo_abeta_plasma_meta.csv")