In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Load the raw data and select only BF2
optional = False
load_df = pd.read_csv('csv/raw_data_20231113.csv', low_memory=False)
load_df = load_df[load_df['MetaStudy']=='BioFINDER-2']
load_df.info(verbose=True)

In [None]:
# Feature list used in model development and feature engineering, i.e., section 3.2
group = 'FS'
if group == 'FS':
    f= [
        'sid','Visit',
        # 'visit_date',
        'PL_pT217T217percentmean_WashU_2023','PL_pT217levelmean_WashU_2023',
        'PL_pT181T181percentmean_WashU_2023','PL_pT181levelmean_WashU_2023',
        'PL_pT205T205percentmean_WashU_2023','PL_pT205levelmean_WashU_2023',
        'PL_tau195210levelmean_WashU_2023','PL_tau181190levelmean_WashU_2023',
        'PL_tau212221levelmean_WashU_2023','Plasma_ptau231_pgml_UGOT_2023',
        'PL_Abeta4240Ratio_Standardized_WashU2023','PL_NTAadjusted_pgmL_Simoa_UGOT_2022',
        'PL_GFAP_pgmL_Simoa_UGOT_2022', 'PL_NFlight_pgmL_Simoa_UGOT_2022',
        'CSF_Ab42_Ab40_ratio_imputed_Elecsys_2020_2022','CSF_Ptau_pgml_imputed_Elecsys_2020_2022',
        'age', 'apoe_genotype_baseline_variable','education_level_years_baseline_variable', 
        'gender_baseline_variable', 'adas_delayed_word_recall',
        'Abnormal_CSF_Ab42_Ab40_Ratio','animal_fluency', 'cognitive_status_baseline_variable', 
        'mPACC_v2','mmse_score', 'symbol_digit', 'trailmaking_b','fnc_ber_com_composite']
    new_fea_names = [
                 'CSF P-tau217',
                 'Plasma %P-tau217',
                 'Plasma P-tau217',
                 'Plasma %P-tau181',
                 'Plasma P-tau181',
                 'Plasma %P-tau205',
                 'Plasma P-tau205',
                 'Plasma tau195-210',
                 'Plasma tau181-190',
                 'Plasma tau212-221',
                 'Plasma P-tau231',
                 'Plasma Aβ42/Aβ40',
                 'Plasma NTAadjusted',
                 'Plasma GFAP',
                 'Plasma NFlight',
                 'CSF Aβ42/Aβ40',
                 'CSF P-tau181',
                 'Age',
                 'APOE',
                 'Education',
                 'Sex',
                 'ADAS',
                 'CSF Abnormal Ratio',
                 'Animal fluency',
                 'Cognitive status',
                 'PACC',
                 'MMSE',
                 'Symbol digit',
                 'Trailmaking',
                 'fnc_ber_com_composite']
elif group == 'R':
    # Features used for the rest results, i.e., section 3.3-3.7
    f = [
        'sid','Visit',
        'PL_pT217T217percentmean_WashU_2023','PL_ptau217_pgml_Lilly_2022',
        'CSF_09pT217T217mean_WashU_2022',
        # 'CSF_ptau217_pgml_Lilly_2019','CSF_Tau_212_221_p217_UGOT_2022','PL_Abeta4240Ratio_Standardized_WashU2023', 
        'CSF_Ab42_Ab40_ratio_imputed_Elecsys_2020_2022', 
        'age','apoe_genotype_baseline_variable','adas_delayed_word_recall',
        'education_level_years_baseline_variable','gender_baseline_variable',
        'cognitive_status_baseline_variable','mmse_score', 
        'Abnormal_CSF_Ab42_Ab40_Ratio', 'diagnosis_baseline_variable', 'fnc_ber_com_composite']
    new_fea_names = [
                 'CSF Lilly P-tau217',
                 'Plasma WashU %P-tau217',
                 'Plasma Lilly P-tau217',
                 'CSF WashU P-tau217',
                 'CSF Aβ42/Aβ40',
                 'Age',
                 'APOE',
                 'ADAS',
                 'Education',
                 'Sex',
                 'Cognitive status',
                 'MMSE',
                 'CSF Abnormal Ratio',
                 'Diagnosis status',
                 'fnc_ber_com_composite']

In [None]:
# Load chosen features only
select_df = load_df[f].reset_index(drop=True)
select_df.info()

### Optional

In [None]:
# Check specific biomarkers (optional)
if optional:
    biomarker_231 = [col for col in load_df.columns if '231' in col]
    biomarker_217 = [col for col in load_df.columns if '217' in col]
    biomarker_212 = [col for col in load_df.columns if '212' in col]
    biomarker_205 = [col for col in load_df.columns if '205' in col]
    biomarker_195 = [col for col in load_df.columns if '195' in col]
    biomarker_181 = [col for col in load_df.columns if '181' in col]
    biomarker_ptau = [col for col in load_df.columns if 'Ptau' in col]
    biomarker_GFAP = [col for col in load_df.columns if 'GFAP' in col]
    biomarker_NFL = [col for col in load_df.columns if 'NFl' in col]
    biomarker_Ab = [col for col in load_df.columns if 'Ab' in col]
    fnc = [col for col in load_df.columns if 'fnc' in col]
    date = [col for col in load_df.columns if 'date' in col]
    print(*date, sep="\n")

In [None]:
# Check the data date range (optional)
if optional:
    select_df['visit_date'].dropna(how='any').max()

In [None]:
# Check the histograms of biomarkers (optional)
if optional:
    fig, ax = plt.subplots(3,5,figsize=(30,10))
    select_df.hist(bins=200, ax=ax)
    plt.show()

### Merge with CSF files

In [None]:
extra_df = pd.read_excel('csv/BF2_CSF_Lilly_2024.xlsx')
extra_df['Visit'] = extra_df['Visit'].astype('float64')
extra_df = extra_df[['sid', 'Visit', 'Norm_CSF_ptau217_pgml_Lilly_2019_2024']]
extra_df.info()

In [None]:
result = extra_df.merge(select_df, on=['sid', 'Visit'], how='right')
result.drop(['sid','Visit'], axis=1, inplace=True)
result = result.set_axis(new_fea_names, axis=1)
result.info()

In [None]:
if group == 'FS':
    result.to_csv('csv/BF2_FS.csv', index=False)
elif group == 'R':
    result.to_csv('csv/BF2_R.csv', index=False)