In [None]:
#versions: pandas==2.0.3, scikit-survival==0.22.2

In [None]:
import pandas as pd
from collections import OrderedDict
from sklearn.model_selection import train_test_split

In [None]:
OUTPUT_FASTSURFER_DIR = "./data/ADNI/output_fastsurfer/"
ADNIMERGE_STATA_DIR="./data/ADNI/ADNIMERGE_Stata/"
RESULTS_FILE_SUMMARY = "./data/ADNI/summary_scan_level.csv"
FASTSURFER_MAPPING = "./data/Freesurfer-LUT_new.txt"
MPRAGE_METADATA = "./data/ADNI/MPRAGEMETA_09Jul2024.csv"# extracted from LONI, contains columns "Orig/Proc","SubjectID","Visit","MagStrength","Sequence","ScanDate","StudyID","SeriesID","ImageUID" for all ADNI scans
SAVEDIR = "./data/"

In [None]:
FEATURE_SET = "BL+VOL"#one of "BL", "BL+VOL", "BL+RAD", "BL+VOL+RAD", "rBL", "rBL+VOL", "rBL+RAD", "rBL+VOL+RAD"
DATASET_SEL = "CN+MCI" #one of "CN+MCI", "MCI"

In [None]:
TRAINING_SET = "./data/train_"+DATASET_SEL+".csv"
TEST_SET = "./data/test_"+DATASET_SEL+".csv"

In [None]:
data_aseg = pd.read_table(OUTPUT_FASTSURFER_DIR+"aseg_stats.txt")
data_wmparc = pd.read_table(OUTPUT_FASTSURFER_DIR+"wmparc_stats.txt")
data_lhDKT = pd.read_table(OUTPUT_FASTSURFER_DIR+"lh.aparc.DKTatlas.mapped.volume.txt")
data_rhDKT = pd.read_table(OUTPUT_FASTSURFER_DIR+"rh.aparc.DKTatlas.mapped.volume.txt")


data_lhDKT = data_lhDKT.rename({"lh.aparc.DKTatlas.mapped.volume":"filename"},axis=1)
data_rhDKT = data_rhDKT.rename({"rh.aparc.DKTatlas.mapped.volume":"filename"},axis=1)
data_aseg = data_aseg.rename({"Measure:volume":"filename"},axis=1)
data_wmparc = data_wmparc.rename({"Measure:volume":"filename"},axis=1)

df = pd.merge(data_lhDKT,data_rhDKT,on="filename",suffixes=("", "_y"))

df = pd.merge(df,data_wmparc,on="filename",suffixes=("", "_y"))
df =pd.merge(df,data_aseg,on="filename",suffixes=("", "_y"))


cols = [c for c in df.columns if not c.endswith("_y")]
df_vol=df[cols]
df_vol["PTID"]=df_vol.filename.str.extract(r"(\d{3}_S_\d{4})")
df_vol["IMAGEUID"]=df_vol.filename.str.split("_I").str[-1].str[:-1]

df_vol=df_vol.astype({"IMAGEUID": "str"})
df_vol.iloc[:,1:191]=df_vol.iloc[:,1:191].div(df_vol.EstimatedTotalIntraCranialVol, axis=0)

In [None]:
#Load texture data
texture=pd.read_csv(RESULTS_FILE_SUMMARY)

texture=texture[~texture.path.str.contains("AIBL")]

texture["PTID"]=texture.path.str.extract(r"(\d{3}_S_\d{4})")

texture["IMAGEUID"]=texture.path.str.split("_I").str[-1].str.split("/").str[0]

texture=texture.drop(["path"],axis=1)

texture=texture.astype({"IMAGEUID":"int"})
texture=texture.astype({"IMAGEUID":"str"})

freeSurfer_LUT=pd.read_table(FASTSURFER_MAPPING, sep=";")
df=pd.DataFrame([texture.drop(["PTID","IMAGEUID"],axis=1).columns,texture.drop(["PTID","IMAGEUID"],axis=1).columns.str.split("_").str[-1],texture.drop(["PTID","IMAGEUID"],axis=1).columns.str.rsplit("_", n=1).str.get(0)]).transpose()
df.columns=["feature_name","region","feature_name_without_region"]
df.region=pd.to_numeric(df.region)
dfmerged=pd.merge(df,freeSurfer_LUT,left_on="region",right_on="0", how="left")
cnames=dfmerged.feature_name+"_"+dfmerged.Unknown
cnames=cnames.tolist()
cnames.append("PTID")
cnames.append("IMAGEUID")
texture.columns=cnames

In [None]:
adnimerge = pd.read_stata(ADNIMERGE_STATA_DIR+"adnimerge.dta")
adnimerge=adnimerge[~adnimerge.DX.isna()]

In [None]:
adni_cdr = pd.read_stata(ADNIMERGE_STATA_DIR+"cdr.dta")
adni_cdr.loc[adni_cdr.VISCODE=="sc","VISCODE"]="bl"
adni_cdr=adni_cdr.filter(["VISCODE","RID","CDGLOBAL"])
adnimerge=pd.merge(adnimerge,adni_cdr,on=["VISCODE","RID"],how="left")
adni_limm = pd.read_stata(ADNIMERGE_STATA_DIR+"neurobat.dta")
adni_limm.loc[adni_limm.VISCODE=="sc","VISCODE"]="bl"
adni_limm=adni_limm.filter(["VISCODE","RID","LIMMTOTAL"])
adnimerge=pd.merge(adnimerge,adni_limm,on=["VISCODE","RID"],how="left")

adnimerge.AGE=adnimerge.AGE+adnimerge.Years_bl
adnimerge=adnimerge.round({"AGE":1})

In [None]:
df_tte=pd.DataFrame(columns=["PTID","Status","Time","Ref_age"])
for PTID in adnimerge.PTID.unique():
    df_PTID=adnimerge[adnimerge.PTID==PTID]
    df_PTID=df_PTID.sort_values("AGE")
    status=""
    time=0.0
    if len(df_PTID.DX.unique())==1:
        if (df_PTID.DX.unique()[0]=="MCI"):
            status="sMCI"
            time=df_PTID.AGE.max()-df_PTID.AGE.min()
            ref_age=df_PTID.AGE.min()
        else:
            if (df_PTID.DX.unique()[0]=="CN"):
                status="sCN"
                time=df_PTID.AGE.max()-df_PTID.AGE.min()
                ref_age=df_PTID.AGE.min()
    else:
        if "Dementia" in df_PTID.DX.tolist():
            if (all((df_PTID.sort_values("DX").reset_index().AGE)==(df_PTID.reset_index().AGE))):
                if df_PTID.iloc[0].DX=="MCI":
                    status="pMCI"
                else:
                    if df_PTID.iloc[0].DX=="CN":
                        status="pCN"
                df_PTID_AD=df_PTID[df_PTID.DX=="Dementia"]
                time=df_PTID_AD.AGE.min() - df_PTID.AGE.min()
                ref_age=df_PTID.AGE.min()
            else:
                if df_PTID.iloc[0].DX=="MCI":
                    df_PTID_AD=df_PTID[df_PTID.DX=="Dementia"]
                    df_PTID_CN=df_PTID[df_PTID.DX=="CN"]
                    df_PTID_MCI=df_PTID[df_PTID.DX=="MCI"]
                    value_AD=df_PTID_AD.AGE.min()
                    value_MCI=df_PTID_MCI.AGE.max()
                    if(df_PTID_CN.shape[0]==0):
                        value_CN=0
                    else:
                        value_CN=df_PTID_CN.AGE.max()
                    if(value_CN<value_AD and value_MCI<value_AD):
                        status="uMCIADClear"
                    else:
                        status="uMCIADUnclear"
                else:
                    if df_PTID.iloc[0].DX=="CN":
                        df_PTID_AD=df_PTID[df_PTID.DX=="Dementia"]
                        df_PTID_CN=df_PTID[df_PTID.DX=="CN"]
                        df_PTID_MCI=df_PTID[df_PTID.DX=="MCI"]
                        value_AD=df_PTID_AD.AGE.min()
                        value_CN=df_PTID_CN.AGE.max()
                        if(df_PTID_MCI.shape[0]==0):
                            value_MCI=0
                        else:
                            value_MCI=df_PTID_MCI.AGE.max()
                        if(value_CN<value_AD and value_MCI<value_AD):
                            status="uCNADClear"
                        else:
                            status="uCNADUnclear"
                df_PTID_AD=df_PTID[df_PTID.DX=="Dementia"]
                time=df_PTID_AD.AGE.min()-df_PTID.AGE.min()
                ref_age=df_PTID.AGE.min()
        else:
            if (all((df_PTID.sort_values("DX").reset_index().AGE)==(df_PTID.reset_index().AGE))):
                if df_PTID.iloc[0].DX=="CN":
                    status="CNtoMCI"
                    time=df_PTID.AGE.max()-df_PTID.AGE.min()
                    ref_age=df_PTID.AGE.min()
            else:
                if df_PTID.iloc[0].DX=="MCI":
                    status="uMCINoAD"
                else:
                    if df_PTID.iloc[0].DX=="CN":
                        status="uCNNoAD"
                time=df_PTID.AGE.max()-df_PTID.AGE.min()
                ref_age=df_PTID.AGE.min()
    if not ((len(df_PTID.DX.unique())==1) and (df_PTID.iloc[0].DX=="Dementia")):
        if not((~df_PTID.IMAGEUID.isna()).sum() ==0):
            df_tte=pd.concat([df_tte,pd.DataFrame({"PTID":[PTID],"Status":[status], "Time": [time],"Ref_age":[ref_age]})],ignore_index=True)

In [None]:
if DATASET_SEL=="MCI":
    df_tte=df_tte[df_tte.Status.isin(["sMCI","pMCI","uMCINoAD","uMCIADClear"])]
else:
    df_tte=df_tte[df_tte.Status.isin(["sCN","sMCI","pMCI","uMCINoAD","uCNNoAD","uMCIADClear","uCNADClear","CNtoMCI","pCN"])]

adnimerge=pd.merge(adnimerge,df_tte,on="PTID")

In [None]:
adnimerge=adnimerge[~adnimerge.IMAGEUID.isna()]
adnimerge=adnimerge.astype({"IMAGEUID":"int"})
adnimerge=adnimerge.astype({"IMAGEUID":"str"})

In [None]:
df_ges=pd.merge(df_vol,adnimerge,on="IMAGEUID",how="inner")
df_ges=pd.merge(texture,df_ges,on="IMAGEUID",how="inner")

In [None]:
df_ges.Time=df_ges.Time-(df_ges.AGE-df_ges.Ref_age)
df_ges=df_ges[df_ges.Time>0.1]

In [None]:
mappingData=pd.read_csv(MPRAGE_METADATA)
mappingData=mappingData.filter(items=["ImageUID","MagStrength"])
mappingData=mappingData.rename(columns={"ImageUID": "IMAGEUID"})
mappingData=mappingData.astype({"IMAGEUID": "str"})
df_ges=pd.merge(df_ges,mappingData,on="IMAGEUID",how="left")

In [None]:
df_ges=df_ges.drop_duplicates("IMAGEUID")

In [None]:
list_status=list()
for i, dat in df_ges.iterrows():
    statNew=False
    if (dat.Status=="sCN") or (dat.Status=="sMCI") or (dat.Status=="CNtoMCI") or (dat.Status=="uMCINoAD") or (dat.Status=="uCNNoAD"):
        statNew=False
    else:
        statNew=True
    list_status.append(statNew)

df_ges.Status_new=list_status

gen = pd.read_stata(ADNIMERGE_STATA_DIR+"desikanlab.dta")  
gen=gen.drop(["ORIGPROT","USERDATE"],axis=1)
df_ges=pd.merge(df_ges,gen,on="RID",how="left")

In [None]:
features_bl=["PTID","PTGENDER","PTEDUCAT","PTETHCAT","PTRACCAT","PTMARRY","APOE4","ADAS11","ADAS13","ADASQ4","MMSE","CDRSB","CDGLOBAL","LIMMTOTAL","RAVLT_immediate","RAVLT_learning","RAVLT_forgetting","RAVLT_perc_forgetting","mPACCdigit","mPACCtrailsB","LDELTOT","DIGIT","TRAB","FAQ","MOCA","EcogPtMem","EcogPtLang","EcogPtVisspat","EcogPtPlan","EcogPtOrgan","EcogPtDivatt","EcogPtTotal","EcogSPMem","EcogSPLang","EcogSPVisspat","EcogSPPlan","EcogSPOrgan","EcogSPDivatt","EcogSPTotal","AGE","IMAGEUID","PHS","CIR"]

features_rbl=["PTID","PTGENDER","APOE4","MMSE","CDGLOBAL","AGE","IMAGEUID"]

features_vol=["PTID","IMAGEUID","lhCerebralWhiteMatterVol","Left-Lateral-Ventricle","Left-Inf-Lat-Vent","Left-Cerebellum-White-Matter","Left-Cerebellum-Cortex","Left-Thalamus","Left-Caudate","Left-Putamen","Left-Pallidum","3rd-Ventricle","4th-Ventricle","Brain-Stem","Left-Hippocampus","Left-Amygdala","CSF","Left-Accumbens-area",
              "Left-VentralDC","Left-choroid-plexus","rhCerebralWhiteMatterVol","Right-Lateral-Ventricle","Right-Inf-Lat-Vent","Right-Cerebellum-White-Matter","Right-Cerebellum-Cortex",
              "Right-Thalamus","Right-Caudate","Right-Putamen","Right-Pallidum","Right-Hippocampus","Right-Amygdala","Right-Accumbens-area","Right-VentralDC","Right-choroid-plexus",
              "WM-hypointensities","lh_caudalanteriorcingulate_volume","lh_caudalmiddlefrontal_volume","lh_cuneus_volume","lh_entorhinal_volume","lh_fusiform_volume","lh_inferiorparietal_volume",
              "lh_inferiortemporal_volume","lh_isthmuscingulate_volume","lh_lateraloccipital_volume","lh_lateralorbitofrontal_volume","lh_lingual_volume","lh_medialorbitofrontal_volume",
              "lh_middletemporal_volume","lh_parahippocampal_volume","lh_paracentral_volume","lh_parsopercularis_volume","lh_parsorbitalis_volume","lh_parstriangularis_volume","lh_pericalcarine_volume",
              "lh_postcentral_volume","lh_posteriorcingulate_volume","lh_precentral_volume","lh_precuneus_volume","lh_rostralanteriorcingulate_volume","lh_rostralmiddlefrontal_volume",
              "lh_superiorfrontal_volume","lh_superiorparietal_volume","lh_superiortemporal_volume","lh_supramarginal_volume","lh_transversetemporal_volume","lh_insula_volume",
              "rh_caudalanteriorcingulate_volume","rh_caudalmiddlefrontal_volume","rh_cuneus_volume","rh_entorhinal_volume","rh_fusiform_volume","rh_inferiorparietal_volume",
              "rh_inferiortemporal_volume","rh_isthmuscingulate_volume","rh_lateraloccipital_volume","rh_lateralorbitofrontal_volume","rh_lingual_volume","rh_medialorbitofrontal_volume",
              "rh_middletemporal_volume","rh_parahippocampal_volume","rh_paracentral_volume","rh_parsopercularis_volume","rh_parsorbitalis_volume","rh_parstriangularis_volume",
              "rh_pericalcarine_volume","rh_postcentral_volume","rh_posteriorcingulate_volume","rh_precentral_volume","rh_precuneus_volume","rh_rostralanteriorcingulate_volume",
              "rh_rostralmiddlefrontal_volume","rh_superiorfrontal_volume","rh_superiorparietal_volume","rh_superiortemporal_volume","rh_supramarginal_volume","rh_transversetemporal_volume","rh_insula_volume","MagStrength","EstimatedTotalIntraCranialVol"]

features_texture=list(texture.columns)
features_texture.append("MagStrength")
features_texture.append("EstimatedTotalIntraCranialVol")


In [None]:
selected_features=list()
selected_features.append("Time")
selected_features.append("Status")
selected_features.append("Status_new")


In [None]:
if(FEATURE_SET=="BL"):
    for i in features_bl:
        selected_features.append(i)
if(FEATURE_SET=="BL+VOL"):
    for i in features_bl:
        selected_features.append(i)
    for i in features_vol:
        selected_features.append(i)
if(FEATURE_SET=="BL+RAD"):
    for i in features_bl:
        selected_features.append(i)
    for i in features_texture:
        selected_features.append(i)
if(FEATURE_SET=="BL+VOL+RAD"):
    for i in features_bl:
        selected_features.append(i)
    for i in features_vol:
        selected_features.append(i)
    for i in features_texture:
        selected_features.append(i)
if(FEATURE_SET=="rBL"):
    for i in features_rbl:
        selected_features.append(i)
if(FEATURE_SET=="rBL+VOL"):
    for i in features_rbl:
        selected_features.append(i)
    for i in features_vol:
        selected_features.append(i)
if(FEATURE_SET=="rBL+RAD"):
    for i in features_rbl:
        selected_features.append(i)
    for i in features_texture:
        selected_features.append(i)
if(FEATURE_SET=="rBL+VOL+RAD"):
    for i in features_rbl:
        selected_features.append(i)
    for i in features_vol:
        selected_features.append(i)
    for i in features_texture:
        selected_features.append(i)


In [None]:
df_ges_new=df_ges.filter(list(OrderedDict.fromkeys(selected_features)))

dummy_features=list(df_ges_new.columns[df_ges_new.columns.isin(["PTETHCAT","PTRACCAT","PTMARRY","APOE4","MagStrength","PTGENDER"])])

df_ges_new=pd.get_dummies(df_ges_new, columns=dummy_features)

df_ges_new=df_ges_new.dropna(thresh=int(len(df_ges)*0.5), axis=1)


In [None]:
df_training_samples=pd.read_csv(TRAINING_SET)
df_test_samples=pd.read_csv(TEST_SET)

In [None]:
df_training_samples=df_training_samples.astype({"IMAGEUID": "str"})
df_test_samples=df_test_samples.astype({"IMAGEUID": "str"})
df_ges_new=df_ges_new.astype({"IMAGEUID": "str"})


In [None]:
train = df_ges_new.set_index("IMAGEUID").loc[list(df_training_samples.IMAGEUID)].reset_index()
test = df_ges_new.set_index("IMAGEUID").loc[list(df_test_samples.IMAGEUID)].reset_index()

In [None]:
train.to_csv(SAVEDIR+DATASET_SEL+"_"+FEATURE_SET+"_train.csv",index=False)
test.to_csv(SAVEDIR+DATASET_SEL+"_"+FEATURE_SET+"_test.csv",index=False)

In [None]:
test