## Combbine all diagnosis

This method take diagnosis from images, clinical, and diagnosis sheet, and creates one ground truth (where all three agree) and one majority vote (where two agree) diagnosis files. 

In [16]:
import pandas as pd
import math
clinical = pd.read_csv("ADSP_PHC_COGN.csv").rename(columns={"PHASE":"Phase"})
#this file is the metadata file that one can get from downloading MRI images from ADNI
img = pd.read_csv("ADNI1_Screening_ImagingMetadata.csv")
comb = pd.read_csv("DXSUM_PDXCONV_ADNIALL.csv")[["RID", "PTID" , "Phase"]]

In [6]:
def read_diagnose(file_path: str = 'DXSUM_PDXCONV_ADNIALL.csv', verbose=False):
    # Read diagnostic summary
    diagnostic_summary = pd.read_csv(file_path, index_col='PTID')
    diagnostic_summary = diagnostic_summary.sort_values(by=["update_stamp"], ascending=True)
    # Create dictionary
    diagnostic_dict: dict = {}
    for key, data in diagnostic_summary.iterrows():
        # Iterate for each row of the document
        phase: str = data['Phase']
        diagnosis: float = -1.
        if phase == "ADNI1":
            diagnosis = data['DXCURREN']
        elif phase == "ADNI2" or phase == "ADNIGO":
            dxchange = data['DXCHANGE']
            if dxchange == 1 or dxchange == 7 or dxchange == 9:
                diagnosis = 1.
            if dxchange == 2 or dxchange == 4 or dxchange == 8:
                diagnosis = 2.
            if dxchange == 3 or dxchange == 5 or dxchange == 6:
                diagnosis = 3.
        elif phase == "ADNI3":
            diagnosis = data['DIAGNOSIS']
        else:
            print(f"ERROR: Not recognized study phase {phase}")
            exit(1)
        # Update dictionary
        if not math.isnan(diagnosis):
            diagnostic_dict[key] = diagnosis
    if verbose:
        print_diagnostic_dict_summary(diagnostic_dict)
    return diagnostic_dict


def print_diagnostic_dict_summary(diagnostic_dict: dict):
    print(f"Number of diagnosed patients: {len(diagnostic_dict.items())}\n")
    n_NL = 0
    n_MCI = 0
    n_AD = 0
    for (key, data) in diagnostic_dict.items():
        if data == 1:
            n_NL += 1
        if data == 2:
            n_MCI += 1
        if data == 3:
            n_AD += 1
    print(f"Number of NL patients: {n_NL}\n"
          f"Number of MCI patients: {n_MCI}\n"
          f"Number of AD patients: {n_AD}\n")

In [7]:
d = read_diagnose()
print_diagnostic_dict_summary(d)

Number of diagnosed patients: 2955

Number of NL patients: 1094
Number of MCI patients: 980
Number of AD patients: 881



In [8]:
new = pd.DataFrame.from_dict(d, orient='index').reset_index()

In [9]:
clinical.head()

Unnamed: 0,RID,SUBJID,Phase,VISCODE,VISCODE2,EXAMDATE,PHC_VISIT,PHC_Sex,PHC_Education,PHC_Ethnicity,PHC_Race,PHC_AGE,PHC_Diagnosis,PHC_MEM,PHC_EXF,PHC_LAN,PHC_VSP,update_stamp
0,2,ADNI_011_S_0002,ADNI1,bl,bl,2005-09-08,1,1.0,16.0,2.0,5.0,74.439425,1.0,0.277,0.254,0.293,,2022-10-18 09:15:45.0
1,2,ADNI_011_S_0002,ADNI1,m06,m06,2006-03-06,2,1.0,16.0,2.0,5.0,74.9295,1.0,0.237,0.278,0.535,-0.333,2022-10-18 09:15:45.0
2,2,ADNI_011_S_0002,ADNI1,m36,m36,2008-08-27,3,1.0,16.0,2.0,5.0,77.407255,1.0,0.344,0.419,0.535,,2022-10-18 09:15:45.0
3,2,ADNI_011_S_0002,ADNIGO,m60,m60,2010-09-22,4,1.0,16.0,2.0,5.0,79.47707,1.0,0.101,0.066,0.62,0.264,2022-10-18 09:15:45.0
4,2,ADNI_011_S_0002,ADNI2,v06,m72,2011-09-19,5,1.0,16.0,2.0,5.0,80.468172,1.0,0.09,0.002,0.46,0.264,2022-10-18 09:15:45.0


In [10]:
clinical["year"] = clinical["EXAMDATE"].str[:4]

In [12]:
clinical["Subject"] = clinical["SUBJID"].str.replace("ADNI_", "").str.replace("s", "S")

In [13]:
c = comb.merge(clinical, on = ["RID", "Phase"])

In [14]:
c = c.drop("Subject", axis =1)

In [48]:
c = c.rename(columns = {"PTID":"Subject"})
c

Unnamed: 0,RID,Subject,Phase,SUBJID,VISCODE,VISCODE2,EXAMDATE,PHC_VISIT,PHC_Sex,PHC_Education,PHC_Ethnicity,PHC_Race,PHC_AGE,PHC_Diagnosis,PHC_MEM,PHC_EXF,PHC_LAN,PHC_VSP,update_stamp,year
0,2,011_S_0002,ADNI1,ADNI_011_S_0002,bl,bl,2005-09-08,1,1.0,16.0,2.0,5.0,74.439425,1.0,0.277,0.254,0.293,,2022-10-18 09:15:45.0,2005
1,2,011_S_0002,ADNI1,ADNI_011_S_0002,m06,m06,2006-03-06,2,1.0,16.0,2.0,5.0,74.929500,1.0,0.237,0.278,0.535,-0.333,2022-10-18 09:15:45.0,2006
2,2,011_S_0002,ADNI1,ADNI_011_S_0002,m36,m36,2008-08-27,3,1.0,16.0,2.0,5.0,77.407255,1.0,0.344,0.419,0.535,,2022-10-18 09:15:45.0,2008
3,2,011_S_0002,ADNI1,ADNI_011_S_0002,bl,bl,2005-09-08,1,1.0,16.0,2.0,5.0,74.439425,1.0,0.277,0.254,0.293,,2022-10-18 09:15:45.0,2005
4,2,011_S_0002,ADNI1,ADNI_011_S_0002,m06,m06,2006-03-06,2,1.0,16.0,2.0,5.0,74.929500,1.0,0.237,0.278,0.535,-0.333,2022-10-18 09:15:45.0,2006
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50397,4795,073_S_4795,ADNI3,G-ADNI-AN000201,init,m84,2019-11-05,4,1.0,16.0,2.0,2.0,68.681725,,0.455,1.461,0.432,,2022-10-18 09:15:45.0,2019
50398,4795,073_S_4795,ADNI3,G-ADNI-AN000201,init,m84,2019-11-05,4,1.0,16.0,2.0,2.0,68.681725,,0.455,1.461,0.432,,2022-10-18 09:15:45.0,2019
50399,5167,073_S_5167,ADNI3,G-ADNI-AN000368,init,m72,2019-11-26,4,1.0,20.0,2.0,5.0,73.234771,,1.186,1.127,0.956,,2022-10-18 09:15:45.0,2019
50400,5167,073_S_5167,ADNI3,G-ADNI-AN000368,init,m72,2019-11-26,4,1.0,20.0,2.0,5.0,73.234771,,1.186,1.127,0.956,,2022-10-18 09:15:45.0,2019


In [29]:
img["year"] = img["Acq Date"].str[5:].str.replace("/", "")

In [46]:
img = img.replace(["CN", "MCI", "AD"], [ 0, 1, 2])


In [54]:

c["DX"] = c["PHC_Diagnosis"] -1

In [55]:
new[0] = new[0].astype(int) -1

KeyError: 0

In [56]:
new = new.rename(columns = {"index":"Subject", 0:"GroupN"})

In [57]:
m = new.merge(c, on = "Subject", how = "outer").merge(img, on = "Subject", how = "outer")

In [58]:
m[["GroupN", "DX", "Group"]]

Unnamed: 0,GroupN,DX,Group
0,1.0,-1.0,0.0
1,1.0,-1.0,0.0
2,1.0,-1.0,0.0
3,1.0,-1.0,0.0
4,1.0,-1.0,0.0
...,...,...,...
58355,1.0,,
58356,1.0,,
58357,0.0,,
58358,,,


In [59]:
m = m[["Subject", "GroupN", "Group", "DX", "Phase"]].drop_duplicates()

In [60]:
m = m.dropna(subset = ["GroupN", "Group", "DX"], how="all").drop_duplicates()
m

Unnamed: 0,Subject,GroupN,Group,DX,Phase
0,011_S_0002,1.0,0.0,-1.0,ADNI1
9,011_S_0002,1.0,0.0,-1.0,ADNIGO
10,011_S_0002,1.0,0.0,-1.0,ADNI2
11,011_S_0002,1.0,0.0,0.0,ADNI2
13,011_S_0002,1.0,0.0,,ADNI2
...,...,...,...,...,...
58353,082_S_7122,0.0,,,
58354,116_S_6453,1.0,,,
58355,123_S_7125,1.0,,,
58356,016_S_6949,1.0,,,


In [61]:
m.loc[m["DX"].isna() & m["Group"].isna(), "Group"] = m.loc[m["DX"].isna() & m["Group"].isna(), "GroupN"]
m.loc[m["DX"].isna() & m["Group"].isna(), "DX"] = m.loc[m["DX"].isna() & m["Group"].isna(), "GroupN"]

In [62]:
m1 = m[m["GroupN"] == m["Group"]]
m3 = m[m["GroupN"] == m["DX"]]
m4 = m[m["Group"] == m["DX"]]
m2 = m1[m1["Group"] == m1["DX"]]

In [63]:
m1 = m1[["Subject", "GroupN", "Group", "DX", "Phase"]]
m1

Unnamed: 0,Subject,GroupN,Group,DX,Phase
41,011_S_0003,2.0,2.0,1.0,ADNI1
57,011_S_0005,0.0,0.0,-1.0,ADNI1
124,022_S_0007,2.0,2.0,1.0,ADNI1
238,023_S_0031,0.0,0.0,-1.0,ADNI1
310,023_S_0031,0.0,0.0,-1.0,ADNIGO
...,...,...,...,...,...
58353,082_S_7122,0.0,0.0,,
58354,116_S_6453,1.0,1.0,,
58355,123_S_7125,1.0,1.0,,
58356,016_S_6949,1.0,1.0,,


In [64]:
m1.loc[m1["DX"].isna(), "DX"] = m1.loc[m1["DX"].isna(), "Group"]

In [65]:
m3 = m3[["Subject", "GroupN", "Group", "DX", "Phase"]]
m3

Unnamed: 0,Subject,GroupN,Group,DX,Phase
5896,021_S_0178,0.0,1.0,0.0,ADNI1
5945,021_S_0178,0.0,1.0,0.0,ADNIGO
5946,021_S_0178,0.0,1.0,0.0,ADNI2
5977,128_S_0205,0.0,1.0,0.0,ADNI1
6786,128_S_0167,1.0,2.0,1.0,ADNI1
...,...,...,...,...,...
57736,006_S_6681,0.0,,0.0,ADNI3
57739,006_S_6682,0.0,,0.0,ADNI3
57742,130_S_6688,0.0,,0.0,ADNI3
57858,067_S_6525,0.0,,0.0,ADNI3


In [66]:
m3.loc[m3["Group"].isna(), "Group"] = m3.loc[m3["Group"].isna(), "GroupN"]

In [67]:
m4 = m4[["Subject", "GroupN", "Group", "DX", "Phase"]]
m4

Unnamed: 0,Subject,GroupN,Group,DX,Phase
11,011_S_0002,1.0,0.0,0.0,ADNI2
40,011_S_0002,1.0,0.0,0.0,ADNI3
111,011_S_0008,1.0,0.0,0.0,ADNI2
165,100_S_0015,1.0,0.0,0.0,ADNI2
168,023_S_0030,2.0,1.0,1.0,ADNI1
...,...,...,...,...,...
31400,127_S_1427,2.0,1.0,1.0,ADNI1
31423,127_S_1427,2.0,1.0,1.0,ADNIGO
31424,127_S_1427,2.0,1.0,1.0,ADNI2
31460,127_S_1427,2.0,1.0,1.0,ADNI3


In [68]:
m4[m4["GroupN"] != m4["DX"]]

Unnamed: 0,Subject,GroupN,Group,DX,Phase
11,011_S_0002,1.0,0.0,0.0,ADNI2
40,011_S_0002,1.0,0.0,0.0,ADNI3
111,011_S_0008,1.0,0.0,0.0,ADNI2
165,100_S_0015,1.0,0.0,0.0,ADNI2
168,023_S_0030,2.0,1.0,1.0,ADNI1
...,...,...,...,...,...
31400,127_S_1427,2.0,1.0,1.0,ADNI1
31423,127_S_1427,2.0,1.0,1.0,ADNIGO
31424,127_S_1427,2.0,1.0,1.0,ADNI2
31460,127_S_1427,2.0,1.0,1.0,ADNI3


In [69]:
m2[["Subject", "GroupN", "Group", "DX", "Phase"]]

Unnamed: 0,Subject,GroupN,Group,DX,Phase
13960,136_S_0429,1.0,1.0,1.0,ADNI1
14866,053_S_0507,1.0,1.0,1.0,ADNI1
19405,141_S_0697,1.0,1.0,1.0,ADNI1
19426,141_S_0697,1.0,1.0,1.0,ADNI2
20883,009_S_0842,0.0,0.0,0.0,ADNI2
22147,020_S_0883,0.0,0.0,0.0,ADNI1
23600,012_S_1009,0.0,0.0,0.0,ADNI1
27142,130_S_1200,0.0,0.0,0.0,ADNI1
27807,094_S_1241,0.0,0.0,0.0,ADNI1


In [70]:
m5 = pd.concat([m1,m3,m4])
i = m5[m5["Group"] == m5["GroupN"]]
i = i[i["Group"] == i["DX"]]

In [71]:
i = i.drop_duplicates()

In [72]:
i

Unnamed: 0,Subject,GroupN,Group,DX,Phase
9989,037_S_0303,0.0,0.0,0.0,ADNI2
10227,114_S_0416,0.0,0.0,0.0,ADNI3
13960,136_S_0429,1.0,1.0,1.0,ADNI1
14866,053_S_0507,1.0,1.0,1.0,ADNI1
19122,137_S_0722,1.0,1.0,1.0,ADNI2
...,...,...,...,...,...
57736,006_S_6681,0.0,0.0,0.0,ADNI3
57739,006_S_6682,0.0,0.0,0.0,ADNI3
57742,130_S_6688,0.0,0.0,0.0,ADNI3
57858,067_S_6525,0.0,0.0,0.0,ADNI3


In [73]:
i[["Subject", "Group", "Phase"]].to_csv("ground_truth.csv")

In [74]:
m.update(m5[~m5.index.duplicated(keep='first')])

In [75]:
indexes = m.index

In [76]:
#if none of the three diagnosis agree, then we set the value to -1
m["GROUP"] = -1

In [77]:
for i in indexes:
    row = m.loc[i]
    if (row["GroupN"] == row["Group"]):
        val = row["GroupN"]
        
        m.loc[i, "GROUP"] = val
    elif (row["GroupN"] == row["DX"]):
        val = row["GroupN"]
        m.loc[i, "GROUP"] = val
        
    elif (row["Group"] == row["DX"]):
        val = row["Group"]
        m.loc[i, "GROUP"] = val
        

In [78]:
m5 = m5[~m5.index.duplicated(keep='first')]
m5

Unnamed: 0,Subject,GroupN,Group,DX,Phase
41,011_S_0003,2.0,2.0,1.0,ADNI1
57,011_S_0005,0.0,0.0,-1.0,ADNI1
124,022_S_0007,2.0,2.0,1.0,ADNI1
238,023_S_0031,0.0,0.0,-1.0,ADNI1
310,023_S_0031,0.0,0.0,-1.0,ADNIGO
...,...,...,...,...,...
31400,127_S_1427,2.0,1.0,1.0,ADNI1
31423,127_S_1427,2.0,1.0,1.0,ADNIGO
31424,127_S_1427,2.0,1.0,1.0,ADNI2
31460,127_S_1427,2.0,1.0,1.0,ADNI3


In [79]:
m[m["GROUP"] != -1]

Unnamed: 0,Subject,GroupN,Group,DX,Phase,GROUP
11,011_S_0002,1.0,0.0,0.0,ADNI2,0
40,011_S_0002,1.0,0.0,0.0,ADNI3,0
41,011_S_0003,2.0,2.0,1.0,ADNI1,2
57,011_S_0005,0.0,0.0,-1.0,ADNI1,0
111,011_S_0008,1.0,0.0,0.0,ADNI2,0
...,...,...,...,...,...,...
58353,082_S_7122,0.0,0.0,0.0,,0
58354,116_S_6453,1.0,1.0,1.0,,1
58355,123_S_7125,1.0,1.0,1.0,,1
58356,016_S_6949,1.0,1.0,1.0,,1


In [None]:
m[["Subject", "GroupN", "Group", "DX", "GROUP", "Phase"]].to_csv("diagnosis_full.csv")