In [132]:
# Import Pandas Library for Data Preprocessing
import pandas as pd

In [133]:
# Read in the data of patients part of the OASIS dataset
patients = pd.read_csv('./oasis_cross-sectional.csv')

In [134]:
# Get the count, minimum, and maximim of each column for all the patients
patients.agg({"count", "min", "max"})

Unnamed: 0,ID,M/F,Hand,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF,Delay
min,OAS1_0001_MR1,F,R,18,1.0,1.0,14.0,0.0,1123,0.644,0.881,1.0
max,OAS1_0457_MR1,M,R,96,5.0,5.0,30.0,2.0,1992,0.893,1.563,89.0
count,436,436,436,436,235.0,216.0,235.0,235.0,436,436.0,436.0,20.0


In [135]:
# remove Gender, Delay, and Hand as they seem insignificant 
patients.drop(["M/F", "Hand", "Delay"], axis = 1, inplace = True)

In [136]:
# filter the patients diagnosed with Alzhiemer's Disease (AD)
ad = patients[patients['CDR'] == 1]

# Get the count, minimum, and maximim of each column for AD patients
ad.agg({"count", "min", "max"})

Unnamed: 0,ID,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF
min,OAS1_0028_MR1,65,1.0,1.0,15.0,1.0,1274,0.655,1.013
max,OAS1_0452_MR1,96,5.0,5.0,29.0,1.0,1732,0.763,1.377
count,28,28,28.0,24.0,28.0,28.0,28,28.0,28.0


In [137]:
# filter the patients diagnosed with Typical Development (TD)
td = patients[patients['CDR'] == 0]

# Get the count, minimum, and maximim of each column for TD patients
td.agg({"count", "min", "max"})

Unnamed: 0,ID,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF
min,OAS1_0001_MR1,33,1.0,1.0,25.0,0.0,1123,0.645,0.965
max,OAS1_0457_MR1,94,5.0,5.0,30.0,0.0,1818,0.847,1.563
count,135,135,135.0,133.0,135.0,135.0,135,135.0,135.0


In [138]:
# filter TD patients to above the age of 65 which is the min for AD
td = td[td['Age'] >= 65]

# filter TD patients to above the eTIV of 1480 which is the min for AD
td = td[td['eTIV'] >= 1480]

# filter TD patients to above the ASF of 1 which is the min for AD
td = td[td['ASF'] >= 1]

# Get the count, minimum, and maximim of each column for filtered TD patients
td.agg({"count", "min", "max"})

Unnamed: 0,ID,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF
min,OAS1_0010_MR1,67,1.0,1.0,26.0,0.0,1483,0.676,1.003
max,OAS1_0428_MR1,91,5.0,4.0,30.0,0.0,1750,0.805,1.183
count,28,28,28.0,27.0,28.0,28.0,28,28.0,28.0


The Count of patients with AD and TD are now equal, 28

In [140]:
# Combine the AD and TD patients into one dataframe
filtered_patients = pd.concat([ad, td])

# Get the ids of the patients that we will be using for network analysis
filtered_patients_ids = filtered_patients['ID']

# Save the list of patient ids in a csv
filtered_patients_ids.to_csv('patient_ids.csv', index=False, columns=['ID'])

In [None]:
diagnosed_and_clear_df = oasis[oasis["CDR"] <= 1]

In [5]:
diagnosed_and_clear_df

Unnamed: 0,ID,M/F,Hand,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF,Delay
0,OAS1_0001_MR1,F,R,74,2.0,3.0,29.0,0.0,1344,0.743,1.306,
1,OAS1_0002_MR1,F,R,55,4.0,1.0,29.0,0.0,1147,0.810,1.531,
2,OAS1_0003_MR1,F,R,73,4.0,3.0,27.0,0.5,1454,0.708,1.207,
8,OAS1_0010_MR1,M,R,74,5.0,2.0,30.0,0.0,1636,0.689,1.073,
9,OAS1_0011_MR1,F,R,52,3.0,2.0,30.0,0.0,1321,0.827,1.329,
...,...,...,...,...,...,...,...,...,...,...,...,...
411,OAS1_0453_MR1,F,R,70,1.0,4.0,29.0,0.5,1295,0.748,1.355,
412,OAS1_0454_MR1,F,R,73,3.0,2.0,23.0,0.5,1536,0.730,1.142,
413,OAS1_0455_MR1,F,R,61,2.0,4.0,28.0,0.0,1354,0.825,1.297,
414,OAS1_0456_MR1,M,R,61,5.0,2.0,30.0,0.0,1637,0.780,1.072,


In [6]:
diagnosed_and_clear_df.shape

(233, 12)

In [7]:
diagnosed_and_clear_df = diagnosed_and_clear_df[diagnosed_and_clear_df["CDR"] >= 0]
diagnosed_and_clear_df = diagnosed_and_clear_df[diagnosed_and_clear_df["CDR"] != 0.5]

In [8]:
diagnosed_and_clear_df

Unnamed: 0,ID,M/F,Hand,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF,Delay
0,OAS1_0001_MR1,F,R,74,2.0,3.0,29.0,0.0,1344,0.743,1.306,
1,OAS1_0002_MR1,F,R,55,4.0,1.0,29.0,0.0,1147,0.810,1.531,
8,OAS1_0010_MR1,M,R,74,5.0,2.0,30.0,0.0,1636,0.689,1.073,
9,OAS1_0011_MR1,F,R,52,3.0,2.0,30.0,0.0,1321,0.827,1.329,
11,OAS1_0013_MR1,F,R,81,5.0,2.0,30.0,0.0,1664,0.679,1.055,
...,...,...,...,...,...,...,...,...,...,...,...,...
407,OAS1_0449_MR1,F,R,71,3.0,4.0,29.0,0.0,1264,0.818,1.388,
410,OAS1_0452_MR1,M,R,75,1.0,4.0,22.0,1.0,1656,0.762,1.060,
413,OAS1_0455_MR1,F,R,61,2.0,4.0,28.0,0.0,1354,0.825,1.297,
414,OAS1_0456_MR1,M,R,61,5.0,2.0,30.0,0.0,1637,0.780,1.072,


In [9]:
diagnosed_df = oasis[oasis["CDR"] == 1]

In [10]:
diagnosed_df

Unnamed: 0,ID,M/F,Hand,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF,Delay
25,OAS1_0028_MR1,F,R,86,2.0,4.0,27.0,1.0,1449,0.738,1.211,
28,OAS1_0031_MR1,M,R,88,1.0,4.0,26.0,1.0,1419,0.674,1.236,
32,OAS1_0035_MR1,F,R,84,3.0,2.0,28.0,1.0,1402,0.695,1.252,
47,OAS1_0052_MR1,F,R,78,1.0,5.0,23.0,1.0,1462,0.697,1.2,
48,OAS1_0053_MR1,F,R,83,1.0,4.0,21.0,1.0,1384,0.699,1.268,
51,OAS1_0056_MR1,F,R,72,3.0,3.0,15.0,1.0,1324,0.668,1.325,
62,OAS1_0067_MR1,F,R,71,4.0,1.0,27.0,1.0,1549,0.73,1.133,
68,OAS1_0073_MR1,F,R,69,2.0,4.0,21.0,1.0,1495,0.655,1.174,
113,OAS1_0122_MR1,F,R,83,5.0,2.0,22.0,1.0,1377,0.715,1.274,
124,OAS1_0134_MR1,M,R,80,2.0,4.0,20.0,1.0,1494,0.665,1.175,


In [11]:
diagnosed_df.shape

(28, 12)

In [12]:
# # removed Delay and Hand as they seem insignificant 
# df.drop(["Hand", "Delay"], axis = 1, inplace = True)

# # remove ID, N/A rows, and CDR of 0.5 and 2 
# df.dropna(inplace = True)
# df.drop(["ID"], axis = 1, inplace = True)
# df = df[df["CDR"] != 0.5]
# df = df[df["CDR"] != 2]

# # It's very unlikely for people below age 65 to get AD
# df = df[df["Age"] >= 65]

# # I noticed none with 30.0 MMSE have CDR 1, so let's drop it
# df = df[df["MMSE"] < 30]

# # Not sure if this is correct
# df = df[df["eTIV"] < 1750]
# df = df[df["nWBV"] < 0.8]
# df = df[df["ASF"] < 1.4]
# df # 68 x 9

In [13]:
diagnosed_and_clear_df.drop(columns=["Hand", "Delay"], inplace = True)

In [14]:
diagnosed_and_clear_df

Unnamed: 0,ID,M/F,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF
0,OAS1_0001_MR1,F,74,2.0,3.0,29.0,0.0,1344,0.743,1.306
1,OAS1_0002_MR1,F,55,4.0,1.0,29.0,0.0,1147,0.810,1.531
8,OAS1_0010_MR1,M,74,5.0,2.0,30.0,0.0,1636,0.689,1.073
9,OAS1_0011_MR1,F,52,3.0,2.0,30.0,0.0,1321,0.827,1.329
11,OAS1_0013_MR1,F,81,5.0,2.0,30.0,0.0,1664,0.679,1.055
...,...,...,...,...,...,...,...,...,...,...
407,OAS1_0449_MR1,F,71,3.0,4.0,29.0,0.0,1264,0.818,1.388
410,OAS1_0452_MR1,M,75,1.0,4.0,22.0,1.0,1656,0.762,1.060
413,OAS1_0455_MR1,F,61,2.0,4.0,28.0,0.0,1354,0.825,1.297
414,OAS1_0456_MR1,M,61,5.0,2.0,30.0,0.0,1637,0.780,1.072


In [15]:
diagnosed_and_clear_df = diagnosed_and_clear_df[diagnosed_and_clear_df["Age"] >= 65]

In [16]:
diagnosed_and_clear_df

Unnamed: 0,ID,M/F,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF
0,OAS1_0001_MR1,F,74,2.0,3.0,29.0,0.0,1344,0.743,1.306
8,OAS1_0010_MR1,M,74,5.0,2.0,30.0,0.0,1636,0.689,1.073
11,OAS1_0013_MR1,F,81,5.0,2.0,30.0,0.0,1664,0.679,1.055
17,OAS1_0019_MR1,F,89,5.0,1.0,30.0,0.0,1536,0.715,1.142
25,OAS1_0028_MR1,F,86,2.0,4.0,27.0,1.0,1449,0.738,1.211
...,...,...,...,...,...,...,...,...,...,...
396,OAS1_0438_MR1,F,66,5.0,2.0,29.0,0.0,1191,0.787,1.474
403,OAS1_0445_MR1,F,90,1.0,2.0,29.0,0.0,1362,0.673,1.289
404,OAS1_0446_MR1,F,80,2.0,4.0,30.0,0.0,1390,0.748,1.263
407,OAS1_0449_MR1,F,71,3.0,4.0,29.0,0.0,1264,0.818,1.388


In [17]:
diagnosed_df.agg({"min", "max"})

Unnamed: 0,ID,M/F,Hand,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF,Delay
min,OAS1_0028_MR1,F,R,65,1.0,1.0,15.0,1.0,1274,0.655,1.013,
max,OAS1_0452_MR1,M,R,96,5.0,5.0,29.0,1.0,1732,0.763,1.377,


In [18]:
diagnosed_and_clear_df = diagnosed_and_clear_df[diagnosed_and_clear_df["MMSE"] >= 15]

In [19]:
diagnosed_and_clear_df = diagnosed_and_clear_df[diagnosed_and_clear_df["MMSE"] <= 29]

In [20]:
diagnosed_and_clear_df

Unnamed: 0,ID,M/F,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF
0,OAS1_0001_MR1,F,74,2.0,3.0,29.0,0.0,1344,0.743,1.306
25,OAS1_0028_MR1,F,86,2.0,4.0,27.0,1.0,1449,0.738,1.211
27,OAS1_0030_MR1,F,65,2.0,3.0,29.0,0.0,1392,0.764,1.261
28,OAS1_0031_MR1,M,88,1.0,4.0,26.0,1.0,1419,0.674,1.236
29,OAS1_0032_MR1,M,89,4.0,1.0,28.0,0.0,1631,0.682,1.076
...,...,...,...,...,...,...,...,...,...,...
389,OAS1_0430_MR1,M,71,4.0,1.0,17.0,1.0,1562,0.687,1.123
396,OAS1_0438_MR1,F,66,5.0,2.0,29.0,0.0,1191,0.787,1.474
403,OAS1_0445_MR1,F,90,1.0,2.0,29.0,0.0,1362,0.673,1.289
407,OAS1_0449_MR1,F,71,3.0,4.0,29.0,0.0,1264,0.818,1.388


In [21]:
diagnosed_and_clear_df = diagnosed_and_clear_df[diagnosed_and_clear_df["ASF"] >= 1.013]

In [22]:
diagnosed_and_clear_df

Unnamed: 0,ID,M/F,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF
0,OAS1_0001_MR1,F,74,2.0,3.0,29.0,0.0,1344,0.743,1.306
25,OAS1_0028_MR1,F,86,2.0,4.0,27.0,1.0,1449,0.738,1.211
27,OAS1_0030_MR1,F,65,2.0,3.0,29.0,0.0,1392,0.764,1.261
28,OAS1_0031_MR1,M,88,1.0,4.0,26.0,1.0,1419,0.674,1.236
29,OAS1_0032_MR1,M,89,4.0,1.0,28.0,0.0,1631,0.682,1.076
...,...,...,...,...,...,...,...,...,...,...
389,OAS1_0430_MR1,M,71,4.0,1.0,17.0,1.0,1562,0.687,1.123
396,OAS1_0438_MR1,F,66,5.0,2.0,29.0,0.0,1191,0.787,1.474
403,OAS1_0445_MR1,F,90,1.0,2.0,29.0,0.0,1362,0.673,1.289
407,OAS1_0449_MR1,F,71,3.0,4.0,29.0,0.0,1264,0.818,1.388


In [23]:
diagnosed_and_clear_df = diagnosed_and_clear_df[diagnosed_and_clear_df["ASF"] <= 1.377]

In [24]:
diagnosed_and_clear_df

Unnamed: 0,ID,M/F,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF
0,OAS1_0001_MR1,F,74,2.0,3.0,29.0,0.0,1344,0.743,1.306
25,OAS1_0028_MR1,F,86,2.0,4.0,27.0,1.0,1449,0.738,1.211
27,OAS1_0030_MR1,F,65,2.0,3.0,29.0,0.0,1392,0.764,1.261
28,OAS1_0031_MR1,M,88,1.0,4.0,26.0,1.0,1419,0.674,1.236
29,OAS1_0032_MR1,M,89,4.0,1.0,28.0,0.0,1631,0.682,1.076
...,...,...,...,...,...,...,...,...,...,...
386,OAS1_0426_MR1,F,82,5.0,2.0,29.0,0.0,1316,0.791,1.334
387,OAS1_0428_MR1,F,84,4.0,3.0,28.0,0.0,1500,0.751,1.170
389,OAS1_0430_MR1,M,71,4.0,1.0,17.0,1.0,1562,0.687,1.123
403,OAS1_0445_MR1,F,90,1.0,2.0,29.0,0.0,1362,0.673,1.289


In [25]:
diagnosed_df.agg({"min", "max"})

Unnamed: 0,ID,M/F,Hand,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF,Delay
min,OAS1_0028_MR1,F,R,65,1.0,1.0,15.0,1.0,1274,0.655,1.013,
max,OAS1_0452_MR1,M,R,96,5.0,5.0,29.0,1.0,1732,0.763,1.377,


In [26]:
diagnosed_and_clear_df = diagnosed_and_clear_df[diagnosed_and_clear_df["eTIV"] <= 1732]

In [27]:
diagnosed_and_clear_df

Unnamed: 0,ID,M/F,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF
0,OAS1_0001_MR1,F,74,2.0,3.0,29.0,0.0,1344,0.743,1.306
25,OAS1_0028_MR1,F,86,2.0,4.0,27.0,1.0,1449,0.738,1.211
27,OAS1_0030_MR1,F,65,2.0,3.0,29.0,0.0,1392,0.764,1.261
28,OAS1_0031_MR1,M,88,1.0,4.0,26.0,1.0,1419,0.674,1.236
29,OAS1_0032_MR1,M,89,4.0,1.0,28.0,0.0,1631,0.682,1.076
...,...,...,...,...,...,...,...,...,...,...
386,OAS1_0426_MR1,F,82,5.0,2.0,29.0,0.0,1316,0.791,1.334
387,OAS1_0428_MR1,F,84,4.0,3.0,28.0,0.0,1500,0.751,1.170
389,OAS1_0430_MR1,M,71,4.0,1.0,17.0,1.0,1562,0.687,1.123
403,OAS1_0445_MR1,F,90,1.0,2.0,29.0,0.0,1362,0.673,1.289


In [28]:
diagnosed_and_clear_df = diagnosed_and_clear_df[diagnosed_and_clear_df["eTIV"] >= 1274]

In [29]:
diagnosed_and_clear_df

Unnamed: 0,ID,M/F,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF
0,OAS1_0001_MR1,F,74,2.0,3.0,29.0,0.0,1344,0.743,1.306
25,OAS1_0028_MR1,F,86,2.0,4.0,27.0,1.0,1449,0.738,1.211
27,OAS1_0030_MR1,F,65,2.0,3.0,29.0,0.0,1392,0.764,1.261
28,OAS1_0031_MR1,M,88,1.0,4.0,26.0,1.0,1419,0.674,1.236
29,OAS1_0032_MR1,M,89,4.0,1.0,28.0,0.0,1631,0.682,1.076
...,...,...,...,...,...,...,...,...,...,...
386,OAS1_0426_MR1,F,82,5.0,2.0,29.0,0.0,1316,0.791,1.334
387,OAS1_0428_MR1,F,84,4.0,3.0,28.0,0.0,1500,0.751,1.170
389,OAS1_0430_MR1,M,71,4.0,1.0,17.0,1.0,1562,0.687,1.123
403,OAS1_0445_MR1,F,90,1.0,2.0,29.0,0.0,1362,0.673,1.289


In [30]:
diagnosed_and_clear_df = diagnosed_and_clear_df[diagnosed_and_clear_df["nWBV"] <= 0.763]

In [31]:
diagnosed_and_clear_df

Unnamed: 0,ID,M/F,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF
0,OAS1_0001_MR1,F,74,2.0,3.0,29.0,0.0,1344,0.743,1.306
25,OAS1_0028_MR1,F,86,2.0,4.0,27.0,1.0,1449,0.738,1.211
28,OAS1_0031_MR1,M,88,1.0,4.0,26.0,1.0,1419,0.674,1.236
29,OAS1_0032_MR1,M,89,4.0,1.0,28.0,0.0,1631,0.682,1.076
30,OAS1_0033_MR1,F,80,4.0,2.0,29.0,0.0,1323,0.735,1.326
32,OAS1_0035_MR1,F,84,3.0,2.0,28.0,1.0,1402,0.695,1.252
47,OAS1_0052_MR1,F,78,1.0,5.0,23.0,1.0,1462,0.697,1.2
48,OAS1_0053_MR1,F,83,1.0,4.0,21.0,1.0,1384,0.699,1.268
51,OAS1_0056_MR1,F,72,3.0,3.0,15.0,1.0,1324,0.668,1.325
60,OAS1_0065_MR1,M,90,2.0,3.0,25.0,0.0,1301,0.645,1.349


In [32]:
diagnosed_and_clear_df = diagnosed_and_clear_df[diagnosed_and_clear_df["nWBV"] >= 0.655]

In [33]:
diagnosed_and_clear_df

Unnamed: 0,ID,M/F,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF
0,OAS1_0001_MR1,F,74,2.0,3.0,29.0,0.0,1344,0.743,1.306
25,OAS1_0028_MR1,F,86,2.0,4.0,27.0,1.0,1449,0.738,1.211
28,OAS1_0031_MR1,M,88,1.0,4.0,26.0,1.0,1419,0.674,1.236
29,OAS1_0032_MR1,M,89,4.0,1.0,28.0,0.0,1631,0.682,1.076
30,OAS1_0033_MR1,F,80,4.0,2.0,29.0,0.0,1323,0.735,1.326
32,OAS1_0035_MR1,F,84,3.0,2.0,28.0,1.0,1402,0.695,1.252
47,OAS1_0052_MR1,F,78,1.0,5.0,23.0,1.0,1462,0.697,1.2
48,OAS1_0053_MR1,F,83,1.0,4.0,21.0,1.0,1384,0.699,1.268
51,OAS1_0056_MR1,F,72,3.0,3.0,15.0,1.0,1324,0.668,1.325
62,OAS1_0067_MR1,F,71,4.0,1.0,27.0,1.0,1549,0.73,1.133


In [34]:
diagnosed_and_clear_df.shape

(51, 10)

In [35]:
diagnosed_and_clear_df.to_csv('selected_data.csv', index=False)