In [1]:
# Import necessary modules for authenticating and creating the PyDrive client
# To access Google Drive in Google Colab

from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [2]:
# Mount the google drive
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
# Loading the dataset
import pandas as pd
df=pd.read_csv("/content/drive/MyDrive/ADNI.csv")

In [4]:
# Deriving information from the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 559 entries, 0 to 558
Columns: 224 entries, RID to APOE4.2
dtypes: float64(185), int64(39)
memory usage: 978.4 KB


In [5]:
# Count missing values for each column
missing_counts = df.isna().sum()

# Total count of missing values in the dataset
total_missing = df.isna().sum().sum()

print("Missing value counts for each column:")
print(missing_counts)
print("\nTotal missing values in the dataset:", total_missing)

Missing value counts for each column:
RID                           0
lh.Cerebellum.White.Matter    0
lh.Cerebellum.Cortex          0
lh.Thalamus.Proper            0
lh.Caudate                    0
                             ..
adas_Q12SCORE                 0
adas_Q13SCORE                 0
APOE4.0                       0
APOE4.1                       0
APOE4.2                       0
Length: 224, dtype: int64

Total missing values in the dataset: 0


In [6]:
# Checking the presence of duplicate values
Duplicates=df[df.duplicated()]
print("Duplicated rows:")
print(Duplicates)


Duplicated rows:
Empty DataFrame
Columns: [RID, lh.Cerebellum.White.Matter, lh.Cerebellum.Cortex, lh.Thalamus.Proper, lh.Caudate, lh.Putamen, lh.Pallidum, X3rd.Ventricle, X4th.Ventricle, Brain.Stem, lh.Hippocampus, lh.Amygdala, CSF, lh.Accumbens.area, lh.VentralDC, lh.choroid.plexus, rh.Cerebellum.White.Matter, rh.Cerebellum.Cortex, rh.Thalamus.Proper, rh.Caudate, rh.Putamen, rh.Pallidum, rh.Hippocampus, rh.Amygdala, rh.Accumbens.area, rh.VentralDC, rh.choroid.plexus, CC_Posterior, CC_Mid_Posterior, CC_Central, CC_Mid_Anterior, CC_Anterior, ctx.lh.unknown, ctx.lh.bankssts, ctx.lh.caudalanteriorcingulate, ctx.lh.caudalmiddlefrontal, ctx.lh.corpuscallosum, ctx.lh.cuneus, ctx.lh.entorhinal, ctx.lh.fusiform, ctx.lh.inferiorparietal, ctx.lh.inferiortemporal, ctx.lh.isthmuscingulate, ctx.lh.lateraloccipital, ctx.lh.lateralorbitofrontal, ctx.lh.lingual, ctx.lh.medialorbitofrontal, ctx.lh.middletemporal, ctx.lh.parahippocampal, ctx.lh.paracentral, ctx.lh.parsopercularis, ctx.lh.parsorbitalis, 

In [7]:
# copy original dataframe before normalizing
master_df = df.copy()
master_df.head()

Unnamed: 0,RID,lh.Cerebellum.White.Matter,lh.Cerebellum.Cortex,lh.Thalamus.Proper,lh.Caudate,lh.Putamen,lh.Pallidum,X3rd.Ventricle,X4th.Ventricle,Brain.Stem,...,adas_Q7SCORE,adas_Q8SCORE,adas_Q9SCORE,adas_Q10SCORE,adas_Q11SCORE,adas_Q12SCORE,adas_Q13SCORE,APOE4.0,APOE4.1,APOE4.2
0,21,1.122404,1.182973,1.817158,1.391494,1.799036,2.5379,0.965265,2.742845,1.074118,...,8,8,5,5,5,5,5,1,0,0
1,31,1.021761,1.11275,1.303878,1.070731,1.652041,1.982748,1.250516,1.489708,1.178413,...,8,11,5,5,5,5,5,1,0,0
2,56,0.933901,1.04013,1.540324,1.340539,1.969141,3.298904,0.83005,1.337372,0.983483,...,8,11,5,5,5,5,5,1,0,0
3,59,1.003087,1.070947,1.503807,1.328975,1.878053,2.308336,0.919911,1.819666,0.984816,...,8,8,5,5,5,5,5,1,0,0
4,69,1.042776,1.112264,1.267889,1.346736,2.233189,5.768378,1.24842,0.774037,1.073024,...,8,9,5,5,5,5,4,1,0,0


In [8]:
# Removing irrelavant variable; RID column (ID column) from the analysis
del df['RID']

In [9]:
# Gender column
df['PTGENDER'] = df['PTGENDER'] - 1

# Subtracting 1 from the 'PTGENDER' column to encode male as 0 and female as 1.

In [10]:
# Identifying negative columns
negative_cols = [k for k,v in df.items() if v.min() < 0]
negative_cols

['PHC_MEM', 'PHC_EXF', 'PHC_LAN', 'COMP_MEM_SCORE', 'COMP_EXEC_FUNC_SCORE']

In [11]:
for k, v in df[negative_cols].items():    # iterating and adding absolute minimum values
    df[k] = df[k].map(lambda x: x + abs(v.min()))

In [12]:
# Normalizing all features by the method min-max scaling
excluded = ['AD_LABEL', 'CDR']
cols_to_normalize = [k for k, v in df.items() if v.max() > 1 and k not in excluded]


normalized_df = df[cols_to_normalize]
numer = normalized_df - normalized_df.min()
denom = normalized_df.max() - normalized_df.min()

df[cols_to_normalize] = (numer / denom)
df.head()
df.isnull().sum().sum()
count_nan_in_df = df.isnull().sum()
max(count_nan_in_df)
import scipy.sparse as sp

x,y = sp.coo_matrix(df.isnull()).nonzero()
print(list(zip(x,y)))

[]
