Google Colab (https://colab.google/), a cloud-based Jupyter notebook environment provided by Google, was utilised for conducting the study, which included data preprocessing, feature selection, missing data generation, imputation, classification and validation tasks.

In [3]:
# Loading the ADNI dataset
import pandas as pd
df=pd.read_csv("/content/drive/MyDrive/ADNI.csv")

In [None]:
# Deriving information from the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 559 entries, 0 to 558
Columns: 224 entries, RID to APOE4.2
dtypes: float64(185), int64(39)
memory usage: 978.4 KB


In [None]:
# Count missing values for each column
missing_counts = df.isna().sum()

# Total count of missing values in the dataset
total_missing = df.isna().sum().sum()

print("Missing value counts for each column:")
print(missing_counts)
print("\nTotal missing values in the dataset:", total_missing)

Missing value counts for each column:
RID                           0
lh.Cerebellum.White.Matter    0
lh.Cerebellum.Cortex          0
lh.Thalamus.Proper            0
lh.Caudate                    0
                             ..
adas_Q12SCORE                 0
adas_Q13SCORE                 0
APOE4.0                       0
APOE4.1                       0
APOE4.2                       0
Length: 224, dtype: int64

Total missing values in the dataset: 0


In [None]:
# No missing values found in the dataset

In [None]:
# copy original dataframe before normalizing
master_df = df.copy()
master_df.head()

Unnamed: 0,RID,lh.Cerebellum.White.Matter,lh.Cerebellum.Cortex,lh.Thalamus.Proper,lh.Caudate,lh.Putamen,lh.Pallidum,X3rd.Ventricle,X4th.Ventricle,Brain.Stem,...,adas_Q7SCORE,adas_Q8SCORE,adas_Q9SCORE,adas_Q10SCORE,adas_Q11SCORE,adas_Q12SCORE,adas_Q13SCORE,APOE4.0,APOE4.1,APOE4.2
0,21,1.122404,1.182973,1.817158,1.391494,1.799036,2.5379,0.965265,2.742845,1.074118,...,8,8,5,5,5,5,5,1,0,0
1,31,1.021761,1.11275,1.303878,1.070731,1.652041,1.982748,1.250516,1.489708,1.178413,...,8,11,5,5,5,5,5,1,0,0
2,56,0.933901,1.04013,1.540324,1.340539,1.969141,3.298904,0.83005,1.337372,0.983483,...,8,11,5,5,5,5,5,1,0,0
3,59,1.003087,1.070947,1.503807,1.328975,1.878053,2.308336,0.919911,1.819666,0.984816,...,8,8,5,5,5,5,5,1,0,0
4,69,1.042776,1.112264,1.267889,1.346736,2.233189,5.768378,1.24842,0.774037,1.073024,...,8,9,5,5,5,5,4,1,0,0


In [None]:
# Removing irrelavant variable; RID column (ID column) from the analysis
del df['RID']

In [None]:
# Gender column
df['PTGENDER'] = df['PTGENDER'] - 1

# Subtracting 1 from the 'PTGENDER' column to encode male as 0 and female as 1.

In [None]:
# Identifying negative columns
negative_cols = [k for k,v in df.items() if v.min() < 0]
negative_cols

['PHC_MEM', 'PHC_EXF', 'PHC_LAN', 'COMP_MEM_SCORE', 'COMP_EXEC_FUNC_SCORE']

In [None]:
for k, v in df[negative_cols].items():    # iterating and adding absolute minimum values
    df[k] = df[k].map(lambda x: x + abs(v.min()))

In [None]:
# Normalizing all features by the method min-max scaling
excluded = ['AD_LABEL', 'CDR']
cols_to_normalize = [k for k, v in df.items() if v.max() > 1 and k not in excluded]


normalized_df = df[cols_to_normalize]
numer = normalized_df - normalized_df.min()
denom = normalized_df.max() - normalized_df.min()

df[cols_to_normalize] = (numer / denom)
df.head()
df.isnull().sum().sum()
count_nan_in_df = df.isnull().sum()
max(count_nan_in_df)
import scipy.sparse as sp

x,y = sp.coo_matrix(df.isnull()).nonzero()
print(list(zip(x,y)))

[]


In [None]:
# Feature selection using the methods;
# Boruta, Logistic regression (Lasso) and autoenoder
# To identify the key features in the prediction of AD and then to introduce missingness and employ imputation

In [None]:
# Feature selection using Boruta
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Setting random seed for reproducibility
np.random.seed(42)

# Assuming df is your DataFrame and AD_LABEL is the target column
X = df.drop('AD_LABEL', axis=1)
y = df['AD_LABEL']

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def get_X_boruta(X_train):
    X_shadow = X_train.apply(np.random.permutation)
    X_shadow.columns = ['shadow_' + feat for feat in X_train.columns]
    X_boruta = pd.concat([X_train, X_shadow], axis=1)
    return X_boruta

def get_random_bar(X_boruta, y, X_train, model):
    model.fit(X_boruta, np.array(y).ravel())
    feat_imp_X = model.feature_importances_[:len(X_train.columns)]
    feat_imp_shadow = model.feature_importances_[len(X_train.columns):]
    hits = feat_imp_X > feat_imp_shadow.max()
    return feat_imp_X, feat_imp_shadow, hits

def get_relevant_features(X_train, feat_imp_X, hits):
    features = X_train.columns.values
    relevant_features = []
    relevant_importances = []
    for index, value in enumerate(hits):
        if value:
            relevant_features.append(features[index])
            relevant_importances.append(feat_imp_X[index])
    # Create a DataFrame for sorting
    feature_importance_df = pd.DataFrame({
        'Feature': relevant_features,
        'Importance': relevant_importances
    })
    # Sort by importance
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
    return feature_importance_df

# Applying the functions
X_boruta = get_X_boruta(X_train)
model = RandomForestClassifier(random_state=42)
feat_imp_X, feat_imp_shadow, hits = get_random_bar(X_boruta, y_train, X_train, model)
relevant_features_df = get_relevant_features(X_train, feat_imp_X, hits)

# Printing the relevant features with their importances
print("Relevant features ranked by importance:\n", relevant_features_df)


Relevant features ranked by importance:
                     Feature  Importance
22                      CDR    0.063548
17                  PHC_MEM    0.037664
26           COMP_MEM_SCORE    0.027073
25        LOG_MEM_DEL_TOTAL    0.025930
20               ADAS_TOTAL    0.020902
24        LOG_MEM_IMM_TOTAL    0.019943
0               lh.Amygdala    0.017832
21              CBB_SCORE_.    0.012465
14         wm.rh.entorhinal    0.011318
18                  PHC_EXF    0.010990
4         ctx.rh.entorhinal    0.010526
27     COMP_EXEC_FUNC_SCORE    0.010302
9    wm.lh.inferiorparietal    0.010010
1         ctx.lh.entorhinal    0.009866
7          wm.lh.entorhinal    0.009484
19                  PHC_LAN    0.007510
28             adas_Q1SCORE    0.007168
11          wm.lh.precuneus    0.006839
12   wm.lh.superiorparietal    0.006772
2           ctx.lh.fusiform    0.006441
5     ctx.rh.middletemporal    0.006199
16   wm.rh.inferiorparietal    0.006136
10    wm.lh.parahippocampal    0.006047

In [None]:
# (ii) Applying Logistic regression for feature selection using 'AD_LABEL' as the target variable
import pandas as pd
from sklearn.linear_model import LogisticRegression

X = df.drop(columns=['AD_LABEL'])  # Features, excluding target variable
y = df['AD_LABEL']  # Target variable

# Initialize the Logistic Regression model with L1 regularization
lasso_model = LogisticRegression(penalty='l1', solver='liblinear', random_state=42)

# Fit the model to the data
lasso_model.fit(X, y)

# Get the coefficients of the features after L1 regularization
lasso_coefficients = lasso_model.coef_

# Extract selected features based on non-zero coefficients
selected_features = X.columns[lasso_coefficients[0] != 0]

# Print the selected features
print("Selected features based on Lasso (L1 regularization):")
print(selected_features)


Selected features based on Lasso (L1 regularization):
Index(['AGE', 'PTGENDER', 'MOTHDEM', 'APOE4.1', 'APOE4.2', 'PHC_MEM',
       'PHC_EXF', 'ADAS_TOTAL', 'CDR', 'MMSE', 'LOG_MEM_IMM_TOTAL',
       'COMP_EXEC_FUNC_SCORE', 'NPIGTOT', 'adas_Q5SCORE', 'adas_Q11SCORE',
       'lh.Cerebellum.White.Matter', 'lh.choroid.plexus', 'rh.Caudate',
       'rh.Pallidum', 'ctx.lh.transversetemporal', 'wm.lh.corpuscallosum',
       'wm.rh.entorhinal'],
      dtype='object')


In [None]:
!pip install tensorflow



In [None]:
!pip install keras



In [None]:
# (iii) Feature selection using autoencoder
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split


# Define hyperparameters
input_dim = 222
# Number of input features excluding the target variables
encoding_dim = 50  # Number of neurons in the encoding layer
epochs = 100
batch_size = 100

# Define the autoencoder architecture
input_layer = Input(shape=(input_dim,))
encoded = Dense(encoding_dim, activation='relu')(input_layer)
decoded = Dense(input_dim, activation='relu')(encoded)

# Create the autoencoder model
autoencoder = Model(input_layer, decoded)

# Compile the model
autoencoder.compile(optimizer='adam', loss='mean_squared_error')


# Split the data into training and testing sets
X = df.drop(columns=['AD_LABEL'])  # Features (excluding target variable)
Y = df['AD_LABEL']  # Target variable
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Train the autoencoder
autoencoder.fit(X_train, X_train, epochs=epochs, batch_size=batch_size, shuffle=True, validation_data=(X_test, X_test))

# Extract latent features
encoder = Model(input_layer, encoded)
encoded_input = Input(shape=(encoding_dim,))
decoder_layer = autoencoder.layers[-1]
decoder = Model(encoded_input, decoder_layer(encoded_input))

# Get the latent features
encoded_X_train = encoder.predict(X_train)
encoded_X_test = encoder.predict(X_test)

Epoch 1/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 168ms/step - loss: 0.1285 - val_loss: 0.1162
Epoch 2/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 33ms/step - loss: 0.1129 - val_loss: 0.0985
Epoch 3/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - loss: 0.0944 - val_loss: 0.0775
Epoch 4/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - loss: 0.0731 - val_loss: 0.0569
Epoch 5/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - loss: 0.0540 - val_loss: 0.0414
Epoch 6/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - loss: 0.0385 - val_loss: 0.0321
Epoch 7/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - loss: 0.0308 - val_loss: 0.0271
Epoch 8/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - loss: 0.0270 - val_loss: 0.0244
Epoch 9/100
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

In [None]:
# Extract the weights of the connections between input and encoding layer
weights = autoencoder.layers[1].get_weights()[0]

# Normalize the weights
normalized_weights = np.linalg.norm(weights, axis=0)

# Sort features by importance
sorted_indices = np.argsort(normalized_weights)[::-1]

X = df.drop(columns=['AD_LABEL'])
# Extract the feature names
feature_names = X.columns

# Print or visualize the sorted feature importance
for idx in sorted_indices:
    print(f"Feature: {feature_names[idx]}, Importance: {normalized_weights[idx]}")

Feature: FATHDEM, Importance: 2.6887803077697754
Feature: NPIATOT, Importance: 2.5944018363952637
Feature: MOTHDEM, Importance: 2.5006532669067383
Feature: adas_Q5SCORE, Importance: 2.25744891166687
Feature: PTGENDER, Importance: 2.0938720703125
Feature: LOG_MEM_IMM_TOTAL, Importance: 1.8243061304092407
Feature: lh.Thalamus.Proper, Importance: 1.8202064037322998
Feature: NPITOTAL, Importance: 1.820050597190857
Feature: CBB_SCORE_., Importance: 1.7178853750228882
Feature: NPILTOT, Importance: 1.6425931453704834
Feature: adas_Q9SCORE, Importance: 1.6400963068008423
Feature: LOG_MEM_DEL_TOTAL, Importance: 1.5958350896835327
Feature: ADAS_TOTAL, Importance: 1.5933635234832764
Feature: PHC_EXF, Importance: 1.5582596063613892
Feature: adas_Q6SCORE, Importance: 1.5575151443481445
Feature: adas_Q7SCORE, Importance: 1.5488641262054443
Feature: NPIITOT, Importance: 1.5028412342071533
Feature: adas_Q10SCORE, Importance: 1.4856277704238892
Feature: NPIBTOT, Importance: 1.4831969738006592
Feature: 

In [None]:
# The common features extracted from the feature selection methods used here are;
# MOTHDEM, APOE4.0, APOE4.1, APOE4.2,Lh.Amygdala, wm.lh.entorhinal,wm.lh.inferiortemporal,CDR and PT_Gender.
# From the features selected, we have chosen the columns 'MOTHDEM', 'APOE' and 'CDR' to introduce MAR and imputing it using denoising autoencoder