In [2]:
from tensorflow.keras.models import load_model
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator


val_dir = '/kaggle/input/augmented-alzheimer-mri-dataset-v2/data/val'


image_size = (176, 208)
batch_size = 32
SEED = 42

val_datagen = ImageDataGenerator(rescale=1./255)
val_gen = val_datagen.flow_from_directory(
    val_dir,
    target_size=image_size,
    color_mode='grayscale',
    class_mode='sparse',
    batch_size=batch_size,
    shuffle=False,
    seed=SEED
)


y_true = val_gen.classes
np.save('y_true.npy', y_true)


model_paths = {
    'baseline':'/kaggle/input/cnn_base_latest/keras/default/1/alzheimers_augmented_model_latest.h5',
    'gan85': '/kaggle/input/cnn_augmented_dcgan/keras/default/1/cnn_model_augmented_85.h5',
    'gan75': '/kaggle/input/cnn_augmented_dcgan/keras/default/1/cnn_model_augmented_75.h5',
    'gan72': '/kaggle/input/cnn_augmented_dcgan/keras/default/1/cnn_model_augmented_72.h5',
    'gan70': '/kaggle/input/cnn_augmented_dcgan/keras/default/1/cnn_model_augmented_70.h5',
    'gan65': '/kaggle/input/cnn_augmented_dcgan/keras/default/1/cnn_model_augmented_65.h5'
}


for name, path in model_paths.items():
    print(f"\nPredicting with {name} model...")
    model = load_model(path)
    val_gen.reset()
    preds = model.predict(val_gen, verbose=1)
    y_pred = np.argmax(preds, axis=1)
    np.save(f'y_pred_{name}.npy', y_pred)
    print(f"Saved predictions: y_pred_{name}.npy")


Found 6400 images belonging to 4 classes.

Predicting with baseline model...


I0000 00:00:1745641305.489124      31 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1745641305.489964      31 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5
  self._warn_if_super_not_called()
I0000 00:00:1745641308.511749     100 service.cc:148] XLA service 0x7bddec003530 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1745641308.512781     100 service.cc:156]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1745641308.512823     100 service.cc:156]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1745641308.639429     100 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m  3/200[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m18s[0m 94ms/step

I0000 00:00:1745641310.494486     100 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 177ms/step
Saved predictions: y_pred_baseline.npy

Predicting with gan85 model...
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 35ms/step
Saved predictions: y_pred_gan85.npy

Predicting with gan75 model...
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 37ms/step
Saved predictions: y_pred_gan75.npy

Predicting with gan72 model...
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 37ms/step
Saved predictions: y_pred_gan72.npy

Predicting with gan70 model...
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 36ms/step
Saved predictions: y_pred_gan70.npy

Predicting with gan65 model...
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 35ms/step
Saved predictions: y_pred_gan65.npy


In [4]:
from statsmodels.stats.contingency_tables import mcnemar
import numpy as np


y_true = np.load('y_true.npy')


model_names = ['gan85', 'gan75', 'gan72', 'gan70', 'gan65']


y_pred_baseline = np.load('y_pred_baseline.npy')

print("\n McNemar’s Test Results (vs. Baseline Model):")


for name in model_names:
    y_pred_gan = np.load(f'y_pred_{name}.npy')

        
    baseline_correct = (y_pred_baseline == y_true)
    gan_correct = (y_pred_gan == y_true)

    # McNemar contingency table values
    b = np.sum((baseline_correct == True) & (gan_correct == False))  # Baseline only correct
    c = np.sum((baseline_correct == False) & (gan_correct == True))  # GAN only correct

    # Contingency table: [ [a, b], [c, d] ] but a & d not needed
    table = [[0, b], [c, 0]]

    # Run McNemar’s test
    result = mcnemar(table, exact=False, correction=True)

    print(f"\nComparing Baseline vs {name.upper()}:")
    print(f"  Contingency: [[0, {b}], [{c}, 0]]")
    print(f"  χ² = {result.statistic:.4f}, p = {result.pvalue:.4f}")

    if result.pvalue < 0.05:
        print("Statistically significant improvement (p < 0.05)")
    else:
        print("Not statistically significant")



 McNemar’s Test Results (vs. Baseline Model):

Comparing Baseline vs GAN85:
  Contingency: [[0, 116], [80, 0]]
  χ² = 6.2500, p = 0.0124
Statistically significant improvement (p < 0.05)

Comparing Baseline vs GAN75:
  Contingency: [[0, 117], [81, 0]]
  χ² = 6.1869, p = 0.0129
Statistically significant improvement (p < 0.05)

Comparing Baseline vs GAN72:
  Contingency: [[0, 28], [93, 0]]
  χ² = 33.8512, p = 0.0000
Statistically significant improvement (p < 0.05)

Comparing Baseline vs GAN70:
  Contingency: [[0, 261], [66, 0]]
  χ² = 115.0948, p = 0.0000
Statistically significant improvement (p < 0.05)

Comparing Baseline vs GAN65:
  Contingency: [[0, 247], [86, 0]]
  χ² = 76.8769, p = 0.0000
Statistically significant improvement (p < 0.05)
