In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from sklearn.model_selection import cross_val_score

from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model

# Set seed for reproducibility
np.random.seed(42)


2025-06-05 22:17:19.908438: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749161840.439246      75 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749161840.567771      75 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [9]:
# Dataset paths (adjust if needed)
base_path = '/kaggle/input/best-alzheimer-mri-dataset-99-accuracy/Combined Dataset'
train_path = os.path.join(base_path, 'train')
test_path = os.path.join(base_path, 'test')

# Image generators with rescaling and validation split
train_datagen = ImageDataGenerator(rescale=1./255, validation_split=0.2)
test_datagen = ImageDataGenerator(rescale=1./255)

# Load training and validation sets
train_generator = train_datagen.flow_from_directory(
    train_path,
    target_size=(176, 208),
    batch_size=32,
    class_mode='categorical',
    subset='training',
    shuffle=True)

val_generator = train_datagen.flow_from_directory(
    train_path,
    target_size=(176, 208),
    batch_size=32,
    class_mode='categorical',
    subset='validation',
    shuffle=False)

# Load test set
test_generator = test_datagen.flow_from_directory(
    test_path,
    target_size=(176, 208),
    batch_size=32,
    class_mode='categorical',
    shuffle=False)


Found 8192 images belonging to 4 classes.
Found 2048 images belonging to 4 classes.
Found 1279 images belonging to 4 classes.


In [11]:
# Load pretrained VGG16 without top layers
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(176, 208, 3))

# Use output of last pooling layer as features
feature_extractor = Model(inputs=base_model.input, outputs=base_model.get_layer('block5_pool').output)


In [12]:
def extract_features(generator, sample_count):
    batch_size = generator.batch_size
    num_batches = int(np.ceil(sample_count / batch_size))
    
    features = np.zeros((sample_count, 5, 6, 512))
    labels = np.zeros((sample_count, generator.num_classes))
    
    i = 0
    for inputs_batch, labels_batch in generator:
        batch_len = inputs_batch.shape[0]
        features_batch = feature_extractor.predict(inputs_batch, verbose=0)
        
        features[i:i+batch_len] = features_batch
        labels[i:i+batch_len] = labels_batch
        
        i += batch_len
        if i >= sample_count:
            break
    
    return features, labels


In [13]:
train_samples = train_generator.samples
val_samples = val_generator.samples
test_samples = test_generator.samples

train_features, train_labels = extract_features(train_generator, train_samples)
val_features, val_labels = extract_features(val_generator, val_samples)
test_features, test_labels = extract_features(test_generator, test_samples)

# Flatten features
train_features_flat = train_features.reshape(train_samples, -1)
val_features_flat = val_features.reshape(val_samples, -1)
test_features_flat = test_features.reshape(test_samples, -1)

# Convert one-hot labels to categorical
train_labels_cat = np.argmax(train_labels, axis=1)
val_labels_cat = np.argmax(val_labels, axis=1)
test_labels_cat = np.argmax(test_labels, axis=1)


I0000 00:00:1749161937.475489     161 service.cc:148] XLA service 0x7b3600087900 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1749161937.476882     161 service.cc:156]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1749161937.476902     161 service.cc:156]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1749161937.691864     161 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1749161946.819900     161 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


In [None]:
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_features_flat)
val_scaled = scaler.transform(val_features_flat)
test_scaled = scaler.transform(test_features_flat)

pca = PCA(n_components=50)
train_pca = pca.fit_transform(train_scaled)
val_pca = pca.transform(val_scaled)
test_pca = pca.transform(test_scaled)

print(f"PCA explained variance ratio (first 50 components): {np.sum(pca.explained_variance_ratio_):.2f}")


In [None]:
models = {
    "Decision Tree": DecisionTreeClassifier(max_depth=5, min_samples_split=10, min_samples_leaf=5, class_weight='balanced', random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_split=10, class_weight='balanced', random_state=42, n_jobs=-1),
    "Extra Trees": ExtraTreesClassifier(n_estimators=100, max_depth=5, min_samples_split=10, class_weight='balanced', random_state=42, n_jobs=-1),
    "XGBoost": XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=100, objective='multi:softmax', random_state=42, tree_method='gpu_hist')
}


In [None]:
import matplotlib.pyplot as plt
import json

def evaluate_model(model, X_train, y_train, X_val, y_val, model_name, save_dir='/kaggle/working/model_results'):
    import os
    from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
    import time
    
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
        
    start_time = time.time()
    
    # Train model
    model.fit(X_train, y_train)
    train_time = time.time() - start_time
    
    # Predictions
    val_pred = model.predict(X_val)
    train_pred = model.predict(X_train)
    
    # Cross-validation (5-fold)
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    
    # Metrics dictionary
    metrics = {
        'model': model_name,
        'train_accuracy': accuracy_score(y_train, train_pred),
        'val_accuracy': accuracy_score(y_val, val_pred),
        'cv_mean_accuracy': np.mean(cv_scores),
        'cv_std_accuracy': np.std(cv_scores),
        'training_time': train_time
    }
    
    # Save classification report as text file
    class_report = classification_report(y_val, val_pred, target_names=list(test_generator.class_indices.keys()))
    with open(f"{save_dir}/{model_name}_classification_report.txt", "w") as f:
        f.write(class_report)
    
    # Save confusion matrix plot
    cm = confusion_matrix(y_val, val_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=list(test_generator.class_indices.keys()))
    disp.plot(cmap='Blues')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f"{save_dir}/{model_name}_confusion_matrix.png")
    plt.close()
    
    # Save metrics as JSON for later easy parsing
    with open(f"{save_dir}/{model_name}_metrics.json", "w") as f:
        json.dump(metrics, f)
    
    print(f"\n{model_name} Performance:")
    print("Validation Set Classification Report:")
    print(class_report)
    
    return metrics


In [None]:
results = []
save_directory = '/kaggle/working/model_results'

for name, model in models.items():
    metrics = evaluate_model(model, train_pca, train_labels_cat, val_pca, val_labels_cat, name, save_dir=save_directory)
    results.append(metrics)

results_df = pd.DataFrame(results).sort_values('val_accuracy', ascending=False)

print("\nModel Performance Comparison:")
display(results_df)

# Save overall summary dataframe as CSV
results_df.to_csv(f"{save_directory}/model_comparison_summary.csv", index=False)
print(f"Saved overall results summary to {save_directory}/model_comparison_summary.csv")


In [None]:
# Pick best model by validation accuracy
best_model_name = results_df.iloc[0]['model']
best_model = models[best_model_name]

# Retrain on full train+val data
X_full_train = np.vstack([train_pca, val_pca])
y_full_train = np.hstack([train_labels_cat, val_labels_cat])

best_model.fit(X_full_train, y_full_train)

# Evaluate on test set
test_pred = best_model.predict(test_pca)
test_acc = accuracy_score(test_labels_cat, test_pred)

print(f"\nBest Model: {best_model_name}")
print(f"Test Set Accuracy: {test_acc:.4f}")
print(classification_report(test_labels_cat, test_pred, target_names=test_generator.class_indices.keys()))

cm = confusion_matrix(test_labels_cat, test_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=test_generator.class_indices.keys())
disp.plot(cmap='Blues')
plt.title(f'{best_model_name} Test Set Confusion Matrix')
plt.xticks(rotation=45)
plt.show()


In [None]:
results_df.to_csv('/kaggle/working/model_evaluation_results.csv', index=False)


In [None]:
import shutil

# Path to final results directory
final_results_dir = "/kaggle/working/model_results"
zip_file_path = "/kaggle/working/model_results.zip"

# Create ZIP archive
shutil.make_archive(base_name=zip_file_path.replace(".zip", ""), 
                    format="zip", 
                    root_dir=final_results_dir)

print(f"Zipped results folder created at: {zip_file_path}")


In [6]:
from graphviz import Digraph

# Create a new directed graph
dot = Digraph(comment='Alzheimer MRI Classification Pipeline', format='png')

# Set graph attributes for better readability
dot.attr(rankdir='TB', size='8,10', dpi='300')
dot.attr('node', shape='box', style='filled', fillcolor='lightblue', fontsize='12')
dot.attr('edge', fontsize='10')

# Define nodes for each pipeline step
dot.node('A', 'Load MRI Images\n(ImageDataGenerator)')
dot.node('B', 'Feature Extraction\n(VGG16 without top layers)')
dot.node('C', 'Flatten Features\n(Reshape to 1D)')
dot.node('D', 'Scale Features\n(StandardScaler)')
dot.node('E', 'Dimensionality Reduction\n(PCA, 50 components)')
dot.node('F', 'Train Classifiers\n(Decision Tree, Random Forest,\nExtra Trees, XGBoost)')
dot.node('G', 'Evaluate Models\n(Validation Accuracy,\nCross-Validation, Confusion Matrix)')
dot.node('H', 'Select Best Model\n(Highest Validation Accuracy)')
dot.node('I', 'Retrain on Train+Val\n(Test on Test Set)')

# Define edges to represent the flow
dot.edges(['AB', 'BC', 'CD', 'DE', 'EF', 'FG', 'GH', 'HI'])

# Save the plot
dot.render('alzheimer_mri_pipeline', view=False, cleanup=True)

print("Pipeline plot saved as 'alzheimer_mri_pipeline.png'")

Pipeline plot saved as 'alzheimer_mri_pipeline.png'


In [7]:
import matplotlib.pyplot as plt
import numpy as np
import os

def plot_model_performance(results_df, save_dir='/kaggle/working/model_results'):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # Set up the plot
    plt.figure(figsize=(10, 6))
    models = results_df['model']
    x = np.arange(len(models))  # Model indices
    width = 0.25  # Bar width

    # Plot bars for each metric
    plt.bar(x - width, results_df['train_accuracy'], width, label='Train Accuracy', color='#36A2EB')
    plt.bar(x, results_df['val_accuracy'], width, label='Validation Accuracy', color='#FF6384')
    plt.bar(x + width, results_df['cv_mean_accuracy'], width, label='CV Mean Accuracy', color='#FFCE56')

    # Add error bars for CV standard deviation
    plt.errorbar(x + width, results_df['cv_mean_accuracy'], yerr=results_df['cv_std_accuracy'], 
                 fmt='none', ecolor='black', capsize=3)

    # Customize plot
    plt.xlabel('Models')
    plt.ylabel('Accuracy')
    plt.title('Model Performance Comparison')
    plt.xticks(x, models, rotation=45)
    plt.legend()
    plt.tight_layout()

    # Save plot
    plt.savefig(f"{save_dir}/model_performance_comparison.png")
    plt.close()

    print(f"Model performance plot saved to {save_dir}/model_performance_comparison.png")


plot_model_performance(results_df)

NameError: name 'results_df' is not defined