<a href="https://colab.research.google.com/github/MokidiSrinidhi/XAI/blob/main/XAI_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing import image_dataset_from_directory
import matplotlib.pyplot as plt
import numpy as np
import os
import zipfile
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, roc_curve
from sklearn.preprocessing import label_binarize
import seaborn as sns

# --- 1. Setup (Run this in Google Colab) ---
# This section is for downloading and unzipping the data.
# You will need to get the Kaggle API credentials.

# !pip install kaggle --quiet

# # Set up Kaggle API token
# # 1. Go to your Kaggle account, click your profile picture
# # 2. Go to "Account" -> "API" -> "Create New API Token"
# # 3. This will download 'kaggle.json'. Upload it to your Colab session.
# !mkdir -p ~/.kaggle
# !cp kaggle.json ~/.kaggle/
# !chmod 600 ~/.kaggle/kaggle.json

# # --- IMPORTANT ---
# # The dataset link you provided (pacificrm/skindiseasedataset) seems broken or private.
# # I will use a different, popular skin disease dataset as an example:
# # "Skin Cancer MNIST: HAM10000" (URL: https://www.kaggle.com/datasets/kmader/skin-cancer-mnist-ham10000)
# # This dataset is in CSV format with image IDs, so the loading logic is different.

# # Let's use a dataset that is already in image folders, as your instructions imply.
# # Example: "Skin Disease Dataset" (URL: https://www.kaggle.com/datasets/prashantmishra158/skin-disease-dataset)
# print("Downloading dataset...")
# !kaggle datasets download -d prashantmishra158/skin-disease-dataset --unzip -q

print("--- Setup Complete (Simulated for this environment) ---")
print("In a real environment (Colab), the above commands would download and unzip the data.")

# --- 2. Data Preprocessing & Loading ---

# Define paths and parameters
# ASSUMPTION: The data is unzipped into 'train_set' and 'test_set' folders
# Please adjust these paths based on how your data unzips.
# train_dir = 'train_set'
# test_dir = 'test_set'

# For demonstration, I will create dummy directories and images.
# --- START: DUMMY DATA CREATION (Remove this in your real project) ---
def create_dummy_data(base_dir, num_classes=3, num_images=20):
    np.random.seed(42)
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)
    for i in range(num_classes):
        class_dir = os.path.join(base_dir, f'class_{i}')
        if not os.path.exists(class_dir):
            os.makedirs(class_dir)
        for j in range(num_images):
            # Create a small dummy image
            img_array = np.random.randint(0, 256, (64, 64, 3), dtype=np.uint8)
            img = tf.keras.preprocessing.image.array_to_img(img_array)
            img.save(os.path.join(class_dir, f'img_{j}.png'))

train_dir = 'dummy_train'
test_dir = 'dummy_test'
create_dummy_data(train_dir, num_classes=3, num_images=50) # 3 classes, 50 images each for train
create_dummy_data(test_dir, num_classes=3, num_images=20)  # 3 classes, 20 images each for test
print(f"Created dummy data in {train_dir} and {test_dir}")
# --- END: DUMMY DATA CREATION ---

IMG_SIZE = (64, 64)
BATCH_SIZE = 32

# Load datasets from directories
train_dataset = image_dataset_from_directory(
    train_dir,
    label_mode='categorical', # Use 'categorical' for one-hot encoding
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    shuffle=True
)

test_dataset = image_dataset_from_directory(
    test_dir,
    label_mode='categorical',
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    shuffle=False # Keep test data in order for evaluation
)

class_names = train_dataset.class_names
NUM_CLASSES = len(class_names)
print(f"Found classes: {class_names}")

# --- 3. Visualization (Select Techniques) ---
print("\n--- Visualizing Data ---")

# Technique 1: Show a batch of images (like a Scatter Plot for images)
plt.figure(figsize=(10, 10))
for images, labels in train_dataset.take(1):
    for i in range(9):
        ax = plt.subplot(3, 3, i + 1)
        plt.imshow(images[i].numpy().astype("uint8"))
        plt.title(class_names[np.argmax(labels[i])])
        plt.axis("off")
plt.suptitle("Batch of Training Images")
plt.savefig("image_batch_visualization.png")
plt.close()
print("Saved image_batch_visualization.png")

# Technique 2: Bar Chart of Class Distribution
# Note: This is harder with image_dataset_from_directory.
# A simpler way is to count files in subdirectories.
class_counts = []
for class_name in class_names:
    class_counts.append(len(os.listdir(os.path.join(train_dir, class_name))))

plt.figure(figsize=(10, 5))
plt.bar(class_names, class_counts)
plt.title("Training Data Class Distribution")
plt.ylabel("Number of Images")
plt.savefig("class_distribution_bar_chart.png")
plt.close()
print("Saved class_distribution_bar_chart.png")

# --- 4. Preprocessing: Normalization & Imbalance Handling ---

# 1. Normalization: Done as a layer in the model for efficiency
normalization_layer = layers.Rescaling(1./255)

# 2. Imbalance / Balance Handling (SMOTE/Oversampling)
# This is complex with image generators.
# A simpler method is to use 'class_weight' during model training.
total_samples = sum(class_counts)
class_weights = {}
for i, count in enumerate(class_counts):
    class_weights[i] = (1 / count) * (total_samples / NUM_CLASSES)
print(f"Calculated Class Weights for Imbalance: {class_weights}")

# Configure dataset for performance
AUTOTUNE = tf.data.AUTOTUNE
train_dataset = train_dataset.cache().prefetch(buffer_size=AUTOTUNE)
test_dataset = test_dataset.cache().prefetch(buffer_size=AUTOTUNE)

# --- 5. Machine Learning / Deep Learning Model (CNN) ---
# A CNN is the correct choice for this image data.

print("\n--- Building CNN Model ---")

model = models.Sequential([
    # Input layer with normalization
    layers.Input(shape=(IMG_SIZE[0], IMG_SIZE[1], 3)),
    normalization_layer,

    # Block 1
    layers.Conv2D(32, (3, 3), activation='relu', padding='same'),
    layers.MaxPooling2D((2, 2)),

    # Block 2
    layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
    layers.MaxPooling2D((2, 2)),

    # Block 3
    layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
    layers.MaxPooling2D((2, 2)),

    # Classifier Head
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5), # Regularization
    layers.Dense(NUM_CLASSES, activation='softmax') # Output layer
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=[
        'accuracy',
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall')
    ]
)

model.summary()

# --- 6. Train the Model ---
print("\n--- Training Model ---")
# Note: EPOCHS should be higher (e.g., 20-50) for a real dataset
EPOCHS = 5
history = model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=EPOCHS,
    class_weight=class_weights, # Handle data imbalance
    verbose=1 # Set to 1 to see progress
)
print("Model training complete.")

# --- 7. Evaluate Model & Metrics ---
print("\n--- Evaluating Model ---")

# Technique 3: Line Plot (Training/Validation Accuracy & Loss)
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(range(EPOCHS), acc, label='Training Accuracy')
plt.plot(range(EPOCHS), val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')

plt.subplot(1, 2, 2)
plt.plot(range(EPOCHS), loss, label='Training Loss')
plt.plot(range(EPOCHS), val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.suptitle("Model Training History")
plt.savefig("model_history_plot.png")
plt.close()
print("Saved model_history_plot.png")

# Get predictions
y_pred_probs = model.predict(test_dataset)
y_pred = np.argmax(y_pred_probs, axis=1)

# Get true labels
y_true = []
for images, labels in test_dataset:
    y_true.extend(np.argmax(labels.numpy(), axis=1))

# --- Metrics: Accuracy, Precision, Recall, F1-Score ---
print("\nClassification Report:")
report = classification_report(y_true, y_pred, target_names=class_names, output_dict=True)
print(classification_report(y_true, y_pred, target_names=class_names))

# Overall Metrics
print(f"Overall Accuracy: {report['accuracy']:.4f}")
print(f"Macro Avg Precision: {report['macro avg']['precision']:.4f}")
print(f"Macro Avg Recall: {report['macro avg']['recall']:.4f}")
print(f"Macro Avg F1-Score: {report['macro avg']['f1-score']:.4f}")

# --- Metrics: Confusion Matrix (Technique 4: Heatmap) ---
print("\nGenerating Confusion Matrix...")
cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)

fig, ax = plt.subplots(figsize=(8, 8))
disp.plot(ax=ax, cmap='Blues', xticks_rotation='vertical')
plt.title("Confusion Matrix")
plt.tight_layout()
plt.savefig("confusion_matrix.png")
plt.close()
print("Saved confusion_matrix.png")


# --- Metrics: AUC-ROC Curve (Technique 5: Line Plot) ---
# Binarize the labels for multiclass ROC
y_true_bin = label_binarize(y_true, classes=range(NUM_CLASSES))

# Calculate AUC
try:
    # 'macro' average is good for imbalance
    auc_score = roc_auc_score(y_true_bin, y_pred_probs, average='macro', multi_class='ovr')
    print(f"\nMacro-Average AUC Score: {auc_score:.4f}")
except ValueError as e:
    print(f"Could not calculate AUC score: {e}")

# Plot ROC Curve for each class
plt.figure(figsize=(10, 7))
for i in range(NUM_CLASSES):
    fpr, tpr, _ = roc_curve(y_true_bin[:, i], y_pred_probs[:, i])
    plt.plot(fpr, tpr, label=f'Class {class_names[i]} (AUC = {roc_auc_score(y_true_bin[:, i], y_pred_probs[:, i]):.2f})')

plt.plot([0, 1], [0, 1], 'k--', label='Chance')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multiclass ROC Curve (One-vs-Rest)')
plt.legend()
plt.grid(True)
plt.savefig("roc_auc_curve.png")
plt.close()
print("Saved roc_auc_curve.png")


# --- 8. Next Steps (Ensemble, SHAP, Advanced DL) ---
print("\n--- Project Complete ---")
print("This script provides the foundation.")
print("Your next steps would be:")
print("1. Ensemble Models: Train other models (e.g., ResNet50, VGG16) and combine them.")
print("   - You can load pre-trained models from tf.keras.applications.")
print("2. Correlation/SHAP: SHAP for CNNs is very computationally expensive.")
print("   - It involves masking parts of images and re-running predictions thousands of times.")
print("   - Look into the 'shap' library's DeepExplainer for this advanced step.")

--- Setup Complete (Simulated for this environment) ---
In a real environment (Colab), the above commands would download and unzip the data.
Created dummy data in dummy_train and dummy_test
Found 150 files belonging to 3 classes.
Found 60 files belonging to 3 classes.
Found classes: ['class_0', 'class_1', 'class_2']

--- Visualizing Data ---
Saved image_batch_visualization.png
Saved class_distribution_bar_chart.png
Calculated Class Weights for Imbalance: {0: 1.0, 1: 1.0, 2: 1.0}

--- Building CNN Model ---



--- Training Model ---
Epoch 1/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 406ms/step - accuracy: 0.3005 - loss: 1.1693 - precision: 0.2872 - recall: 0.0927 - val_accuracy: 0.3333 - val_loss: 1.1018 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 2/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 316ms/step - accuracy: 0.3060 - loss: 1.1106 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_accuracy: 0.3167 - val_loss: 1.0990 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 3/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 437ms/step - accuracy: 0.4103 - loss: 1.1003 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_accuracy: 0.3000 - val_loss: 1.0983 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Epoch 4/5
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 253ms/step - accuracy: 0.3484 - loss: 1.0991 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_accuracy: 0.3333 - val_loss: 1.0981 - val_pre

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Saved confusion_matrix.png

Macro-Average AUC Score: 0.5904
Saved roc_auc_curve.png

--- Project Complete ---
This script provides the foundation.
Your next steps would be:
1. Ensemble Models: Train other models (e.g., ResNet50, VGG16) and combine them.
   - You can load pre-trained models from tf.keras.applications.
2. Correlation/SHAP: SHAP for CNNs is very computationally expensive.
   - It involves masking parts of images and re-running predictions thousands of times.
   - Look into the 'shap' library's DeepExplainer for this advanced step.
