In [7]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

In [10]:
# Ensure reproducibility
np.random.seed(42)
tf.random.set_seed(42)
print("Libraries loaded successfully.")

Libraries loaded successfully.


In [38]:
df = pd.read_csv('Cancer_Data.csv')

df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [11]:
# --- 3. DATA PREPROCESSING AND FEATURE ENGINEERING (10% WEIGHT) ---

# Identify Features (X) and Target (Y)
# Drop the non-predictive 'id' column and the target variable 'diagnosis' from features [1]
X = data.drop(['id', 'diagnosis'], axis=1)
Y = data['diagnosis']


In [12]:
# 3.1. Target Encoding (Malignant/Benign -> 1/0)
# Deep Learning models require numerical input for the target variable [2]
encoder = LabelEncoder()
encoded_Y = encoder.fit_transform(Y) # 'M' -> 1, 'B' -> 0

In [14]:
# 3.2. Feature Standardization (Scaling)
# Scaling is crucial for optimizing gradient descent convergence in deep networks [2]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("\nData preprocessing complete. Features standardized, Target encoded (M=1, B=0).")


Data preprocessing complete. Features standardized, Target encoded (M=1, B=0).


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


# 4. DATA PREPARATION (TRAIN, VALIDATION, TEST SPLIT)

In [16]:
# Split the data into Training (70%), Validation (15%), and Test (15%) sets.
# We use stratification to ensure the class distribution is maintained across all subsets.
TEST_SIZE_FINAL = 0.15 # 15% for final, unbiased evaluation
VAL_SIZE_RATIO = 0.1764 # (0.15 / 0.85) to get 15% of the total in the validation set

In [17]:
# Step 1: Split Full Training Set (85%) and Test Set (15%)
X_train_full, X_test, Y_train_full, Y_test = train_test_split(
    X_scaled, encoded_Y, test_size=TEST_SIZE_FINAL, random_state=42, stratify=encoded_Y
)


In [19]:
# Step 2: Split Training Set into Training (70%) and Validation (15%)
X_train, X_val, Y_train, Y_val = train_test_split(
    X_train_full, Y_train_full, test_size=VAL_SIZE_RATIO, random_state=42, stratify=Y_train_full
)

In [20]:
input_dim = X_train.shape[1]

print(f"Total Samples: {data.shape}")
print(f"Training Samples (70%): {X_train.shape}")
print(f"Validation Samples (15%): {X_val.shape}")
print(f"Test Samples (15%): {X_test.shape}")

Total Samples: (569, 33)
Training Samples (70%): (397, 31)
Validation Samples (15%): (86, 31)
Test Samples (15%): (86, 31)


In [23]:
# --- 5. MODEL SELECTION AND ARCHITECTURE (10% WEIGHT) ---

# Member 1: Standard Deep Multi-Layer Perceptron (MLP) using Sequential API.
# This serves as the deep learning baseline model.

def create_baseline_dnn(input_dim):
    model = Sequential([
        Dense(1, activation='sigmoid')
    ], name="Baseline_DNN_M1")
    return model

model_m1 = create_baseline_dnn(input_dim)
print("\nModel Architecture (DNN Baseline):")
model_m1.summary()


Model Architecture (DNN Baseline):


In [24]:
# --- 6. MODEL COMPILATION ---

# Use Adam optimizer and Binary Cross-Entropy loss for binary classification [2]
# Include AUC-ROC as a critical performance metric (AUC is robust against class imbalance)
model_m1.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy', keras.metrics.AUC(name='auc')]
)

In [25]:
# --- 7. MODEL TRAINING ---

# Use Early Stopping to halt training if validation loss plateaus, restoring the best weights [3]
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=15,          # Number of epochs with no improvement after which training will be stopped
    restore_best_weights=True
)

In [26]:
print("\nStarting Model Training...")
history_m1 = model_m1.fit(
    X_train, Y_train,
    epochs=200,             # Set a high number of epochs, relying on EarlyStopping
    batch_size=32,
    validation_data=(X_val, Y_val),
    callbacks=[early_stopping],
    verbose=2               # Display training progress per epoch
)
print("Model training finished.")


Starting Model Training...
Epoch 1/200
13/13 - 4s - 285ms/step - accuracy: 0.6272 - auc: 0.0000e+00 - loss: nan - val_accuracy: 0.6279 - val_auc: 0.0000e+00 - val_loss: nan
Epoch 2/200
13/13 - 0s - 27ms/step - accuracy: 0.6272 - auc: 0.0000e+00 - loss: nan - val_accuracy: 0.6279 - val_auc: 0.0000e+00 - val_loss: nan
Epoch 3/200
13/13 - 1s - 54ms/step - accuracy: 0.6272 - auc: 0.0000e+00 - loss: nan - val_accuracy: 0.6279 - val_auc: 0.0000e+00 - val_loss: nan
Epoch 4/200
13/13 - 1s - 51ms/step - accuracy: 0.6272 - auc: 0.0000e+00 - loss: nan - val_accuracy: 0.6279 - val_auc: 0.0000e+00 - val_loss: nan
Epoch 5/200
13/13 - 0s - 31ms/step - accuracy: 0.6272 - auc: 0.0000e+00 - loss: nan - val_accuracy: 0.6279 - val_auc: 0.0000e+00 - val_loss: nan
Epoch 6/200
13/13 - 1s - 50ms/step - accuracy: 0.6272 - auc: 0.0000e+00 - loss: nan - val_accuracy: 0.6279 - val_auc: 0.0000e+00 - val_loss: nan
Epoch 7/200
13/13 - 1s - 49ms/step - accuracy: 0.6272 - auc: 0.0000e+00 - loss: nan - val_accuracy: 0

In [28]:
# --- 8. MODEL EVALUATION (5% WEIGHT) ---

print("\nEvaluating Model on the held-out Test Set (15%):")
loss_m1, accuracy_m1, auc_m1 = model_m1.evaluate(X_test, Y_test, verbose=0)

print(f"\n--- Member 1 (Baseline DNN) Final Test Results ---")
print(f"Test Loss: {loss_m1:.4f}")
print(f"Test Accuracy: {accuracy_m1:.4f}")
print(f"Test AUC-ROC: {auc_m1:.4f}")


Evaluating Model on the held-out Test Set (15%):

--- Member 1 (Baseline DNN) Final Test Results ---
Test Loss: nan
Test Accuracy: 0.6279
Test AUC-ROC: 0.0000


In [29]:
# Generate detailed classification report and confusion matrix
Y_pred_prob = model_m1.predict(X_test)
Y_pred_class = (Y_pred_prob > 0.5).astype("int32")

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step 


In [30]:
# Calculate Confusion Matrix
cm = confusion_matrix(Y_test, Y_pred_class)
print("\nConfusion Matrix:")
print(cm)


Confusion Matrix:
[[54  0]
 [32  0]]


In [31]:
# Calculate Classification Report (Precision, Recall, F1-Score)
target_names = encoder.classes_
report = classification_report(Y_test, Y_pred_class, target_names=target_names)
print("\nClassification Report (Key Metrics for Comparison):")
print(report)


Classification Report (Key Metrics for Comparison):
              precision    recall  f1-score   support

           B       0.63      1.00      0.77        54
           M       0.00      0.00      0.00        32

    accuracy                           0.63        86
   macro avg       0.31      0.50      0.39        86
weighted avg       0.39      0.63      0.48        86



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
