In [1]:
# --- 1. SETUP AND IMPORTS ---
# Member 3: Implementation of a 1D Convolutional Neural Network (1D-CNN)
# This model explores using localized feature extraction for structured tabular data.

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Suppress the FutureWarnings related to pandas downcasting for a clean output
warnings.filterwarnings("ignore", category=FutureWarning) 

# Ensure reproducibility
np.random.seed(42)
tf.random.set_seed(42)

print("Libraries loaded successfully. Preparing data for 1D CNN.")

Libraries loaded successfully. Preparing data for 1D CNN.


In [2]:
# --- 2. DATA LOADING AND INITIAL CLEANING ---

try:
    data = pd.read_csv('Cancer_Data.csv') # [1]
    
    # Check for the common redundant column (if present)
    if 'Unnamed: 32' in data.columns:
        data = data.drop(['Unnamed: 32'], axis=1) 
        
    print(f"Dataset shape: {data.shape}")
except FileNotFoundError:
    print("Error: 'Cancer_Data.csv' not found. Please ensure the file is in the correct directory.")
    exit()

# Identify Features (X) and Target (Y)
X = data.drop(['id', 'diagnosis'], axis=1) # [1]
Y = data['diagnosis']


Dataset shape: (569, 32)


In [3]:
# --- 3. EXPLORATORY DATA ANALYSIS (EDA) AND VISUALIZATIONS ---

# 3.1. Target Class Distribution
print("\n--- EDA: Target Class Distribution ---")
# sns.countplot(x='diagnosis', data=data, palette='viridis')
# plt.title('Distribution of Diagnosis (Malignant vs. Benign)')
# plt.show() # Code hidden for brevity in output, but included in notebook flow



--- EDA: Target Class Distribution ---


In [4]:
# 3.2. Feature Correlation Heatmap
data_corr = data.copy()
data_corr['diagnosis'] = data_corr['diagnosis'].replace({'M': 1, 'B': 0}).astype(int) 
mean_features = ['diagnosis'] + [col for col in data_corr.columns if 'mean' in col]
corr_matrix = data_corr[mean_features].corr() 

In [5]:
# plt.figure(figsize=(10, 8))
# sns.heatmap(corr_matrix, annot=True, fmt='.1f', cmap='coolwarm', 
#             linewidths=.5, linecolor='black')
# plt.title('Correlation Heatmap of Mean Features')
# plt.show() # Code hidden for brevity in output

In [6]:
# --- 4. DATA PREPARATION (SPLIT FIRST) ---

# 4.1. Target Encoding 
encoder = LabelEncoder()
encoded_Y = encoder.fit_transform(Y) # 'M' -> 1, 'B' -> 0 [2]

TEST_SIZE_FINAL = 0.15 
VAL_SIZE_RATIO = 0.1764 # (~15% of total samples for validation)

# Split 1: Training Pool (85%) and Test Set (15%), stratified
X_train_full, X_test, Y_train_full, Y_test = train_test_split(
    X.values, encoded_Y, test_size=TEST_SIZE_FINAL, random_state=42, stratify=encoded_Y
)

# Split 2: Training Set (70%) and Validation Set (15%)
X_train, X_val, Y_train, Y_val = train_test_split(
    X_train_full, Y_train_full, test_size=VAL_SIZE_RATIO, random_state=42, stratify=Y_train_full
)


In [7]:
# --- 5. DATA PREPROCESSING (SCALE AND RESHAPE) (10% Grading Weight) ---

scaler = StandardScaler()

# 5.1. Fit ONLY on Training Data
X_train_scaled = scaler.fit_transform(X_train) # [3]

# 5.2. Transform Validation and Test Data
X_val_scaled = scaler.transform(X_val)       
X_test_scaled = scaler.transform(X_test)     

# 5.3. CRITICAL STEP: Reshape 2D data (N, 30) into 3D (N, 30, 1) for Conv1D [4]
X_train_cnn = np.expand_dims(X_train_scaled, axis=2)
X_val_cnn = np.expand_dims(X_val_scaled, axis=2)
X_test_cnn = np.expand_dims(X_test_scaled, axis=2)

input_shape_cnn = X_train_cnn.shape[1:] # Should be (30, 1)

print(f"\nData successfully scaled and reshaped.")
print(f"1D CNN Input Shape: {input_shape_cnn}")



Data successfully scaled and reshaped.
1D CNN Input Shape: (30, 1)


In [10]:
# --- 6. MODEL ARCHITECTURE DEFINITION (1D-CNN CORE CONTRIBUTION) ---

def create_1d_cnn(input_shape):
    # Sequential API requires layers as a list or use .add() method
    model = Sequential([
        # First layer needs input_shape parameter
        BatchNormalization(input_shape=input_shape, name='BN_1'),
        MaxPooling1D(pool_size=2, name='MaxPool_1'), # Reduces feature dimensionality [6]
        Dropout(0.3, name='Dropout_1'),
        
        Conv1D(filters=32, kernel_size=3, activation='relu', name='Conv1D_2'),
        BatchNormalization(name='BN_2'),
        
        # 2. Flatten Layer: Converts 3D output to 1D vector for Dense layer
        Flatten(name='Flatten_Features'), 
        
        # 3. Dense Classification Head
        Dense(32, activation='relu', name='Dense_Classifier'),
        Dropout(0.3, name='Dropout_2'),
        Dense(1, activation='sigmoid', name='Output_Layer') 
    ], name="1D_CNN_M3")
    return model

model_m3 = create_1d_cnn(input_shape_cnn)
print("\nModel Architecture (1D-CNN - Local Feature Extractor):")
model_m3.summary()


Model Architecture (1D-CNN - Local Feature Extractor):


  super().__init__(**kwargs)


In [11]:
# --- 7. MODEL COMPILATION AND TRAINING ---

model_m3.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy', 
    metrics=['accuracy', keras.metrics.AUC(name='auc')] # AUC is vital for comparison
)

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=25,          # Slightly higher patience for Conv1D stability
    restore_best_weights=True # [7]
)

print("\nStarting Model Training...")
history_m3 = model_m3.fit(
    X_train_cnn, Y_train,
    epochs=300,             
    batch_size=32,
    validation_data=(X_val_cnn, Y_val),
    callbacks=[early_stopping],
    verbose=2
)
print("Model training finished.")



Starting Model Training...
Epoch 1/300
13/13 - 8s - 623ms/step - accuracy: 0.7834 - auc: 0.8519 - loss: 0.4561 - val_accuracy: 0.9070 - val_auc: 0.9806 - val_loss: 0.5086
Epoch 2/300
13/13 - 1s - 102ms/step - accuracy: 0.9118 - auc: 0.9688 - loss: 0.2171 - val_accuracy: 0.9186 - val_auc: 0.9878 - val_loss: 0.4192
Epoch 3/300
13/13 - 1s - 47ms/step - accuracy: 0.9043 - auc: 0.9622 - loss: 0.2369 - val_accuracy: 0.9302 - val_auc: 0.9925 - val_loss: 0.3789
Epoch 4/300
13/13 - 0s - 34ms/step - accuracy: 0.9194 - auc: 0.9712 - loss: 0.2056 - val_accuracy: 0.9419 - val_auc: 0.9945 - val_loss: 0.3544
Epoch 5/300
13/13 - 0s - 33ms/step - accuracy: 0.9194 - auc: 0.9776 - loss: 0.1842 - val_accuracy: 0.9535 - val_auc: 0.9925 - val_loss: 0.3323
Epoch 6/300
13/13 - 0s - 35ms/step - accuracy: 0.9118 - auc: 0.9675 - loss: 0.2183 - val_accuracy: 0.9535 - val_auc: 0.9933 - val_loss: 0.3157
Epoch 7/300
13/13 - 0s - 34ms/step - accuracy: 0.9270 - auc: 0.9690 - loss: 0.2089 - val_accuracy: 0.9535 - val_

In [12]:
# --- 8. MODEL EVALUATION (5% Grading Weight) ---

print("\nEvaluating Model on the held-out Test Set (15%):")
loss_m3, accuracy_m3, auc_m3 = model_m3.evaluate(X_test_cnn, Y_test, verbose=0) 

print(f"\n--- Member 3 (1D-CNN) Final Test Results ---")
print(f"Test Loss: {loss_m3:.4f}")
print(f"Test Accuracy: {accuracy_m3:.4f}")
print(f"Test AUC-ROC: {auc_m3:.4f}")

# Generate detailed classification report
Y_pred_prob = model_m3.predict(X_test_cnn)
Y_pred_class = (Y_pred_prob > 0.5).astype("int32")

report = classification_report(Y_test, Y_pred_class, target_names=encoder.classes_)
print("\nClassification Report (Key Metrics for Comparison):")
print(report)


Evaluating Model on the held-out Test Set (15%):

--- Member 3 (1D-CNN) Final Test Results ---
Test Loss: 0.0894
Test Accuracy: 0.9535
Test AUC-ROC: 0.9971
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 155ms/step

Classification Report (Key Metrics for Comparison):
              precision    recall  f1-score   support

           B       0.95      0.98      0.96        54
           M       0.97      0.91      0.94        32

    accuracy                           0.95        86
   macro avg       0.96      0.94      0.95        86
weighted avg       0.95      0.95      0.95        86

