In [None]:
# Breast Cancer Detection Using Machine Learning (Binary Classification)

#This notebook builds a breast cancer classifier using:
#- Logistic Regression
#- Decision Tree
#- Random Forest
#- Artificial Neural Network (ANN)

#We:
#1. Load and explore the data  
#2. Preprocess it (encode labels, handle missing values, scale features)  
#3. Train and evaluate models  
#4. Optionally explore PCA and K-Means clustering  
#5. Predict on a new (synthetic) patient sample

## CNN on BreakHis Histopathology Images

#In this section, I use the BreakHis breast histopathology image dataset
#organized into `train/val/test` and `benign/malignant` folders.
#I train a Convolutional Neural Network (CNN) to classify tumor images.


In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from keras.models import Sequential
from keras.layers import Dense
from keras.models import load_model

import joblib


In [None]:
# Load the dataset
dataset = pd.read_csv("breast-cancer.csv")

# Show basic info
print("First 5 rows:")
display(dataset.head())

print("\nColumns:")
print(dataset.columns.tolist())

print("\nShape (rows, columns):", dataset.shape)


In [None]:
print("Summary statistics for numeric features:")
display(dataset.describe())


In [None]:
plt.figure(figsize=(4, 4))
sns.countplot(x='diagnosis', data=dataset)
plt.title("Count of Benign (B) vs Malignant (M)")
plt.show()


In [None]:
dataset.hist(figsize=(20, 20))
plt.suptitle("Feature Distributions", y=1.02)
plt.show()


In [None]:
# Data Preprocessing
# === 3.1 Drop useless columns ===
# 'id' is not useful for prediction, we drop it if present
if 'id' in dataset.columns:
    dataset = dataset.drop('id', axis=1)

# === 3.2 Encode target label ===
# diagnosis: M/B --> 1/0
label_encoder = LabelEncoder()
dataset['diagnosis'] = label_encoder.fit_transform(dataset['diagnosis'])
# Typically: B -> 0, M -> 1

# === 3.3 Split into features (X) and target (y) ===
X = dataset.drop(columns=['diagnosis'])
y = dataset['diagnosis']

print("Feature columns:")
print(X.columns.tolist())
print("\nX shape:", X.shape)
print("y shape:", y.shape)

# === 3.4 Handle missing values ===
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# === 3.5 Train–test split ===
X_train, X_test, y_train, y_test = train_test_split(
    X_imputed, y, test_size=0.2, random_state=42, stratify=y
)

print("\nTrain shape:", X_train.shape)
print("Test shape:", X_test.shape)

# === 3.6 Feature scaling ===
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save scaler and imputer for later use (e.g., single-patient prediction)
joblib.dump(scaler, "scaler.save")
joblib.dump(imputer, "imputer.save")


In [None]:
# === 4.1 Logistic Regression ===

log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train_scaled, y_train)

# Predict on test set
y_pred_log = log_model.predict(X_test_scaled)

# Evaluation
print("=== Logistic Regression ===")
print("Accuracy:", accuracy_score(y_test, y_pred_log))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_log, target_names=['Benign (0)', 'Malignant (1)']))


In [None]:
# === 4.2 Decision Tree Classifier ===

tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train_scaled, y_train)

# Predict on test set
y_pred_tree = tree_model.predict(X_test_scaled)

# Evaluation
print("=== Decision Tree Classifier ===")
print("Accuracy:", accuracy_score(y_test, y_pred_tree))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_tree))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_tree, target_names=['Benign (0)', 'Malignant (1)']))


In [None]:
# ===============================
# SECTION A — RANDOM FOREST TRAINING & EVALUATION
# ===============================

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Create a stronger Random Forest model
rf_model = RandomForestClassifier(
    n_estimators=200,         # more trees → better accuracy
    max_depth=None,           # allow full depth
    random_state=42,
    class_weight='balanced'   # handles any imbalance
)

# IMPORTANT: train on the SCALED data (like Logistic Regression and ANN)
rf_model.fit(X_train_scaled, y_train)

# Predict on the test set (also scaled)
y_pred_rf = rf_model.predict(X_test_scaled)

# Evaluate
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_conf_mat = confusion_matrix(y_test, y_pred_rf)
rf_report = classification_report(y_test, y_pred_rf, target_names=['Benign (0)', 'Malignant (1)'])

print("===== RANDOM FOREST: Training Results =====")
print("Accuracy:", rf_accuracy)
print("\nConfusion Matrix:\n", rf_conf_mat)
print("\nClassification Report:\n", rf_report)


In [None]:
# ===============================
# SECTION B — RANDOM FOREST PREDICTION ON NEW DATA
# ===============================

# ---- Synthetic Benign Example ----
example_patient = pd.DataFrame({
    'radius_mean': [12.0],
    'texture_mean': [15.0],
    'perimeter_mean': [78.0],
    'area_mean': [450.0],
    'smoothness_mean': [0.08],
    'compactness_mean': [0.07],
    'concavity_mean': [0.04],
    'concave points_mean': [0.03],
    'symmetry_mean': [0.18],
    'fractal_dimension_mean': [0.06],
    'radius_se': [0.3],
    'texture_se': [1.1],
    'perimeter_se': [2.5],
    'area_se': [20.0],
    'smoothness_se': [0.005],
    'compactness_se': [0.01],
    'concavity_se': [0.005],
    'concave points_se': [0.004],
    'symmetry_se': [0.015],
    'fractal_dimension_se': [0.003],
    'radius_worst': [14.0],
    'texture_worst': [20.0],
    'perimeter_worst': [95.0],
    'area_worst': [550.0],
    'smoothness_worst': [0.09],
    'compactness_worst': [0.10],
    'concavity_worst': [0.05],
    'concave points_worst': [0.04],
    'symmetry_worst': [0.20],
    'fractal_dimension_worst': [0.07]
})

# Align columns to X just to be safe
example_patient = example_patient[X.columns]

# Scale with SAME scaler
example_scaled = scaler.transform(example_patient)

# Predict
rf_prediction = rf_model.predict(example_scaled)
print("\n===== RANDOM FOREST: Synthetic Patient Prediction =====")
print("Predicted Class:", rf_prediction[0])

if rf_prediction[0] == 1:
    print("Final Result: MALIGNANT (1)")
else:
    print("Final Result: BENIGN (0)")


In [None]:
# ---- Real Malignant Example from Dataset ----

real_malignant_example = X[y == 1].iloc[[0]]   # get first malignant row

print("\n===== REAL MALIGNANT PATIENT DATA =====")
display(real_malignant_example)

# Impute (in case needed) and scale
mal_imputed = imputer.transform(real_malignant_example)
mal_scaled = scaler.transform(mal_imputed)

# Predict
mal_pred = rf_model.predict(mal_scaled)[0]

print("\n===== RANDOM FOREST: Real Malignant Prediction =====")
print("Predicted Class:", mal_pred)

if mal_pred == 1:
    print("Final Result: MALIGNANT (1)")
else:
    print("Final Result: BENIGN (0)")


In [None]:
# === 4.3 Artificial Neural Network (ANN) ===

# Define input dimension based on number of features
input_dim = X_train_scaled.shape[1]
print("Input dimension for ANN:", input_dim)

ann = Sequential()
ann.add(Dense(units=6, activation='relu', kernel_initializer='uniform', input_dim=input_dim))
ann.add(Dense(units=6, activation='relu', kernel_initializer='uniform'))
ann.add(Dense(units=1, activation='sigmoid', kernel_initializer='uniform'))

ann.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history = ann.fit(
    X_train_scaled, y_train,
    batch_size=10,
    epochs=50,      
    verbose=0       # set to 1 to see training progress
)

# Evaluate on test set
loss, acc = ann.evaluate(X_test_scaled, y_test, verbose=0)

print("=== ANN (Neural Network) ===")
print("Test Loss:", loss)
print("Test Accuracy:", acc)

# Save model
ann.save("ann_model.h5")


In [None]:
# Plot training accuracy and loss over epochs
plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.title('ANN Training Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.title('ANN Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')

plt.tight_layout()
plt.show()


In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

conf_matrix = confusion_matrix(y_test, y_pred_tree)
disp = ConfusionMatrixDisplay(conf_matrix, display_labels=['Benign (0)', 'Malignant (1)'])
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix - Decision Tree')
plt.show()


In [None]:
# === 6.1 PCA (2D) ===(reduce dimension)

pca = PCA(n_components=2)#(2D->PC1,PC2)
X_pca = pca.fit_transform(X_train_scaled)

print("Explained variance ratio (2 components):", pca.explained_variance_ratio_)

plt.figure(figsize=(6, 5))
plt.scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.5)
plt.title("PCA Projection (Train Data)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()

# === 6.2 KMeans clustering on PCA-reduced data ===(unsupervised learning)

k = 2 #3 groups,clusters
kmeans = KMeans(n_clusters=k, random_state=42)
cluster_labels = kmeans.fit_predict(X_pca)

sil_score = silhouette_score(X_pca, cluster_labels)
print("Silhouette Score (KMeans on PCA-reduced data):", sil_score)

plt.figure(figsize=(6, 5))
colors = ['red', 'green', 'blue']
for i in range(k):
    plt.scatter(
        X_pca[cluster_labels == i, 0],
        X_pca[cluster_labels == i, 1],
        color=colors[i],
        label=f"Cluster {i+1}",
        alpha=0.6
    )

centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], color='black', marker='X', s=100, label='Centers')

plt.title("KMeans Clusters (PCA-reduced data)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.legend()
plt.show()


In [None]:
# Get the feature names in correct order
feature_names = X.columns.tolist()
print("Feature names (in order):")
print(feature_names)


In [None]:
# === 7. Prediction for a new synthetic patient ===

# Load imputer, scaler, and ANN model
imputer_loaded = joblib.load("imputer.save")
scaler_loaded = joblib.load("scaler.save")
ann_loaded = load_model("ann_model.h5")

# Example observation: fill ALL required feature values
# Here I'll show an example assuming all 30 original features are present.
# Replace these numbers with realistic values or those in your report.
observation_data = {
   'radius_mean': [10.5],
    'texture_mean': [14.0],
    'perimeter_mean': [70.0],
    'area_mean': [330.0],
    'smoothness_mean': [0.08],
    'compactness_mean': [0.05],
    'concavity_mean': [0.02],
    'concave points_mean': [0.01],
    'symmetry_mean': [0.17],
    'fractal_dimension_mean': [0.06],
    'radius_se': [0.3],
    'texture_se': [1.1],
    'perimeter_se': [2.0],
    'area_se': [20.0],
    'smoothness_se': [0.005],
    'compactness_se': [0.01],
    'concavity_se': [0.005],
    'concave points_se': [0.004],
    'symmetry_se': [0.015],
    'fractal_dimension_se': [0.003],
    'radius_worst': [12.5],
    'texture_worst': [16.0],
    'perimeter_worst': [84.0],
    'area_worst': [460.0],
    'smoothness_worst': [0.095],
    'compactness_worst': [0.08],
    'concavity_worst': [0.03],
    'concave points_worst': [0.02],
    'symmetry_worst': [0.20],
    'fractal_dimension_worst': [0.07]
}




# Create DataFrame in the correct column order
# Ensure we align with X.columns
observation_df = pd.DataFrame(observation_data)[X.columns]

# 1) Impute (if needed)
obs_imputed = imputer_loaded.transform(observation_df)

# 2) Scale
obs_scaled = scaler_loaded.transform(obs_imputed)

# 3) Predict with ANN
prob = ann_loaded.predict(obs_scaled)[0, 0]
pred_class = int(prob > 0.5)

print(f"Predicted probability of malignant: {prob:.3f}")
print("Predicted class:", pred_class, "(1 = malignant, 0 = benign)")


In [None]:
# ============================
#  CNN: Load Pre-Trained Model
# ============================

from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing import image
import numpy as np
import matplotlib.pyplot as plt

# Image size used during training
img_height = 150
img_width = 150

# Load your trained CNN model
cnn = load_model("cnn_model_colab.h5")

print("Loaded CNN Model:")
cnn.summary()


In [None]:
def predict_cnn(image_path):
    img = image.load_img(image_path, target_size=(img_height, img_width))
    img_array = image.img_to_array(img) / 255.0
    img_array = np.expand_dims(img_array, axis=0)

    prob = cnn.predict(img_array)[0][0]
    label = "Malignant (1)" if prob > 0.5 else "Benign (0)"

    plt.imshow(image.load_img(image_path))
    plt.title(f"Prediction: {label}\nProbability of malignant = {prob:.4f}")
    plt.axis("off")
    plt.show()

    return prob, label


In [None]:
# ============================
#  CNN: Run Prediction on an Image
# ============================

#image_path = "C:\\Users\\User\\Desktop\\Computer Science\\Artificial Intelligence\\AI Project\\Dataset\\benign\\SOB\\adenosis\\SOB_B_A_14-22549AB\\40X\\SOB_B_A-14-22549AB-40-014.png"   #Balignant
image_path = "C:\\Users\\User\\Desktop\\Computer Science\\Artificial Intelligence\\AI Project\\Dataset\\malignant\\SOB\\ductal_carcinoma\\SOB_M_DC_14-2523\\40X\\SOB_M_DC-14-2523-40-010.png"
prob, label = predict_cnn(image_path)

print("\nRaw malignant probability:", prob)
print("Predicted class:", label)


In [None]:
#from tensorflow.keras.preprocessing.image import ImageDataGenerator

#img_height = 150
#img_width = 150
#batch_size = 32 --each batch has 32 images

#train_dir = "Dataset/breakhis_dataset/train"
#val_dir   = "Dataset/breakhis_dataset/val"
#test_dir  = "Dataset/breakhis_dataset/test"

#Normalization:
#train_datagen = ImageDataGenerator(rescale=1./255)
#val_datagen   = ImageDataGenerator(rescale=1./255)
#test_datagen  = ImageDataGenerator(rescale=1./255)

#train_generator = train_datagen.flow_from_directory(
 #   train_dir,
  #  target_size=(img_height, img_width),
   # batch_size=batch_size,
    #class_mode='binary'
#)

#val_generator = val_datagen.flow_from_directory(
 #   val_dir,
  #  target_size=(img_height, img_width),
   # batch_size=batch_size,
    #class_mode='binary'
#)

#test_generator = test_datagen.flow_from_directory(
 #   test_dir,
  #  target_size=(img_height, img_width),
   # batch_size=batch_size,
    #class_mode='binary'
#)


In [None]:
#epochs = 10  # increase to 20 if you want better accuracy

#history_cnn = cnn.fit(
 #   train_generator,
  #  epochs=epochs,
   # validation_data=val_generator
#)


In [None]:
#test_loss, test_acc = cnn.evaluate(test_generator)
#print("CNN Test Accuracy:", test_acc)
#print("CNN Test Loss:", test_loss)


In [None]:
#import matplotlib.pyplot as plt

#plt.figure(figsize=(10,4))

#plt.subplot(1,2,1)
#plt.plot(history_cnn.history['accuracy'], label='train acc')
#plt.plot(history_cnn.history['val_accuracy'], label='val acc')
#plt.title('CNN Accuracy')
#plt.legend()

#plt.subplot(1,2,2)
#plt.plot(history_cnn.history['loss'], label='train loss')
#plt.plot(history_cnn.history['val_loss'], label='val loss')
#plt.title('CNN Loss')
#plt.legend()
#plt.tight_layout()
#plt.show()


In [None]:
#from tensorflow.keras.preprocessing import image
#import numpy as np

#img_path = "TestImages/myTestImageB.png"  # <- adjust if in a different folder

# 1) Load image and resize to same size used in training
#img = image.load_img(img_path, target_size=(img_height, img_width))

# 2) Convert to array
#img_array = image.img_to_array(img)

# 3) Scale like training data (rescale=1./255)
#img_array = img_array / 255.0

# 4) Add batch dimension: shape (1, img_height, img_width, 3)
#img_array = np.expand_dims(img_array, axis=0)

# 5) Predict
#prob = cnn.predict(img_array)[0][0]  # probability of class 1 (malignant)

#print("Raw predicted probability of malignant:", prob)

#if prob > 0.5:
 #   print("Predicted class: 1 (Malignant)")
#else:
 #   print("Predicted class: 0 (Benign)")
