In [3]:
# ðŸ“˜ Breast Cancer Image Classification using Random Forest
# Author: Omowumi Akindehinde (Techgod)
# Program: Power Learn Project (PLP) â€“ July Cohort
# Deliverable: Jupyter Notebook + Performance Metrics

# --- Step 1: Import Required Libraries ---
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
from tensorflow.keras.preprocessing import image_dataset_from_directory
import joblib

# --- Step 2: Load Image Dataset ---
# Directory structure:
# complete_set/
# â”œâ”€â”€ training_set/
# â”‚   â”œâ”€â”€ benign/
# â”‚   â”œâ”€â”€ malignant/

train_dir = "complete_set/training_set"

# Automatically load all images and labels from folders
dataset = image_dataset_from_directory(
    train_dir,
    image_size=(128, 128),  # Resize images for uniformity
    batch_size=32           # Process in small batches
)

# --- Step 3: Inspect Classes ---
class_names = dataset.class_names
print("âœ… Classes found:", class_names)

# --- Step 4: Convert Images and Labels into NumPy Arrays ---
X = []
y = []

for images, labels in dataset:
    X.extend(images.numpy())   # Convert each image batch to NumPy
    y.extend(labels.numpy())   # Convert labels to NumPy

X = np.array(X)
y = np.array(y)

print("âœ… Dataset loaded. Total samples:", len(X))

# --- Step 5: Preprocess Data for Machine Learning Model ---
# Flatten images: (128x128x3 â†’ single vector)
X_flat = X.reshape(len(X), -1)
print("âœ… Feature matrix shape:", X_flat.shape)

# Encode labels: 'benign' â†’ 0, 'malignant' â†’ 1
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

# --- Step 6: Split Dataset into Train/Test ---
X_train, X_test, y_train, y_test = train_test_split(
    X_flat, y_encoded, test_size=0.2, random_state=42
)
print("âœ… Data split completed:")
print("   Training samples:", len(X_train))
print("   Testing samples:", len(X_test))

# --- Step 7: Train Random Forest Model ---
model = RandomForestClassifier(
    n_estimators=100,       # Number of decision trees
    random_state=42,
    n_jobs=-1               # Use all CPU cores for faster training
)
model.fit(X_train, y_train)

print("âœ… Model training completed!")

# --- Step 8: Make Predictions ---
y_pred = model.predict(X_test)

# --- Step 9: Evaluate Model Performance ---
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("\nðŸ“Š PERFORMANCE METRICS ðŸ“Š")
print("Accuracy:", round(accuracy, 4))
print("F1-score:", round(f1, 4))
print("\nDetailed Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=class_names))

# --- Step 10: Save Model for Future Use ---
joblib.dump(model, "breast_cancer_rf_model.pkl")
print("\nðŸ’¾ Model saved as 'breast_cancer_rf_model.pkl'")



Found 1112 files belonging to 2 classes.
âœ… Classes found: ['benign', 'malignant']
âœ… Dataset loaded. Total samples: 1112
âœ… Feature matrix shape: (1112, 49152)
âœ… Data split completed:
   Training samples: 889
   Testing samples: 223
âœ… Model training completed!

ðŸ“Š PERFORMANCE METRICS ðŸ“Š
Accuracy: 0.7713
F1-score: 0.7615

Detailed Classification Report:

              precision    recall  f1-score   support

      benign       0.82      0.88      0.85       164
   malignant       0.59      0.46      0.51        59

    accuracy                           0.77       223
   macro avg       0.70      0.67      0.68       223
weighted avg       0.76      0.77      0.76       223


ðŸ’¾ Model saved as 'breast_cancer_rf_model.pkl'
