In [None]:
import os
import numpy as np
import cv2 # OpenCV for image processing
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
from tqdm import tqdm

# --- Step 1: Set the Local Dataset Path for Colab ---
# This path is the correct Colab-specific path to your dataset
# after you have mounted your Google Drive.
train_data_path = '/content/drive/MyDrive/training_set'

# Verify that the path exists before proceeding
if not os.path.exists(train_data_path):
    print(f"Error: The specified path '{train_data_path}' does not exist.")
    print("Please ensure your Google Drive is mounted and the path is correct.")
    print("The command `from google.colab import drive; drive.mount('/content/drive')` must be run first.")
    exit()

print(f"Loading dataset from: {train_data_path}")

# --- Step 2: Feature Extraction from Images ---

def extract_features_from_images(folder_path, feature_extractor):
    """
    Loads images from a folder, resizes them, and extracts features using a
    pre-trained VGG16 model.
    """
    features = []
    labels = []
    img_width, img_height = 224, 224 # VGG16 requires a minimum size of 224x224

    for class_label in os.listdir(folder_path):
        class_path = os.path.join(folder_path, class_label)
        if not os.path.isdir(class_path):
            continue

        print(f"Extracting features for class: {class_label}")
        for img_name in tqdm(os.listdir(class_path)):
            img_path = os.path.join(class_path, img_name)

            # Load and preprocess the image
            img = cv2.imread(img_path)
            if img is None:
                continue

            img = cv2.resize(img, (img_width, img_height))
            img = np.expand_dims(img, axis=0)
            img = preprocess_input(img)

            # Extract features and flatten the output
            extracted_features = feature_extractor.predict(img, verbose=0)
            features.append(extracted_features.flatten())
            labels.append(class_label)

    return np.array(features), np.array(labels)

# Load VGG16 model without the top (classification) layer
print("\nLoading VGG16 model for feature extraction...")
feature_extractor = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Extract features from the training data using the local path
X, y = extract_features_from_images(train_data_path, feature_extractor)

# --- Step 3: Train a Random Forest Model ---

# We need to map the labels to numerical values.
label_mapping = {'malignant': 1, 'benign': 0}
y_encoded = np.array([label_mapping[label] for label in y])

# Split the data into training and testing sets
# The test set is used for final evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)
print(f"\nTraining set size: {len(X_train)} samples")
print(f"Test set size: {len(X_test)} samples")

# Initialize and train the Random Forest Classifier
print("\nTraining Random Forest Classifier...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# --- Step 4: Evaluate the Model ---
print("\nEvaluating the model on the test set...")
y_pred = rf_model.predict(X_test)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
# F1-score for binary classification, which is appropriate for this task
f1 = f1_score(y_test, y_pred)

print("--- Performance Metrics ---")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1-Score: {f1:.4f}")

# You can also get a detailed report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=list(label_mapping.keys())))