In [None]:
import os

# Define the dataset path (adjust according to your Kaggle dataset path)
dataset_path = "/kaggle/input/breakhis"

# Display directory structure
for root, dirs, files in os.walk(dataset_path):
    print(root)


In [None]:
!pip install opencv-python-headless


In [None]:
import cv2
import numpy as np

# Preprocessing functions
def resize_image(image, target_size=(250, 250)):
    """
    Resize the image to the target size (250x250) while maintaining all content.
    """
    resized_image = cv2.resize(image, target_size, interpolation=cv2.INTER_LINEAR)
    return resized_image


def apply_gaussian_filter(image, kernel_size=5, sigma=1.0):
    """Apply Gaussian filter for denoising."""
    denoised_image = cv2.GaussianBlur(image, (kernel_size, kernel_size), sigma)
    return denoised_image

def apply_clahe(image):
    """Apply CLAHE to enhance contrast while retaining original color."""
    lab_image = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)  # Convert to LAB color space
    l_channel, a_channel, b_channel = cv2.split(lab_image)  # Split into L, A, and B channels
    
    # Apply CLAHE only to the L channel (lightness)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    l_channel = clahe.apply(l_channel)
    
    # Merge the CLAHE enhanced L channel with the original A and B channels
    clahe_image = cv2.merge((l_channel, a_channel, b_channel))
    
    # Convert back to BGR color space
    enhanced_image = cv2.cvtColor(clahe_image, cv2.COLOR_LAB2BGR)
    final_image = cv2.cvtColor(enhanced_image, cv2.COLOR_BGR2RGB)
    return final_image

def preprocess_image(image_path, target_size=(250, 250), kernel_size=5, sigma=1.0):
    """Complete preprocessing pipeline for a single image."""
    image = cv2.imread(image_path)
    
    # Step 1: Resize image
    resized_image = resize_image(image, target_size)
    
    # Step 2: Apply Gaussian filter (denoising)
    denoised_image = apply_gaussian_filter(resized_image, kernel_size, sigma)
    
    # Step 3: Apply CLAHE (contrast enhancement while retaining colors)
    preprocessed_image = apply_clahe(denoised_image)
    
    return preprocessed_image

def preprocess_dataset(input_dir, output_dir, target_size=(250, 250)):
    """Preprocess all images in the dataset."""
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for root, dirs, files in os.walk(input_dir):
        for file in files:
            if file.endswith((".png", ".jpg", ".jpeg")):
                # Input and output paths
                image_path = os.path.join(root, file)
                relative_path = os.path.relpath(root, input_dir)
                output_path = os.path.join(output_dir, relative_path)
                
                # Ensure output directory exists
                os.makedirs(output_path, exist_ok=True)
                
                # Preprocess and save the image
                preprocessed_image = preprocess_image(image_path, target_size)
                output_file_path = os.path.join(output_path, file)
                cv2.imwrite(output_file_path, preprocessed_image)
                print(f"Processed: {output_file_path}")




In [None]:
import cv2
import numpy as np

# Preprocessing functions
def resize_image(image, target_size=(250, 250)):
    """
    Resize the image to the target size (250x250) while maintaining all content.
    """
    resized_image = cv2.resize(image, target_size, interpolation=cv2.INTER_LINEAR)
    return resized_image


def apply_gaussian_filter(image, kernel_size=5, sigma=1.0):
    """Apply Gaussian filter for denoising."""
    denoised_image = cv2.GaussianBlur(image, (kernel_size, kernel_size), sigma)
    return denoised_image

def apply_clahe(image):
    """Apply CLAHE to enhance contrast while retaining original color."""
    lab_image = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)  # Convert to LAB color space
    l_channel, a_channel, b_channel = cv2.split(lab_image)  # Split into L, A, and B channels
    
    # Apply CLAHE only to the L channel (lightness)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    l_channel = clahe.apply(l_channel)
    
    # Merge the CLAHE enhanced L channel with the original A and B channels
    clahe_image = cv2.merge((l_channel, a_channel, b_channel))
    
    # Convert back to BGR color space
    enhanced_image = cv2.cvtColor(clahe_image, cv2.COLOR_LAB2BGR)
    final_image = cv2.cvtColor(enhanced_image, cv2.COLOR_BGR2RGB)
    return final_image

def preprocess_image(image_path, target_size=(250, 250), kernel_size=5, sigma=1.0):
    """Complete preprocessing pipeline for a single image."""
    image = cv2.imread(image_path)
    
    # Step 1: Resize image
    resized_image = resize_image(image, target_size)
    
    # Step 2: Apply Gaussian filter (denoising)
    denoised_image = apply_gaussian_filter(resized_image, kernel_size, sigma)
    
    # Step 3: Apply CLAHE (contrast enhancement while retaining colors)
    preprocessed_image = apply_clahe(denoised_image)
    
    return preprocessed_image

def preprocess_dataset(input_dir, output_dir, target_size=(250, 250)):
    """Preprocess all images in the dataset."""
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    count = 0
    for root, dirs, files in os.walk(input_dir):
        
        for file in files:
            if file.endswith((".png", ".jpg", ".jpeg")):
                # Input and output paths
                image_path = os.path.join(root, file)
                relative_path = os.path.relpath(root, input_dir)
                output_path = os.path.join(output_dir, relative_path)
                
                # Ensure output directory exists
                os.makedirs(output_path, exist_ok=True)
                
                # Preprocess and save the image
                preprocessed_image = preprocess_image(image_path, target_size)
                output_file_path = os.path.join(output_path, file)
                cv2.imwrite(output_file_path, preprocessed_image)
                print(f"{count}", end = " ")
                count += 1




In [None]:
# Example usage
input_dir = dataset_path
output_dir = "/kaggle/working/"
preprocess_dataset(input_dir, output_dir)

In [None]:
#**Visualisation of preprecessed images**


import matplotlib.pyplot as plt

# Function for visualization
def visualize_preprocessing(input_dir, target_size=(250, 250), kernel_size=5, sigma=1.0, num_samples=10):
    """Visualize the preprocessing pipeline with original and processed images."""
    # Collect a list of image paths
    image_paths = []
    for root, _, files in os.walk(input_dir):
        for file in files:
            if file.endswith((".png", ".jpg", ".jpeg")):
                image_paths.append(os.path.join(root, file))
    
    # Randomly select samples for visualization
    selected_paths = np.random.choice(image_paths, size=num_samples, replace=False)
    
    # Prepare visualization
    fig, axes = plt.subplots(num_samples, 2, figsize=(10, 5 * num_samples))
    for i, image_path in enumerate(selected_paths):
        # Read and preprocess image
        original_image = cv2.imread(image_path)
        preprocessed_image = preprocess_image(image_path, target_size, kernel_size, sigma)
        
        # Convert BGR to RGB for visualization
        original_image_rgb = cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB)
        preprocessed_image_rgb = cv2.cvtColor(preprocessed_image, cv2.COLOR_BGR2RGB)
        
        # Display original and preprocessed images
        axes[i, 0].imshow(original_image_rgb)
        axes[i, 0].set_title("Original Image")
        axes[i, 0].axis("off")
        
        axes[i, 1].imshow(preprocessed_image_rgb)
        axes[i, 1].set_title("Preprocessed Image")
        axes[i, 1].axis("off")
    
    plt.tight_layout()
    plt.show()

# Example usage
input_dir = "/kaggle/input/breakhis"
visualize_preprocessing(input_dir)


In [None]:
import cv2
import matplotlib.pyplot as plt


def preprocess_image_with_visualization(image_path, target_size=(250, 250), kernel_size=5, sigma=1.0):
    """Complete preprocessing pipeline for a single image with output visualization."""
    image = cv2.imread(image_path)
    plt.figure(figsize=(12, 8))
    plt.subplot(1, 4, 1)
    plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    plt.title("Original Image")
    plt.axis('off')

    # Step 1: Resize image
    resized_image = resize_image(image, target_size)
    plt.figure(figsize=(12, 8))
    plt.subplot(1, 4, 2)
    plt.imshow(cv2.cvtColor(resized_image, cv2.COLOR_BGR2RGB))
    plt.title("Step 1: Resized Image")
    plt.axis('off')

    # Step 2: Apply Gaussian filter (denoising)
    denoised_image = apply_gaussian_filter(resized_image, kernel_size, 1.0)
    plt.subplot(1, 4, 3)
    plt.imshow(cv2.cvtColor(denoised_image, cv2.COLOR_BGR2RGB))
    plt.title("Step 2: Denoised Image")
    plt.axis('off')

    # Step 3: Apply CLAHE (contrast enhancement while retaining colors)
    preprocessed_image = apply_clahe(denoised_image)
    plt.subplot(1, 4, 4)
    plt.imshow(cv2.cvtColor(preprocessed_image, cv2.COLOR_BGR2RGB))
    plt.title("Step 3: CLAHE Enhanced Image")
    plt.axis('off')

    plt.tight_layout()
    plt.show()

    return preprocessed_image

# Example usage
image_path = "/kaggle/input/breakhis/BreaKHis_v1/BreaKHis_v1/histology_slides/breast/benign/SOB/adenosis/SOB_B_A_14-22549AB/100X/SOB_B_A-14-22549AB-100-001.png"
preprocessed_image = preprocess_image_with_visualization(image_path)


In [None]:
import os
import cv2
import numpy as np
import tensorflow as tf

# Import pre-trained models and their preprocessing functions
from tensorflow.keras.applications import ResNet50, VGG16, Xception
from tensorflow.keras.applications.resnet50 import preprocess_input as resnet_preprocess
from tensorflow.keras.applications.vgg16 import preprocess_input as vgg_preprocess
from tensorflow.keras.applications.xception import preprocess_input as xception_preprocess

# --- 1. Load Pre-trained Models (without top layers) ---
resnet_model = ResNet50(weights="/kaggle/input/transefermodels/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5", include_top=False, pooling="avg")
vgg_model    = VGG16(weights="/kaggle/input/transefermodels/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5", include_top=False, pooling="avg")
xception_model = Xception(weights="/kaggle/input/transefermodels/xception_weights_tf_dim_ordering_tf_kernels_notop.h5", include_top=False, pooling="avg")

# --- 2. Define Helper Functions ---

def split_image(image):
    """
    Given a 250x250 image, split it into 4 equal quadrants
    and then resize each quadrant to 224x224.
    """
    h, w, _ = image.shape
    mid_h, mid_w = h // 2, w // 2
    quadrants = [
        image[0:mid_h, 0:mid_w],
        image[0:mid_h, mid_w:w],
        image[mid_h:h, 0:mid_w],
        image[mid_h:h, mid_w:w]
    ]
    return [cv2.resize(q, (224, 224)) for q in quadrants]

def extract_features_from_sub_images(sub_images):
    """
    Process each sub-image through the three models and
    concatenate the resulting feature vectors.
    Returns a NumPy array of shape (4, combined_feature_length).
    """
    features_list = []
    for sub_img in sub_images:
        sub_img_batch = np.expand_dims(sub_img, axis=0)
        feat_resnet   = resnet_model.predict(resnet_preprocess(sub_img_batch), verbose=0)
        feat_vgg      = vgg_model.predict(vgg_preprocess(sub_img_batch), verbose=0)
        feat_xception = xception_model.predict(xception_preprocess(sub_img_batch), verbose=0)
        combined = np.concatenate([feat_resnet, feat_vgg, feat_xception], axis=1)
        features_list.append(combined)
    return np.array(features_list)

def process_subclass(subclass_dir, main_class, tumor_type, output_base):
    """
    Process all images in a subclass folder.
    For each image:
      - Load (and convert to RGB), ensure size 250x250,
      - Split into 4 sub-images,
      - Extract features,
      - Append the features and label.
    Finally, save the features and labels as NumPy files.
    """
    features_list = []
    labels_list = []
    for root, dirs, files in os.walk(subclass_dir):
        for file in files:
            if file.lower().endswith((".png", ".jpg", ".jpeg")):
                image_path = os.path.join(root, file)
                image = cv2.imread(image_path)
                if image is None:
                    print(f"Skipping {image_path}: could not load.")
                    continue
                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                if image.shape[0] != 250 or image.shape[1] != 250:
                    image = cv2.resize(image, (250, 250))
                sub_imgs = split_image(image)
                feats = extract_features_from_sub_images(sub_imgs)
                features_list.append(feats)
                labels_list.append(f"{main_class}_{tumor_type}")
                print(f"Processed: {image_path}")
    if features_list:
        features_array = np.array(features_list, dtype=np.float32)
        labels_array = np.array(labels_list)
        os.makedirs(output_base, exist_ok=True)
        np.save(os.path.join(output_base, f"features_{main_class}_{tumor_type}.npy"), features_array)
        np.save(os.path.join(output_base, f"labels_{main_class}_{tumor_type}.npy"), labels_array)
        print(f"Saved {len(features_list)} images for subclass '{main_class}_{tumor_type}'")
    else:
        print(f"No images found in {main_class}_{tumor_type}")

# --- 3. Process All Benign Images ---

base_dir = "/kaggle/working/BreaKHis_v1/BreaKHis_v1/histology_slides/breast/"
benign_dir = os.path.join(base_dir, "benign")
output_base = "/kaggle/working/benign_features/"

if os.path.isdir(benign_dir):
    # Assume structure: benign/SOB/<tumor_type>/
    for sob_folder in os.listdir(benign_dir):
        sob_path = os.path.join(benign_dir, sob_folder)
        if not os.path.isdir(sob_path): continue
        for tumor_type in os.listdir(sob_path):
            tumor_dir = os.path.join(sob_path, tumor_type)
            print(tumor_dir, "  ", type(tumor_dir))
            if not os.path.isdir(tumor_dir): continue
            if tumor_dir != "/kaggle/working/BreaKHis_v1/BreaKHis_v1/histology_slides/breast/benign/SOB/fibroadenoma" : continue
            print(f"\n--- Processing benign subclass: {tumor_type} ---")
            process_subclass(tumor_dir, "benign", tumor_type, output_base)
else:
    print("Benign directory not found.")

print("✅ Benign extraction complete!")


In [1]:
import numpy as np

features = np.load("breakhis_features.npy")
labels = np.load("breakhis_labels.npy")

print("Feature Array Shape:", features.shape)
print("Labels Shape:", labels.shape)


FileNotFoundError: [Errno 2] No such file or directory: 'breakhis_features.npy'

In [None]:
print("First Image Features:\n", features[0])
print("First Image Label:", labels[0])


In [None]:
import os
import cv2
import numpy as np
import tensorflow as tf
from tqdm import tqdm  # progress bar

# Import pre-trained models and their preprocessing functions
from tensorflow.keras.applications import ResNet50, VGG16, Xception
from tensorflow.keras.applications.resnet50 import preprocess_input as resnet_preprocess
from tensorflow.keras.applications.vgg16 import preprocess_input as vgg_preprocess
from tensorflow.keras.applications.xception import preprocess_input as xception_preprocess

# --- 1. Load Pre-trained Models ---
resnet_model = ResNet50(weights="/kaggle/input/transefermodels/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5", include_top=False, pooling="avg")
vgg_model    = VGG16(weights="/kaggle/input/transefermodels/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5", include_top=False, pooling="avg")
xception_model = Xception(weights="/kaggle/input/transefermodels/xception_weights_tf_dim_ordering_tf_kernels_notop.h5", include_top=False, pooling="avg")

# --- 2. Helper Functions (same as in Code 1) ---
def split_image(image):
    h, w, _ = image.shape
    mid_h, mid_w = h // 2, w // 2
    quadrants = [
        image[0:mid_h, 0:mid_w],
        image[0:mid_h, mid_w:w],
        image[mid_h:h, 0:mid_w],
        image[mid_h:h, mid_w:w]
    ]
    return [cv2.resize(q, (224, 224)) for q in quadrants]

def extract_features_from_sub_images(sub_images):
    features_list = []
    for sub_img in sub_images:
        sub_img_batch = np.expand_dims(sub_img, axis=0)
        feat_resnet = resnet_model.predict(resnet_preprocess(sub_img_batch), verbose=0)
        feat_vgg = vgg_model.predict(vgg_preprocess(sub_img_batch), verbose=0)
        feat_xception = xception_model.predict(xception_preprocess(sub_img_batch), verbose=0)
        combined = np.concatenate([feat_resnet, feat_vgg, feat_xception], axis=1)
        features_list.append(combined)
    return np.array(features_list)

def process_subclass(subclass_dir, main_class, tumor_type, output_base):
    image_paths = []
    for root, dirs, files in os.walk(subclass_dir):
        for file in files:
            if file.lower().endswith((".png", ".jpg", ".jpeg")):
                image_paths.append(os.path.join(root, file))
    features_list = []
    labels_list = []
    for image_path in tqdm(image_paths, desc=f"Processing {main_class}_{tumor_type}"):
        image = cv2.imread(image_path)
        if image is None:
            print(f"Skipping {image_path}: could not load.")
            continue
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if image.shape[0] != 250 or image.shape[1] != 250:
            image = cv2.resize(image, (250, 250))
        sub_imgs = split_image(image)
        feats = extract_features_from_sub_images(sub_imgs)
        features_list.append(feats)
        labels_list.append(f"{main_class}_{tumor_type}")
    if features_list:
        features_array = np.array(features_list, dtype=np.float32)
        labels_array = np.array(labels_list)
        os.makedirs(output_base, exist_ok=True)
        np.save(os.path.join(output_base, f"features_{main_class}_{tumor_type}.npy"), features_array)
        np.save(os.path.join(output_base, f"labels_{main_class}_{tumor_type}.npy"), labels_array)
        print(f"Saved {len(features_list)} images for subclass '{main_class}_{tumor_type}'")
    else:
        print(f"No images found in {main_class}_{tumor_type}")

# --- 3. Process Malignant Ductal and Lobular Subtypes ---
base_dir = "/kaggle/working/BreaKHis_v1/BreaKHis_v1/histology_slides/breast/"
malignant_dir = os.path.join(base_dir, "malignant")
output_base = "/kaggle/working/malignant_features/"

if os.path.isdir(malignant_dir):
    # Assume structure: malignant/SOB/<tumor_type>/
    for sob_folder in os.listdir(malignant_dir):
        sob_path = os.path.join(malignant_dir, sob_folder)
        if not os.path.isdir(sob_path): continue
        for tumor_type in os.listdir(sob_path):
            # Process only ductal and lobular carcinoma folders
            if tumor_type.lower() in ['ductal_carcinoma']:
                tumor_dir = os.path.join(sob_path, tumor_type)
                if not os.path.isdir(tumor_dir): continue
                print(f"\n--- Processing malignant subtype: {tumor_type} ---")
                process_subclass(tumor_dir, "malignant", tumor_type, output_base)
else:
    print("Malignant directory not found.")

print("✅ Malignant ductal and lobular extraction complete!")


In [None]:
!zip -r malignant_features.zip /kaggle/working/malignant_features

In [None]:
!zip -r /kaggle/working/output.zip /kaggle/working/


In [None]:
import os
import glob
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
import pickle

def merge_features_labels(features_dir):
    """
    Given a directory where feature and label .npy files are stored,
    this function searches for all files starting with 'features_' and then
    loads each corresponding labels file (which is expected to have the same name
    with 'features_' replaced by 'labels_').
    
    The arrays from all files are concatenated along the first axis.
    
    Returns:
        all_features: NumPy array of shape (N, 4, D)
        all_labels: NumPy array of shape (N,)
    """
    features_files = glob.glob(os.path.join(features_dir, "features_*.npy"))
    all_features_list = []
    all_labels_list = []
    
    for f_file in features_files:
        # Construct the corresponding label file name.
        l_file = f_file.replace("features_", "labels_")
        print(l_file, " before")
        if l_file == "/kaggle/input/featuresnlabels/features/features_malignant_mucinous_carcinoma (1).npy":
            l_file = "/kaggle/input/featuresnlabels/features/features_malignant_mucinous_carcinoma.npy"
            print(l_file, " after")
        if os.path.exists(l_file):
            feats = np.load(f_file)
            labs = np.load(l_file)
            all_features_list.append(feats)
            all_labels_list.append(labs)
        else:
            print(f"Warning: Corresponding label file for {f_file} not found.")
    
    if all_features_list:
        all_features = np.concatenate(all_features_list, axis=0)
        all_labels = np.concatenate(all_labels_list, axis=0)
        return all_features, all_labels
    else:
        return None, None

def main():
    # ------------------------------
    # 1. Merge Features and Labels
    # ------------------------------
    features_dir = "/kaggle/input/featss/features/"
    all_features, all_labels = merge_features_labels(features_dir)
    
    if all_features is None or all_labels is None:
        print("No features or labels found! Please check your file paths.")
        return

    # all_features is expected to have shape (N, 4, D)
    print("Merged features shape:", all_features.shape)
    print("Merged labels shape:", all_labels.shape)
    
    # ------------------------------
    # 2. Feature Fusion
    # ------------------------------
    # Fuse the features from the 4 sub-images by taking the mean along axis 1.
    # This yields an image-level feature vector for each image, with shape (N, D).
    fused_features = np.mean(all_features, axis=1)
    print("After fusion, fused features shape:", fused_features.shape)
    
    # Check if fused_features is 2D. ExtraTreesClassifier expects a 2D array.
    if fused_features.ndim != 2:
        print("Fused features are not 2D (ndim =", fused_features.ndim, "). Reshaping now.")
        fused_features = fused_features.reshape(fused_features.shape[0], -1)
    print("Final fused features shape (for classifier):", fused_features.shape)
    
    # ------------------------------
    # 3. Feature Selection Using ExtraTreesClassifier
    # ------------------------------
    etc = ExtraTreesClassifier(n_estimators=100, random_state=42)
    etc.fit(fused_features, all_labels)
    
    # Use SelectFromModel to select features with importance above the median.
    sfm = SelectFromModel(etc, threshold="median", prefit=True)
    selected_features = sfm.transform(fused_features)
    
    print("Original fused feature dimension:", fused_features.shape[1])
    print("Dimension after feature selection:", selected_features.shape[1])
    
    # ------------------------------
    # 4. Save the Selected Features and the Feature Selection Model
    # ------------------------------
    selected_features_path = "/kaggle/working/selected_features.npy"
    model_path = "/kaggle/working/feature_selection_model.pkl"
    
    np.save(selected_features_path, selected_features)
    print("Saved selected features to:", selected_features_path)
    
    with open(model_path, "wb") as f:
        pickle.dump(sfm, f)
    print("Saved feature selection model to:", model_path)

if __name__ == "__main__":
    main()


In [None]:
import os
import glob
import numpy as np
import pickle
import warnings
from tqdm import tqdm

warnings.filterwarnings("ignore")

# Scikit-learn imports
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             confusion_matrix, log_loss, classification_report, roc_auc_score)
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier

# Additional models
from xgboost import XGBClassifier

def merge_features_labels(features_dir):
    """
    Searches the given directory for all files starting with 'features_'.
    For each such file, it loads the corresponding 'labels_' file.
    It then concatenates all feature arrays and label arrays along axis 0.
    
    Returns:
        all_features: NumPy array of shape (N, 4, D) or (N, 4, 1, D)
        all_labels: NumPy array of shape (N,)
    """
    print("Starting to merge feature and label files...")
    features_files = glob.glob(os.path.join(features_dir, "features_*.npy"))
    features_list = []
    labels_list = []
    
    for f_file in tqdm(features_files, desc="Merging feature files"):
        # Construct the corresponding labels file name.
        l_file = f_file.replace("features_", "labels_")
        if os.path.exists(l_file):
            feat = np.load(f_file)
            lab = np.load(l_file)
            features_list.append(feat)
            labels_list.append(lab)
        else:
            print(f"Warning: Label file for {f_file} not found.")
    
    if features_list:
        all_features = np.concatenate(features_list, axis=0)
        all_labels = np.concatenate(labels_list, axis=0)
        return all_features, all_labels
    else:
        return None, None

def main():
    # -------------------------------------------------------
    # 1. Merge Features and Labels from the Folder
    # -------------------------------------------------------
    features_dir = "/kaggle/input/featss/features/"
    all_features, all_labels = merge_features_labels(features_dir)
    
    if all_features is None or all_labels is None:
        print("No features or labels found! Please check your file paths.")
        return

    print("Merged features shape:", all_features.shape)  # e.g., (N, 4, 1, D)
    print("Merged labels shape:", all_labels.shape)      # e.g., (N,)
    
    # -------------------------------------------------------
    # 2. Feature Fusion: Average the sub-image features for each image
    # -------------------------------------------------------
    print("Starting feature fusion...")
    if all_features.ndim == 4 and all_features.shape[2] == 1:
        all_features = np.squeeze(all_features, axis=2)  # Now shape: (N, 4, D)
    fused_features = np.mean(all_features, axis=1)  # Shape: (N, D)
    if fused_features.ndim != 2:
        fused_features = fused_features.reshape(fused_features.shape[0], -1)
    print("Fused features shape (for classifier):", fused_features.shape)
    
    # -------------------------------------------------------
    # 3. Encode Class Labels into Numeric Values
    # -------------------------------------------------------
    print("Encoding class labels...")
    le = LabelEncoder()
    y_encoded = le.fit_transform(all_labels)
    print("Unique classes:", le.classes_)  # Expected 8 unique class names
    
    # -------------------------------------------------------
    # 4. Split Data into Training and Testing Sets (Stratified)
    # -------------------------------------------------------
    print("Splitting data into training and testing sets...")
    X_train, X_test, y_train, y_test = train_test_split(
        fused_features, y_encoded, test_size=0.20, random_state=42, stratify=y_encoded
    )
    print("Training set shape:", X_train.shape)
    print("Test set shape:", X_test.shape)
    
    # -------------------------------------------------------
    # 5. Train Individual Base Models
    # -------------------------------------------------------
    print("Training individual base models...")
    clf_lr = LogisticRegression(max_iter=1000, random_state=42)
    clf_svc = SVC(probability=True, random_state=42)
    clf_extra = ExtraTreesClassifier(n_estimators=200, random_state=42)
    clf_ridge = RidgeClassifier(random_state=42)
    clf_xgb = XGBClassifier(n_estimators=200, random_state=42, use_label_encoder=False, eval_metric='mlogloss')

    
    base_models = [("Logistic Regression", clf_lr),
                   ("SVC", clf_svc),
                   ("Extra Trees", clf_extra),
                   ("Ridge Classifier", clf_ridge),
                   ("XGBoost", clf_xgb),]
    
    for name, clf in tqdm(base_models, desc="Training base models"):
        clf.fit(X_train, y_train)
    
    # -------------------------------------------------------
    # 6. Build a Voting Ensemble Model (Soft Voting)
    # -------------------------------------------------------
    print("Building voting ensemble...")
    voting_clf = VotingClassifier(estimators=[
        ('lr', clf_lr),
        ('svc', clf_svc),
        ('extra', clf_extra),
        ('ridge', clf_ridge),
        ('xgb', clf_xgb)
    ], voting='soft')
    
    print("Evaluating ensemble with cross-validation...")
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(voting_clf, X_train, y_train, cv=skf, scoring='accuracy')
    print("\nVoting Ensemble CV Accuracy: {:.4f}".format(cv_scores.mean()))
    
    print("Training voting ensemble on full training set...")
    voting_clf.fit(X_train, y_train)
    
    # -------------------------------------------------------
    # 7. Evaluate Model Performance on the Test Set
    # -------------------------------------------------------
    print("Evaluating ensemble on test set...")
    y_pred = voting_clf.predict(X_test)
    y_prob = voting_clf.predict_proba(X_test)
    
    # Compute evaluation metrics (using macro averaging for multi-class)
    metrics = {}
    metrics['accuracy'] = accuracy_score(y_test, y_pred)
    metrics['precision'] = precision_score(y_test, y_pred, average='macro')
    metrics['recall'] = recall_score(y_test, y_pred, average='macro')
    metrics['f1_score'] = f1_score(y_test, y_pred, average='macro')
    metrics['mcr'] = 1 - metrics['accuracy']
    metrics['log_loss'] = log_loss(y_test, y_prob)
    try:
        metrics['auc_roc'] = roc_auc_score(y_test, y_prob, multi_class='ovr', average='macro')
    except Exception as e:
        metrics['auc_roc'] = None

    cm = confusion_matrix(y_test, y_pred)
    metrics['confusion_matrix'] = cm
    
    print("\nEvaluation Metrics:")
    print("  Accuracy:      {:.4f}".format(metrics['accuracy']))
    print("  Precision:     {:.4f}".format(metrics['precision']))
    print("  Recall:        {:.4f}".format(metrics['recall']))
    print("  F1-score:      {:.4f}".format(metrics['f1_score']))
    print("  Misclassification Rate: {:.4f}".format(metrics['mcr']))
    print("  Log Loss:      {:.4f}".format(metrics['log_loss']))
    print("  AUC-ROC:       {:.4f}".format(metrics['auc_roc'] if metrics['auc_roc'] is not None else -1))
    
    print("\nConfusion Matrix:")
    print(cm)
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=le.classes_))
    
    # -------------------------------------------------------
    # 8. Save the Final Ensemble Model
    # -------------------------------------------------------
    model_path = "/kaggle/working/voting_ensemble.pkl"
    with open(model_path, "wb") as f:
        pickle.dump(voting_clf, f)
    print("Voting ensemble model saved to:", model_path)
    
if __name__ == "__main__":
    main()


In [None]:
import numpy as np

def check_feature_fusion(features_array):
    """
    Checks if the feature array is fused or not.
    
    - If fused, the expected shape is (N, D).
    - If not fused, the shape might be (N, 4, D) or (N, 4, 1, D).
    """
    shape = features_array.shape
    print("Feature array shape:", shape)
    
    if features_array.ndim == 2:
        print("The features are fused (2D array: (N, D)).")
    elif features_array.ndim == 3:
        if shape[1] == 4:
            print("The features are NOT fused (3D array: (N, 4, D)).")
        else:
            print("The features are 3D but do not match expected unfused shape.")
    elif features_array.ndim == 4:
        if shape[1] == 4 and shape[2] == 1:
            print("The features are NOT fused (4D array: (N, 4, 1, D)).")
        else:
            print("The features are 4D but do not match expected unfused shape.")
    else:
        print("Unexpected feature array shape.")

# Example usage:
# Replace 'path/to/features.npy' with your actual feature file path.
features_path = "/kaggle/input/featss/features/features_benign_fibroadenoma.npy"
features = np.load(features_path)

check_feature_fusion(features)


In [None]:
import os
import glob
import numpy as np
import pickle
import warnings
from tqdm import tqdm

warnings.filterwarnings("ignore")

# Scikit-learn imports
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.calibration import CalibratedClassifierCV

# Additional model
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

def merge_fused_features_labels(input_dir):
    """
    Scans the given directory for files starting with 
    'fused_selected_features_' and loads their corresponding
    'fused_selected_labels_' files.
    
    Returns:
        all_features: NumPy array of shape (N, D)
        all_labels: NumPy array of shape (N,)
    """
    feat_files = glob.glob(os.path.join(input_dir, "fused_selected_features_*.npy"))
    features_list = []
    labels_list = []
    
    for feat_file in tqdm(feat_files, desc="Merging fused feature files"):
        label_file = feat_file.replace("fused_selected_features_", "fused_selected_labels_")
        if os.path.exists(label_file):
            feats = np.load(feat_file)
            labs = np.load(label_file)
            features_list.append(feats)
            labels_list.append(labs)
        else:
            print(f"Warning: Label file for {feat_file} not found.")
    
    if features_list:
        all_features = np.concatenate(features_list, axis=0)
        all_labels = np.concatenate(labels_list, axis=0)
        return all_features, all_labels
    else:
        return None, None

def main():
    # Use your input directory containing fused selected features and labels
    input_dir = "/kaggle/input/selected-feats/fused_selected_features/"
    all_features, all_labels = merge_fused_features_labels(input_dir)
    
    if all_features is None or all_labels is None:
        print("No features or labels found. Please check the directory.")
        return
    
    print("Merged fused features shape:", all_features.shape)  # e.g., (7909, D)
    print("Merged labels shape:", all_labels.shape)            # e.g., (7909,)
    
    # Encode labels (if not already numeric)
    le = LabelEncoder()
    y_encoded = le.fit_transform(all_labels)
    
    # Split data into training and testing sets (80/20, stratified)
    X_train, X_test, y_train, y_test = train_test_split(
        all_features, y_encoded, test_size=0.20, random_state=42, stratify=y_encoded
    )
    
    # Define your base models
    clf_lr = LogisticRegression(max_iter=1000, random_state=42)
    clf_svc = SVC(probability=True, random_state=42)
    clf_extra = ExtraTreesClassifier(n_estimators=200, random_state=42)
    ridge = RidgeClassifier(random_state=42)
    clf_ridge = CalibratedClassifierCV(ridge, cv=5)
    clf_xgb = XGBClassifier(n_estimators=200, random_state=42, use_label_encoder=False, eval_metric='mlogloss')
    clf_lgbm = LGBMClassifier(n_estimators=200, random_state=42)
    
    base_models = [
        ("SVC", clf_svc)
    ]
    
    # Create directory to save base models
    base_model_dir = "/kaggle/working/base_models/"
    os.makedirs(base_model_dir, exist_ok=True)
    
    print("Training and saving base models...")
    for name, clf in tqdm(base_models, desc="Training base models"):
        clf.fit(X_train, y_train)
        model_file = os.path.join(base_model_dir, f"{name}.pkl")
        with open(model_file, "wb") as f:
            pickle.dump(clf, f)
        print(f"Saved {name} model to {model_file}")
    
    print("All base models have been trained and saved.")


if __name__ == "__main__":
    main()


Merging fused feature files: 100%|██████████| 8/8 [00:00<00:00, 97.57it/s]

Merged fused features shape: (7909, 4608)
Merged labels shape: (7909,)





Training and saving base models...


Training base models:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
import os
import glob
import numpy as np
import pickle
import warnings
from tqdm import tqdm

warnings.filterwarnings("ignore")

# Scikit-learn imports
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, log_loss, roc_auc_score
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier
from sklearn.calibration import CalibratedClassifierCV

# Additional models
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

def merge_fused_features_labels(input_dir):
    """
    Scans the given directory for files starting with 
    'fused_selected_features_' and loads their corresponding
    'fused_selected_labels_' files.
    
    Returns:
        all_features: NumPy array of shape (N, D)
        all_labels: NumPy array of shape (N,)
    """
    feat_files = glob.glob(os.path.join(input_dir, "fused_selected_features_*.npy"))
    features_list = []
    labels_list = []
    
    for feat_file in tqdm(feat_files, desc="Merging fused feature files"):
        label_file = feat_file.replace("fused_selected_features_", "fused_selected_labels_")
        if os.path.exists(label_file):
            feats = np.load(feat_file)
            labs = np.load(label_file)
            features_list.append(feats)
            labels_list.append(labs)
        else:
            print(f"Warning: Label file for {feat_file} not found.")
    
    if features_list:
        all_features = np.concatenate(features_list, axis=0)
        all_labels = np.concatenate(labels_list, axis=0)
        return all_features, all_labels
    else:
        return None, None

def main():
    # -------------------------------------------------------
    # 1. Merge All Fused Selected Feature Files and Labels
    # -------------------------------------------------------
    print("Step 1: Merging fused selected features and labels...")
    input_dir = "/kaggle/input/selected-feats/fused_selected_features/"
    all_features, all_labels = merge_fused_features_labels(input_dir)
    
    if all_features is None or all_labels is None:
        print("No features or labels found. Please check the directory.")
        return
    
    print("Merged fused features shape:", all_features.shape)
    print("Merged labels shape:", all_labels.shape)
    
    # -------------------------------------------------------
    # 2. Encode Class Labels into Numeric Values
    # -------------------------------------------------------
    print("Step 2: Encoding class labels...")
    le = LabelEncoder()
    y_encoded = le.fit_transform(all_labels)
    print("Unique classes:", le.classes_)
    
    # -------------------------------------------------------
    # 3. Split Data into Training and Testing Sets (Stratified)
    # -------------------------------------------------------
    print("Step 3: Splitting data into training and test sets...")
    X_train, X_test, y_train, y_test = train_test_split(
        all_features, y_encoded, test_size=0.20, random_state=42, stratify=y_encoded
    )
    print("Training set shape:", X_train.shape)
    print("Test set shape:", X_test.shape)
    
    # -------------------------------------------------------
    # 4. Load Pre-trained Base Models from Disk
    # -------------------------------------------------------
    print("Step 4: Loading pre-trained base models...")
    base_model_dir = "/kaggle/input/basemodels"
    model_names = ["LogisticRegression", "SVC", "XGBoost", "LightGBM"]
    
    def load_base_models(model_dir, model_names):
        models = []
        for name in model_names:
            model_path = os.path.join(model_dir, f"{name}.pkl")
            with open(model_path, "rb") as f:
                model = pickle.load(f)
            models.append((name, model))
            print(f"Loaded {name} model from {model_path}")
        return models
    
    base_models = load_base_models(base_model_dir, model_names)
    
    # -------------------------------------------------------
    # 5. Build a Voting Ensemble Model (Soft Voting)
    # -------------------------------------------------------
    print("Step 5: Building voting ensemble...")
    voting_clf = VotingClassifier(estimators=base_models, voting='soft')
    
    # -------------------------------------------------------
    # 6. Evaluate Ensemble with Manual Cross-Validation
    # -------------------------------------------------------
    print("Step 6: Evaluating ensemble with cross-validation...")
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = []
    fold_idx = 1
    for train_idx, val_idx in tqdm(skf.split(X_train, y_train), total=skf.get_n_splits(), desc="CV Folds"):
        X_tr, X_val = X_train[train_idx], X_train[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]
        # Train ensemble on the fold training data
        voting_clf.fit(X_tr, y_tr)
        score = accuracy_score(y_val, voting_clf.predict(X_val))
        cv_scores.append(score)
        print(f"Fold {fold_idx} accuracy: {score:.4f}")
        fold_idx += 1
    print("\nVoting Ensemble CV Accuracy: {:.4f}".format(np.mean(cv_scores)))
    
    print("Step 7: Training voting ensemble on full training set...")
    voting_clf.fit(X_train, y_train)
    
    # -------------------------------------------------------
    # 7. Evaluate Model Performance on the Test Set
    # -------------------------------------------------------
    print("Step 8: Evaluating ensemble on test set...")
    y_pred = voting_clf.predict(X_test)
    y_prob = voting_clf.predict_proba(X_test)
    
    test_acc = accuracy_score(y_test, y_pred)
    print("\nTest Accuracy: {:.4f}".format(test_acc))
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=le.classes_))
    
    print("Confusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    
    try:
        auc = roc_auc_score(y_test, y_prob, multi_class='ovr', average='macro')
    except Exception as e:
        auc = None
    print("AUC-ROC:", auc)
    
    test_ll = log_loss(y_test, y_prob)
    print("Log Loss:", test_ll)
    
    # -------------------------------------------------------
    # 8. Check if Model is Online (Sample Prediction)
    # -------------------------------------------------------
    print("Step 9: Checking if model is online...")
    sample_input = X_test[0].reshape(1, -1)
    sample_pred = voting_clf.predict(sample_input)
    print("Sample prediction for first test sample:", sample_pred)
    print("Model is online and ready for predictions!")
    
    # -------------------------------------------------------
    # 9. Save the Final Ensemble Model
    # -------------------------------------------------------
    print("Step 10: Saving final ensemble model...")
    ensemble_model_path = "/kaggle/working/voting_ensemble.pkl"
    with open(ensemble_model_path, "wb") as f:
        pickle.dump(voting_clf, f)
    print("Voting ensemble model saved to:", ensemble_model_path)
    
if __name__ == "__main__":
    main()


Step 1: Merging fused selected features and labels...


Merging fused feature files: 100%|██████████| 8/8 [00:01<00:00,  4.60it/s]


Merged fused features shape: (7909, 4608)
Merged labels shape: (7909,)
Step 2: Encoding class labels...
Unique classes: ['benign_adenosis' 'benign_fibroadenoma' 'benign_phyllodes_tumor'
 'benign_tubular_adenoma' 'malignant_ductal_carcinoma'
 'malignant_lobular_carcinoma' 'malignant_mucinous_carcinoma'
 'malignant_papillary_carcinoma']
Step 3: Splitting data into training and test sets...
Training set shape: (6327, 4608)
Test set shape: (1582, 4608)
Step 4: Loading pre-trained base models...
Loaded LogisticRegression model from /kaggle/input/basemodels/LogisticRegression.pkl
Loaded SVC model from /kaggle/input/basemodels/SVC.pkl
Loaded ExtraTrees model from /kaggle/input/basemodels/ExtraTrees.pkl
Loaded CalibratedRidge model from /kaggle/input/basemodels/CalibratedRidge.pkl
Loaded XGBoost model from /kaggle/input/basemodels/XGBoost.pkl
Loaded LightGBM model from /kaggle/input/basemodels/LightGBM.pkl
Step 5: Building voting ensemble...
Step 6: Evaluating ensemble with cross-validation...

CV Folds:   0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.556908 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1174919
[LightGBM] [Info] Number of data points in the train set: 5061, number of used features: 4608
[LightGBM] [Info] Start training from score -2.880345
[LightGBM] [Info] Start training from score -2.055429
[LightGBM] [Info] Start training from score -2.862893
[LightGBM] [Info] Start training from score -2.632166
[LightGBM] [Info] Start training from score -0.829477
[LightGBM] [Info] Start training from score -2.535358
[LightGBM] [Info] Start training from score -2.298838
[LightGBM] [Info] Start training from score -2.645997


In [3]:
import os
import glob
import numpy as np
import pickle
import warnings
import time
from tqdm import tqdm
from imblearn.over_sampling import SMOTE

# Scikit-learn imports
from sklearn.preprocessing import RobustScaler, PowerTransformer, LabelEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split

import logging, sys
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(sys.stdout)]
)
logger = logging.getLogger(__name__)

def merge_fused_features_labels(input_dir):
    """
    Scans the given directory for files starting with 'fused_selected_features_'
    and loads their corresponding 'fused_selected_labels_' files.
    Returns:
        all_features: NumPy array of shape (N, D)
        all_labels: NumPy array of shape (N,)
    """
    print(f"Merging fused feature files from: {input_dir}")
    feat_files = glob.glob(os.path.join(input_dir, "fused_selected_features_*.npy"))
    features_list = []
    labels_list = []
    for feat_file in tqdm(feat_files, desc="Merging fused feature files"):
        label_file = feat_file.replace("fused_selected_features_", "fused_selected_labels_")
        if os.path.exists(label_file):
            feats = np.load(feat_file)
            labs = np.load(label_file)
            features_list.append(feats)
            labels_list.append(labs)
            print(f"Loaded {os.path.basename(feat_file)} with shape {feats.shape}")
        else:
            logger.warning(f"Label file for {feat_file} not found.")
    if features_list:
        all_features = np.concatenate(features_list, axis=0)
        all_labels = np.concatenate(labels_list, axis=0)
        print("Merging completed.")
        return all_features, all_labels
    else:
        return None, None

def preprocess_features(X, feature_threshold=0.95):
    """
    Preprocess features by removing quasi-constant features, robust scaling,
    and applying Yeo-Johnson transformation.
    """
    print("Preprocessing: Removing quasi-constant features...")
    selector = VarianceThreshold(threshold=0.01)
    X = selector.fit_transform(X)
    
    print("Preprocessing: Applying RobustScaler...")
    scaler = RobustScaler()
    X = scaler.fit_transform(X)
    
    print("Preprocessing: Applying PowerTransformer (Yeo-Johnson)...")
    power = PowerTransformer(method='yeo-johnson')
    X = power.fit_transform(X)
    
    return X

# Module A: Execute Data Loading and Preprocessing
print("Module A: Data Loading and Preprocessing started.")
input_dir = "/kaggle/input/selected-feats/fused_selected_features/"
all_features, all_labels = merge_fused_features_labels(input_dir)

if all_features is None or all_labels is None:
    logger.error("No features or labels found. Check the directory and file naming.")
    sys.exit(1)

print(f"Merged features shape: {all_features.shape}")
print(f"Merged labels shape: {all_labels.shape}")

print("Preprocessing features...")
all_features = preprocess_features(all_features)
print(f"Preprocessed feature shape: {all_features.shape}")

print("Encoding class labels...")
le = LabelEncoder()
y_encoded = le.fit_transform(all_labels)
print(f"Unique classes: {le.classes_}")

print("Splitting data into training and test sets...")
X_train, X_test, y_train, y_test = train_test_split(
    all_features, y_encoded, test_size=0.20, random_state=42, stratify=y_encoded
)
print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

print("Applying SMOTE for class balancing on training data...")
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)
print(f"Balanced training set shape: {X_train_bal.shape}")

# Save preprocessed data if desired
with open("/kaggle/working/preprocessed_data.pkl", "wb") as f:
    pickle.dump((X_train_bal, y_train_bal, X_test, y_test, le), f)

print("Module A completed successfully.")


Merging fused feature files: 100%|██████████| 8/8 [00:00<00:00, 78.62it/s]


In [2]:
from IPython.display import FileLink
FileLink("merged_fused_selected_features.npy")


In [None]:
import os
import glob
import numpy as np
import pickle
import warnings
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, log_loss, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import logging, sys

warnings.filterwarnings("ignore")
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(sys.stdout)]
)
logger = logging.getLogger(__name__)

def merge_fused_features_labels(input_dir):
    """
    Scans the given directory for files starting with 
    'fused_selected_features_' and loads their corresponding
    'fused_selected_labels_' files.
    
    Returns:
        all_features: NumPy array of shape (N, D)
        all_labels: NumPy array of shape (N,)
    """
    print(f"Merging fused feature files from: {input_dir}")
    feat_files = glob.glob(os.path.join(input_dir, "fused_selected_features_*.npy"))
    features_list = []
    labels_list = []
    
    for feat_file in tqdm(feat_files, desc="Merging fused feature files"):
        label_file = feat_file.replace("fused_selected_features_", "fused_selected_labels_")
        if os.path.exists(label_file):
            feats = np.load(feat_file)
            labs = np.load(label_file)
            features_list.append(feats)
            labels_list.append(labs)
            print(f"Loaded {os.path.basename(feat_file)} with shape {feats.shape}")
        else:
            logger.warning(f"Label file for {feat_file} not found.")
    
    if features_list:
        all_features = np.concatenate(features_list, axis=0)
        all_labels = np.concatenate(labels_list, axis=0)
        print("Merging completed.")
        return all_features, all_labels
    else:
        return None, None

def main():
    # -------------------------------------------------------
    # 1. Merge All Fused Selected Feature Files and Labels
    # -------------------------------------------------------
    input_dir = "/kaggle/input/selected-feats/fused_selected_features/"
    all_features, all_labels = merge_fused_features_labels(input_dir)
    
    if all_features is None or all_labels is None:
        logger.error("No features or labels found. Please check the directory.")
        return
    
    # Save merged features and labels for later reference (optional)
    merged_feat_path = "/kaggle/working/merged_fused_selected_features.npy"
    merged_labels_path = "/kaggle/working/merged_labels.npy"
    np.save(merged_feat_path, all_features)
    np.save(merged_labels_path, all_labels)
    print(f"Merged features saved to: {merged_feat_path}")
    print(f"Merged labels saved to: {merged_labels_path}")
    
    print(f"Merged fused features shape: {all_features.shape}")
    print(f"Merged labels shape: {all_labels.shape}")
    
    # -------------------------------------------------------
    # 2. Encode Class Labels into Numeric Values
    # -------------------------------------------------------
    print("Encoding class labels...")
    le = LabelEncoder()
    y_encoded = le.fit_transform(all_labels)
    print(f"Unique classes: {le.classes_}")
    
    # -------------------------------------------------------
    # 3. Split Data into Training and Testing Sets (Stratified)
    # -------------------------------------------------------
    print("Splitting data into training and test sets...")
    X_train, X_test, y_train, y_test = train_test_split(
        all_features, y_encoded, test_size=0.20, random_state=42, stratify=y_encoded
    )
    print(f"Training set shape: {X_train.shape}")
    print(f"Test set shape: {X_test.shape}")
    
    # -------------------------------------------------------
    # 4. Load Pre-trained Base Models from Disk
    # -------------------------------------------------------
    base_model_dir = "/kaggle/input/basemodels"
    model_names = ["LogisticRegression", "SVC", "ExtraTrees", "CalibratedRidge", "XGBoost"]
    def load_base_models(model_dir, model_names):
        models = []
        for name in model_names:
            model_path = os.path.join(model_dir, f"{name}.pkl")
            with open(model_path, "rb") as f:
                model = pickle.load(f)
            models.append((name, model))
            print(f"Loaded model {name} from {model_path}")
        return models
    base_models = load_base_models(base_model_dir, model_names)
    
    # -------------------------------------------------------
    # 5. Build a Voting Ensemble Model (Soft Voting)
    # -------------------------------------------------------
    print("Building voting ensemble...")
    voting_clf = VotingClassifier(estimators=base_models, voting='soft', n_jobs=-1)
    
    # Evaluate using 5-fold stratified cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(voting_clf, X_train, y_train, cv=skf, scoring='accuracy', n_jobs=-1)
    print(f"\nVoting Ensemble CV Accuracy: {cv_scores.mean():.4f}")
    
    print("Training voting ensemble on full training set...")
    voting_clf.fit(X_train, y_train)
    
    # -------------------------------------------------------
    # 6. Evaluate Model Performance on the Test Set
    # -------------------------------------------------------
    print("Evaluating ensemble on test set...")
    y_pred = voting_clf.predict(X_test)
    y_prob = voting_clf.predict_proba(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    print(f"\nTest Accuracy: {acc:.4f}")
    
    print("\nClassification Report:")
    print("\n" + classification_report(y_test, y_pred, target_names=le.classes_))
    
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print("\n" + str(cm))
    
    try:
        auc = roc_auc_score(y_test, y_prob, multi_class='ovr', average='macro')
    except Exception as e:
        auc = None
    print(f"AUC-ROC: {auc}")
    
    ll = log_loss(y_test, y_prob)
    print(f"Log Loss: {ll}")
    
    # -------------------------------------------------------
    # 7. Check if Model is Online (Sample Prediction)
    # -------------------------------------------------------
    sample_input = X_test[0].reshape(1, -1)
    sample_pred = voting_clf.predict(sample_input)
    print(f"\nSample prediction for first test sample: {sample_pred}")
    print("Model is online and ready for predictions!")
    
    # -------------------------------------------------------
    # 8. Save the Final Ensemble Model
    # -------------------------------------------------------
    ensemble_model_path = "/kaggle/working/voting_ensemble.pkl"
    with open(ensemble_model_path, "wb") as f:
        pickle.dump(voting_clf, f)
    print(f"Voting ensemble model saved to: {ensemble_model_path}")

if __name__ == "__main__":
    main()


Merging fused feature files from: /kaggle/input/selected-feats/fused_selected_features/


Merging fused feature files: 100%|██████████| 8/8 [00:00<00:00, 89.66it/s]

Loaded fused_selected_features_benign_fibroadenoma.npy with shape (1014, 4608)
Loaded fused_selected_features_benign_tubular_adenoma.npy with shape (569, 4608)
Loaded fused_selected_features_malignant_lobular_carcinoma.npy with shape (626, 4608)
Loaded fused_selected_features_malignant_ductal_carcinoma.npy with shape (3451, 4608)
Loaded fused_selected_features_benign_adenosis.npy with shape (444, 4608)
Loaded fused_selected_features_benign_phyllodes_tumor.npy with shape (453, 4608)
Loaded fused_selected_features_malignant_mucinous_carcinoma.npy with shape (792, 4608)
Loaded fused_selected_features_malignant_papillary_carcinoma.npy with shape (560, 4608)
Merging completed.





Merged features saved to: /kaggle/working/merged_fused_selected_features.npy
Merged labels saved to: /kaggle/working/merged_labels.npy
Merged fused features shape: (7909, 4608)
Merged labels shape: (7909,)
Encoding class labels...
Unique classes: ['benign_adenosis' 'benign_fibroadenoma' 'benign_phyllodes_tumor'
 'benign_tubular_adenoma' 'malignant_ductal_carcinoma'
 'malignant_lobular_carcinoma' 'malignant_mucinous_carcinoma'
 'malignant_papillary_carcinoma']
Splitting data into training and test sets...
Training set shape: (6327, 4608)
Test set shape: (1582, 4608)
Loaded model LogisticRegression from /kaggle/input/basemodels/LogisticRegression.pkl
Loaded model SVC from /kaggle/input/basemodels/SVC.pkl
Loaded model ExtraTrees from /kaggle/input/basemodels/ExtraTrees.pkl
Loaded model CalibratedRidge from /kaggle/input/basemodels/CalibratedRidge.pkl
Loaded model XGBoost from /kaggle/input/basemodels/XGBoost.pkl
Building voting ensemble...

Voting Ensemble CV Accuracy: 0.8132
Training vot

In [2]:
import os
import glob
import numpy as np
import pickle
import warnings
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, log_loss, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import logging, sys

warnings.filterwarnings("ignore")
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(sys.stdout)]
)
logger = logging.getLogger(__name__)

def merge_fused_features_labels(input_dir):
    """
    Scans the given directory for files starting with 
    'fused_selected_features_' and loads their corresponding
    'fused_selected_labels_' files.
    
    Returns:
        all_features: NumPy array of shape (N, D)
        all_labels: NumPy array of shape (N,)
    """
    print(f"Merging fused feature files from: {input_dir}")
    feat_files = glob.glob(os.path.join(input_dir, "fused_selected_features_*.npy"))
    features_list = []
    labels_list = []
    
    for feat_file in tqdm(feat_files, desc="Merging fused feature files"):
        label_file = feat_file.replace("fused_selected_features_", "fused_selected_labels_")
        if os.path.exists(label_file):
            feats = np.load(feat_file)
            labs = np.load(label_file)
            features_list.append(feats)
            labels_list.append(labs)
            print(f"Loaded {os.path.basename(feat_file)} with shape {feats.shape}")
        else:
            logger.warning(f"Label file for {feat_file} not found.")
    
    if features_list:
        all_features = np.concatenate(features_list, axis=0)
        all_labels = np.concatenate(labels_list, axis=0)
        print("Merging completed.")
        return all_features, all_labels
    else:
        return None, None

def main():
    # -------------------------------------------------------
    # 1. Merge All Fused Selected Feature Files and Labels
    # -------------------------------------------------------
    input_dir = "/kaggle/input/selected-feats/fused_selected_features/"
    all_features, all_labels = merge_fused_features_labels(input_dir)
    
    if all_features is None or all_labels is None:
        logger.error("No features or labels found. Please check the directory.")
        return
    
    # Save merged features and labels for later reference (optional)
    merged_feat_path = "/kaggle/working/merged_fused_selected_features.npy"
    merged_labels_path = "/kaggle/working/merged_labels.npy"
    np.save(merged_feat_path, all_features)
    np.save(merged_labels_path, all_labels)
    print(f"Merged features saved to: {merged_feat_path}")
    print(f"Merged labels saved to: {merged_labels_path}")
    
    print(f"Merged fused features shape: {all_features.shape}")
    print(f"Merged labels shape: {all_labels.shape}")
    
    # -------------------------------------------------------
    # 2. Encode Class Labels into Numeric Values
    # -------------------------------------------------------
    print("Encoding class labels...")
    le = LabelEncoder()
    y_encoded = le.fit_transform(all_labels)
    print(f"Unique classes: {le.classes_}")
    
    # -------------------------------------------------------
    # 3. Split Data into Training and Testing Sets (Stratified)
    # -------------------------------------------------------
    print("Splitting data into training and test sets...")
    X_train, X_test, y_train, y_test = train_test_split(
        all_features, y_encoded, test_size=0.20, random_state=42, stratify=y_encoded
    )
    print(f"Training set shape: {X_train.shape}")
    print(f"Test set shape: {X_test.shape}")
    
    # -------------------------------------------------------
    # 4. Load Pre-trained Base Models from Disk
    # -------------------------------------------------------
    base_model_dir = "/kaggle/input/basemodels"
    model_names = ["LogisticRegression", "SVC", "ExtraTrees", "CalibratedRidge", "XGBoost"]
    def load_base_models(model_dir, model_names):
        models = []
        for name in model_names:
            model_path = os.path.join(model_dir, f"{name}.pkl")
            with open(model_path, "rb") as f:
                model = pickle.load(f)
            models.append((name, model))
            print(f"Loaded model {name} from {model_path}")
        return models
    base_models = load_base_models(base_model_dir, model_names)
    
    # -------------------------------------------------------
    # 5. Build a Voting Ensemble Model (Soft Voting)
    # -------------------------------------------------------
    print("Building voting ensemble...")
    voting_clf = VotingClassifier(estimators=base_models, voting='soft', n_jobs=-1)

    print("Training voting ensemble on full training set...")
    voting_clf.fit(X_train, y_train)
    
    # -------------------------------------------------------
    # 6. Evaluate Model Performance on the Test Set
    # -------------------------------------------------------
    print("Evaluating ensemble on test set...")
    y_pred = voting_clf.predict(X_test)
    y_prob = voting_clf.predict_proba(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    print(f"\nTest Accuracy: {acc:.4f}")
    
    print("\nClassification Report:")
    print("\n" + classification_report(y_test, y_pred, target_names=le.classes_))
    
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print("\n" + str(cm))
    
    try:
        auc = roc_auc_score(y_test, y_prob, multi_class='ovr', average='macro')
    except Exception as e:
        auc = None
    print(f"AUC-ROC: {auc}")
    
    ll = log_loss(y_test, y_prob)
    print(f"Log Loss: {ll}")
    
    # -------------------------------------------------------
    # 7. Check if Model is Online (Sample Prediction)
    # -------------------------------------------------------
    sample_input = X_test[0].reshape(1, -1)
    sample_pred = voting_clf.predict(sample_input)
    print(f"\nSample prediction for first test sample: {sample_pred}")
    print("Model is online and ready for predictions!")
    
    # -------------------------------------------------------
    # 8. Save the Final Ensemble Model
    # -------------------------------------------------------
    ensemble_model_path = "/kaggle/working/voting_ensemble.pkl"
    with open(ensemble_model_path, "wb") as f:
        pickle.dump(voting_clf, f)
    print(f"Voting ensemble model saved to: {ensemble_model_path}")

if __name__ == "__main__":
    main()


Merging fused feature files from: /kaggle/input/selected-feats/fused_selected_features/


Merging fused feature files:  25%|██▌       | 2/8 [00:00<00:01,  5.26it/s]

Loaded fused_selected_features_benign_fibroadenoma.npy with shape (1014, 4608)
Loaded fused_selected_features_benign_tubular_adenoma.npy with shape (569, 4608)


Merging fused feature files:  38%|███▊      | 3/8 [00:00<00:00,  6.23it/s]

Loaded fused_selected_features_malignant_lobular_carcinoma.npy with shape (626, 4608)


Merging fused feature files:  75%|███████▌  | 6/8 [00:01<00:00,  4.96it/s]

Loaded fused_selected_features_malignant_ductal_carcinoma.npy with shape (3451, 4608)
Loaded fused_selected_features_benign_adenosis.npy with shape (444, 4608)
Loaded fused_selected_features_benign_phyllodes_tumor.npy with shape (453, 4608)


Merging fused feature files: 100%|██████████| 8/8 [00:01<00:00,  5.42it/s]


Loaded fused_selected_features_malignant_mucinous_carcinoma.npy with shape (792, 4608)
Loaded fused_selected_features_malignant_papillary_carcinoma.npy with shape (560, 4608)
Merging completed.
Merged features saved to: /kaggle/working/merged_fused_selected_features.npy
Merged labels saved to: /kaggle/working/merged_labels.npy
Merged fused features shape: (7909, 4608)
Merged labels shape: (7909,)
Encoding class labels...
Unique classes: ['benign_adenosis' 'benign_fibroadenoma' 'benign_phyllodes_tumor'
 'benign_tubular_adenoma' 'malignant_ductal_carcinoma'
 'malignant_lobular_carcinoma' 'malignant_mucinous_carcinoma'
 'malignant_papillary_carcinoma']
Splitting data into training and test sets...
Training set shape: (6327, 4608)
Test set shape: (1582, 4608)
Loaded model LogisticRegression from /kaggle/input/basemodels/LogisticRegression.pkl
Loaded model SVC from /kaggle/input/basemodels/SVC.pkl
Loaded model ExtraTrees from /kaggle/input/basemodels/ExtraTrees.pkl
Loaded model CalibratedRi

In [1]:
!pip freeze > requirements_kaggle.txt
