In [1]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import os
import numpy as np
import pandas as pd
from PIL import Image

from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    GridSearchCV,
)
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score



In [2]:
RAW_DATA_PATH = "data/Brain Tumor.csv"
IMAGES_DIR = "data/images"
IMAGE_SIZE = (64, 64)
RANDOM_STATE = None

# Load raw dataset
df_raw = pd.read_csv(RAW_DATA_PATH)
print(f"Raw dataset shape: {df_raw.shape}")
print(df_raw.head(2))

# Define feature categories
FIRST_ORDER_FEATURES = ["Mean", "Variance", "Standard Deviation", "Skewness", "Kurtosis"]
SECOND_ORDER_FEATURES = ["Contrast", "Energy", "ASM", "Entropy", "Homogeneity", "Dissimilarity", "Correlation", "Coarseness"]

# Verify all features exist in the dataset
all_features = FIRST_ORDER_FEATURES + SECOND_ORDER_FEATURES
missing = [f for f in all_features if f not in df_raw.columns]
if missing:
    raise ValueError(f"Missing expected features in dataset: {missing}")

available_features = [col for col in df_raw.columns if col not in ["Image", "Class"]]
print(f"\nFirst order features: {FIRST_ORDER_FEATURES}")
print(f"Second order features: {SECOND_ORDER_FEATURES}")
print(f"\nAvailable features in dataset: {available_features}")


Raw dataset shape: (3762, 15)
    Image  Class      Mean    Variance  Standard Deviation   Entropy  \
0  Image1      0  6.535339  619.587845           24.891522  0.109059   
1  Image2      0  8.749969  805.957634           28.389393  0.266538   

   Skewness   Kurtosis   Contrast    Energy       ASM  Homogeneity  \
0  4.276477  18.900575  98.613971  0.293314  0.086033     0.530941   
1  3.718116  14.464618  63.858816  0.475051  0.225674     0.651352   

   Dissimilarity  Correlation     Coarseness  
0       4.473346     0.981939  7.458341e-155  
1       3.220072     0.988834  7.458341e-155  

First order features: ['Mean', 'Variance', 'Standard Deviation', 'Skewness', 'Kurtosis']
Second order features: ['Contrast', 'Energy', 'ASM', 'Entropy', 'Homogeneity', 'Dissimilarity', 'Correlation', 'Coarseness']

Available features in dataset: ['Mean', 'Variance', 'Standard Deviation', 'Entropy', 'Skewness', 'Kurtosis', 'Contrast', 'Energy', 'ASM', 'Homogeneity', 'Dissimilarity', 'Correlation', 

In [3]:
# Helper functions for data preparation and model training

def prepare_tabular_data(df, feature_list):
    """Prepare tabular data with specified features."""
    X = df[feature_list].copy()
    y = df["Class"].copy()
    
    # Remove rows with infinite values
    inf_mask = np.isinf(X).any(axis=1)
    if inf_mask.any():
        print(f"Removing {inf_mask.sum()} rows containing infinite values.")
        X = X.loc[~inf_mask].reset_index(drop=True)
        y = y.loc[~inf_mask].reset_index(drop=True)
    
    # Note: NaN imputation is handled in the pipeline to avoid data leakage
    
    print(f"Tabular data shape: {X.shape}")
    print(f"Features used: {feature_list}")
    print(f"Class counts:\n{y.value_counts()}\n")
    
    return X, y


def load_and_preprocess_image(image_path, Class_size=IMAGE_SIZE):
    """Load and preprocess an image."""
    try:
        img = Image.open(image_path)
        if img.mode != "L":
            img = img.convert("L")
        img = img.resize(Class_size)
        return (np.array(img, dtype=np.float32) / 255.0).flatten()
    except Exception as exc:
        print(f"Failed to load {image_path}: {exc}")
        return None


def build_combined_dataset(df, feature_list, images_dir=IMAGES_DIR, Class_size=IMAGE_SIZE):
    """Build dataset combining tabular features with image features."""
    X_tab = df[feature_list].copy()
    y = df["Class"].copy()
    
    # Remove rows with infinite values
    inf_mask = np.isinf(X_tab).any(axis=1)
    if inf_mask.any():
        print(f"Removing {inf_mask.sum()} rows with inf values.")
        df = df.loc[~inf_mask].reset_index(drop=True)
        X_tab = X_tab.loc[~inf_mask].reset_index(drop=True)
        y = y.loc[~inf_mask].reset_index(drop=True)
    
    # Note: NaN imputation is handled in the pipeline to avoid data leakage
    
    # Load image features
    image_features = []
    valid_indices = []
    for idx, image_name in enumerate(df["Image"]):
        if not isinstance(image_name, str):
            continue
        filename = image_name if image_name.endswith(".jpg") else f"{image_name}.jpg"
        path = os.path.join(images_dir, filename)
        if not os.path.exists(path):
            continue
        features = load_and_preprocess_image(path, Class_size)
        if features is not None:
            image_features.append(features)
            valid_indices.append(idx)
    
    if not image_features:
        raise ValueError("No images could be processed.")
    
    X_tab_valid = X_tab.iloc[valid_indices].reset_index(drop=True)
    y_valid = y.iloc[valid_indices].reset_index(drop=True)
    X_image = np.vstack(image_features)
    X_combined = np.hstack([X_tab_valid.values, X_image])
    
    print(f"Combined dataset: {X_combined.shape[0]} samples, {X_tab_valid.shape[1]} tabular + {X_image.shape[1]} image features")
    print(f"Features used: {feature_list}")
    print(f"Class counts:\n{y_valid.value_counts()}\n")
    
    return X_combined, y_valid


def build_image_only_dataset(df, images_dir=IMAGES_DIR, Class_size=IMAGE_SIZE):
    """Build dataset with only image features (no tabular features)."""
    y = df["Class"].copy()
    
    # Load image features
    image_features = []
    valid_indices = []
    for idx, image_name in enumerate(df["Image"]):
        if not isinstance(image_name, str):
            continue
        filename = image_name if image_name.endswith(".jpg") else f"{image_name}.jpg"
        path = os.path.join(images_dir, filename)
        if not os.path.exists(path):
            continue
        features = load_and_preprocess_image(path, Class_size)
        if features is not None:
            image_features.append(features)
            valid_indices.append(idx)
    
    if not image_features:
        raise ValueError("No images could be processed.")
    
    y_valid = y.iloc[valid_indices].reset_index(drop=True)
    X_image = np.vstack(image_features)
    
    print(f"Image-only dataset: {X_image.shape[0]} samples, {X_image.shape[1]} image features")
    print(f"Class counts:\n{y_valid.value_counts()}\n")
    
    return X_image, y_valid


def train_and_evaluate_model(X_train, X_test, y_train, y_test, run_name):
    """Train and evaluate a logistic regression model."""
    param_grid = [
        {
            "classifier__C": [0.01, 0.1, 1, 10],
            "classifier__penalty": ["l1"],
            "classifier__solver": ["liblinear", "saga"],
        },
        {
            "classifier__C": [0.01, 0.1, 1, 10],
            "classifier__penalty": ["l2"],
            "classifier__solver": ["lbfgs", "saga"],
        },
    ]
    
    pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("classifier", LogisticRegression(random_state=RANDOM_STATE, max_iter=1000)),
    ])
    
    print(f"Train/Test split sizes: {len(X_train)} / {len(X_test)}")
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    
    search = GridSearchCV(
        pipeline,
        param_grid,
        cv=cv,
        scoring="accuracy",
        n_jobs=-1,
        verbose=1,
    )
    
    search.fit(X_train, y_train)
    print(f"Best params: {search.best_params_}")
    print(f"Mean CV accuracy (best params): {search.best_score_:.4f}")
    
    best_model = search.best_estimator_
    y_pred = best_model.predict(X_test)
    print(f"Test accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("Confusion matrix (test):")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification report (test):")
    print(classification_report(y_test, y_pred, digits=4))
    
    return best_model, search.best_params_, search.best_score_, accuracy_score(y_test, y_pred)


In [4]:
# ============================================================================
# RUN 1: Only First Order Features (Tabular Only)
# ============================================================================
print("=" * 70)
print("RUN 1: Only First Order Features (Tabular Only)")
print("=" * 70)

X_first, y_first = prepare_tabular_data(df_raw, FIRST_ORDER_FEATURES)
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(
    X_first, y_first, test_size=0.2, random_state=RANDOM_STATE, stratify=y_first
)

model_1, params_1, cv_acc_1, test_acc_1 = train_and_evaluate_model(
    X_train_1, X_test_1, y_train_1, y_test_1, "Run 1: First Order Only"
)


RUN 1: Only First Order Features (Tabular Only)
Tabular data shape: (3762, 5)
Features used: ['Mean', 'Variance', 'Standard Deviation', 'Skewness', 'Kurtosis']
Class counts:
Class
0    2079
1    1683
Name: count, dtype: int64

Train/Test split sizes: 3009 / 753
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best params: {'classifier__C': 10, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
Mean CV accuracy (best params): 0.8807
Test accuracy: 0.8818
Confusion matrix (test):
[[375  41]
 [ 48 289]]

Classification report (test):
              precision    recall  f1-score   support

           0     0.8865    0.9014    0.8939       416
           1     0.8758    0.8576    0.8666       337

    accuracy                         0.8818       753
   macro avg     0.8811    0.8795    0.8802       753
weighted avg     0.8817    0.8818    0.8817       753





In [5]:
# ============================================================================
# RUN 2: Only Second Order Features (Tabular Only)
# ============================================================================
print("=" * 70)
print("RUN 2: Only Second Order Features (Tabular Only)")
print("=" * 70)

X_second, y_second = prepare_tabular_data(df_raw, SECOND_ORDER_FEATURES)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(
    X_second, y_second, test_size=0.2, random_state=RANDOM_STATE, stratify=y_second
)

model_2, params_2, cv_acc_2, test_acc_2 = train_and_evaluate_model(
    X_train_2, X_test_2, y_train_2, y_test_2, "Run 2: Second Order Only"
)


RUN 2: Only Second Order Features (Tabular Only)
Tabular data shape: (3762, 8)
Features used: ['Contrast', 'Energy', 'ASM', 'Entropy', 'Homogeneity', 'Dissimilarity', 'Correlation', 'Coarseness']
Class counts:
Class
0    2079
1    1683
Name: count, dtype: int64

Train/Test split sizes: 3009 / 753
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best params: {'classifier__C': 0.01, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
Mean CV accuracy (best params): 0.9787
Test accuracy: 0.9801
Confusion matrix (test):
[[411   5]
 [ 10 327]]

Classification report (test):
              precision    recall  f1-score   support

           0     0.9762    0.9880    0.9821       416
           1     0.9849    0.9703    0.9776       337

    accuracy                         0.9801       753
   macro avg     0.9806    0.9792    0.9798       753
weighted avg     0.9801    0.9801    0.9801       753





In [6]:
# ============================================================================
# RUN 3: Both First and Second Order Features (Tabular Only)
# ============================================================================
print("=" * 70)
print("RUN 3: Both First and Second Order Features (Tabular Only)")
print("=" * 70)

X_both, y_both = prepare_tabular_data(df_raw, FIRST_ORDER_FEATURES + SECOND_ORDER_FEATURES)
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(
    X_both, y_both, test_size=0.2, random_state=RANDOM_STATE, stratify=y_both
)

model_3, params_3, cv_acc_3, test_acc_3 = train_and_evaluate_model(
    X_train_3, X_test_3, y_train_3, y_test_3, "Run 3: Both Orders"
)


RUN 3: Both First and Second Order Features (Tabular Only)
Tabular data shape: (3762, 13)
Features used: ['Mean', 'Variance', 'Standard Deviation', 'Skewness', 'Kurtosis', 'Contrast', 'Energy', 'ASM', 'Entropy', 'Homogeneity', 'Dissimilarity', 'Correlation', 'Coarseness']
Class counts:
Class
0    2079
1    1683
Name: count, dtype: int64

Train/Test split sizes: 3009 / 753
Fitting 5 folds for each of 16 candidates, totalling 80 fits




Best params: {'classifier__C': 1, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'}
Mean CV accuracy (best params): 0.9837
Test accuracy: 0.9841
Confusion matrix (test):
[[413   3]
 [  9 328]]

Classification report (test):
              precision    recall  f1-score   support

           0     0.9787    0.9928    0.9857       416
           1     0.9909    0.9733    0.9820       337

    accuracy                         0.9841       753
   macro avg     0.9848    0.9830    0.9839       753
weighted avg     0.9842    0.9841    0.9840       753





In [7]:
# ============================================================================
# RUN 4: First Order Features + Images
# ============================================================================
print("=" * 70)
print("RUN 4: First Order Features + Images")
print("=" * 70)

X_combined_4, y_combined_4 = build_combined_dataset(df_raw, FIRST_ORDER_FEATURES)
X_train_4, X_test_4, y_train_4, y_test_4 = train_test_split(
    X_combined_4, y_combined_4, test_size=0.2, random_state=RANDOM_STATE, stratify=y_combined_4
)

model_4, params_4, cv_acc_4, test_acc_4 = train_and_evaluate_model(
    X_train_4, X_test_4, y_train_4, y_test_4, "Run 4: First Order + Images"
)


RUN 4: First Order Features + Images
Combined dataset: 3762 samples, 5 tabular + 4096 image features
Features used: ['Mean', 'Variance', 'Standard Deviation', 'Skewness', 'Kurtosis']
Class counts:
Class
0    2079
1    1683
Name: count, dtype: int64

Train/Test split sizes: 3009 / 753
Fitting 5 folds for each of 16 candidates, totalling 80 fits




Best params: {'classifier__C': 1, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
Mean CV accuracy (best params): 0.9691
Test accuracy: 0.9748
Confusion matrix (test):
[[407   9]
 [ 10 327]]

Classification report (test):
              precision    recall  f1-score   support

           0     0.9760    0.9784    0.9772       416
           1     0.9732    0.9703    0.9718       337

    accuracy                         0.9748       753
   macro avg     0.9746    0.9743    0.9745       753
weighted avg     0.9748    0.9748    0.9748       753



In [8]:
# ============================================================================
# RUN 5: Second Order Features + Images
# ============================================================================
print("=" * 70)
print("RUN 5: Second Order Features + Images")
print("=" * 70)

X_combined_5, y_combined_5 = build_combined_dataset(df_raw, SECOND_ORDER_FEATURES)
X_train_5, X_test_5, y_train_5, y_test_5 = train_test_split(
    X_combined_5, y_combined_5, test_size=0.2, random_state=RANDOM_STATE, stratify=y_combined_5
)

model_5, params_5, cv_acc_5, test_acc_5 = train_and_evaluate_model(
    X_train_5, X_test_5, y_train_5, y_test_5, "Run 5: Second Order + Images"
)


RUN 5: Second Order Features + Images
Combined dataset: 3762 samples, 8 tabular + 4096 image features
Features used: ['Contrast', 'Energy', 'ASM', 'Entropy', 'Homogeneity', 'Dissimilarity', 'Correlation', 'Coarseness']
Class counts:
Class
0    2079
1    1683
Name: count, dtype: int64

Train/Test split sizes: 3009 / 753
Fitting 5 folds for each of 16 candidates, totalling 80 fits




Best params: {'classifier__C': 1, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
Mean CV accuracy (best params): 0.9877
Test accuracy: 0.9867
Confusion matrix (test):
[[412   4]
 [  6 331]]

Classification report (test):
              precision    recall  f1-score   support

           0     0.9856    0.9904    0.9880       416
           1     0.9881    0.9822    0.9851       337

    accuracy                         0.9867       753
   macro avg     0.9869    0.9863    0.9866       753
weighted avg     0.9867    0.9867    0.9867       753



In [9]:
# ============================================================================
# RUN 6: Both First and Second Order Features + Images
# ============================================================================
print("=" * 70)
print("RUN 6: Both First and Second Order Features + Images")
print("=" * 70)

X_combined_6, y_combined_6 = build_combined_dataset(df_raw, FIRST_ORDER_FEATURES + SECOND_ORDER_FEATURES)
X_train_6, X_test_6, y_train_6, y_test_6 = train_test_split(
    X_combined_6, y_combined_6, test_size=0.2, random_state=RANDOM_STATE, stratify=y_combined_6
)

model_6, params_6, cv_acc_6, test_acc_6 = train_and_evaluate_model(
    X_train_6, X_test_6, y_train_6, y_test_6, "Run 6: Both Orders + Images"
)


RUN 6: Both First and Second Order Features + Images
Combined dataset: 3762 samples, 13 tabular + 4096 image features
Features used: ['Mean', 'Variance', 'Standard Deviation', 'Skewness', 'Kurtosis', 'Contrast', 'Energy', 'ASM', 'Entropy', 'Homogeneity', 'Dissimilarity', 'Correlation', 'Coarseness']
Class counts:
Class
0    2079
1    1683
Name: count, dtype: int64

Train/Test split sizes: 3009 / 753
Fitting 5 folds for each of 16 candidates, totalling 80 fits




Best params: {'classifier__C': 1, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
Mean CV accuracy (best params): 0.9887
Test accuracy: 0.9894
Confusion matrix (test):
[[414   2]
 [  6 331]]

Classification report (test):
              precision    recall  f1-score   support

           0     0.9857    0.9952    0.9904       416
           1     0.9940    0.9822    0.9881       337

    accuracy                         0.9894       753
   macro avg     0.9899    0.9887    0.9892       753
weighted avg     0.9894    0.9894    0.9894       753



In [10]:
# ============================================================================
# RUN 7: Images Only (No Tabular Features)
# ============================================================================
print("=" * 70)
print("RUN 7: Images Only (No Tabular Features)")
print("=" * 70)

X_image_only, y_image_only = build_image_only_dataset(df_raw)
X_train_7, X_test_7, y_train_7, y_test_7 = train_test_split(
    X_image_only, y_image_only, test_size=0.2, random_state=RANDOM_STATE, stratify=y_image_only
)

model_7, params_7, cv_acc_7, test_acc_7 = train_and_evaluate_model(
    X_train_7, X_test_7, y_train_7, y_test_7, "Run 7: Images Only"
)


RUN 7: Images Only (No Tabular Features)
Image-only dataset: 3762 samples, 4096 image features
Class counts:
Class
0    2079
1    1683
Name: count, dtype: int64

Train/Test split sizes: 3009 / 753
Fitting 5 folds for each of 16 candidates, totalling 80 fits




Best params: {'classifier__C': 1, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
Mean CV accuracy (best params): 0.8873
Test accuracy: 0.8831
Confusion matrix (test):
[[369  47]
 [ 41 296]]

Classification report (test):
              precision    recall  f1-score   support

           0     0.9000    0.8870    0.8935       416
           1     0.8630    0.8783    0.8706       337

    accuracy                         0.8831       753
   macro avg     0.8815    0.8827    0.8820       753
weighted avg     0.8834    0.8831    0.8832       753





In [11]:
# ============================================================================
# Summary of All Runs
# ============================================================================
print("=" * 70)
print("SUMMARY OF ALL RUNS")
print("=" * 70)

results = {
    "Run 1: First Order Only": {"CV Accuracy": cv_acc_1, "Test Accuracy": test_acc_1},
    "Run 2: Second Order Only": {"CV Accuracy": cv_acc_2, "Test Accuracy": test_acc_2},
    "Run 3: Both Orders": {"CV Accuracy": cv_acc_3, "Test Accuracy": test_acc_3},
    "Run 4: First Order + Images": {"CV Accuracy": cv_acc_4, "Test Accuracy": test_acc_4},
    "Run 5: Second Order + Images": {"CV Accuracy": cv_acc_5, "Test Accuracy": test_acc_5},
    "Run 6: Both Orders + Images": {"CV Accuracy": cv_acc_6, "Test Accuracy": test_acc_6},
    "Run 7: Images Only": {"CV Accuracy": cv_acc_7, "Test Accuracy": test_acc_7},
}

summary_df = pd.DataFrame(results).T
print("\n" + summary_df.to_string())
print("\n" + "=" * 70)


SUMMARY OF ALL RUNS

                              CV Accuracy  Test Accuracy
Run 1: First Order Only          0.880687       0.881806
Run 2: Second Order Only         0.978732       0.980080
Run 3: Both Orders               0.983716       0.984064
Run 4: First Order + Images      0.969096       0.974768
Run 5: Second Order + Images     0.987704       0.986720
Run 6: Both Orders + Images      0.988700       0.989376
Run 7: Images Only               0.887341       0.883134

