In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import numpy as np
import pandas as pd
from PIL import Image

from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    GridSearchCV,
    cross_val_score,
)
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score



In [None]:
RAW_DATA_PATH = "data/bt_dataset_t3.csv"
IMAGES_DIR = "data/images"
IMAGE_SIZE = (64, 64)

# Load raw dataset
df_raw = pd.read_csv(RAW_DATA_PATH)
print(f"Raw dataset shape: {df_raw.shape}")
print(df_raw.head(2))

# Define feature categories
FIRST_ORDER_FEATURES = ["Mean", "Variance", "Standard Deviation", "Skewness", "Kurtosis"]
SECOND_ORDER_FEATURES = ["Contrast", "Energy", "ASM", "Entropy", "Homogeneity", "Dissimilarity", "Correlation", "Coarseness"]

# Verify all features exist in the dataset
all_features = FIRST_ORDER_FEATURES + SECOND_ORDER_FEATURES
available_features = [col for col in df_raw.columns if col not in ["Image", "Target"]]
print(f"\nFirst order features: {FIRST_ORDER_FEATURES}")
print(f"Second order features: {SECOND_ORDER_FEATURES}")
print(f"\nAvailable features in dataset: {available_features}")


Raw dataset shape: (1644, 19)
    Image       Mean     Variance  Standard Deviation   Entropy  Skewness  \
0  Image1  23.448517  2538.985627           50.388348  0.651174  1.984202   
1  Image2   4.398331   834.853030           28.893823  0.953532  6.495203   

    Kurtosis    Contrast    Energy       ASM  Homogeneity  Dissimilarity  \
0   5.421042  181.467713  0.781557  0.610831     0.847033       2.765411   
1  43.349355   76.745886  0.972770  0.946281     0.980762       0.548605   

   Correlation     Coarseness        PSNR      SSIM       MSE        DC  \
0     0.968576  7.458341e-155   97.974630  0.777011  0.171163  0.303989   
1     0.959751  7.458341e-155  110.346597  0.977953  0.009913  0.839019   

   Target  
0       1  
1       1  

First order features: ['Mean', 'Variance', 'Standard Deviation', 'Skewness', 'Kurtosis']
Second order features: ['Contrast', 'Energy', 'ASM', 'Entropy', 'Homogeneity', 'Dissimilarity', 'Correlation', 'Coarseness']

Available features in dataset: 

In [None]:
# Helper functions for data preparation and model training

def prepare_tabular_data(df, feature_list):
    """Prepare tabular data with specified features."""
    X = df[feature_list].copy()
    y = df["Target"].copy()
    
    # Remove rows with infinite values
    inf_mask = np.isinf(X).any(axis=1)
    if inf_mask.any():
        print(f"Removing {inf_mask.sum()} rows containing infinite values.")
        X = X.loc[~inf_mask].reset_index(drop=True)
        y = y.loc[~inf_mask].reset_index(drop=True)
    
    # Fill missing values
    X = X.fillna(X.median(numeric_only=True))
    
    print(f"Tabular data shape: {X.shape}")
    print(f"Features used: {feature_list}")
    print(f"Class counts:\n{y.value_counts()}\n")
    
    return X, y


def load_and_preprocess_image(image_path, target_size=IMAGE_SIZE):
    """Load and preprocess an image."""
    try:
        img = Image.open(image_path)
        if img.mode != "L":
            img = img.convert("L")
        img = img.resize(target_size)
        return (np.array(img, dtype=np.float32) / 255.0).flatten()
    except Exception as exc:
        print(f"Failed to load {image_path}: {exc}")
        return None


def build_combined_dataset(df, feature_list, images_dir=IMAGES_DIR, target_size=IMAGE_SIZE):
    """Build dataset combining tabular features with image features."""
    X_tab = df[feature_list].copy()
    y = df["Target"].copy()
    
    # Remove rows with infinite values
    inf_mask = np.isinf(X_tab).any(axis=1)
    if inf_mask.any():
        print(f"Removing {inf_mask.sum()} rows with inf values.")
        df = df.loc[~inf_mask].reset_index(drop=True)
        X_tab = X_tab.loc[~inf_mask].reset_index(drop=True)
        y = y.loc[~inf_mask].reset_index(drop=True)
    
    X_tab = X_tab.fillna(X_tab.median(numeric_only=True))
    
    # Load image features
    image_features = []
    valid_indices = []
    for idx, image_name in enumerate(df["Image"]):
        if not isinstance(image_name, str):
            continue
        filename = image_name if image_name.endswith(".jpg") else f"{image_name}.jpg"
        path = os.path.join(images_dir, filename)
        if not os.path.exists(path):
            continue
        features = load_and_preprocess_image(path, target_size)
        if features is not None:
            image_features.append(features)
            valid_indices.append(idx)
    
    if not image_features:
        raise ValueError("No images could be processed.")
    
    X_tab_valid = X_tab.iloc[valid_indices].reset_index(drop=True)
    y_valid = y.iloc[valid_indices].reset_index(drop=True)
    X_image = np.vstack(image_features)
    X_combined = np.hstack([X_tab_valid.values, X_image])
    
    print(f"Combined dataset: {X_combined.shape[0]} samples, {X_tab_valid.shape[1]} tabular + {X_image.shape[1]} image features")
    print(f"Features used: {feature_list}")
    print(f"Class counts:\n{y_valid.value_counts()}\n")
    
    return X_combined, y_valid


def build_image_only_dataset(df, images_dir=IMAGES_DIR, target_size=IMAGE_SIZE):
    """Build dataset with only image features (no tabular features)."""
    y = df["Target"].copy()
    
    # Load image features
    image_features = []
    valid_indices = []
    for idx, image_name in enumerate(df["Image"]):
        if not isinstance(image_name, str):
            continue
        filename = image_name if image_name.endswith(".jpg") else f"{image_name}.jpg"
        path = os.path.join(images_dir, filename)
        if not os.path.exists(path):
            continue
        features = load_and_preprocess_image(path, target_size)
        if features is not None:
            image_features.append(features)
            valid_indices.append(idx)
    
    if not image_features:
        raise ValueError("No images could be processed.")
    
    y_valid = y.iloc[valid_indices].reset_index(drop=True)
    X_image = np.vstack(image_features)
    
    print(f"Image-only dataset: {X_image.shape[0]} samples, {X_image.shape[1]} image features")
    print(f"Class counts:\n{y_valid.value_counts()}\n")
    
    return X_image, y_valid


def train_and_evaluate_model(X_train, X_test, y_train, y_test, run_name):
    """Train and evaluate a logistic regression model."""
    param_grid = [
        {
            "classifier__C": [0.01, 0.1, 1, 10],
            "classifier__penalty": ["l1"],
            "classifier__solver": ["liblinear", "saga"],
        },
        {
            "classifier__C": [0.01, 0.1, 1, 10],
            "classifier__penalty": ["l2"],
            "classifier__solver": ["lbfgs", "saga"],
        },
    ]
    
    pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("classifier", LogisticRegression(random_state=42, max_iter=1000)),
    ])
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    print(f"Train/Test split sizes: {len(X_train)} / {len(X_test)}")
    
    search = GridSearchCV(
        pipeline,
        param_grid,
        cv=cv,
        scoring="accuracy",
        n_jobs=-1,
        verbose=1,
    )
    
    search.fit(X_train, y_train)
    print(f"Best params: {search.best_params_}")
    print(f"Mean CV accuracy: {search.best_score_:.4f}")
    
    best_model = search.best_estimator_
    cv_scores = cross_val_score(best_model, X_train, y_train, cv=cv, scoring="accuracy", n_jobs=-1)
    print(f"CV mean ± 2*std: {cv_scores.mean():.4f} ± {2 * cv_scores.std():.4f}")
    
    y_pred = best_model.predict(X_test)
    print(f"Test accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("Confusion matrix (test):")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification report (test):")
    print(classification_report(y_test, y_pred, digits=4))
    
    return best_model, search.best_params_, search.best_score_, accuracy_score(y_test, y_pred)


In [4]:
# ============================================================================
# RUN 1: Only First Order Features (Tabular Only)
# ============================================================================
print("=" * 70)
print("RUN 1: Only First Order Features (Tabular Only)")
print("=" * 70)

X_first, y_first = prepare_tabular_data(df_raw, FIRST_ORDER_FEATURES)
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(
    X_first, y_first, test_size=0.2, random_state=42, stratify=y_first
)

model_1, params_1, cv_acc_1, test_acc_1 = train_and_evaluate_model(
    X_train_1, X_test_1, y_train_1, y_test_1, "Run 1: First Order Only"
)


RUN 1: Only First Order Features (Tabular Only)
Tabular data shape: (1644, 5)
Features used: ['Mean', 'Variance', 'Standard Deviation', 'Skewness', 'Kurtosis']
Class counts:
Target
1    1449
0     195
Name: count, dtype: int64

Train/Test split sizes: 1315 / 329
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best params: {'classifier__C': 10, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
Mean CV accuracy: 0.8829
CV mean ± 2*std: 0.8829 ± 0.0075
Test accuracy: 0.8754
Confusion matrix (test):
[[  2  37]
 [  4 286]]

Classification report (test):
              precision    recall  f1-score   support

           0     0.3333    0.0513    0.0889        39
           1     0.8854    0.9862    0.9331       290

    accuracy                         0.8754       329
   macro avg     0.6094    0.5187    0.5110       329
weighted avg     0.8200    0.8754    0.8330       329



In [5]:
# ============================================================================
# RUN 2: Only Second Order Features (Tabular Only)
# ============================================================================
print("=" * 70)
print("RUN 2: Only Second Order Features (Tabular Only)")
print("=" * 70)

X_second, y_second = prepare_tabular_data(df_raw, SECOND_ORDER_FEATURES)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(
    X_second, y_second, test_size=0.2, random_state=42, stratify=y_second
)

model_2, params_2, cv_acc_2, test_acc_2 = train_and_evaluate_model(
    X_train_2, X_test_2, y_train_2, y_test_2, "Run 2: Second Order Only"
)


RUN 2: Only Second Order Features (Tabular Only)
Tabular data shape: (1644, 8)
Features used: ['Contrast', 'Energy', 'ASM', 'Entropy', 'Homogeneity', 'Dissimilarity', 'Correlation', 'Coarseness']
Class counts:
Target
1    1449
0     195
Name: count, dtype: int64

Train/Test split sizes: 1315 / 329
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best params: {'classifier__C': 1, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
Mean CV accuracy: 0.8882
CV mean ± 2*std: 0.8882 ± 0.0091
Test accuracy: 0.8784
Confusion matrix (test):
[[  1  38]
 [  2 288]]

Classification report (test):
              precision    recall  f1-score   support

           0     0.3333    0.0256    0.0476        39
           1     0.8834    0.9931    0.9351       290

    accuracy                         0.8784       329
   macro avg     0.6084    0.5094    0.4913       329
weighted avg     0.8182    0.8784    0.8299       329



In [6]:
# ============================================================================
# RUN 3: Both First and Second Order Features (Tabular Only)
# ============================================================================
print("=" * 70)
print("RUN 3: Both First and Second Order Features (Tabular Only)")
print("=" * 70)

X_both, y_both = prepare_tabular_data(df_raw, FIRST_ORDER_FEATURES + SECOND_ORDER_FEATURES)
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(
    X_both, y_both, test_size=0.2, random_state=42, stratify=y_both
)

model_3, params_3, cv_acc_3, test_acc_3 = train_and_evaluate_model(
    X_train_3, X_test_3, y_train_3, y_test_3, "Run 3: Both Orders"
)


RUN 3: Both First and Second Order Features (Tabular Only)
Tabular data shape: (1644, 13)
Features used: ['Mean', 'Variance', 'Standard Deviation', 'Skewness', 'Kurtosis', 'Contrast', 'Energy', 'ASM', 'Entropy', 'Homogeneity', 'Dissimilarity', 'Correlation', 'Coarseness']
Class counts:
Target
1    1449
0     195
Name: count, dtype: int64

Train/Test split sizes: 1315 / 329
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best params: {'classifier__C': 10, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
Mean CV accuracy: 0.8867
CV mean ± 2*std: 0.8867 ± 0.0057
Test accuracy: 0.8754
Confusion matrix (test):
[[  2  37]
 [  4 286]]

Classification report (test):
              precision    recall  f1-score   support

           0     0.3333    0.0513    0.0889        39
           1     0.8854    0.9862    0.9331       290

    accuracy                         0.8754       329
   macro avg     0.6094    0.5187    0.5110       329
weighted avg     0.8200    0.8754

In [7]:
# ============================================================================
# RUN 4: First Order Features + Images
# ============================================================================
print("=" * 70)
print("RUN 4: First Order Features + Images")
print("=" * 70)

X_combined_4, y_combined_4 = build_combined_dataset(df_raw, FIRST_ORDER_FEATURES)
X_train_4, X_test_4, y_train_4, y_test_4 = train_test_split(
    X_combined_4, y_combined_4, test_size=0.2, random_state=42, stratify=y_combined_4
)

model_4, params_4, cv_acc_4, test_acc_4 = train_and_evaluate_model(
    X_train_4, X_test_4, y_train_4, y_test_4, "Run 4: First Order + Images"
)


RUN 4: First Order Features + Images
Combined dataset: 1644 samples, 5 tabular + 4096 image features
Features used: ['Mean', 'Variance', 'Standard Deviation', 'Skewness', 'Kurtosis']
Class counts:
Target
1    1449
0     195
Name: count, dtype: int64

Train/Test split sizes: 1315 / 329
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best params: {'classifier__C': 0.01, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
Mean CV accuracy: 0.8814
CV mean ± 2*std: 0.8814 ± 0.0030
Test accuracy: 0.8815
Confusion matrix (test):
[[  0  39]
 [  0 290]]

Classification report (test):
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000        39
           1     0.8815    1.0000    0.9370       290

    accuracy                         0.8815       329
   macro avg     0.4407    0.5000    0.4685       329
weighted avg     0.7770    0.8815    0.8259       329



In [8]:
# ============================================================================
# RUN 5: Second Order Features + Images
# ============================================================================
print("=" * 70)
print("RUN 5: Second Order Features + Images")
print("=" * 70)

X_combined_5, y_combined_5 = build_combined_dataset(df_raw, SECOND_ORDER_FEATURES)
X_train_5, X_test_5, y_train_5, y_test_5 = train_test_split(
    X_combined_5, y_combined_5, test_size=0.2, random_state=42, stratify=y_combined_5
)

model_5, params_5, cv_acc_5, test_acc_5 = train_and_evaluate_model(
    X_train_5, X_test_5, y_train_5, y_test_5, "Run 5: Second Order + Images"
)


RUN 5: Second Order Features + Images
Combined dataset: 1644 samples, 8 tabular + 4096 image features
Features used: ['Contrast', 'Energy', 'ASM', 'Entropy', 'Homogeneity', 'Dissimilarity', 'Correlation', 'Coarseness']
Class counts:
Target
1    1449
0     195
Name: count, dtype: int64

Train/Test split sizes: 1315 / 329
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best params: {'classifier__C': 0.01, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
Mean CV accuracy: 0.8814
CV mean ± 2*std: 0.8814 ± 0.0030
Test accuracy: 0.8815
Confusion matrix (test):
[[  0  39]
 [  0 290]]

Classification report (test):
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000        39
           1     0.8815    1.0000    0.9370       290

    accuracy                         0.8815       329
   macro avg     0.4407    0.5000    0.4685       329
weighted avg     0.7770    0.8815    0.8259       329



In [9]:
# ============================================================================
# RUN 6: Both First and Second Order Features + Images
# ============================================================================
print("=" * 70)
print("RUN 6: Both First and Second Order Features + Images")
print("=" * 70)

X_combined_6, y_combined_6 = build_combined_dataset(df_raw, FIRST_ORDER_FEATURES + SECOND_ORDER_FEATURES)
X_train_6, X_test_6, y_train_6, y_test_6 = train_test_split(
    X_combined_6, y_combined_6, test_size=0.2, random_state=42, stratify=y_combined_6
)

model_6, params_6, cv_acc_6, test_acc_6 = train_and_evaluate_model(
    X_train_6, X_test_6, y_train_6, y_test_6, "Run 6: Both Orders + Images"
)


RUN 6: Both First and Second Order Features + Images
Combined dataset: 1644 samples, 13 tabular + 4096 image features
Features used: ['Mean', 'Variance', 'Standard Deviation', 'Skewness', 'Kurtosis', 'Contrast', 'Energy', 'ASM', 'Entropy', 'Homogeneity', 'Dissimilarity', 'Correlation', 'Coarseness']
Class counts:
Target
1    1449
0     195
Name: count, dtype: int64

Train/Test split sizes: 1315 / 329
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best params: {'classifier__C': 0.01, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
Mean CV accuracy: 0.8814
CV mean ± 2*std: 0.8814 ± 0.0030
Test accuracy: 0.8815
Confusion matrix (test):
[[  0  39]
 [  0 290]]

Classification report (test):
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000        39
           1     0.8815    1.0000    0.9370       290

    accuracy                         0.8815       329
   macro avg     0.4407    0.5000    0.4685       329
wei

In [16]:
# ============================================================================
# RUN 7: Images Only (No Tabular Features)
# ============================================================================
print("=" * 70)
print("RUN 7: Images Only (No Tabular Features)")
print("=" * 70)

X_image_only, y_image_only = build_image_only_dataset(df_raw)
X_train_7, X_test_7, y_train_7, y_test_7 = train_test_split(
    X_image_only, y_image_only, test_size=0.2, random_state=42, stratify=y_image_only
)

model_7, params_7, cv_acc_7, test_acc_7 = train_and_evaluate_model(
    X_train_7, X_test_7, y_train_7, y_test_7, "Run 7: Images Only"
)


RUN 7: Images Only (No Tabular Features)
Image-only dataset: 1644 samples, 4096 image features
Class counts:
Target
1    1449
0     195
Name: count, dtype: int64

Train/Test split sizes: 1315 / 329
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best params: {'classifier__C': 0.01, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
Mean CV accuracy: 0.8814
CV mean ± 2*std: 0.8814 ± 0.0030
Test accuracy: 0.8815
Confusion matrix (test):
[[  0  39]
 [  0 290]]

Classification report (test):
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000        39
           1     0.8815    1.0000    0.9370       290

    accuracy                         0.8815       329
   macro avg     0.4407    0.5000    0.4685       329
weighted avg     0.7770    0.8815    0.8259       329



In [17]:
# ============================================================================
# Summary of All Runs
# ============================================================================
print("=" * 70)
print("SUMMARY OF ALL RUNS")
print("=" * 70)

results = {
    "Run 1: First Order Only": {"CV Accuracy": cv_acc_1, "Test Accuracy": test_acc_1},
    "Run 2: Second Order Only": {"CV Accuracy": cv_acc_2, "Test Accuracy": test_acc_2},
    "Run 3: Both Orders": {"CV Accuracy": cv_acc_3, "Test Accuracy": test_acc_3},
    "Run 4: First Order + Images": {"CV Accuracy": cv_acc_4, "Test Accuracy": test_acc_4},
    "Run 5: Second Order + Images": {"CV Accuracy": cv_acc_5, "Test Accuracy": test_acc_5},
    "Run 6: Both Orders + Images": {"CV Accuracy": cv_acc_6, "Test Accuracy": test_acc_6},
    "Run 7: Images Only": {"CV Accuracy": cv_acc_7, "Test Accuracy": test_acc_7},
}

summary_df = pd.DataFrame(results).T
print("\n" + summary_df.to_string())
print("\n" + "=" * 70)


SUMMARY OF ALL RUNS

                              CV Accuracy  Test Accuracy
Run 1: First Order Only          0.882890       0.875380
Run 2: Second Order Only         0.888213       0.878419
Run 3: Both Orders               0.886692       0.875380
Run 4: First Order + Images      0.881369       0.881459
Run 5: Second Order + Images     0.881369       0.881459
Run 6: Both Orders + Images      0.881369       0.881459
Run 7: Images Only               0.881369       0.881459

