In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import numpy as np
import pandas as pd
from PIL import Image

from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    GridSearchCV,
    cross_val_score,
)
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score



In [2]:
RAW_DATA_PATH = "data/bt_dataset_t3.csv"
IMAGES_DIR = "data/images"
IMAGE_SIZE = (64, 64)

# Load raw dataset once so both workflows can reuse it
df_raw = pd.read_csv(RAW_DATA_PATH)
print(f"Raw dataset shape: {df_raw.shape}")
print(df_raw.head(2))

# Prepare tabular-only view (drop image column)
feature_columns = [col for col in df_raw.columns if col not in ["Image", "Target"]]
X_tabular = df_raw[feature_columns].copy()
y_tabular = df_raw["Target"].copy()

# Remove rows containing +/- inf values
inf_mask = np.isinf(X_tabular).any(axis=1)
if inf_mask.any():
    print(f"Removing {inf_mask.sum()} rows containing infinite values.")
    X_tabular = X_tabular.loc[~inf_mask].reset_index(drop=True)
    y_tabular = y_tabular.loc[~inf_mask].reset_index(drop=True)
else:
    print("No infinite values detected in tabular features.")

# Median-fill any remaining missing values (numeric columns only)
X_tabular = X_tabular.fillna(X_tabular.median(numeric_only=True))

print(f"Clean tabular matrix shape: {X_tabular.shape}")
print("Class counts:\n", y_tabular.value_counts())


Raw dataset shape: (1644, 19)
    Image       Mean     Variance  Standard Deviation   Entropy  Skewness  \
0  Image1  23.448517  2538.985627           50.388348  0.651174  1.984202   
1  Image2   4.398331   834.853030           28.893823  0.953532  6.495203   

    Kurtosis    Contrast    Energy       ASM  Homogeneity  Dissimilarity  \
0   5.421042  181.467713  0.781557  0.610831     0.847033       2.765411   
1  43.349355   76.745886  0.972770  0.946281     0.980762       0.548605   

   Correlation     Coarseness        PSNR      SSIM       MSE        DC  \
0     0.968576  7.458341e-155   97.974630  0.777011  0.171163  0.303989   
1     0.959751  7.458341e-155  110.346597  0.977953  0.009913  0.839019   

   Target  
0       1  
1       1  
Removing 98 rows containing infinite values.
Clean tabular matrix shape: (1546, 17)
Class counts:
 Target
1    1449
0      97
Name: count, dtype: int64


In [3]:
# Logistic regression on tabular features only
param_grid_tabular = [
    {
        "classifier__C": [0.01, 0.1, 1, 10],
        "classifier__penalty": ["l1"],
        "classifier__solver": ["liblinear", "saga"],
    },
    {
        "classifier__C": [0.01, 0.1, 1, 10],
        "classifier__penalty": ["l2"],
        "classifier__solver": ["lbfgs", "saga"],
    },
]

pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("classifier", LogisticRegression(random_state=42, max_iter=1000)),
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

X_train_tab, X_test_tab, y_train_tab, y_test_tab = train_test_split(
    X_tabular,
    y_tabular,
    test_size=0.2,
    random_state=42,
    stratify=y_tabular,
)

print(f"Train/Test split sizes: {len(X_train_tab)} / {len(X_test_tab)}")

tabular_search = GridSearchCV(
    pipeline,
    param_grid_tabular,
    cv=cv,
    scoring="accuracy",
    n_jobs=1,
    verbose=1,
)

tabular_search.fit(X_train_tab, y_train_tab)
print(f"Best params: {tabular_search.best_params_}")
print(f"Mean CV accuracy: {tabular_search.best_score_:.4f}")

best_tabular_model = tabular_search.best_estimator_
cv_scores = cross_val_score(best_tabular_model, X_train_tab, y_train_tab, cv=cv, scoring="accuracy")
print(f"CV mean ± 2*std: {cv_scores.mean():.4f} ± {2 * cv_scores.std():.4f}")

y_pred_test = best_tabular_model.predict(X_test_tab)
print(f"Test accuracy: {accuracy_score(y_test_tab, y_pred_test):.4f}")
print("Confusion matrix (test):")
print(confusion_matrix(y_test_tab, y_pred_test))
print("\nClassification report (test):")
print(classification_report(y_test_tab, y_pred_test, digits=4))


Train/Test split sizes: 1236 / 310
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best params: {'classifier__C': 10, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
Mean CV accuracy: 0.9814
CV mean ± 2*std: 0.9814 ± 0.0110
Test accuracy: 0.9871
Confusion matrix (test):
[[ 17   2]
 [  2 289]]

Classification report (test):
              precision    recall  f1-score   support

           0     0.8947    0.8947    0.8947        19
           1     0.9931    0.9931    0.9931       291

    accuracy                         0.9871       310
   macro avg     0.9439    0.9439    0.9439       310
weighted avg     0.9871    0.9871    0.9871       310



In [4]:
def load_and_preprocess_image(image_path, target_size=IMAGE_SIZE):
    try:
        img = Image.open(image_path)
        if img.mode != "L":
            img = img.convert("L")
        img = img.resize(target_size)
        return (np.array(img, dtype=np.float32) / 255.0).flatten()
    except Exception as exc:
        print(f"Failed to load {image_path}: {exc}")
        return None


def build_image_dataset(df, images_dir=IMAGES_DIR, target_size=IMAGE_SIZE):
    feature_cols = [col for col in df.columns if col not in ["Image", "Target"]]
    X_tab = df[feature_cols].copy()
    y = df["Target"].copy()

    inf_mask = np.isinf(X_tab).any(axis=1)
    if inf_mask.any():
        print(f"Removing {inf_mask.sum()} rows with inf values for image workflow.")
        df = df.loc[~inf_mask].reset_index(drop=True)
        X_tab = X_tab.loc[~inf_mask].reset_index(drop=True)
        y = y.loc[~inf_mask].reset_index(drop=True)

    X_tab = X_tab.fillna(X_tab.median(numeric_only=True))

    image_features = []
    valid_indices = []
    for idx, image_name in enumerate(df["Image"]):
        if not isinstance(image_name, str):
            continue
        filename = image_name if image_name.endswith(".jpg") else f"{image_name}.jpg"
        path = os.path.join(images_dir, filename)
        if not os.path.exists(path):
            print(f"Missing image: {path}")
            continue
        features = load_and_preprocess_image(path, target_size)
        if features is not None:
            image_features.append(features)
            valid_indices.append(idx)

    if not image_features:
        raise ValueError("No images could be processed.")

    X_tab_valid = X_tab.iloc[valid_indices].reset_index(drop=True)
    y_valid = y.iloc[valid_indices].reset_index(drop=True)
    X_image = np.vstack(image_features)
    X_combined = np.hstack([X_tab_valid.values, X_image])

    print(
        f"Image dataset: {X_combined.shape[0]} samples, "
        f"{X_tab_valid.shape[1]} tabular + {X_image.shape[1]} image features"
    )
    print("Class counts:\n", y_valid.value_counts())
    return X_combined, y_valid, {
        "tabular_features": feature_cols,
        "n_tabular": X_tab_valid.shape[1],
        "n_image": X_image.shape[1],
    }


X_image_all, y_image_all, image_feature_info = build_image_dataset(df_raw)

X_train_img, X_test_img, y_train_img, y_test_img = train_test_split(
    X_image_all,
    y_image_all,
    test_size=0.2,
    random_state=42,
    stratify=y_image_all,
)

print(f"Train/Test split sizes (image workflow): {len(X_train_img)} / {len(X_test_img)}")

image_search = GridSearchCV(
    pipeline,
    param_grid_tabular,  # reuse same hyper-grid
    cv=cv,
    scoring="accuracy",
    n_jobs=1,
    verbose=1,
)

image_search.fit(X_train_img, y_train_img)
print(f"Best params (image workflow): {image_search.best_params_}")
print(f"Mean CV accuracy: {image_search.best_score_:.4f}")

best_image_model = image_search.best_estimator_
cv_scores_img = cross_val_score(best_image_model, X_train_img, y_train_img, cv=cv, scoring="accuracy")
print(f"CV mean ± 2*std: {cv_scores_img.mean():.4f} ± {2 * cv_scores_img.std():.4f}")

y_pred_img = best_image_model.predict(X_test_img)
print(f"Test accuracy: {accuracy_score(y_test_img, y_pred_img):.4f}")
print("Confusion matrix (test):")
print(confusion_matrix(y_test_img, y_pred_img))
print("\nClassification report (test):")
print(classification_report(y_test_img, y_pred_img, digits=4))


Removing 98 rows with inf values for image workflow.
Image dataset: 1546 samples, 17 tabular + 4096 image features
Class counts:
 Target
1    1449
0      97
Name: count, dtype: int64
Train/Test split sizes (image workflow): 1236 / 310
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best params (image workflow): {'classifier__C': 1, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
Mean CV accuracy: 0.9498
CV mean ± 2*std: 0.9498 ± 0.0288
Test accuracy: 0.9548
Confusion matrix (test):
[[  9  10]
 [  4 287]]

Classification report (test):
              precision    recall  f1-score   support

           0     0.6923    0.4737    0.5625        19
           1     0.9663    0.9863    0.9762       291

    accuracy                         0.9548       310
   macro avg     0.8293    0.7300    0.7693       310
weighted avg     0.9495    0.9548    0.9508       310

