In [1]:
import os
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils import shuffle
from sklearn.metrics import classification_report, confusion_matrix
from skimage.feature import hog
from skimage.color import rgb2gray

In [None]:
# ========== Feature Extractor (Color Histogram) ==========
def extract_features(image, bins=32, resize_dim=(64, 64)):
    image = image.resize(resize_dim).convert('RGB')
    image_np = np.array(image)
    hist_r = np.histogram(image_np[:, :, 0], bins=bins, range=(0, 256))[0]
    hist_g = np.histogram(image_np[:, :, 1], bins=bins, range=(0, 256))[0]
    hist_b = np.histogram(image_np[:, :, 2], bins=bins, range=(0, 256))[0]
    hist = np.concatenate([hist_r, hist_g, hist_b])
    return hist / np.sum(hist)

In [None]:
def extract_hog_features(image, resize_dim=(64, 64), orientations=9, pixels_per_cell=(8, 8), cells_per_block=(2, 2)):
    image = image.resize(resize_dim).convert('RGB')
    gray = rgb2gray(np.array(image))
    features = hog(gray, orientations=orientations,
                   pixels_per_cell=pixels_per_cell,
                   cells_per_block=cells_per_block,
                   block_norm='L2-Hys')
    return features

def extract_combined_features(image):
    hog_feat = extract_hog_features(image)
    hist_feat = extract_features(image)
    return np.concatenate([hog_feat, hist_feat])

In [None]:
# Parameters
image_size = (64, 64)  # Resize all images to 64x64
valid_exts = ('.jpg', '.jpeg', '.png')

# Function to load images from a given folder
def load_dataset(root_dir, extractor_fn):
    X = []
    y = []
    class_names = sorted(os.listdir(root_dir))
    for label in class_names:
        label_path = os.path.join(root_dir, label)
        if not os.path.isdir(label_path):
            continue
        for fname in os.listdir(label_path):
            if fname.lower().endswith(valid_exts):
                try:
                    img_path = os.path.join(label_path, fname)
                    img = Image.open(img_path).convert('RGB')
                    if extractor_fn == None:
                        img = img.resize(image_size)
                        img_array = np.array(img).flatten()  # Flatten to 1D vector (64*64*3)
                        X.append(img_array)
                        y.append(label)
                    else:
                        features = extractor_fn(img)
                        X.append(features)
                        y.append(label)
                except Exception as e:
                    print(f"Error loading {img_path}: {e}")
    return np.array(X), np.array(y)

# Load training and testing data
print("=== Load data ===")
func_name = extract_combined_features #extract_features #extract_combined_features #None
X_train, y_train = load_dataset("Training", func_name)
X_test, y_test = load_dataset("Test", func_name)

# Encode labels (e.g., 'library' -> 0, etc.)
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

# Shuffle and standardize
X_train, y_train_enc = shuffle(X_train, y_train_enc, random_state=42)
X_test, y_test_enc = shuffle(X_test, y_test_enc, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")
print(f"Classes: {le.classes_}")

In [None]:
# ---- SVM ---- individual models
print("\n--- SVM Training ---")
svm_classifier = SVC(kernel='rbf', C=1, random_state=42, degree= 8, probability=True)
svm_classifier.fit(X_train, y_train_enc)



In [None]:
##=== Training Performance SVM===
print('=== Training Performance SVM===')
svm_preds_train = svm_classifier.predict(X_train)
print(classification_report(y_train_enc, svm_preds_train))
print(confusion_matrix(y_train_enc, svm_preds_train))

In [None]:
print('=== Testing Performance SVM ===')
svm_preds = svm_classifier.predict(X_test)
print(classification_report(y_test_enc, svm_preds))
print(confusion_matrix(y_test_enc, svm_preds))

In [None]:
# ---- Random Forest ----
print("\n--- Random Forest Training ---")
# rf_params = {'n_estimators': [50, 100, 150], 'max_depth': [5, 10, 15], 'criterion': ['gini','entropy','log_loss'],'min_samples_split': [2, 5]}
rf = RandomForestClassifier(n_estimators= 100, 
                            max_depth= 9,
                            criterion= "entropy", 
                            min_samples_split= 5,
                            min_samples_leaf=5,
                            max_features='sqrt',
                            random_state=42)
rf.fit(X_train, y_train_enc)


In [None]:
print('=== Training Performance RF ===')
rf_preds_train = rf.predict(X_train)
print(classification_report(y_train_enc, rf_preds_train))
print(confusion_matrix(y_train_enc, rf_preds_train))

In [None]:
print('=== Testing Performance RF ===')
rf_preds = rf.predict(X_test)
print(classification_report(y_test_enc, rf_preds))
print(confusion_matrix(y_test_enc, rf_preds))

In [None]:
# ========== Semi-Supervised Tree ==========

n_iter=25
confidence_thresh=0.95
np.random.seed(422)
max_add_fraction = 0.05 # add 10% of the total samples

max_add_count = int(max_add_fraction * len(X_train))
total_idx = np.arange(len(X_train))
labeled_idx = np.random.choice(total_idx, size=int(0.2 * len(X_train)), replace=False)
unlabeled_idx = np.setdiff1d(total_idx, labeled_idx)

y_pseudo = y_train_enc.copy()
clf = DecisionTreeClassifier(criterion='entropy', 
                             max_depth=10, 
                             min_samples_split=5, 
                             min_samples_leaf=5,
                             random_state=42, 
                             max_leaf_nodes=None)

print(f"Starting Semi-Supervised Learning:")
print(f"Initial: {len(labeled_idx)} labeled, {len(unlabeled_idx)} unlabeled")

    
for i in range(n_iter):
    clf.fit(X_train[labeled_idx], y_pseudo[labeled_idx])
    probs = clf.predict_proba(X_train[unlabeled_idx])
    preds = np.argmax(probs, axis=1)
    max_conf = np.max(probs, axis=1)

    confident_idx = np.where(max_conf >= confidence_thresh)[0]
    if len(confident_idx) == 0:
        break

    if len(confident_idx) > max_add_count:
        confident_idx = confident_idx[:max_add_count]
    
    confident_unlabeled = unlabeled_idx[confident_idx]
    y_pseudo[confident_unlabeled] = preds[confident_idx]
    
    labeled_idx = np.concatenate([labeled_idx, confident_unlabeled])
    unlabeled_idx = np.setdiff1d(unlabeled_idx, confident_unlabeled)
    print(f"- Iteration {i + 1}: Added {len(confident_unlabeled)} pseudo-labeled samples. "
              f"{len(unlabeled_idx)} unlabeled remain.")
    
    if unlabeled_idx.size==0:
        break

# clf.fit(X_train[labeled_idx], y_pseudo[labeled_idx])



In [None]:
print('=== Training Performance DT ===')
preds_train = clf.predict(X_train[labeled_idx])
print(classification_report(y_train_enc[labeled_idx], preds_train))
print(confusion_matrix(y_train_enc[labeled_idx], preds_train))

In [None]:
print('=== Testing Performance DT ===')
preds = clf.predict(X_test)
print(classification_report(y_test_enc, preds))
print(confusion_matrix(y_test_enc, preds))

In [48]:
from sklearn.feature_selection import SelectKBest, f_classif


In [49]:
# X_train and y_train should already be defined
selector = SelectKBest(score_func=f_classif, k=200)
X_train_selected = selector.fit_transform(X_train, y_train_enc)
X_test_selected = selector.transform(X_test)  # Use the same selector