# **Course:** Introduction to Computer Vision (CS231.Q11)

**Topic:** Face Mask Classification

**Member:** 
- Nguyen Cong Phat - 23521143
- Nguyen Le Phong - 23521168
- Vu Viet Cuong - 23520213 

**Imports & Configuration**

In [6]:
import os
from pathlib import Path
import numpy as np
import joblib
from PIL import Image
from tqdm import tqdm
import matplotlib.pyplot as plt

# Image Processing
from skimage.feature import local_binary_pattern
from skimage import img_as_ubyte

# Machine Learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# Optimization
import optuna

# ====== CONFIGURATION ======
# Update this path if you are not on Kaggle (e.g., Colab or Local)
DATA_DIR = Path('/kaggle/input/face-mask-12k-images-dataset/Face Mask Dataset')

LABELS = ['WithMask', 'WithoutMask']
IMAGE_SIZE = (128, 128)
MODEL_FILENAME = 'lbp_rf_face_mask_model_optuna_uniform.joblib'

# LBP Parameters
LBP_RADIUS = 1
LBP_POINTS = 8 * LBP_RADIUS
LBP_METHOD = 'uniform'

**LBP Feature Extraction Logic**

In [7]:
def get_lbp_bins(method):
    """Calculates the number of bins for the histogram based on LBP method."""
    if method == 'default':
        return 2 ** LBP_POINTS
    elif method == 'ror':
        return LBP_POINTS + 1
    elif method == 'uniform':
        return LBP_POINTS + 2
    elif method == 'nri_uniform':
        return LBP_POINTS * (LBP_POINTS - 1) + 3
    else:
        raise ValueError(f"Unsupported LBP method: {method}")

def extract_lbp_features(image_np, method):
    """Computes LBP texture image and returns the normalized histogram."""
    # Compute LBP
    lbp = local_binary_pattern(image_np, LBP_POINTS, LBP_RADIUS, method)
    
    # Calculate Histogram
    n_bins = get_lbp_bins(method)
    hist, _ = np.histogram(lbp.ravel(), bins=n_bins, range=(0, n_bins))
    
    # Normalize Histogram (Frequency -> Probability)
    hist = hist.astype("float")
    hist /= (hist.sum() + 1e-7)
    
    return hist, lbp

**Data Loading Logic**

In [8]:
def load_split(split: str, method='default'):
    """Iterates through folders, resizes images, and extracts LBP features."""
    X, y = [], []
    for label in LABELS:
        folder = DATA_DIR / split / label
        
        # Check if folder exists to avoid crashes
        if not folder.exists():
            print(f"Warning: Folder not found: {folder}")
            continue

        for img_path in tqdm(list(folder.glob('*.*')), desc=f'Loading {split}/{label}'):
            try:
                # 1. Open and convert to Grayscale
                image = Image.open(img_path).convert('L')
                
                # 2. Resize (Crucial for consistent processing speed)
                image = image.resize(IMAGE_SIZE)
                image_np = np.array(image)
                
                # 3. Extract Features
                hist, _ = extract_lbp_features(image_np, method=method)
                
                X.append(hist)
                y.append(label)
            except Exception as e:
                print(f'-- error reading {img_path}: {e}')
                
    if len(X) == 0:
        raise ValueError(f"No images loaded for {split}. Check DATA_DIR path!")
        
    return np.vstack(X), np.array(y)

**Load Training & Validation Data**

In [9]:
print("--- Loading Training Data ---")
X_train, y_train = load_split('Train', LBP_METHOD)

print("\n--- Loading Validation Data ---")
X_val, y_val = load_split('Validation', LBP_METHOD)

# Encode Labels
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_val_enc = le.transform(y_val)

print(f"\nTraining Feature Shape: {X_train.shape}")
print(f"Validation Feature Shape: {X_val.shape}")

--- Loading Training Data ---


Loading Train/WithMask: 100%|██████████| 5000/5000 [01:00<00:00, 83.32it/s]
Loading Train/WithoutMask: 100%|██████████| 5000/5000 [00:49<00:00, 100.05it/s]



--- Loading Validation Data ---


Loading Validation/WithMask: 100%|██████████| 400/400 [00:04<00:00, 89.95it/s]
Loading Validation/WithoutMask: 100%|██████████| 400/400 [00:03<00:00, 102.85it/s]


Training Feature Shape: (10000, 10)
Validation Feature Shape: (800, 10)





**Define Optuna Objective**

In [10]:
def objective(trial):
    # Hyperparameter Search Space
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 5, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'random_state': 42,
        'n_jobs': -1  # Use all CPU cores
    }
    
    clf = RandomForestClassifier(**params)
    clf.fit(X_train, y_train_enc)
    
    preds = clf.predict(X_val)
    acc = accuracy_score(y_val_enc, preds)

    # Save model to trial for retrieval later
    trial.set_user_attr("model", clf)
    
    return acc

**Run Optimization**

In [11]:
print("--- Starting Optuna Optimization ---")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50) # Adjust n_trials if needed

print("Best validation accuracy:", study.best_value)
print("Best parameters:", study.best_params)

[I 2025-11-29 18:26:38,950] A new study created in memory with name: no-name-a2319384-af24-4495-82d7-d18c7375fde8


--- Starting Optuna Optimization ---


[I 2025-11-29 18:26:45,361] Trial 0 finished with value: 0.8825 and parameters: {'n_estimators': 251, 'max_depth': 23, 'min_samples_split': 6, 'min_samples_leaf': 10, 'max_features': None}. Best is trial 0 with value: 0.8825.
[I 2025-11-29 18:26:47,326] Trial 1 finished with value: 0.88875 and parameters: {'n_estimators': 170, 'max_depth': 30, 'min_samples_split': 13, 'min_samples_leaf': 5, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.88875.
[I 2025-11-29 18:26:48,559] Trial 2 finished with value: 0.8725 and parameters: {'n_estimators': 128, 'max_depth': 10, 'min_samples_split': 18, 'min_samples_leaf': 7, 'max_features': 'log2'}. Best is trial 1 with value: 0.88875.
[I 2025-11-29 18:26:50,109] Trial 3 finished with value: 0.885 and parameters: {'n_estimators': 137, 'max_depth': 16, 'min_samples_split': 15, 'min_samples_leaf': 3, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.88875.
[I 2025-11-29 18:26:51,961] Trial 4 finished with value: 0.885 and parameters: {'n_esti

Best validation accuracy: 0.90125
Best parameters: {'n_estimators': 235, 'max_depth': 27, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2'}


**Final Test & Save**

In [12]:
# Retrieve best model
best_clf = study.best_trial.user_attrs["model"]

print("\n--- Loading Test Data ---")
X_test, y_test = load_split('Test', LBP_METHOD)
y_test_enc = le.transform(y_test)

# Predict
y_pred = best_clf.predict(X_test)

print("\nTest Accuracy:", accuracy_score(y_test_enc, y_pred))
print("\nClassification Report:")
print(classification_report(y_test_enc, y_pred, target_names=le.classes_))

# Save
joblib.dump({'model': best_clf, 'label_encoder': le}, MODEL_FILENAME)
print(f'\nModel and encoder saved to {MODEL_FILENAME}')


--- Loading Test Data ---


Loading Test/WithMask: 100%|██████████| 483/483 [00:05<00:00, 91.46it/s]
Loading Test/WithoutMask: 100%|██████████| 509/509 [00:04<00:00, 106.60it/s]



Test Accuracy: 0.9092741935483871

Classification Report:
              precision    recall  f1-score   support

    WithMask       0.92      0.89      0.90       483
 WithoutMask       0.90      0.93      0.91       509

    accuracy                           0.91       992
   macro avg       0.91      0.91      0.91       992
weighted avg       0.91      0.91      0.91       992


Model and encoder saved to lbp_rf_face_mask_model_optuna_uniform.joblib
