In [3]:
# Imports
import os, glob, time, joblib
import cv2
import numpy as np
import pandas as pd
from collections import Counter

from skimage.feature import hog
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix, cohen_kappa_score
)
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [5]:
# USER TUNEABLE
DATA_PATH = "./Data/Scores"   
IMG_SIZE = 128
TEST_SIZE = 0.20
N_BOOTSTRAP = 500   
SAVE_DIR = "gb_artifacts"
os.makedirs(SAVE_DIR, exist_ok=True)

In [6]:
# Build df of image paths and labels
paths, labels = [], []

# Try two-level structure: ShapeName/Score/images
two_level = False
for shape_name in sorted(os.listdir(DATA_PATH)):
    shape_path = os.path.join(DATA_PATH, shape_name)
    if not os.path.isdir(shape_path):
        continue
    for score_name in sorted(os.listdir(shape_path)):
        score_path = os.path.join(shape_path, score_name)
        if os.path.isdir(score_path):
            two_level = True
            for ext in ("*.png","*.jpg","*.jpeg"):
                for p in glob.glob(os.path.join(score_path, ext)):
                    paths.append(p)
                    labels.append(f"{shape_name}_{score_name}")

if not two_level:
    # fallback: each folder under DATA_PATH is a class folder
    for class_name in sorted(os.listdir(DATA_PATH)):
        class_path = os.path.join(DATA_PATH, class_name)
        if not os.path.isdir(class_path):
            continue
        for ext in ("*.png","*.jpg","*.jpeg"):
            for p in glob.glob(os.path.join(class_path, ext)):
                paths.append(p)
                labels.append(class_name)

df = pd.DataFrame({"path": paths, "label": labels})
df = df.sample(frac=1.0, random_state=RANDOM_SEED).reset_index(drop=True)  # shuffle
print(f"[INFO] Total images found: {len(df)}")
print(f"[INFO] Unique classes found: {df['label'].nunique()}")
display(df.head())

[INFO] Total images found: 4297
[INFO] Unique classes found: 34


Unnamed: 0,path,label
0,./Data/Scores/Overlapped pencils/5/img1404-p-5...,Overlapped pencils_5
1,./Data/Scores/Star/3/img1903-ST-3(10011).png,Star_3
2,./Data/Scores/Overlapped pencils/6/img170-p-6.png,Overlapped pencils_6
3,./Data/Scores/Square/5/img1499-S-5.png,Square_5
4,./Data/Scores/Triangle/5/img1071-T-5.png,Triangle_5


In [7]:
# Show distribution and drop extremely rare classes
counts = df['label'].value_counts()
print("[INFO] Class counts (top 20):")
display(counts.head(20))

# Identify classes with < 2 samples
rare = counts[counts < 2].index.tolist()
if rare:
    print(f"[WARNING] Removing {len(rare)} class(es) with <2 samples:", rare)
    df = df[~df['label'].isin(rare)].reset_index(drop=True)

# Final class set
print("[INFO] Final dataset size:", len(df))
print("[INFO] Final number of classes:", df['label'].nunique())


[INFO] Class counts (top 20):


label
Square_5                540
Triangle_5              539
Diagonal_5              479
Circle_4                457
Wave_4                  434
Overlapped circle_6     386
Star_4                  250
Overlapped pencils_6    226
Overlapped pencils_5    143
Overlapped circle_5     135
Circle_3                112
Wave_3                  109
Star_3                  102
Star_5                  100
Diagonal_4               70
Overlapped pencils_4     54
Square_4                 45
Triangle_4               38
Star_0                   13
Overlapped pencils_0     12
Name: count, dtype: int64

[INFO] Final dataset size: 4294
[INFO] Final number of classes: 31


In [8]:
# Feature extraction utilities
def load_and_resize(path, img_size=IMG_SIZE):
    img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        raise FileNotFoundError(path)
    img = cv2.resize(img, (img_size, img_size), interpolation=cv2.INTER_AREA)
    return img

def hu_moments(img):
    moments = cv2.moments(img)
    hu = cv2.HuMoments(moments).flatten()
    # log transform to stabilize
    hu_signed = -np.sign(hu) * np.log10(np.abs(hu) + 1e-12)
    return hu_signed

def area_ratio(img):
    # assumes darker strokes on lighter background — adapt if opposite
    _, bw = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    # count non-background pixels: if strokes are dark, pixels==0 are strokes
    strokes = np.sum(bw == 0)
    return np.array([strokes / (img.shape[0] * img.shape[1] + 1e-9)])

def hog_vector(img):
    # skimage.hog expects float in [0,1] (works with 0..255 too but normalize)
    img_f = img.astype("float32") / 255.0
    vec = hog(img_f,
              orientations=9,
              pixels_per_cell=(8,8),
              cells_per_block=(2,2),
              block_norm='L2-Hys',
              feature_vector=True)
    return vec

def extract_features_for_path(path):
    img = load_and_resize(path)
    h = hog_vector(img)
    hu = hu_moments(img)
    ar = area_ratio(img)
    feat = np.concatenate([h, hu, ar])
    return feat


In [9]:
# feature matrix
from tqdm.auto import tqdm

feats = []
labs = []
failed = 0
for p, lab in tqdm(zip(df['path'], df['label']), total=len(df), desc="Extracting"):
    try:
        feats.append(extract_features_for_path(p))
        labs.append(lab)
    except Exception as e:
        failed += 1
        print("Failed:", p, e)

if failed:
    print(f"[WARN] Failed to process {failed} images")

X = np.vstack(feats)
y_labels = np.array(labs)
print("[INFO] Feature matrix shape:", X.shape)

# Label encode
le = LabelEncoder()
y = le.fit_transform(y_labels)
print("[INFO] Classes:", list(le.classes_))


  from .autonotebook import tqdm as notebook_tqdm
Extracting: 100%|██████████| 4294/4294 [00:53<00:00, 79.59it/s]


[INFO] Feature matrix shape: (4294, 8108)
[INFO] Classes: [np.str_('Circle_0'), np.str_('Circle_3'), np.str_('Circle_4'), np.str_('Diagonal_0'), np.str_('Diagonal_3'), np.str_('Diagonal_4'), np.str_('Diagonal_5'), np.str_('Overlapped circle_0'), np.str_('Overlapped circle_4'), np.str_('Overlapped circle_5'), np.str_('Overlapped circle_6'), np.str_('Overlapped pencils_0'), np.str_('Overlapped pencils_2'), np.str_('Overlapped pencils_3'), np.str_('Overlapped pencils_4'), np.str_('Overlapped pencils_5'), np.str_('Overlapped pencils_6'), np.str_('Square_4'), np.str_('Square_5'), np.str_('Star_0'), np.str_('Star_2'), np.str_('Star_3'), np.str_('Star_4'), np.str_('Star_5'), np.str_('Triangle_0'), np.str_('Triangle_4'), np.str_('Triangle_5'), np.str_('Wave_0'), np.str_('Wave_2'), np.str_('Wave_3'), np.str_('Wave_4')]


In [10]:
# Split
cnts = Counter(y)
if min(cnts.values()) < 2:
    raise ValueError("There is still at least one class with <2 samples; adjust dataset or remove rare classes.")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_SEED
)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

Train shape: (3435, 8108) Test shape: (859, 8108)


In [11]:
# Compute sample weights inversely proportional to class frequency (for GB fit)
class_freq = Counter(y_train)
class_weights = {cls: 1.0 / freq for cls, freq in class_freq.items()}
sample_weight = np.array([class_weights[lab] for lab in y_train])
print("[INFO] Example class weights (train):", {k: round(v,3) for k,v in class_weights.items()})


[INFO] Example class weights (train): {np.int64(26): 0.002, np.int64(22): 0.005, np.int64(2): 0.003, np.int64(9): 0.009, np.int64(23): 0.013, np.int64(18): 0.002, np.int64(29): 0.011, np.int64(6): 0.003, np.int64(17): 0.028, np.int64(11): 0.1, np.int64(10): 0.003, np.int64(16): 0.006, np.int64(25): 0.033, np.int64(30): 0.003, np.int64(3): 0.333, np.int64(1): 0.011, np.int64(21): 0.012, np.int64(5): 0.018, np.int64(15): 0.009, np.int64(20): 0.167, np.int64(19): 0.1, np.int64(14): 0.023, np.int64(13): 0.167, np.int64(8): 0.143, np.int64(27): 0.5, np.int64(0): 0.25, np.int64(4): 0.5, np.int64(28): 0.333, np.int64(7): 0.5, np.int64(24): 0.333, np.int64(12): 0.5}


In [None]:
# Train
start = time.time()

gb = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=3,
    random_state=RANDOM_SEED
)

gb.fit(X_train, y_train, sample_weight=sample_weight)

elapsed = time.time() - start
print(f"[INFO] Training finished in {elapsed:.1f}s")

In [None]:
# Evaluate
y_pred = gb.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {acc:.4f}")

In [None]:
# Limit metrics to classes present in y_test
unique_test_labels = np.unique(y_test)
print(classification_report(
    y_test,
    y_pred,
    labels=unique_test_labels,
    target_names=le.classes_[unique_test_labels],
    zero_division=0
))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=unique_test_labels)
plt.figure(figsize=(10,8))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=le.classes_[unique_test_labels],
            yticklabels=le.classes_[unique_test_labels], cmap='Blues')
plt.xlabel("Predicted"); plt.ylabel("True"); plt.title("Confusion Matrix (GB)")
plt.tight_layout(); plt.show()

In [None]:
# Quadratic Weighted Kappa + bootstrap CI
def qwk(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def bootstrap_qwk_ci(y_true, y_pred, n_boot=N_BOOTSTRAP, seed=RANDOM_SEED):
    rng = np.random.RandomState(seed)
    n = len(y_true)
    vals = []
    for _ in range(n_boot):
        idx = rng.randint(0, n, n)
        vals.append(qwk(y_true[idx], y_pred[idx]))
    vals = np.array(vals)
    return qwk(y_true, y_pred), np.percentile(vals, 2.5), np.percentile(vals, 97.5)

qwk_val, qwk_lo, qwk_hi = bootstrap_qwk_ci(y_test, y_pred)
print(f"QWK: {qwk_val:.4f} (95% CI: {qwk_lo:.4f} - {qwk_hi:.4f})")

In [None]:
# Feature importance
importances = gb.feature_importances_
topk = 20
indices = np.argsort(importances)[::-1][:topk]

plt.figure(figsize=(10,6))
plt.barh(range(topk)[::-1], importances[indices][::-1])
plt.yticks(range(topk)[::-1], [f"F{idx}" for idx in indices[::-1]])
plt.xlabel("Importance")
plt.title("Top 20 feature importances (Gradient Boosting)")
plt.tight_layout()
plt.show()

# Map block: show which indices correspond to Hu moments and Area (approx.)
hog_len = len(hog_vector(np.zeros((IMG_SIZE,IMG_SIZE), dtype=np.uint8)))
print(f"[INFO] HOG length: {hog_len}, total features: {X.shape[1]}")
print(f"[INFO] Hu indices approximately: {list(range(hog_len, hog_len+7))}")
print(f"[INFO] Area index approximately: {hog_len+7}")


In [None]:
# Save artifacts
model_path = os.path.join(SAVE_DIR, "gb_model.joblib")
le_path = os.path.join(SAVE_DIR, "label_encoder.joblib")
joblib.dump(gb, model_path)
joblib.dump(le, le_path)
print("[INFO] Saved model to:", model_path)
print("[INFO] Saved label encoder to:", le_path)


In [None]:
# Predict helper
def predict_image_gb(img_path, model=gb, label_encoder=le):
    feat = extract_features_for_path(img_path).reshape(1,-1)
    pred_idx = model.predict(feat)[0]
    proba = model.predict_proba(feat)[0] if hasattr(model, "predict_proba") else None
    label = label_encoder.inverse_transform([pred_idx])[0]
    return {"pred_idx": int(pred_idx), "label": label, "proba": proba}

# Example
# res = predict_image_gb("Data/Scores/Circle/2/sample1.png")
# print(res)