In [23]:
import os
import pandas as pd
import seaborn as sns
import cv2
import numpy as np 
from PIL import Image
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from skimage import io, color
from skimage.feature import hog
from joblib import Parallel, delayed
import numpy as np
from skimage.transform import resize

In [3]:


images_dir = '/kaggle/input/nhapmoncv/data/images'

classes = [d for d in os.listdir(images_dir) if os.path.isdir(os.path.join(images_dir, d))]

label_map = {cls: idx for idx, cls in enumerate(classes)}

data = []
for cls in classes:
    cls_folder = os.path.join(images_dir, cls)
    for fname in os.listdir(cls_folder):
        if fname.lower().endswith(('.jpg', '.jpeg', '.png')):
            file_path = os.path.join(cls_folder, fname)
            label = label_map[cls]
            data.append((file_path, label))

classes = [d.split("-")[-1] for d in os.listdir(images_dir) if os.path.isdir(os.path.join(images_dir, d))]

label_map = {cls: idx for idx, cls in enumerate(classes)}


df = pd.DataFrame(data, columns=['filepath', 'label'])
print(df.head())
print("Number of images:", len(df))
print("Number of classes:", len(classes))


                                            filepath  label
0  /kaggle/input/nhapmoncv/data/images/n02091635-...      0
1  /kaggle/input/nhapmoncv/data/images/n02091635-...      0
2  /kaggle/input/nhapmoncv/data/images/n02091635-...      0
3  /kaggle/input/nhapmoncv/data/images/n02091635-...      0
4  /kaggle/input/nhapmoncv/data/images/n02091635-...      0
Number of images: 20580
Number of classes: 120


In [4]:
label_map = {v:k for k,v in label_map.items()}

In [5]:
df["breed"] = df["label"].map(label_map)

In [6]:
df

Unnamed: 0,filepath,label,breed
0,/kaggle/input/nhapmoncv/data/images/n02091635-...,0,otterhound
1,/kaggle/input/nhapmoncv/data/images/n02091635-...,0,otterhound
2,/kaggle/input/nhapmoncv/data/images/n02091635-...,0,otterhound
3,/kaggle/input/nhapmoncv/data/images/n02091635-...,0,otterhound
4,/kaggle/input/nhapmoncv/data/images/n02091635-...,0,otterhound
...,...,...,...
20575,/kaggle/input/nhapmoncv/data/images/n02088466-...,119,bloodhound
20576,/kaggle/input/nhapmoncv/data/images/n02088466-...,119,bloodhound
20577,/kaggle/input/nhapmoncv/data/images/n02088466-...,119,bloodhound
20578,/kaggle/input/nhapmoncv/data/images/n02088466-...,119,bloodhound


# COLOR HISTOGRAM

In [7]:
def color_feature_extractor(img):
    hist_r = cv2.calcHist([img], [0], None, [16], [0, 256])
    hist_g = cv2.calcHist([img], [1], None, [16], [0, 256])
    hist_b = cv2.calcHist([img], [2], None, [16], [0, 256])

    feature_vector = np.concatenate([hist_r, hist_g, hist_b]).flatten()

    feature_vector = feature_vector / np.sum(feature_vector)
    return feature_vector


In [8]:
color_feature_vectors = []

for _, row in df.iterrows():
    color_feature_vectors.append(color_feature_extractor(cv2.imread(row["filepath"])))

In [9]:
x = np.array(color_feature_vectors)
y = np.array(list(df["label"]))

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [11]:
rf = RandomForestClassifier()

In [12]:
rf.fit(x_train, y_train)

In [13]:
y_pred = rf.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred) * 100)

Accuracy: 6.365403304178814


# HOG

In [28]:
def compute_hog(img_path, target_size=(128, 128)):
    try:
        img = io.imread(img_path)
        img = resize(img, target_size)
        img_gray = color.rgb2gray(img)

        features = hog(
            img_gray,
            orientations=8,
            pixels_per_cell=(8, 8),
            cells_per_block=(2, 2),
            block_norm='L2-Hys',
            visualize=False,       
            feature_vector=True
        )
        return features
    except Exception as err:
        print(f"[WARN] Skipped {img_path}: {err}")
        return None

In [36]:
features_list = Parallel(n_jobs=-1, backend='loky')(
    delayed(compute_hog)(p) for p in df["filepath"]
)


valid_mask = [f is not None for f in features_list]

X = np.vstack([f for f in features_list if f is not None])
y = df.loc[valid_mask, "label"].values

print(f"✅ Features extracted for {len(X)} of {len(df)} images")

✅ Features extracted for 20579 of 20580 images


In [37]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred) * 100)

[WARN] Skipped /kaggle/input/nhapmoncv/data/images/n02105855-Shetland_sheepdog/n02105855_2933.jpg: the input array must have size 3 along `channel_axis`, got (128, 128, 4)
Accuracy: 3.1098153547133136
