In [15]:
import os
import cv2
import pandas as pd
import seaborn as sns
import numpy as np 
import matplotlib.pyplot as plt

from PIL import Image
from tqdm import tqdm
from joblib import Parallel, delayed

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import StandardScaler

from skimage import io, color
from skimage.feature import hog
from skimage.transform import resize

# Data loading

In [4]:
images_dir = '/kaggle/input/nhapmoncv/data/images'

classes = [d for d in os.listdir(images_dir) if os.path.isdir(os.path.join(images_dir, d))]

label_map = {cls: idx for idx, cls in enumerate(classes)}

data = []
for cls in classes:
    cls_folder = os.path.join(images_dir, cls)
    for fname in os.listdir(cls_folder):
        if fname.lower().endswith(('.jpg', '.jpeg', '.png')):
            file_path = os.path.join(cls_folder, fname)
            label = label_map[cls]
            data.append((file_path, label))

classes = [d.split("-")[-1] for d in os.listdir(images_dir) if os.path.isdir(os.path.join(images_dir, d))]

label_map = {cls: idx for idx, cls in enumerate(classes)}


df = pd.DataFrame(data, columns=['filepath', 'label'])
print(df.head())
print("Number of images:", len(df))
print("Number of classes:", len(classes))

                                            filepath  label
0  /kaggle/input/nhapmoncv/data/images/n02091635-...      0
1  /kaggle/input/nhapmoncv/data/images/n02091635-...      0
2  /kaggle/input/nhapmoncv/data/images/n02091635-...      0
3  /kaggle/input/nhapmoncv/data/images/n02091635-...      0
4  /kaggle/input/nhapmoncv/data/images/n02091635-...      0
Number of images: 20580
Number of classes: 120


In [5]:
label_map = {v:k for k,v in label_map.items()}

In [6]:
df["breed"] = df["label"].map(label_map)

In [7]:
df

Unnamed: 0,filepath,label,breed
0,/kaggle/input/nhapmoncv/data/images/n02091635-...,0,otterhound
1,/kaggle/input/nhapmoncv/data/images/n02091635-...,0,otterhound
2,/kaggle/input/nhapmoncv/data/images/n02091635-...,0,otterhound
3,/kaggle/input/nhapmoncv/data/images/n02091635-...,0,otterhound
4,/kaggle/input/nhapmoncv/data/images/n02091635-...,0,otterhound
...,...,...,...
20575,/kaggle/input/nhapmoncv/data/images/n02088466-...,119,bloodhound
20576,/kaggle/input/nhapmoncv/data/images/n02088466-...,119,bloodhound
20577,/kaggle/input/nhapmoncv/data/images/n02088466-...,119,bloodhound
20578,/kaggle/input/nhapmoncv/data/images/n02088466-...,119,bloodhound


# Feature extraction using ORB

In [8]:
orb = cv2.ORB_create(nfeatures=500)

all_descriptors = []
for img_path in list(df['filepath']):
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    kp, des = orb.detectAndCompute(img, None)
    if des is not None:
        all_descriptors.extend(des)

In [9]:
k = 200  # number of visual words
kmeans = MiniBatchKMeans(n_clusters=k).fit(all_descriptors)

# Represent each image as histogram of visual words
def get_bovw_vector(des, kmeans):
    hist = np.zeros(k)
    if des is not None:
        words = kmeans.predict(des)
        for w in words:
            hist[w] += 1
    return hist



In [10]:
X = [get_bovw_vector(orb.detectAndCompute(cv2.imread(p, 0), None)[1], kmeans) for p in list(df['filepath'])]
X = StandardScaler().fit_transform(X)

y = np.array(list(df["label"]))

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [13]:
y_pred = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.03255587949465501
