# Rebuild the opencv library

In [2]:
# https://drive.google.com/file/d/17UjW8RuTPJr-_sr_j8lkI3KyoiPNMoBP/view?usp=sharing
# https://drive.google.com/file/d/1cb-sYWuIKxYrthCIU-0qaaZlzSJ5d7xz/view?usp=sharing

!gdown 17UjW8RuTPJr-_sr_j8lkI3KyoiPNMoBP
!gdown 1cb-sYWuIKxYrthCIU-0qaaZlzSJ5d7xz


Downloading...
From: https://drive.google.com/uc?id=17UjW8RuTPJr-_sr_j8lkI3KyoiPNMoBP
To: /content/right.jpg
100% 165k/165k [00:00<00:00, 60.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=1cb-sYWuIKxYrthCIU-0qaaZlzSJ5d7xz
To: /content/left.jpg
100% 167k/167k [00:00<00:00, 88.0MB/s]


In [3]:
# Install necessary packages
!apt-get -qq install -y cmake libopencv-dev

# Clone OpenCV repository
!git clone https://github.com/opencv/opencv.git
!git clone https://github.com/opencv/opencv_contrib.git

# Create build directory
!mkdir /content/build
%cd /content/build

# Configure OpenCV with CMake
!cmake -DOPENCV_ENABLE_NONFREE=ON -DOPENCV_EXTRA_MODULES_PATH=/content/opencv_contrib/modules /content/opencv

# Build and install OpenCV
!make -j8
!make install

Cloning into 'opencv'...
remote: Enumerating objects: 349174, done.[K
remote: Counting objects: 100% (115/115), done.[K
remote: Compressing objects: 100% (90/90), done.[K
remote: Total 349174 (delta 56), reused 25 (delta 25), pack-reused 349059 (from 3)[K
Receiving objects: 100% (349174/349174), 537.59 MiB | 34.16 MiB/s, done.
Resolving deltas: 100% (243735/243735), done.
Updating files: 100% (7664/7664), done.
Cloning into 'opencv_contrib'...
remote: Enumerating objects: 44178, done.[K
remote: Counting objects: 100% (6903/6903), done.[K
remote: Compressing objects: 100% (1891/1891), done.[K
remote: Total 44178 (delta 5523), reused 5017 (delta 5011), pack-reused 37275 (from 3)[K
Receiving objects: 100% (44178/44178), 152.91 MiB | 39.08 MiB/s, done.
Resolving deltas: 100% (27178/27178), done.
/content/build
  Compatibility with CMake < 3.10 will be removed from a future version of
  CMake.

  Update the VERSION argument <min> value.  Or, use the <min>...<max> syntax
  to tell CM

## Check for SURF algorithm

In [4]:
import cv2
# lets check the documentation of surf algorithm
help(cv2.xfeatures2d.SURF_create)

Help on built-in function SURF_create:

SURF_create(...)
    SURF_create([, hessianThreshold[, nOctaves[, nOctaveLayers[, extended[, upright]]]]]) -> retval
    .   @param hessianThreshold Threshold for hessian keypoint detector used in SURF.
    .       @param nOctaves Number of pyramid octaves the keypoint detector will use.
    .       @param nOctaveLayers Number of octave layers within each octave.
    .       @param extended Extended descriptor flag (true - use extended 128-element descriptors; false - use
    .       64-element descriptors).
    .       @param upright Up-right or rotated features flag (true - do not compute orientation of features;
    .       false - compute orientation).



# Classification with SURF + Random Forest



In [36]:
import os
import cv2
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

from PIL import Image
from tqdm import tqdm
from joblib import Parallel, delayed

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import StandardScaler

from skimage import io, color
from skimage.feature import hog
from skimage.transform import resize

## Data loading

In [16]:
!pip install kaggle



In [30]:
!mkdir ~/.kaggle

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [31]:
!cp /content/kaggle.json ~/.kaggle/

In [32]:
!chmod 600 ~/.kaggle/kaggle.json

In [33]:
!kaggle datasets download -d nbtailee22/nhapmoncv

Dataset URL: https://www.kaggle.com/datasets/nbtailee22/nhapmoncv
License(s): unknown
Downloading nhapmoncv.zip to /content/build
 98% 738M/751M [00:05<00:00, 92.5MB/s]
100% 751M/751M [00:05<00:00, 152MB/s] 


In [34]:
!unzip /content/build/nhapmoncv.zip -d /content/

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /content/data/images/n02108089-boxer/n02108089_11875.jpg  
  inflating: /content/data/images/n02108089-boxer/n02108089_122.jpg  
  inflating: /content/data/images/n02108089-boxer/n02108089_12232.jpg  
  inflating: /content/data/images/n02108089-boxer/n02108089_125.jpg  
  inflating: /content/data/images/n02108089-boxer/n02108089_12738.jpg  
  inflating: /content/data/images/n02108089-boxer/n02108089_12739.jpg  
  inflating: /content/data/images/n02108089-boxer/n02108089_12827.jpg  
  inflating: /content/data/images/n02108089-boxer/n02108089_13340.jpg  
  inflating: /content/data/images/n02108089-boxer/n02108089_13526.jpg  
  inflating: /content/data/images/n02108089-boxer/n02108089_1353.jpg  
  inflating: /content/data/images/n02108089-boxer/n02108089_1355.jpg  
  inflating: /content/data/images/n02108089-boxer/n02108089_1357.jpg  
  inflating: /content/data/images/n02108089-boxer/n02108089_1367.jpg  
  infla

In [37]:
images_dir = '/content/data/images'

classes = [d for d in os.listdir(images_dir) if os.path.isdir(os.path.join(images_dir, d))]

label_map = {cls: idx for idx, cls in enumerate(classes)}

data = []
for cls in classes:
    cls_folder = os.path.join(images_dir, cls)
    for fname in os.listdir(cls_folder):
        if fname.lower().endswith(('.jpg', '.jpeg', '.png')):
            file_path = os.path.join(cls_folder, fname)
            label = label_map[cls]
            data.append((file_path, label))

classes = [d.split("-")[-1] for d in os.listdir(images_dir) if os.path.isdir(os.path.join(images_dir, d))]

label_map = {cls: idx for idx, cls in enumerate(classes)}


df = pd.DataFrame(data, columns=['filepath', 'label'])
print(df.head())
print("Number of images:", len(df))
print("Number of classes:", len(classes))

                                            filepath  label
0  /content/data/images/n02106030-collie/n0210603...      0
1  /content/data/images/n02106030-collie/n0210603...      0
2  /content/data/images/n02106030-collie/n0210603...      0
3  /content/data/images/n02106030-collie/n0210603...      0
4  /content/data/images/n02106030-collie/n0210603...      0
Number of images: 20580
Number of classes: 120


In [38]:
label_map = {v:k for k,v in label_map.items()}

In [39]:
df["breed"] = df["label"].map(label_map)

In [40]:
df

Unnamed: 0,filepath,label,breed
0,/content/data/images/n02106030-collie/n0210603...,0,collie
1,/content/data/images/n02106030-collie/n0210603...,0,collie
2,/content/data/images/n02106030-collie/n0210603...,0,collie
3,/content/data/images/n02106030-collie/n0210603...,0,collie
4,/content/data/images/n02106030-collie/n0210603...,0,collie
...,...,...,...
20575,/content/data/images/n02113624-toy_poodle/n021...,119,toy_poodle
20576,/content/data/images/n02113624-toy_poodle/n021...,119,toy_poodle
20577,/content/data/images/n02113624-toy_poodle/n021...,119,toy_poodle
20578,/content/data/images/n02113624-toy_poodle/n021...,119,toy_poodle


## Feature extraction using SURF

In [41]:
surf = cv2.xfeatures2d.SURF_create(hessianThreshold=400)

filenames = list(df['filepath'])
batch_size = 500
k = 200

kmeans = MiniBatchKMeans(n_clusters=k, batch_size=batch_size, random_state=42)

for img_path in tqdm(filenames, desc="Incremental fitting"):
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        continue
    _, des = surf.detectAndCompute(img, None)
    if des is not None:
        # Sample subset to avoid too many per image
        if len(des) > 200:
            des = des[np.random.choice(len(des), 200, replace=False)]
        kmeans.partial_fit(des)

print("✅ KMeans fitted incrementally.")

Incremental fitting: 100%|██████████| 20580/20580 [37:14<00:00,  9.21it/s]

✅ KMeans fitted incrementally.





In [42]:
def get_bovw_vector(des, kmeans):
    hist = np.zeros(k)
    if des is not None:
        words = kmeans.predict(des)
        for w in words:
            hist[w] += 1
    return hist

In [44]:
X = [get_bovw_vector(surf.detectAndCompute(cv2.imread(p, 0), None)[1], kmeans) for p in list(df['filepath'])]
X = StandardScaler().fit_transform(X)

y = np.array(list(df["label"]))

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [46]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [47]:
y_pred = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.047862001943634595
