In [None]:
import pandas as pd
import skimage.io
import skimage.feature
import skimage.transform
import os
import matplotlib.pyplot as plt

data_dir = "data"

df = pd.read_csv(os.path.join(data_dir, "items.csv"), sep=";")
print(df.shape[0])
df.dropna(inplace=True, ignore_index=True)
print(df.shape[0])

665
646


In [2]:
df = df.astype({"x1": int, "y1": int, "x2": int, "y2": int})
df

Unnamed: 0,img_dir,label,x1,y1,x2,y2
0,1/001_1.jpg,1,3085,769,3627,1311
1,1/001_10.jpg,1,10164,3168,12264,5268
2,1/001_11.jpg,1,10154,3280,12078,5204
3,1/001_12.jpg,1,5002,1602,6026,2626
4,1/001_13.jpg,1,12675,3253,14753,5331
...,...,...,...,...,...,...
641,500/500_57.jpg,500,1179,1535,2393,2749
642,500/500_58.jpg,500,574,730,1170,1326
643,500/500_6.jpg,500,243,857,865,1479
644,500/500_8.jpg,500,2445,381,3687,1623


In [3]:
df = df.head(2)

In [None]:
from tqdm import tqdm
import cv2 as cv

from aoc.utils import (
    extract_coin,
    gaussuian_mask,
    convolve_mask,
)

resize_shape = (200, 200)
mask_sigma = 15
mask = gaussuian_mask(resize_shape, mask_sigma)
orientations = 8
pixels_per_cell = (16, 16)
cells_per_block = (3, 3)

features_list = []

for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    bounding_box = (row["x1"], row["y1"], row["x2"], row["y2"])

    # read the image
    image_path = os.path.join(data_dir, row["img_dir"])
    image = cv.imread(image_path)

    # convert BGR to RGB
    image = image[:, :, ::-1]

    # extract the coin
    image = extract_coin(image, bounding_box, resize_shape)

    image = convolve_mask(image, mask)
    hog_features = skimage.feature.hog(
        image,
        orientations=orientations,
        pixels_per_cell=pixels_per_cell,
        cells_per_block=cells_per_block,
        channel_axis=None,
    )
    features_list.append(hog_features)

df["features"] = features_list

 50%|█████     | 8/16 [00:01<00:01,  7.23it/s]Corrupt JPEG data: 1 extraneous bytes before marker 0xd0
100%|██████████| 16/16 [00:22<00:00,  1.38s/it]


In [7]:
df

Unnamed: 0,img_dir,label,x1,y1,x2,y2,degrees,features
0,1/001_1.jpg,1,3085,769,3627,1311,0,"[0.05223591675037199, 0.02200874164265266, 0.0..."
1,1/001_1.jpg,1,3085,769,3627,1311,45,"[1.1907090213033673e-14, 3.4587583284812287e-1..."
2,1/001_1.jpg,1,3085,769,3627,1311,90,"[0.038482497394997796, 0.022487319972457866, 0..."
3,1/001_1.jpg,1,3085,769,3627,1311,135,"[4.511220802712497e-15, 6.601067150744913e-15,..."
4,1/001_1.jpg,1,3085,769,3627,1311,180,"[0.04219645779627354, 0.024649430648316244, 0...."
5,1/001_1.jpg,1,3085,769,3627,1311,225,"[6.213299986915226e-15, 5.0754575264019e-15, 1..."
6,1/001_1.jpg,1,3085,769,3627,1311,270,"[0.027422104016316524, 0.0164303969861719, 0.0..."
7,1/001_1.jpg,1,3085,769,3627,1311,315,"[2.6066196821507658e-14, 6.412825504201677e-15..."
8,1/001_10.jpg,1,10164,3168,12264,5268,0,"[0.0621900713648915, 0.02044215881361015, 0.01..."
9,1/001_10.jpg,1,10164,3168,12264,5268,45,"[1.1403484392240953e-14, 7.684963846532448e-15..."


In [8]:
# save df

df.to_parquet(os.path.join(data_dir, "dataset.parquet"))

In [None]:
read_df = pd.read_parquet(os.path.join(data_dir, "dataset.parquet"))
read_df

Unnamed: 0,img_dir,label,x1,y1,x2,y2,degrees,features
0,1/001_1.jpg,1,3085,769,3627,1311,0,"[0.05223591675037199, 0.02200874164265266, 0.0..."
1,1/001_1.jpg,1,3085,769,3627,1311,45,"[1.1907090213033673e-14, 3.4587583284812287e-1..."
2,1/001_1.jpg,1,3085,769,3627,1311,90,"[0.038482497394997796, 0.022487319972457866, 0..."
3,1/001_1.jpg,1,3085,769,3627,1311,135,"[4.511220802712497e-15, 6.601067150744913e-15,..."
4,1/001_1.jpg,1,3085,769,3627,1311,180,"[0.04219645779627354, 0.024649430648316244, 0...."
5,1/001_1.jpg,1,3085,769,3627,1311,225,"[6.213299986915226e-15, 5.0754575264019e-15, 1..."
6,1/001_1.jpg,1,3085,769,3627,1311,270,"[0.027422104016316524, 0.0164303969861719, 0.0..."
7,1/001_1.jpg,1,3085,769,3627,1311,315,"[2.6066196821507658e-14, 6.412825504201677e-15..."
8,1/001_10.jpg,1,10164,3168,12264,5268,0,"[0.0621900713648915, 0.02044215881361015, 0.01..."
9,1/001_10.jpg,1,10164,3168,12264,5268,45,"[1.1403484392240953e-14, 7.684963846532448e-15..."


In [10]:
df.equals(read_df)

True