In [50]:
import pandas as pd
import matplotlib.pyplot as plot
import matplotlib.image as mpimg
from shutil import copyfile
import pathlib
%matplotlib widget

In [2]:
df = pd.read_csv("/users/janhr/unsup3d_extended/data/animals_dataset_dataframe.csv", index_col=0)

## Number of images in dataset

In [3]:
len(df)

117484

## Number of classes

In [4]:
len(df.groupby(['class']))

149

## How do images differ in size?

In [5]:
print(len(df[(df["width"] > 1000) | (df["height"] > 1000)]))
print(len(df[(df["width"] < 64) | (df["height"] < 64)]))

111
4143


## How do images differ in ratio?

In [6]:
# Ratio
df['ratio'] = df.apply(lambda row: row["width"] / row["height"], axis=1)
print(len(df[(df["ratio"] > 1.20) | (df["ratio"] < 0.80)]))
print(len(df[(df["ratio"] > 1.40) | (df["ratio"] < 0.60)]))

8808
1169


In [7]:
out = pd.qcut(df["ratio"], q=6)

ax = out.value_counts(sort=False).plot.bar(rot=0, color="b", figsize=(10,4))
# ax.set_xticklabels([c[1:-1].replace(","," to") for c in out.cat.categories])
ax.set_xticklabels(out.cat.categories)
ax.set_xlabel("Ratio")
ax.set_ylabel("Number of images")
plot.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

=> ~18.000 images have a width/height ratio between 0.487 and 0.911, ~18.000 other images have a width/height ratio between 0.959 and 994, and so on

class
n02085620     913
n02085782     470
n02085936     894
n02086079     887
n02086240     792
             ... 
n02133161     604
n02134084     664
n02134418     498
n02137549     772
n02138441    1034
Name: image_name, Length: 149, dtype: int64

## Number of images per class

In [10]:
number_imgs = df.groupby(['class']).count()["image_name"]
out = pd.qcut(number_imgs, q=6)

ax = out.value_counts(sort=False).plot.bar(rot=0, color="b", figsize=(10,4))
# ax.set_xticklabels([c[1:-1].replace(","," to") for c in out.cat.categories])
ax.set_xticklabels(out.cat.categories)
ax.set_xlabel("Number of images per class")
ax.set_ylabel("Number of classes")
plot.show()

=> 25 classes consist of 364-662 images, 25 other classes concist 662 - 754 images and so on

## Average image for each class when center cropped and resized to 64x64

In [None]:
from pathlib import Path
root_path = "/users/janhr/unsup3d_extended/data/animal_avg_class_images/"
for img_path in Path(root_path).glob('*.jpg'):
    img = mpimg.imread(str(img_path))
    
    fig = plot.figure()
    fig.suptitle(f"{img_path.name[:-4]}", fontsize=20)
    plot.imshow(img)

In [None]:
# Goal: find 5k images of dogs with a ratio close to 1

In [15]:
# Sub goal: find 10 dog classes:

In [16]:
number_imgs[:15]

class
n02085620     913
n02085782     470
n02085936     894
n02086079     887
n02086240     792
n02086646     886
n02086910    1014
n02087046     633
n02087394     901
n02088094     710
n02088238     875
n02088364     815
n02088466     829
n02088632     710
n02089078     364
Name: image_name, dtype: int64

In [33]:
# n02085620: Chihuahua
# n02085936: Maltese Dog
# n02086079: Pekinese (long hair)
# n02086240: Shih-Tzu
# n02086646: Blenheim spaniel
# n02086910: Papillon
# n02087046: Toy Terrier
# n02087394: rhodesian ridgeback
# n02088094: Afghan hound
# n02088238: basset
# n02088364: beagle
list_of_dog_classes = ["n02088094", "n02087394", "n02087046", "n02086910", "n02086646", "n02086240", "n02086079", "n02085936", "n02085620", "n02088238","n02088364"]

In [40]:
dogs_square_ratio = df[(abs(df["ratio"]-1) < 0.1) & df["class"].isin(list_of_dog_classes)]

In [47]:
INPUT_PATH = "/scratch/local/ssd/janhr/data/animals_original/" 
OUTPUT_PATH = "/scratch/local/ssd/janhr/data/dogs_square_ratio/"
pathlib.Path(OUTPUT_PATH).mkdir(parents=True, exist_ok=True) 

In [51]:
for index, row in dogs_square_ratio.iterrows():
    class_name = row["class"]
    img_name = row["image_name"]
    src = INPUT_PATH + class_name + "/" + img_name + ".jpg"
    dst = OUTPUT_PATH + class_name + "/" + img_name + ".jpg"
    pathlib.Path(OUTPUT_PATH + class_name).mkdir(parents=True, exist_ok=True) 
    copyfile(src, dst)