In [6]:
"""Oxford Pet handwritten digits dataset.

"""
import os
from pathlib import Path
from dotenv import find_dotenv, load_dotenv
import numpy as np
from keras.utils.data_utils import get_file

# find .env automagically by walking up directories until it's found, then
# load up the .env entries as environment variables
load_dotenv(find_dotenv())
path = Path(os.environ.get('PROJECT_DIR')) / 'data' / 'raw' / 'oxford-pet'
os.makedirs(path, exist_ok=True)

origin_folder = 'http://www.robots.ox.ac.uk/~vgg/data/pets/data/'
images_download = get_file(
    path / 'images.tar.gz',
    cache_dir=Path(os.environ.get('PROJECT_DIR')) / 'data' / 'raw',
    cache_subdir='oxford-pet',
    extract=True,
    origin=origin_folder + 'images.tar.gz',
)
os.remove(path / 'images.tar.gz')

In [7]:
annotations_download = get_file(
    path / 'annotations.tar.gz',
    cache_dir=Path(os.environ.get('PROJECT_DIR')) / 'data' / 'raw',
    cache_subdir='oxford-pet',
    extract=True,
    origin=origin_folder + 'annotations.tar.gz',
)
os.remove(path / 'annotations.tar.gz')

Downloading data from http://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz


In [15]:
path_download = path / 'annotations'
all_data = []
with open(path_download / 'list.txt', 'r') as images_list:
    for line in images_list.readlines():
        if '#' in line:
            continue
        image_name, label, species, _ = line.strip().split(' ')
        trimaps_dir_path = os.path.join(path_download, 'trimaps')

        label = int(label) - 1
        species = int(species) - 1

        trimap_name = image_name + '.png'
        record = {
            "image": os.path.join(path, 'images', image_name + '.jpg'),
            "label": label,
            "species": species,
            "file_name": image_name,
            "segmentation_mask": os.path.join(trimaps_dir_path, trimap_name)
        }
        all_data.append(record)

In [63]:
from skimage import io
from skimage.transform import resize
path = Path(os.environ.get('PROJECT_DIR')) / 'data' / 'processed' / 'oxford-pet'
os.makedirs(path, exist_ok=True)
os.makedirs(path / 'images', exist_ok=True)
os.makedirs(path / 'masks', exist_ok=True)

for idx in range(len(all_data)):
    # load
    img = io.imread(all_data[idx]['image'])
    img = img / 255
    img = resize(img, (128, 128), order=1, anti_aliasing=True).astype(np.float32)
    np.save(path / 'images' / f'{all_data[idx]["file_name"]}', img)

    mask = io.imread(all_data[idx]['segmentation_mask'])
    mask[mask == 2.0] = 0.0
    mask[(mask == 1.0) | (mask == 3.0)] = 1.0
    mask = resize(mask, (128, 128), order=0, preserve_range=True, anti_aliasing=False)
    np.save(path / 'masks' / f'{all_data[idx]["file_name"]}', mask)


In [30]:
all_x = []
all_y = []
for idx in range(len(all_data)):
    all_x.append(all_data[idx]["file_name"])
    all_y.append(all_data[idx]["label"])
all_x = np.array(all_x)
all_y = np.array(all_y)

from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=2204, random_state=42)
for train_index, test_index in sss.split(all_x, all_y):
    X_train_val, X_test = all_x[train_index], all_x[test_index]
    y_train_val, y_test = all_y[train_index], all_y[test_index]


In [32]:
path = Path(os.environ.get('PROJECT_DIR')) / 'data'

with open(path / 'OxfordPet_test_data.txt', 'a') as f:
    for name, y in zip(X_test, y_test):
        f.write(f"{name}.npy, {y}\n")

In [39]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=514, random_state=42)
for train_index, test_index in sss.split(X_train_val, y_train_val):
    X_train, X_val = all_x[train_index], all_x[test_index]
    y_train, y_val = all_y[train_index], all_y[test_index]


In [40]:
with open(path / 'OxfordPet_training_data.txt', 'a') as f:
    for name, y in zip(X_train, y_train):
        f.write(f"{name}.npy, {y}\n")

In [42]:
with open(path / 'OxfordPet_validation_data.txt', 'a') as f:
    for name, y in zip(X_val, y_val):
        f.write(f"{name}.npy, {y}\n")