#Dog breed identification
##Guillermo Blanco Núñez
####UDC International Summer School - Data Mining and Neural Networks Course
July 24th, 2025

Load dataset from github repository, downloaded from public kaggle competition before.  

In [5]:
!git clone --depth 1 https://github.com/GuillermoBlancoNunez/DogBreedsDataRepo.git
%cd DogBreedsDataRepo


fatal: destination path 'DogBreedsDataRepo' already exists and is not an empty directory.
/content/DogBreedsDataRepo


Import all necessary libraries and set variables for path, showing the amount of images loaded in the training dataset.

In [1]:
import pandas as pd
import numpy as np
import glob, re
from pathlib import Path
from sklearn.model_selection import train_test_split
from PIL import Image
from tensorflow.keras.utils import to_categorical

DATA_DIR = Path("/content/DogBreedsDataRepo")  # Adjust path
IMG_DIR = DATA_DIR / "train"                  #For training set
image_paths = sorted(glob.glob(str(IMG_DIR / "*.jpg")))
print("Total de imágenes encontradas:", len(image_paths))


Total de imágenes encontradas: 10222


Read labels.csv to create an index from image labels to the dog breed they reference.

In [6]:
labels_df = pd.read_csv("/content/DogBreedsDataRepo/labels.csv")

id_col    = [c for c in labels_df.columns if c.lower() in ("id","image_id","label","image")][0]
breed_col = [c for c in labels_df.columns if "breed" in c.lower()][0]

print("id_col =", id_col, "| breed_col =", breed_col)

id2breed = dict(zip(labels_df[id_col].astype(str),
                    labels_df[breed_col].astype(str)))

id_col = id | breed_col = breed


Connects indexes between labels.csv and image labels

In [7]:

def extract_label(path: str):
    """
    Returns the breed associated with the image path according to labels.csv.
    Requires a global dictionary id2breed: {image_id: breed}.
    If image_id isn´t found in the dictionary, returns 'UNKNOWN'.
    """
    image_id = Path(path).stem  # name without .jpg extension
    return id2breed.get(image_id, "UNKNOWN")




# Generates a list of breeds, not IDs
breeds = [extract_label(p) for p in image_paths]

Creates a pandas dataframe with the image path, the filename and the breed name as the label.

In [8]:
df = pd.DataFrame({
    "path": image_paths,
    "filename": [Path(p).name for p in image_paths],
    "breed": breeds
})

df.head()


Unnamed: 0,path,filename,breed
0,/content/DogBreedsDataRepo/train/000bec180eb18...,000bec180eb18c7604dcecc8fe0dba07.jpg,boston_bull
1,/content/DogBreedsDataRepo/train/001513dfcb2ff...,001513dfcb2ffafc82cccf4d8bbaba97.jpg,dingo
2,/content/DogBreedsDataRepo/train/001cdf01b096e...,001cdf01b096e06d78e9e5112d419397.jpg,pekinese
3,/content/DogBreedsDataRepo/train/00214f311d5d2...,00214f311d5d2247d5dfe4fe24b2303d.jpg,bluetick
4,/content/DogBreedsDataRepo/train/0021f9ceb3235...,0021f9ceb3235effd7fcde7f7538ed62.jpg,golden_retriever


Check nº of breeds detected in data, and the breed with the least and most apperances.

In [9]:
classes = df['breed'].unique()
NUM_CLASSES = len(classes)
print("Nº of detected breeds:", NUM_CLASSES)
counts = df['breed'].value_counts()
print(f"Breed with the most images: {counts.idxmax()} with {counts.max()}")
print(f"Breed with the least images: {counts.idxmin()} with {counts.min()}")

Nº of detected breeds: 120
Breed with the most images: scottish_deerhound with 126
Breed with the least images: eskimo_dog with 66


Separate into training, validation and testing set. With 70/15/15 split.

In [10]:

SEED = 42

# Separate testing set (15%)
train_val_df, test_df = train_test_split(df, test_size=0.15, stratify=df["breed"], random_state=SEED)

# Separate a global 15% for validate set from the 85% remaining

val_rel = 0.15 / 0.85

train_df, val_df = train_test_split(train_val_df, test_size=val_rel, stratify=train_val_df['breed'], random_state=SEED)

print(f"Tamaños -> Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")

Tamaños -> Train: 7154 | Val: 1534 | Test: 1534


Statistics for training, validation and testing sets.

In [11]:
subsets = {"Training set": train_df, "Validation set": val_df, "Test set": test_df}
for name, i in subsets.items():
  print(f"\n\n\nStatisctics for {name}")
  num_classes = i['breed'].nunique()
  print(f"Nº of entries: {len(i)}")
  print("Nº of detected breeds:", num_classes)
  counts = i['breed'].value_counts()

  print(f"Breed with the most images: {counts.idxmax()} with {counts.max()}")
  print(f"Breed with the least images: {counts.idxmin()} with {counts.min()}")
  print(f"Ratio: {counts.max() / counts.min()}")




Statisctics for Training set
Nº of entries: 7154
Nº of detected breeds: 120
Breed with the most images: scottish_deerhound with 88
Breed with the least images: eskimo_dog with 46
Ratio: 1.9130434782608696



Statisctics for Validation set
Nº of entries: 1534
Nº of detected breeds: 120
Breed with the most images: scottish_deerhound with 19
Breed with the least images: german_shepherd with 10
Ratio: 1.9



Statisctics for Test set
Nº of entries: 1534
Nº of detected breeds: 120
Breed with the most images: scottish_deerhound with 19
Breed with the least images: otterhound with 10
Ratio: 1.9


Load and normalize images.

In [12]:
IMG_SIZE   = (224, 224)   # width, height


def load_images(paths, img_size=IMG_SIZE):
    imgs = []
    for p in paths:
        img = Image.open(p).convert("RGB").resize(img_size, Image.BILINEAR)
        arr = np.asarray(img, dtype=np.float32) / 255.0
        imgs.append(arr)
    return np.stack(imgs)  # shape: (N, 224,224,3)

Load and divide train, validate and test sets into input and output values.

In [13]:
#Load images from X_train, X_val, X_test
x_train = load_images(train_df['path'].tolist())
x_val   = load_images(val_df['path'].tolist())
x_test  = load_images(test_df['path'].tolist())

# Convert label(str) into indexes
label2idx = {breed:idx for idx, breed in enumerate(classes)}
y_train_idx = train_df['breed'].map(label2idx).values
y_val_idx   = val_df['breed'].map(label2idx).values
y_test_idx  = test_df['breed'].map(label2idx).values


Transform output values dataset to categorical classes using one-hot encoding

In [14]:
# One‑hot encode output variables
y_train = to_categorical(y_train_idx, num_classes=NUM_CLASSES)
y_val   = to_categorical(y_val_idx,   num_classes=NUM_CLASSES)
y_test  = to_categorical(y_test_idx,  num_classes=NUM_CLASSES)

# Check shapes
print("x_train:", x_train.shape, "y_train:", y_train.shape)
print("x_val:  ", x_val.shape,   "y_val:  ", y_val.shape)
print("x_test: ", x_test.shape,  "y_test: ", y_test.shape)



x_train: (7154, 224, 224, 3) y_train: (7154, 120)
x_val:   (1534, 224, 224, 3) y_val:   (1534, 120)
x_test:  (1534, 224, 224, 3) y_test:  (1534, 120)
