#Dog breed identification
##Guillermo Blanco Núñez
####UDC International Summer School - Data Mining and Neural Networks Course
July 24th, 2025

Load dataset from github repository, downloaded from public kaggle competition before.  

In [2]:
!git clone --depth 1 https://github.com/GuillermoBlancoNunez/DogBreedsDataRepo.git
%cd DogBreedsDataRepo


Cloning into 'DogBreedsDataRepo'...
remote: Enumerating objects: 20496, done.[K
remote: Counting objects: 100% (20496/20496), done.[K
remote: Compressing objects: 100% (20496/20496), done.[K
remote: Total 20496 (delta 0), reused 20496 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (20496/20496), 684.99 MiB | 35.80 MiB/s, done.
Updating files: 100% (20581/20581), done.
/content/DogBreedsDataRepo


Import all necessary libraries and set variables for path, showing the amount of images loaded in the training dataset.

In [27]:
import pandas as pd
import numpy as np
import glob, re
from pathlib import Path
from sklearn.model_selection import train_test_split
from PIL import Image
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from sklearn.metrics import f1_score

DATA_DIR = Path("/content/DogBreedsDataRepo")  # Adjust path
IMG_DIR = DATA_DIR / "train"                  #For training set
image_paths = sorted(glob.glob(str(IMG_DIR / "*.jpg")))
print("Total de imágenes encontradas:", len(image_paths))


Total de imágenes encontradas: 10222


Read labels.csv to create an index from image labels to the dog breed they reference.

In [29]:
labels_df = pd.read_csv("/content/DogBreedsDataRepo/labels.csv")

id_col    = [c for c in labels_df.columns if c.lower() in ("id","image_id","label","image")][0]
breed_col = [c for c in labels_df.columns if "breed" in c.lower()][0]

print("id_col =", id_col, "| breed_col =", breed_col)

id2breed = dict(zip(labels_df[id_col].astype(str),
                    labels_df[breed_col].astype(str)))

id_col = id | breed_col = breed


Connects indexes between labels.csv and image labels

In [30]:

def extract_label(path: str):
    """
    Returns the breed associated with the image path according to labels.csv.
    Requires a global dictionary id2breed: {image_id: breed}.
    If image_id isn´t found in the dictionary, returns 'UNKNOWN'.
    """
    image_id = Path(path).stem  # name without .jpg extension
    return id2breed.get(image_id, "UNKNOWN")




# Generates a list of breeds, not IDs
breeds = [extract_label(p) for p in image_paths]

Creates a pandas dataframe with the image path, the filename and the breed name as the label.

In [31]:
df = pd.DataFrame({
    "path": image_paths,
    "filename": [Path(p).name for p in image_paths],
    "breed": breeds
})

df.head()


Unnamed: 0,path,filename,breed
0,/content/DogBreedsDataRepo/train/000bec180eb18...,000bec180eb18c7604dcecc8fe0dba07.jpg,boston_bull
1,/content/DogBreedsDataRepo/train/001513dfcb2ff...,001513dfcb2ffafc82cccf4d8bbaba97.jpg,dingo
2,/content/DogBreedsDataRepo/train/001cdf01b096e...,001cdf01b096e06d78e9e5112d419397.jpg,pekinese
3,/content/DogBreedsDataRepo/train/00214f311d5d2...,00214f311d5d2247d5dfe4fe24b2303d.jpg,bluetick
4,/content/DogBreedsDataRepo/train/0021f9ceb3235...,0021f9ceb3235effd7fcde7f7538ed62.jpg,golden_retriever


Check nº of breeds detected in data, and the breed with the least and most apperances.

In [32]:
classes = df['breed'].unique()
NUM_CLASSES = len(classes)
label2idx = {breed:idx for idx, breed in enumerate(classes)}
print("Nº of detected breeds:", NUM_CLASSES)
counts = df['breed'].value_counts()
print(f"Breed with the most images: {counts.idxmax()} with {counts.max()}")
print(f"Breed with the least images: {counts.idxmin()} with {counts.min()}")

Nº of detected breeds: 120
Breed with the most images: scottish_deerhound with 126
Breed with the least images: eskimo_dog with 66


In [33]:



FRACTION = 0.3
RANDOM_STATE = 42

print("Total original:", len(df))


df = (
    df
    .groupby('breed', group_keys=False)        # agrupa por raza…
    .apply(lambda grp: grp.sample(
        frac=FRACTION,
        random_state=RANDOM_STATE
    ))
    .reset_index(drop=True)
)

# Verifica

print("Total subsample:", len(df))
print("Distribución subsample:")
print(df['breed'].value_counts().head())




Total original: 10222
Total subsample: 3070
Distribución subsample:
breed
scottish_deerhound      38
afghan_hound            35
maltese_dog             35
bernese_mountain_dog    34
entlebucher             34
Name: count, dtype: int64


  .apply(lambda grp: grp.sample(


Separate into training, validation and testing set. With 70/15/15 split.

In [34]:

SEED = 42

# Separate testing set (15%)
train_val_df, test_df = train_test_split(df, test_size=0.15, stratify=df["breed"], random_state=SEED)

# Separate a global 15% for validate set from the 85% remaining

val_rel = 0.15 / 0.85

train_df, val_df = train_test_split(train_val_df, test_size=val_rel, stratify=train_val_df['breed'], random_state=SEED)

print(f"Tamaños -> Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")

Tamaños -> Train: 2148 | Val: 461 | Test: 461


Statistics for training, validation and testing sets.

In [35]:
subsets = {"Training set": train_df, "Validation set": val_df, "Test set": test_df}
for name, i in subsets.items():
  print(f"\n\n\nStatisctics for {name}")
  num_classes = i['breed'].nunique()
  print(f"Nº of entries: {len(i)}")
  print("Nº of detected breeds:", num_classes)
  counts = i['breed'].value_counts()

  print(f"Breed with the most images: {counts.idxmax()} with {counts.max()}")
  print(f"Breed with the least images: {counts.idxmin()} with {counts.min()}")
  print(f"Ratio: {counts.max() / counts.min()}")






Statisctics for Training set
Nº of entries: 2148
Nº of detected breeds: 120
Breed with the most images: scottish_deerhound with 26
Breed with the least images: brabancon_griffon with 14
Ratio: 1.8571428571428572



Statisctics for Validation set
Nº of entries: 461
Nº of detected breeds: 120
Breed with the most images: scottish_deerhound with 6
Breed with the least images: vizsla with 3
Ratio: 2.0



Statisctics for Test set
Nº of entries: 461
Nº of detected breeds: 120
Breed with the most images: scottish_deerhound with 6
Breed with the least images: redbone with 3
Ratio: 2.0


In [37]:
IMG_SIZE = 224
AUTOTUNE = tf.data.AUTOTUNE

def make_dataset(paths, labels, batch_size, training=False):
    ds = tf.data.Dataset.from_tensor_slices((paths, labels))
    if training:
        ds = ds.shuffle(buffer_size=len(paths), seed=42)
    ds = ds.map(
        lambda p,y: (
            tf.keras.applications.efficientnet.preprocess_input(
                tf.image.resize(
                    tf.image.decode_jpeg(tf.io.read_file(p), channels=3),
                    (IMG_SIZE, IMG_SIZE)
                )
            ), y
        ),
        num_parallel_calls=AUTOTUNE
    )
    if training:
        aug = tf.keras.Sequential([
            tf.keras.layers.RandomFlip("horizontal"),
            tf.keras.layers.RandomRotation(0.08),
            tf.keras.layers.RandomZoom(0.1),
            tf.keras.layers.RandomContrast(0.1),
        ])
        ds = ds.map(lambda x,y: (aug(x, training=True), y),
                    num_parallel_calls=AUTOTUNE)
    return ds.batch(batch_size).prefetch(AUTOTUNE)

def build_model(lr):
    base = tf.keras.applications.EfficientNetB0(
        include_top=False, weights="imagenet",
        input_shape=(IMG_SIZE, IMG_SIZE, 3), pooling="avg"
    )
    base.trainable = False
    inputs  = tf.keras.Input((IMG_SIZE, IMG_SIZE, 3))
    x       = base(inputs, training=False)
    x       = tf.keras.layers.Dropout(0.4)(x)
    outputs = tf.keras.layers.Dense(NUM_CLASSES, activation="softmax")(x)
    m = tf.keras.Model(inputs, outputs)
    m.compile(
        optimizer=tf.keras.optimizers.Adam(lr),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )
    return m

# Configuraciones a probar
configs = [
    (0.01,   10, 32),
    (0.01,   10, 64),
    (0.001,  20, 32),
    (0.001,  20, 64),
    (0.0001, 30, 32),
    (0.0001, 30, 64),
]

# Preparamos test set (etiquetas e imágenes)
y_test_idx = test_df['breed'].map(label2idx).values
test_paths = test_df['path'].values

results = []
for lr, epochs, batch_size in configs:
    # Creamos los datasets
    train_ds = make_dataset(
        train_df['path'].values,
        train_df['breed'].map(label2idx).values,
        batch_size, training=True
    )
    val_ds = make_dataset(
        val_df['path'].values,
        val_df['breed'].map(label2idx).values,
        batch_size, training=False
    )
    test_ds = make_dataset(test_paths, y_test_idx, batch_size, training=False)

    # Construimos y compilamos el modelo
    model = build_model(lr)

    # Callback de EarlyStopping
    early = tf.keras.callbacks.EarlyStopping(
        monitor="val_loss",
        patience=3,
        restore_best_weights=True
    )

    # Entrenamos con EarlyStopping
    model.fit(
        train_ds,
        validation_data=val_ds,
        epochs=epochs,
        callbacks=[early],
        verbose=1
    )

    # Evaluación final en test
    loss, acc = model.evaluate(test_ds, verbose=0)
    preds     = np.argmax(model.predict(test_ds, verbose=0), axis=1)
    f1w       = f1_score(y_test_idx, preds, average='weighted')

    results.append({
        "lr": lr,
        "epochs": epochs,
        "batch_size": batch_size,
        "test_loss": loss,
        "test_acc": acc,
        "test_f1_weighted": f1w
    })

# Mostrar resultados finales
df_results = pd.DataFrame(results)
print(df_results.to_string(index=False))

Epoch 1/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m233s[0m 3s/step - accuracy: 0.2905 - loss: 3.4468 - val_accuracy: 0.7093 - val_loss: 0.9459
Epoch 2/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m263s[0m 3s/step - accuracy: 0.8045 - loss: 0.6066 - val_accuracy: 0.7505 - val_loss: 0.8616
Epoch 3/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m213s[0m 3s/step - accuracy: 0.8957 - loss: 0.2992 - val_accuracy: 0.7397 - val_loss: 0.9318
Epoch 4/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m269s[0m 3s/step - accuracy: 0.9007 - loss: 0.3036 - val_accuracy: 0.7505 - val_loss: 0.9025
Epoch 5/10
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m219s[0m 3s/step - accuracy: 0.9206 - loss: 0.2371 - val_accuracy: 0.7636 - val_loss: 0.9308
Epoch 1/10
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m246s[0m 7s/step - accuracy: 0.2561 - loss: 3.4509 - val_accuracy: 0.7223 - val_loss: 0.8867
Epoch 2/10
[1m34/34[0m [32m━━━━



Epoch 1/20
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m249s[0m 7s/step - accuracy: 0.0715 - loss: 4.5723 - val_accuracy: 0.5249 - val_loss: 3.1734
Epoch 2/20
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m229s[0m 7s/step - accuracy: 0.5191 - loss: 2.8815 - val_accuracy: 0.6898 - val_loss: 2.0731
Epoch 3/20
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m228s[0m 7s/step - accuracy: 0.7042 - loss: 1.8799 - val_accuracy: 0.7180 - val_loss: 1.5045
Epoch 4/20
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m229s[0m 7s/step - accuracy: 0.7827 - loss: 1.3558 - val_accuracy: 0.7419 - val_loss: 1.2150
Epoch 5/20
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m230s[0m 7s/step - accuracy: 0.8272 - loss: 1.0682 - val_accuracy: 0.7549 - val_loss: 1.0461
Epoch 6/20
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 7s/step - accuracy: 0.8475 - loss: 0.8919 - val_accuracy: 0.7722 - val_loss: 0.9396
Epoch 7/20
[1m34/34[0m [32m━━━━



Epoch 1/30
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m248s[0m 7s/step - accuracy: 0.0093 - loss: 4.9025 - val_accuracy: 0.0347 - val_loss: 4.6669
Epoch 2/30
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m230s[0m 7s/step - accuracy: 0.0283 - loss: 4.6871 - val_accuracy: 0.0694 - val_loss: 4.4680
Epoch 3/30
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m229s[0m 7s/step - accuracy: 0.0470 - loss: 4.4879 - val_accuracy: 0.1323 - val_loss: 4.2780
Epoch 4/30
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m231s[0m 7s/step - accuracy: 0.0874 - loss: 4.2898 - val_accuracy: 0.2386 - val_loss: 4.0977
Epoch 5/30
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m229s[0m 7s/step - accuracy: 0.1519 - loss: 4.1335 - val_accuracy: 0.2993 - val_loss: 3.9241
Epoch 6/30
[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 7s/step - accuracy: 0.1917 - loss: 3.9475 - val_accuracy: 0.3796 - val_loss: 3.7583
Epoch 7/30
[1m34/34[0m [32m━━━━