In [1]:
import pandas as pd

# Load family tree data
family_tree_df = pd.read_excel('Data/Input/name_to_ancestors.xlsx')

# Ensure the structure by splitting ancestors into different levels
family_tree_df['ancestors'] = family_tree_df['ancestors'].apply(eval)  # Convert string representation of list to actual list
max_levels = max(family_tree_df['ancestors'].apply(len))
for i in range(max_levels):
    family_tree_df[f'level_{i}'] = family_tree_df['ancestors'].apply(lambda x: x[i] if i < len(x) else None)

# Display the first few rows to check the structure
print(family_tree_df.head())


          name                         ancestors      level_0   level_1  \
0     Animalia                        [Animalia]     Animalia      None   
1      Insecta               [Insecta, Animalia]      Insecta  Animalia   
2  Hymenoptera  [Hymenoptera, Insecta, Animalia]  Hymenoptera   Insecta   
3  Lepidoptera  [Lepidoptera, Insecta, Animalia]  Lepidoptera   Insecta   
4      Diptera      [Diptera, Insecta, Animalia]      Diptera   Insecta   

    level_2 level_3 level_4 level_5  
0      None    None    None    None  
1      None    None    None    None  
2  Animalia    None    None    None  
3  Animalia    None    None    None  
4  Animalia    None    None    None  


In [2]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm.notebook import tqdm
from keras.preprocessing.image import ImageDataGenerator
from keras.applications import VGG16
from keras.layers import Dense, Flatten, GlobalAveragePooling2D
from keras.models import Model
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt

# Custom callback for tqdm progress bar during training
class TQDMNotebookCallback(tf.keras.callbacks.Callback):
    def __init__(self, epochs):
        self.epochs = epochs
        self.pbar = tqdm(total=epochs, desc="Training Progress", unit="epoch")

    def on_epoch_end(self, epoch, logs=None):
        self.pbar.update(1)
        self.pbar.set_postfix(loss=logs.get('loss'), accuracy=logs.get('accuracy'),
                              val_loss=logs.get('val_loss'), val_accuracy=logs.get('val_accuracy'))

# Load classification labels
labels_df = pd.read_csv('Data/Input/classification_labels.csv')
labels_df['basename'] = labels_df['basename'].apply(lambda x: f"{x}.jpg")  # Ensure filenames have .jpg extension
labels_df = labels_df.rename(columns={"basename": "filename", "deepest_name": "label"})

# Define paths
image_dir = 'Data/Input/images_resized'




In [3]:
# Create an ImageDataGenerator for data augmentation
datagen = ImageDataGenerator(rescale=1./255, validation_split=0.2)

# Initialize dictionary to hold models for each level
models = {}

# Get the unique levels from the family tree data
levels = [f'level_{i}' for i in range(max_levels)]

# Train a model for each level
for level in levels:
    print(f"Training model for {level}")
    level_labels_df = labels_df.copy()
    level_labels_df['label'] = level_labels_df['label'].apply(
        lambda x: family_tree_df[family_tree_df['name'] == x][level].values[0] if not family_tree_df[family_tree_df['name'] == x][level].isna().values[0] else x)
    
    train_generator = datagen.flow_from_dataframe(
        dataframe=level_labels_df,
        directory=image_dir,
        x_col='filename',
        y_col='label',
        subset='training',
        batch_size=32,
        seed=42,
        shuffle=True,
        class_mode='categorical',
        target_size=(128, 128))

    validation_generator = datagen.flow_from_dataframe(
        dataframe=level_labels_df,
        directory=image_dir,
        x_col='filename',
        y_col='label',
        subset='validation',
        batch_size=32,
        seed=42,
        shuffle=True,
        class_mode='categorical',
        target_size=(128, 128))

    base_model = VGG16(weights='imagenet', include_top=False, input_shape=(128, 128, 3))
    base_model.trainable = False

    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    x = Dense(1024, activation='relu')(x)
    predictions = Dense(len(train_generator.class_indices), activation='softmax')(x)

    model = Model(inputs=base_model.input, outputs=predictions)
    model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
    
    es = EarlyStopping(monitor='val_loss', patience=3)
    
    model.fit(
        train_generator,
        validation_data=validation_generator,
        epochs=10,
        callbacks=[TQDMNotebookCallback(10), es])
    
    models[level] = model
    model.save(f'vgg16_{level}_classification.h5')

Training model for level_0
Found 31556 validated image filenames belonging to 84 classes.
Found 7889 validated image filenames belonging to 84 classes.




Training Progress:   0%|          | 0/10 [00:00<?, ?epoch/s]

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training model for level_1


  saving_api.save_model(


Found 31556 validated image filenames belonging to 25 classes.
Found 7889 validated image filenames belonging to 25 classes.


Training Progress:   0%|          | 0/10 [00:00<?, ?epoch/s]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Training model for level_2


  saving_api.save_model(


Found 31556 validated image filenames belonging to 12 classes.
Found 7889 validated image filenames belonging to 12 classes.


Training Progress:   0%|          | 0/10 [00:00<?, ?epoch/s]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training model for level_3


  saving_api.save_model(


Found 31556 validated image filenames belonging to 19 classes.
Found 7889 validated image filenames belonging to 19 classes.


Training Progress:   0%|          | 0/10 [00:00<?, ?epoch/s]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training model for level_4


  saving_api.save_model(


Found 31556 validated image filenames belonging to 70 classes.
Found 7889 validated image filenames belonging to 70 classes.


Training Progress:   0%|          | 0/10 [00:00<?, ?epoch/s]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training model for level_5


  saving_api.save_model(


Found 31556 validated image filenames belonging to 78 classes.
Found 7889 validated image filenames belonging to 78 classes.


Training Progress:   0%|          | 0/10 [00:00<?, ?epoch/s]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  saving_api.save_model(


In [4]:
# Function to make hierarchical predictions
def hierarchical_predictions(image):
    preds = {}
    img_array = np.expand_dims(image, axis=0)
    img_array = datagen.standardize(img_array)
    
    for level in levels:
        model = models[level]
        pred = model.predict(img_array)
        class_idx = np.argmax(pred)
        class_label = list(train_generator.class_indices.keys())[class_idx]
        confidence = pred[0, class_idx]
        preds[level] = {'class': class_label, 'confidence': confidence}
        
        # Stop if prediction confidence is too low to proceed
        if confidence < 0.5:
            break

    return preds

# Make predictions on the validation set
validation_generator.reset()
hierarchical_results = []

for i in tqdm(range(len(validation_generator)), desc="Predicting", unit="batch"):
    batch = validation_generator[i]
    images = batch[0]
    filenames = validation_generator.filenames[i*validation_generator.batch_size:(i+1)*validation_generator.batch_size]
    
    for j, image in enumerate(images):
        preds = hierarchical_predictions(image)
        result = {'Filename': filenames[j]}
        for level in levels:
            if level in preds:
                result[f'Prediction_{level}'] = preds[level]['class']
                result[f'Confidence_{level}'] = preds[level]['confidence']
            else:
                result[f'Prediction_{level}'] = None
                result[f'Confidence_{level}'] = None
        
        hierarchical_results.append(result)

# Convert results to DataFrame and save
results_df = pd.DataFrame(hierarchical_results)
results_df.to_csv('Data/Output/hierarchical_predictions.csv', index=False)

# Display the first few rows of the output to check
print(results_df.head())

Predicting:   0%|          | 0/247 [00:00<?, ?batch/s]

                        Filename Prediction_level_0  Confidence_level_0  \
0  213_20210905050256_2415_t.jpg       Leptoceridae            0.460755   
1   192_20200724194326_17604.jpg       Leptoceridae            0.461662   
2   294_20210814023615_30831.jpg       Leptoceridae            0.460861   
3   146_20200616021921_26547.jpg       Leptoceridae            0.460832   
4   118_20200827021322_36091.jpg       Leptoceridae            0.461361   

  Prediction_level_1 Confidence_level_1 Prediction_level_2 Confidence_level_2  \
0               None               None               None               None   
1               None               None               None               None   
2               None               None               None               None   
3               None               None               None               None   
4               None               None               None               None   

  Prediction_level_3 Confidence_level_3 Prediction_level_4 Con