<a href="https://colab.research.google.com/github/MarianoChic09/MSc-ORT-Deep-Learning/blob/main/Clase%207/5_Bees_Template_Transfer_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:

%cd /content/drive/MyDrive/Colab Notebooks/Datasets/bees_dataset

## 1.1 Imports

In [None]:
import numpy as np

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten, MaxPool2D, Dropout
import tensorflow as tf

import utils

## 1.2 Set random seeds

In [None]:
np.random.seed(117)
tf.random.set_seed(117)

## 1.3 Global variables

In [None]:
img_width = 100
img_height = 100
img_channels = 3

# 2. Carga de datos

In [None]:
bees, bees_test_for_evaluation = utils.read_data()

In [None]:
bees.head()

In [None]:
bees_test_for_evaluation.head()

# 3. Análisis exploratorio de datos

## 3.1 Análisis descriptivo: Distribuciones, Scatterplots, Barplots...

In [None]:
utils.value_counts(bees, 'subspecies')

In [None]:
utils.value_counts(bees, 'location')

In [None]:
utils.value_counts(bees, 'zip code')

Voy a usar zip code porque se repite Athens, Georgia en location y ya es un numero además.

In [None]:
utils.value_counts(bees, 'caste')

Tambien voy a dropear caste porque es constante.

In [None]:
utils.value_counts(bees, 'pollen_carrying')

In [None]:
bees.dtypes

Voy a droppear health porque no esta en el test.

### Analizo si tengo Nans

In [None]:
bees.isna().sum()

## 3.2 Ver imágenes

In [None]:
utils.plot_images(bees, 'location', [0, 18, 24, 38, 45])

# 4. Clasificación

## 4.1. Data preprocessing
### 4.1.1 Particionamiento

In [None]:
train_bees, val_bees, test_bees = utils.split(bees)

In [None]:
print(val_bees.isna().sum())
print(test_bees.isna().sum())

In [None]:
train_set_zip_code = set(train_bees['zip code'].unique())
val_set_zip_code = set(val_bees['zip code'].unique())
test_set_zip_code = set(test_bees['zip code'].unique())

print(train_set_zip_code == val_set_zip_code)
print(train_set_zip_code == test_set_zip_code)


### 4.1.2 Carga de imágenes

In [None]:
import os

os.chdir("/content/drive/MyDrive/Colab Notebooks/Datasets/bees_dataset")


In [None]:
print(os.getcwd())


In [None]:
print(os.listdir())
os.chdir("./data/imgs")
print(os.listdir())
os.chdir("../../")

In [None]:
# os.chdir("../../")
# print(os.listdir())


In [None]:
train_X, val_X, test_X, train_y, val_y, test_y = utils.load_images_and_target(train_bees,
                                                                              val_bees,
                                                                              test_bees,
                                                                              'subspecies',
                                                                              img_width,
                                                                              img_height,
                                                                              img_channels)

In [None]:
optimizer = 'sgd'
loss = 'categorical_crossentropy'

In [None]:
model1 = Sequential()
model1.add(Flatten(input_shape =(img_height, img_width, img_channels)))
model1.add(Dense(train_y.columns.size, activation = 'softmax'))
model1.compile(optimizer = optimizer, loss = loss, metrics = ['accuracy'])

## 4.3 Entrenamiento

### 4.2.2 Parámetros de transformación de imágenes (data augmentation)

In [None]:
rotation_range = 15      # rotación aleatoria en grados entre 0 a rotation_range
zoom_range = 0.1         # zoom aleatorio
width_shift_range = 0.1  # desplazamiento horizontal aleatorio (fracción del total)
height_shift_range = 0.1 # desplazamiento vertical aleatorio (fracción del total)
horizontal_flip = True   # transposición horizontal
vertical_flip = True     # transposición horizontal

In [None]:
batch_size = 10
epochs = 5
steps_per_epoch = 10
patience = 10
class_weights = utils.class_weights(bees, 'subspecies')

In [None]:
class_weights

In [None]:
training1, model1 = utils.train(model1,
                train_X,
                train_y,
                batch_size = batch_size,
                epochs = epochs,
                validation_data_X = val_X,
                validation_data_y = val_y,
                steps_per_epoch = steps_per_epoch,
                rotation_range = rotation_range,
                zoom_range = zoom_range,
                width_shift_range = width_shift_range,
                height_shift_range = height_shift_range,
                horizontal_flip = horizontal_flip,
                vertical_flip = vertical_flip,
                patience = patience,
                class_weights = class_weights
                               )

## 4.3 Evaluación del modelo

In [None]:
utils.eval_model(training1, model1, test_X, test_y, 'subspecies')

## 4.4 Evaluación y generación de archivo para competencia Kaggle

In [None]:
 df_subspecies = utils.load_test_and_generate_prediction_file(model1, img_width, img_height, img_channels)

In [None]:
df_subspecies

# Transfer Learning

## Qué es Transfer Learning?

Transfer learning o aprendizaje por transferencia es un problema de investigación en el aprendizaje automático que se centra en almacenar el conocimiento adquirido mientras se resuelve un problema y se aplica a un problema diferente pero relacionado.

[Keras Models](https://keras.io/api/applications/)

In [None]:
# example of loading the vgg16 model
from tensorflow.keras.applications.vgg16 import VGG16
# load model

model = VGG16(input_shape=(img_height, img_width, img_channels), include_top=False)
# model = VGG16(input_shape=(224, 224, 3), include_top=True)

# summarize the model
model.summary()

In [None]:
for layer in model.layers[:-3]:
  layer.trainable = False
model.summary()

In [None]:
from tensorflow.keras.models import Model

flat1 = Flatten()(model.layers[-1].output)
class1 = Dense(1024, activation='relu')(flat1)
output = Dense(7, activation='softmax')(class1)

# define new model
model = Model(inputs=model.inputs, outputs=output)

optimizer = 'sgd'
loss = 'categorical_crossentropy'

# compile the model
model.compile(optimizer = optimizer, loss = loss, metrics = ['accuracy'])

In [None]:
model.summary()

## Entrenamiento

Parámetros de transformación de imágenes (data augmentation)

In [None]:
rotation_range = 15      # rotación aleatoria en grados entre 0 a rotation_range
zoom_range = 0.1         # zoom aleatorio
width_shift_range = 0.1  # desplazamiento horizontal aleatorio (fracción del total)
height_shift_range = 0.1 # desplazamiento vertical aleatorio (fracción del total)
horizontal_flip = True   # transposición horizontal
vertical_flip = True     # transposición horizontal

In [None]:
from sklearn.utils.class_weight import compute_sample_weight

def computing_class_weights(df, class_name):
    class_labels = {name: index for index, name in enumerate(np.unique(df[class_name]))}
    y_integers = np.array([class_labels[name] for name in df[class_name]])
    weights = compute_sample_weight(class_weight='balanced', y=y_integers)

    weight_dict = {}
    for class_index in class_labels.values():
        weight_dict[class_index] = weights[y_integers == class_index].mean()

    return weight_dict

import tensorflow as tf
from tensorflow.keras import backend as K

def weighted_accuracy(weight_dict):
    class_weights = tf.constant([weight_dict[i] for i in range(len(weight_dict))])

    def calc_weighted_accuracy(y_true, y_pred):
        y_true_labels = K.argmax(y_true, axis=1)
        y_pred_labels = K.argmax(y_pred, axis=1)

        correct_predictions = K.cast(K.equal(y_true_labels, y_pred_labels), dtype='float32')
        weights = K.gather(class_weights, y_true_labels)
        weighted_correct_predictions = correct_predictions * weights

        accuracy = K.sum(weighted_correct_predictions) / K.sum(weights)
        return accuracy

    return calc_weighted_accuracy


In [None]:
batch_size = 10
epochs = 500
steps_per_epoch = 10
patience = 100

class_weights = computing_class_weights(bees, 'subspecies')


In [None]:
class_weights

In [None]:
training_vgg16, model = utils.train(model,
                train_X,
                train_y,
                batch_size = batch_size,
                epochs = epochs,
                validation_data_X = val_X,
                validation_data_y = val_y,
                steps_per_epoch = steps_per_epoch,
                rotation_range = rotation_range,
                zoom_range = zoom_range,
                width_shift_range = width_shift_range,
                height_shift_range = height_shift_range,
                horizontal_flip = horizontal_flip,
                vertical_flip = vertical_flip,
                patience = patience,
                class_weights = class_weights
                               )

In [None]:
trainable_params = ((3*3*512)*1024+1024)+(1024*7+7)
trainable_params

## Evaluación del modelo

In [None]:
utils.eval_model(training_vgg16, model, test_X, test_y, 'subspecies')

# Imbalanced Learning


# Archivo para Kaggle

In [None]:
df_subspecies = utils.load_test_and_generate_prediction_file(model1, img_width, img_height, img_channels)
df_subspecies

In [None]:
minority_images


In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Create a new ImageDataGenerator with the desired augmentations
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Assuming minority_images is a numpy array of images from the minority class
augmented_images = []
num_augmentations_per_image = 10  # You can adjust this value based on how many augmented images you want per original image

for img in minority_images:
    img = img.reshape((1,) + img.shape)  # Reshape the image
    i = 0
    for batch in datagen.flow(img, batch_size=1):
        augmented_images.append(batch[0])
        i += 1
        if i >= num_augmentations_per_image:
            break  # Avoid generator to loop indefinitely

# Now, `augmented_images` will contain the augmented images, and you can add these images to your training dataset.

# VGG 19

In [None]:
# example of loading the vgg16 model
from tensorflow.keras.applications.vgg19 import VGG19
# load model

model = VGG19(input_shape=(img_height, img_width, img_channels), include_top=False)
# model = VGG16(input_shape=(224, 224, 3), include_top=True)

# summarize the model
model.summary()

In [None]:
for layer in model.layers[:-3]:
  layer.trainable = False
model.summary()

In [None]:
from tensorflow.keras.models import Model

flat1 = Flatten()(model.layers[-1].output)
class1 = Dense(1024, activation='relu')(flat1)
output = Dense(7, activation='softmax')(class1)

# define new model
model = Model(inputs=model.inputs, outputs=output)

optimizer = 'sgd'
loss = 'categorical_crossentropy'

# compile the model
model.compile(optimizer = optimizer, loss = loss, metrics = ['accuracy'])

In [None]:
model.summary()

In [None]:
rotation_range = 15      # rotación aleatoria en grados entre 0 a rotation_range
zoom_range = 0.1         # zoom aleatorio
width_shift_range = 0.1  # desplazamiento horizontal aleatorio (fracción del total)
height_shift_range = 0.1 # desplazamiento vertical aleatorio (fracción del total)
horizontal_flip = True   # transposición horizontal
vertical_flip = True     # transposición horizontal

In [None]:
from sklearn.utils.class_weight import compute_sample_weight

def computing_class_weights(df, class_name):
    class_labels = {name: index for index, name in enumerate(np.unique(df[class_name]))}
    y_integers = np.array([class_labels[name] for name in df[class_name]])
    weights = compute_sample_weight(class_weight='balanced', y=y_integers)

    weight_dict = {}
    for class_index in class_labels.values():
        weight_dict[class_index] = weights[y_integers == class_index].mean()

    return weight_dict

In [None]:
batch_size = 10
epochs = 500
steps_per_epoch = 10
patience = 100

class_weights = computing_class_weights(bees, 'subspecies')


In [None]:
class_weights

In [None]:
training_vgg19, model = utils.train(model,
                train_X,
                train_y,
                batch_size = batch_size,
                epochs = epochs,
                validation_data_X = val_X,
                validation_data_y = val_y,
                steps_per_epoch = steps_per_epoch,
                rotation_range = rotation_range,
                zoom_range = zoom_range,
                width_shift_range = width_shift_range,
                height_shift_range = height_shift_range,
                horizontal_flip = horizontal_flip,
                vertical_flip = vertical_flip,
                patience = patience,
                class_weights = class_weights
                               )

In [None]:
utils.eval_model(training_vgg19, model, test_X, test_y, 'subspecies')

In [None]:
df_subspecies = utils.load_test_and_generate_prediction_file(model1, img_width, img_height, img_channels)
df_subspecies

# ResNet 50

In [None]:
from sklearn.utils.class_weight import compute_sample_weight

def computing_class_weights(df, class_name):
    class_labels = {name: index for index, name in enumerate(np.unique(df[class_name]))}
    y_integers = np.array([class_labels[name] for name in df[class_name]])
    weights = compute_sample_weight(class_weight='balanced', y=y_integers)

    weight_dict = {}
    for class_index in class_labels.values():
        weight_dict[class_index] = weights[y_integers == class_index].mean()

    return weight_dict

import tensorflow as tf
from tensorflow.keras import backend as K

def weighted_accuracy(weight_dict):
    class_weights = tf.constant([weight_dict[i] for i in range(len(weight_dict))])

    def calc_weighted_accuracy(y_true, y_pred):
        y_true_labels = K.argmax(y_true, axis=1)
        y_pred_labels = K.argmax(y_pred, axis=1)

        correct_predictions = K.cast(K.equal(y_true_labels, y_pred_labels), dtype='float32')
        weights = K.gather(class_weights, y_true_labels)
        weighted_correct_predictions = correct_predictions * weights

        accuracy = K.sum(weighted_correct_predictions) / K.sum(weights)
        return accuracy

    return calc_weighted_accuracy

In [None]:
base_model = tf.keras.applications.ResNet50V2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))


In [None]:
# example of loading the vgg16 model
from tensorflow.keras.applications.resnet_v2 import ResNet152V2
# load model

model_ResNet152V2 = ResNet152V2(input_shape=(img_height, img_width, img_channels), include_top=False)
# model = VGG16(input_shape=(224, 224, 3), include_top=True)

# summarize the model
model_ResNet152V2.summary()

In [None]:
for layer in model_ResNet152V2.layers[:-6]:
  layer.trainable = False
model_ResNet152V2.summary()

In [None]:
from tensorflow.keras import Model, models, layers, optimizers

# flat1 = Flatten()(model.layers[-1].output)
# class1 = Dense(1024, activation='relu')(flat1)
# output = Dense(7, activation='softmax')(class1)

# # define new model
# model = Model(inputs=model.inputs, outputs=output)
# model.trainable = False

model = models.Sequential([
    model,
    layers.GlobalAveragePooling2D(),
    layers.Dense(1024, activation='relu'),
    layers.Dropout(0.5),  # Optional: for regularization
    layers.Dense(7, activation='softmax')  # Adjust for the number of classes in your dataset
])

optimizer = optimizers.Adam(learning_rate=1e-4)
loss = 'categorical_crossentropy'

# compile the model
model.compile(optimizer=optimizer,
              loss=loss,
              metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
rotation_range = 180      # rotación aleatoria en grados entre 0 a rotation_range
zoom_range = 0.1         # zoom aleatorio
width_shift_range = 0.1  # desplazamiento horizontal aleatorio (fracción del total)
height_shift_range = 0.1 # desplazamiento vertical aleatorio (fracción del total)
horizontal_flip = True   # transposición horizontal
vertical_flip = True     # transposición horizontal

In [None]:
from sklearn.utils.class_weight import compute_sample_weight

def computing_class_weights(df, class_name):
    class_labels = {name: index for index, name in enumerate(np.unique(df[class_name]))}
    y_integers = np.array([class_labels[name] for name in df[class_name]])
    weights = compute_sample_weight(class_weight='balanced', y=y_integers)

    weight_dict = {}
    for class_index in class_labels.values():
        weight_dict[class_index] = weights[y_integers == class_index].mean()

    return weight_dict

In [None]:
batch_size = 10
epochs = 1500
steps_per_epoch = 10
patience = 100

class_weights = computing_class_weights(bees, 'subspecies')


In [None]:
class_weights

In [None]:
training_resnet152V2, model = utils.train(model,
                train_X,
                train_y,
                batch_size = batch_size,
                epochs = epochs,
                validation_data_X = val_X,
                validation_data_y = val_y,
                steps_per_epoch = steps_per_epoch,
                rotation_range = rotation_range,
                zoom_range = zoom_range,
                width_shift_range = width_shift_range,
                height_shift_range = height_shift_range,
                horizontal_flip = horizontal_flip,
                vertical_flip = vertical_flip,
                patience = patience,
                class_weights = class_weights
                               )

In [None]:
utils.eval_model(training_resnet152V2, model, test_X, test_y, 'subspecies')

In [None]:
df_subspecies = utils.load_test_and_generate_prediction_file(model, img_width, img_height, img_channels)
df_subspecies

# Combinando con la metadata
Combinando la información de las imágenes con la metadata disponible

In [None]:
train_bees, val_bees, test_bees = utils.split(bees)

In [None]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

encoder = OneHotEncoder(sparse_output=False)
encoder.fit(train_bees[['zip code']])

def preprocess_metadata(df, encoder):

    zip_encoded = encoder.transform(df[['zip code']])


    zip_df = pd.DataFrame(zip_encoded, columns=encoder.get_feature_names_out(['zip code']))

    zip_df = zip_df.reset_index(drop=True)
    pollen_df = df['pollen_carrying'].reset_index(drop=True)

    pollen_df = pollen_df.astype(int)

    meta_df = pd.concat([zip_df, pollen_df], axis=1)

    return meta_df

def load_images_and_target_with_metadata(train_bees, val_bees, test_bees, y_field_name, img_width, img_height, img_channels, encoder):
    """
    Load images for features, drop other columns
    One hot encode for label, drop other columns
    @return: train images, validation images, test images, train labels, validation labels, test labels
    """
    # Bees already splitted to train, validation and test
    # Load and transform images to have equal width/height/channels.
    # Use np.stack to get NumPy array for CNN input
    img_folder='./data/imgs/'


    # Train data
    train_X = np.stack(train_bees['file'].apply(lambda x: utils.read_img(x, img_folder, img_width, img_height, img_channels)))
    train_y = pd.DataFrame(utils.onehot_encoding(train_bees, y_field_name))
    # train_y  = pd.get_dummies(train_bees[y_field_name], drop_first=False)

    # Validation during training data to calc val_loss metric
    val_X = np.stack(val_bees['file'].apply(lambda x: utils.read_img(x, img_folder, img_width, img_height, img_channels)))
    val_y = pd.DataFrame(utils.onehot_encoding(val_bees, y_field_name))
    # val_y = pd.get_dummies(val_bees[y_field_name], drop_first=False)

    # Test data
    test_X = np.stack(test_bees['file'].apply(lambda x: utils.read_img(x, img_folder, img_width, img_height, img_channels)))
    test_y = pd.DataFrame(utils.onehot_encoding(test_bees, y_field_name))
    # test_y = pd.get_dummies(test_bees[y_field_name], drop_first=False)

    # encoder = OneHotEncoder(sparse_output=False)
    # encoder.fit(train_bees[['zip code']])


    train_meta = preprocess_metadata(train_bees,encoder)
    val_meta = preprocess_metadata(val_bees,encoder)
    test_meta = preprocess_metadata(test_bees,encoder)

    return (train_X, val_X, test_X, train_y, val_y, test_y, train_meta, val_meta, test_meta)


In [None]:
train_X, val_X, test_X, train_y, val_y, test_y, train_meta, val_meta, test_meta = load_images_and_target_with_metadata(train_bees,
                                                                              val_bees,
                                                                              test_bees,
                                                                              'subspecies',
                                                                              img_width,
                                                                              img_height,
                                                                              img_channels)

In [None]:
print(len(train_bees))
print(len(val_bees))
print(len(test_bees))
print("-------------------------------------")
print(f"Size of metadata {train_meta.shape[0]}. Size of images: {train_X.shape[0]}")
print(f"Size of metadata {val_meta.shape[0]}. Size of images: {val_X.shape[0]}")
print(f"Size of metadata {test_meta.shape[0]}. Size of images: {test_X.shape[0]}")

print(test_meta.shape)

In [None]:
import tensorflow as tf
from tensorflow.keras import optimizers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import concatenate
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.utils.class_weight import compute_sample_weight


def create_combined_model(image_model, metadata_model,num_classes):
    combined_input = concatenate([image_model.output, metadata_model.output])

    # You can add more dense layers here if needed
    z = Dense(num_classes, activation='softmax')(combined_input)  # Assuming num_classes is pre-defined

    combined_model = Model(inputs=[image_model.input, metadata_model.input], outputs=z)

    return combined_model

def combined_gen(image_gen, meta_data):
    for (x, y) in image_gen:
        yield [x, meta_data], y

def combined_generator(img_data, meta_data, labels, batch_size):
    num_samples = img_data.shape[0]
    while True:
        for i in range(0, num_samples, batch_size):
            end = min(i + batch_size, num_samples)
            img_batch = img_data[i:end]
            meta_batch = meta_data[i:end]
            label_batch = labels[i:end]
            yield [img_batch, meta_batch], label_batch

def train_with_metadata(model,
                train_X,
                train_y,
                train_meta,
                val_meta,
                batch_size,
                epochs,
                validation_data_X,
                validation_data_y,
                steps_per_epoch,
                rotation_range,  # ... other augmentation parameters
                patience,
                class_weights=None):

    datagen = ImageDataGenerator(featurewise_center=False,  # set input mean to 0 over the dataset
                                samplewise_center=False,  # set each sample mean to 0
                                featurewise_std_normalization=False,  # divide inputs by std of the dataset
                                samplewise_std_normalization=False,  # divide each input by its std
                                zca_whitening=False,  # apply ZCA whitening
                                rotation_range=rotation_range,  # randomly rotate images in the range (degrees, 0 to rotation_range)
                                zoom_range = zoom_range, # Randomly zoom image
                                width_shift_range=width_shift_range,  # randomly shift images horizontally (fraction of total width)
                                height_shift_range=height_shift_range,  # randomly shift images vertically (fraction of total height)
                                horizontal_flip=horizontal_flip,  # randomly flip images
                                vertical_flip=vertical_flip)

    train_gen = datagen.flow(train_X, train_y, batch_size=batch_size)
    # combined_train_gen = combined_gen(train_gen, train_meta)
    combined_train_gen = combined_generator(train_X, train_meta, train_y, batch_size=32)


    # Train
    ## Callbacks
    earlystopper = EarlyStopping(monitor='loss', patience=patience, verbose=1, restore_best_weights=True)

    training = model.fit(
        combined_train_gen,
        validation_data=([validation_data_X, val_meta], validation_data_y),
        steps_per_epoch=steps_per_epoch,
        epochs=epochs,
        callbacks=[earlystopper],
        class_weight=class_weights
        # ... other parameters
    )
    return training, model


In [None]:
# example of loading the vgg16 model
from tensorflow.keras.applications.resnet_v2 import ResNet50V2
# load model

model_ResNet50V2 = ResNet50V2(input_shape=(img_height, img_width, img_channels), include_top=False)
# model = VGG16(input_shape=(224, 224, 3), include_top=True)

# summarize the model
# model_ResNet152V2.summary()

In [None]:
for layer in model_ResNet50V2.layers[:-3]:
  layer.trainable = False
# model_ResNet152V2.summary()

In [None]:
from tensorflow.keras import Model, models, layers, optimizers

# flat1 = Flatten()(model.layers[-1].output)
# class1 = Dense(1024, activation='relu')(flat1)
# output = Dense(7, activation='softmax')(class1)

# # define new model
# model = Model(inputs=model.inputs, outputs=output)
# model.trainable = False

image_model = models.Sequential([
    model_ResNet50V2,
    layers.GlobalAveragePooling2D(),
    layers.Dense(1024, activation='relu'),
    layers.Dropout(0.5),  # Optional: for regularization
    # layers.Dense(7, activation='softmax')  # Adjust for the number of classes in your dataset
])

optimizer = optimizers.Adam(learning_rate=1e-5)
loss = 'categorical_crossentropy'

# compile the model
image_model.compile(optimizer=optimizer,
              loss=loss,
              metrics=['accuracy'])
image_model.summary()

In [None]:
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.regularizers import l2 # Agrego regularización porque al agregar la metadata tomaba demasiado en cuenta estos valores.

# Define the metadata model
metadata_input = Input(shape=(8,))
metadata_layer = Dense(32, activation='relu', kernel_regularizer=l2(0.01))(metadata_input)
metadata_layer = Dropout(0.5)(metadata_layer)
metadata_model = Model(inputs=metadata_input, outputs=metadata_layer)

combined_model = create_combined_model(image_model, metadata_model,num_classes=7)
combined_model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

metadata_model.summary()
combined_model.summary()

print(train_X.shape)
print(train_meta.shape)

print(image_model.output_shape)
print(metadata_model.output_shape)

In [None]:
def print_combined_gen_shapes(train_X, train_meta, batch_size=32, num_batches=5):
    num_samples = train_X.shape[0]
    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i+1) * batch_size, num_samples)
        img_batch = train_X[start_idx:end_idx]
        meta_batch = train_meta[start_idx:end_idx]
        print(img_batch.shape, meta_batch.shape)
# Print the shapes of the batches
print_combined_gen_shapes(train_X, train_meta)


In [None]:
import matplotlib.pyplot as plt
import warnings
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
from sklearn.utils.class_weight import compute_sample_weight

categories = {}
class_indices = {}

def computing_class_weights(df, class_name):
    class_labels = {name: index for index, name in enumerate(np.unique(df[class_name]))}
    y_integers = np.array([class_labels[name] for name in df[class_name]])
    weights = compute_sample_weight(class_weight='balanced', y=y_integers)

    weight_dict = {}
    for class_index in class_labels.values():
        weight_dict[class_index] = weights[y_integers == class_index].mean()

    return weight_dict

def class_columns(df) :
    return np.column_stack((np.asarray(df['subspecies']), np.asarray(df['health'])))

def setup_onehot(df) :
    # Fit one hot encoder
    enc.fit(class_columns(df))
    # Get categories
    categories['subspecies'] = enc.categories_[0]
    categories['health'] = enc.categories_[1]
    # Get indices
    class_indices['subspecies'] = np.arange(len(categories['subspecies']))
    class_indices['health'] = len(categories['subspecies']) + np.arange(len(categories['health']))

def onehot_encoding(df, class_name) :
    if categories == {} :
        raise ValueError('Run setup_onehot first')

    return enc.transform(class_columns(df)).toarray()[:,class_indices[class_name]]

def read_data() :
    bees=pd.read_csv('./data/bees_train.csv',
                index_col=False,
                dtype={'subspecies':'category', 'health':'category','caste':'category'})
    bees_test_for_evaluation=pd.read_csv('./data/bees_test.csv',
                index_col=False,
                dtype={'caste':'category'})

    setup_onehot(bees)

    class_weights = computing_class_weights(bees,'subspecies')

    return bees, bees_test_for_evaluation, class_weights



def eval_model(training, model, test_X, test_meta, test_y, field_name):
    """
    Model evaluation: plots, classification report
    @param training: model training history
    @param model: trained model
    @param test_X: features
    @param test_y: labels
    @param field_name: label name to display on plots
    """
    ## Trained model analysis and evaluation
    f, ax = plt.subplots(2,1, figsize=(5,5))
    ax[0].plot(training.history['loss'], label="Loss")
    ax[0].plot(training.history['val_loss'], label="Validation loss")
    ax[0].set_title('%s: loss' % field_name)
    ax[0].set_xlabel('Epoch')
    ax[0].set_ylabel('Loss')
    ax[0].legend()

    # Accuracy
    ax[1].plot(training.history['accuracy'], label="Accuracy")
    ax[1].plot(training.history['val_accuracy'], label="Validation accuracy")
    ax[1].set_title('%s: accuracy' % field_name)
    ax[1].set_xlabel('Epoch')
    ax[1].set_ylabel('Accuracy')
    ax[1].legend()
    plt.tight_layout()
    plt.show()

    # Accuracy by category
    # test_pred = model.predict(test_X)
    test_pred = model.predict([test_X, test_meta])

    acc_by_category = np.logical_and((test_pred > 0.5), test_y).sum()/test_y.sum()
    acc_by_category.plot(kind='bar', title='Accuracy by %s' % field_name)
    plt.ylabel('Accuracy')
    plt.show()

    # Print metrics
    print("Classification report")
    # Print metrics
    test_pred = np.argmax(test_pred, axis=1)
    test_truth = np.argmax(test_y.values, axis=1)

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        print(metrics.classification_report(test_truth, test_pred, target_names=categories[field_name]))

    # Updated model evaluation to include test_meta
    test_res = model.evaluate([test_X, test_meta], test_y.values, verbose=0)
    print('Loss function: %s, accuracy:' % test_res[0], test_res[1])

In [None]:
_, _, class_weights = read_data()
class_weights

In [None]:
training, trained_model = train_with_metadata(
    model=combined_model,
    train_X=train_X,
    train_y=train_y,
    train_meta=train_meta,
    val_meta=val_meta,
    batch_size=10,
    epochs=200,
    validation_data_X=val_X,
    validation_data_y=val_y,
    steps_per_epoch=len(train_X) // 32,
    rotation_range=90,
    patience=5,
    class_weights=class_weights
)


In [None]:
eval_model(training, trained_model, test_X, test_meta, test_y, 'subspecies')

In [None]:
### load_test
img_folder='./data/imgs/'
# Image processing
import imageio
import skimage
import skimage.io
import skimage.transform

def read_img(file, img_folder, img_width, img_height, img_channels):
    """
    Read and resize img, adjust channels.
    @param file: file name without full path
    """
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        img = skimage.io.imread(img_folder + file)
        img = skimage.transform.resize(img, (img_width, img_height), mode='reflect', )
    return img[:,:,:img_channels]

def load_test(img_width, img_height, img_channels, encoder):
    X_test_partition=pd.read_csv('./data/bees_test.csv', index_col=False, dtype={'caste':'category'})

    print("X_test_partition shape:", X_test_partition.shape)
    print("X_test_partition columns:", X_test_partition.columns)


    test_images = np.stack(X_test_partition['file'].apply(lambda x: read_img(x, img_folder, img_width, img_height, img_channels)))

    # preprocess metadata using the encoder
    test_meta = preprocess_metadata(X_test_partition, encoder)

    return X_test_partition, test_images, test_meta

def predict_test(model, test_images, test_meta):

    prob = model.predict([test_images, test_meta])
    pred = np.argmax(prob, axis=1).reshape(-1,1)
    return pred

def gen_csv_file(test_ids, pred, class_name):
    output = np.stack((test_ids, pred), axis=-1)
    output = output.reshape([-1, 2])

    df = pd.DataFrame(output)
    df.columns = ['id','expected']

    df['expected'] = df['expected'].map(pd.Series(categories[class_name]))
    df.to_csv("test_" + str(class_name) + ".csv", index = False, index_label = False)
    return df

def load_test_and_generate_prediction_file(model, img_width, img_height, img_channels,encoder):
    X_test_partition, test_images, test_meta = load_test(img_width, img_height, img_channels,encoder)
    pred = predict_test(model, test_images, test_meta)

    test_ids = X_test_partition['id']
    test_ids = np.array(test_ids).reshape(-1,1)

    return gen_csv_file(test_ids, pred, 'subspecies')


In [None]:
print(os.getcwd())


In [None]:
print(os.listdir())
os.chdir("./data/imgs")
print(os.listdir())
os.chdir("../../")

In [None]:
test_meta = test_meta.reshape(-1, 1) if test_meta.ndim == 1 else test_meta


In [None]:
missing_cols = set(train_meta.columns) - set(test_meta.columns)
print("Missing columns:", missing_cols)


In [None]:
df_subspecies = load_test_and_generate_prediction_file(trained_model, img_width, img_height, img_channels, encoder)
df_subspecies