In [2]:
#The purpose of the preprocess_input function is to preprocess the input images before feeding them to an EfficientNet model
from keras.applications.efficientnet import preprocess_input
#This line imports the `EfficientNetB0` model from Keras' applications module
from keras.applications import EfficientNetB0
#This imports the `Sequential` model from Keras. A Sequential model is a stack of layers.
#It's a basic form of neural network model in Keras where you can simply add layers to the model in sequence.
#This is a "linear" arrangement in the sense that the data flows through the layers in a single path, without branching or merging.
from keras.models import Sequential
# - This line imports specific layers that you can add to your neural network:
#     - `GlobalAveragePooling2D`: A layer that averages each feature map to a single number.
#     - `Dropout`: This layer randomly sets a fraction of input units to 0 at each update during training
from keras.layers import GlobalAveragePooling2D, Dropout, Dense
#Keras might not have every possible metric or loss function you need and although the RMSE, for instance,
#is a standard metric for regression problems, it is not directly available as a built-in function in Keras.
#Keras does not provide "low-level" operations such as tensor multiplication and convolution.
#Instead, it relies on the tensor library: `keras.backend` to handle these operations.
#We are using that library to define our custom loss function.
#By using `keras.backend`, your custom function remains compatible with whichever backend Keras is using.
#This means you can write your code once and it will work whether you're using TensorFlow, Theano, or any other backend supported by Keras.
# This allows for more flexible and portable code.
import keras.backend as backend
#ImageDataGenerator is a class in Keras used for real-time data augmentation.
#'img_to_array' is a utility function that converts a loaded image (in the form of a PIL image or a similar object) into a NumPy array.
#This conversion is necessary because deep learning models in Keras work with data in the form of NumPy arrays.
from keras.preprocessing.image import ImageDataGenerator, img_to_array
#The os module provides a way of using operating system-dependent functionality.
#It allows you to interact with the operating system in various ways, such as navigating the file system, reading, and writing files,
#querying and setting environment variables, and executing system commands.
import os
#cv2 provides tools that are essential for many tasks in the field of computer vision and image processing.
import cv2
#Import the ADAM optimizer
from keras.optimizers import Adam
#Remember that the Kaggle challenge includes a CSV file with the "labels" we are to predict
import pandas as pd

In [3]:
#The probability distributions for the classifications for each of the training images comes in a CSV file.
#The columns of the CSV file are: "GalxaxyID, Class1.1, Class1.2, ..."
classes = [
    'Class1.1', 'Class1.2', 'Class1.3', 'Class2.1', 'Class2.2', 'Class3.1',
    'Class3.2', 'Class4.1', 'Class4.2', 'Class5.1', 'Class5.2', 'Class5.3',
    'Class5.4', 'Class6.1', 'Class6.2', 'Class7.1', 'Class7.2', 'Class7.3',
    'Class8.1', 'Class8.2', 'Class8.3', 'Class8.4', 'Class8.5', 'Class8.6',
    'Class8.7', 'Class9.1', 'Class9.2', 'Class9.3', 'Class10.1', 'Class10.2',
    'Class10.3', 'Class11.1', 'Class11.2', 'Class11.3', 'Class11.4',
    'Class11.5', 'Class11.6'
]

In [4]:
#This line sets the variable DIR to the path of the CSV file provided by the Kaggle challenge, this
#CSV file contains the entries of the "classification" vectors for each galaxy in the training set
#DIR = "/content/drive/MyDrive/DL_project/training_solutions_rev1.csv"
DIR = 'C:/Users/usuario/Desktop/DL_project_Jupyter/training_solutions_rev1.csv'

#This line sets the variable train_path to the path of a directory that contains the training images
#train_path = "/content/drive/Othercomputers/Mi portátil/images_training_rev1"
train_path = 'C:/Users/usuario/Desktop/DL_project_Jupyter/images_training_rev1'

In [5]:
#This function takes a single argument fn (short for "filename") and returns the filename with ".jpg" appended to it
#This function we are going to use in order to read the images
def append_ext(fn):
    return fn + ".jpg"

In [6]:
#We read the dataset whose path is stored by the "DIR" variable
traindf = pd.read_csv(DIR)
#We take the 'GalaxyID' column of the DataFrame, convert it to string type (using .astype(str)),
#and then applies the append_ext function to each element in the column.
#This is done using the .apply() method, which applies a function along an axis of the DataFrame.
#The result is that each entry in the new 'id' column is the corresponding GalaxyID with ".jpg" appended to it.
#hence, "id" will contain the filename of each image
traindf["id"] = traindf['GalaxyID'].astype(str).apply(append_ext)

In [7]:
#The cropping we explained above
def random_input(img):
    #By using [:2], we ignore the number of channels and keep just the height and width.
    shape = img.shape[:2]
    #These lines calculate one-fourth of the height and width of the image, respectively.
    left = int(shape[0]/4)
    top = int(shape[1]/4)
    #This line crops the image to a central region.
    #It selects a square from the image that starts at (left, top) and extends to three times the value of left and top.
    img = img[left:left*3,top:top*3,:]
    #After cropping, the image is resized back to its original dimensions
    #interpolation=cv2.INTER_CUBIC argument specifies the interpolation method to be cubic, which is a method that generally provides good results.
    image = cv2.resize(img, shape, interpolation = cv2.INTER_CUBIC)
    #This line converts the resized image into a NumPy array using the img_to_array function.
    #This conversion is necessary because Keras models expect input in the form of NumPy arrays.
    image = img_to_array(image)

    #Apply EfficientNetB0 preprocess_input
    return preprocess_input(image)

In [8]:
datagen = ImageDataGenerator(
    #This parameter specifies that each image will be randomly rotated by a degree between -90 and +90.
    rotation_range=90,
    #These parameters allow for random horizontal and vertical shifts of the image.
    #`width_shift_range=0.1` means the image will be shifted horizontally by up to 10% of its width.
    #Similarly, `height_shift_range=0.1` allows for up to 10% vertical shift.
    width_shift_range=0.1,
    height_shift_range=0.1,
    #This parameter randomly changes the brightness of the image. The brightness will be adjusted by a factor chosen from the range [0.9, 1.2]
    #Since pixel values must be within the range 0 to 255, if the multiplication results in a value outside this range, it will be clipped to fit within it.
    brightness_range = (0.9, 1.2),
    #These parameters enable random flipping of the images horizontally and vertically.
    horizontal_flip=True,
    vertical_flip=True,
    #This parameter is used to reserve a portion of the images for validation. In this case, 15% (`0.15`) of the images will be used for validation.
    validation_split = 0.15,
    # Here, you can specify a custom preprocessing function to apply to each image after the augmentation and before feeding it to the model.
    preprocessing_function = random_input,
)

#Setting the validation_split parameter in both ImageDataGenerator instances
#(datagen for training data and valid_datagen for validation data) is a common practice
#when you have a single dataset and you want to split it into training and validation sets automatically.
#You will understand this once you review the "flow_from_dataframe" class, there you will see that
#each ImageDataGenerator takes care of generating either the training set or the validation set
#I need to specify the percentages for both given that there is no a priori communcation between them

#We an instance of the ImageDataGenerator class from Keras, which is specifically configured for generating validation data.
valid_datagen=ImageDataGenerator(validation_split=0.15, preprocessing_function = random_input)

In [9]:
traindf

Unnamed: 0,GalaxyID,Class1.1,Class1.2,Class1.3,Class2.1,Class2.2,Class3.1,Class3.2,Class4.1,Class4.2,...,Class10.1,Class10.2,Class10.3,Class11.1,Class11.2,Class11.3,Class11.4,Class11.5,Class11.6,id
0,100008,0.383147,0.616853,0.000000,0.000000,0.616853,0.038452,0.578401,0.418398,0.198455,...,0.279952,0.138445,0.000000,0.000000,0.092886,0.000000,0.000000,0.0,0.325512,100008.jpg
1,100023,0.327001,0.663777,0.009222,0.031178,0.632599,0.467370,0.165229,0.591328,0.041271,...,0.000000,0.131378,0.459950,0.000000,0.591328,0.000000,0.000000,0.0,0.000000,100023.jpg
2,100053,0.765717,0.177352,0.056931,0.000000,0.177352,0.000000,0.177352,0.000000,0.177352,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,100053.jpg
3,100078,0.693377,0.238564,0.068059,0.000000,0.238564,0.109493,0.129071,0.189098,0.049466,...,0.094549,0.000000,0.094549,0.189098,0.000000,0.000000,0.000000,0.0,0.000000,100078.jpg
4,100090,0.933839,0.000000,0.066161,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,100090.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61573,999948,0.510379,0.489621,0.000000,0.059207,0.430414,0.000000,0.430414,0.226257,0.204157,...,0.226257,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.226257,999948.jpg
61574,999950,0.901216,0.098784,0.000000,0.000000,0.098784,0.000000,0.098784,0.000000,0.098784,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,999950.jpg
61575,999958,0.202841,0.777376,0.019783,0.116962,0.660414,0.067245,0.593168,0.140022,0.520391,...,0.000000,0.090673,0.049349,0.000000,0.067726,0.000000,0.000000,0.0,0.072296,999958.jpg
61576,999964,0.091000,0.909000,0.000000,0.045450,0.863550,0.022452,0.841098,0.795330,0.068220,...,0.068398,0.318132,0.408799,0.227464,0.408799,0.090668,0.023065,0.0,0.045334,999964.jpg


In [10]:
#We create an instance of a data generator for training data using the flow_from_dataframe method of the previously defined ImageDataGenerator instance (datagen).

#The dataframe parameter specifies the pandas DataFrame (traindf) that contains our training data.
#This DataFrame should have two columns: one for the image identifiers and one for the labels.

#directory specifies the path to the directory where the images are stored (train_path).
#The generator will look for the images mentioned in the traindf DataFrame in this directory.

#x_col specifies the name of the column in traindf that contains the image file names

#y_col specifies the column or columns that contain the labels. Since we are working on a multi-label classification problem
#this parameter should be set to a list of column names in traindf with the different class names.

#subset is used to specify which part of the data to use. By setting it to "training", you indicate that
#the generator should use the part of the data designated for training (based on the validation_split defined in the ImageDataGenerator).

#batch_size=70: This sets the size of the batches of data to be generated. Each batch will contain 70 images (and their corresponding labels).

#The seed parameter sets the random seed for shuffling and transformations, ensuring reproducibility of your batches.

#No shuffle is applied, if you think about it, it is not necessary for us.

#The class_mode parameter specifies how the labels are represented. Setting it to "raw" means that the labels will be provided as they are in the DataFrame.
#For instance, if class_mode="categorical", and you have images classified into 'cat', 'dog', and 'bird',
#the labels will be converted into one-hot encoded format, like [1, 0, 0] for 'cat', [0, 1, 0] for 'dog', and [0, 0, 1] for 'bird'.

#target_size sets the dimensions to which all images found will be resized. In this case, images will be resized to 224x224 pixels.

train_generator = datagen.flow_from_dataframe(
    dataframe=traindf,
    directory=train_path,
    x_col="id",
    y_col=classes,
    subset="training",
    batch_size=70,
    seed=123,
    shuffle=False,
    class_mode="raw",
    target_size=(224,224))


#We do something analogous for the valid_generator class
valid_generator = valid_datagen.flow_from_dataframe(
    dataframe=traindf,
    directory=train_path,
    x_col="id",
    y_col=classes,
    subset="validation",
    batch_size=70,
    seed=123,
    shuffle=False,
    class_mode="raw",
    target_size=(224, 224))

Found 52342 validated image filenames.
Found 9236 validated image filenames.


In [11]:
#The following line calculates the number of steps (or batches) per epoch during the training process.
#The .n attribute of a Keras data generator gives you the total number of images (or samples) in the dataset that the generator is drawing from.
#train_generator.batch_size is the number of samples that will be processed in each batch.
#The // operator in Python performs integer (or floor) division
STEP_SIZE_TRAIN = train_generator.n // train_generator.batch_size

#And we do something analogous for the validation set
STEP_SIZE_VALID = valid_generator.n // valid_generator.batch_size

In [12]:
#The Kaggle Galaxy Zoo challenge asked participants to use the Root Mean Square Error (RMSE) of their predictions to evaluate the performance of their models:
def rmse(y_true, y_pred):
        return backend.sqrt(backend.mean(backend.square(y_pred - y_true)))

In [13]:
#Here we construct a neural network model using Keras utilizing the EfficientNetB0 architecture as the base model.
def build_model():
    #This line initializes an EfficientNetB0 model pre-trained on the ImageNet dataset.
    #weights='imagenet' indicates that the model should be loaded with weights trained on the ImageNet dataset.
    #include_top=False means that the top layer of the model (a fully connected layer for classification) is not included. This allows for custom layers to be added for specific tasks.
    #input_shape=(224, 224, 3) sets the shape of the input images to 224x224 pixels with 3 color channels (RGB).
    eff1 = EfficientNetB0(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

    #This "freezes" these layers, meaning their weights will not be updated during training.
    #This is a common practice when using a pre-trained model as a feature extractor,
    #as it allows the model to maintain the knowledge it has gained from the original training dataset (ImageNet in this case).
    for layer in eff1.layers:
        layer.trainable = False

    #model = Sequential(): This line initializes a new Sequential model.
    model = Sequential()
    #model.add(eff1): Adds the EfficientNetB0 model as the base of the new model.
    model.add(eff1)
    #model.add(GlobalAveragePooling2D()): This layer applies global average pooling to the output of EfficientNetB0.
    model.add(GlobalAveragePooling2D())
    #model.add(Dropout(0.5)): This layer randomly sets input units to 0 with a frequency of 0.5 at each instance (image) during training.
    model.add(Dropout(0.5))
    #model.add(Dense(64, activation='relu')): Adds a densely-connected Neural Network layer with 64 units and ReLU (Rectified Linear Unit) activation.
    model.add(Dense(64, activation='relu'))
    #model.add(Dense(37, activation='sigmoid')): Finally, adds a Dense layer with 37 units and a sigmoid activation function.
    model.add(Dense(37, activation='sigmoid'))

    return model

In [14]:
#The line model = build_model() is calling the build_model function that you defined earlier and storing the returned model in the variable model
model = build_model()

In [15]:
from keras.models import load_model

# Load the weights from the last checkpoint
model.load_weights('C:/Users/usuario/Desktop/DL_project_Jupyter/weights_efficientnetB0_justClassifiers.hdf5')

#When you load the weights with model.load_weights(), it only affects the weights of the layers, 
#not their trainable status. So the pre-trained EfficientNetB0 base will remain frozen, 
#and only the weights of the classifier layers at the end (which were trained by you) will be loaded.

In [16]:
# Unfreeze the EfficientNetB0 layers for full model training

#model.layers[0]: This retrieves the first layer of the Sequential model. 
#Since the first thing you added to your Sequential model was the EfficientNetB0 model, 
#model.layers[0] is the EfficientNetB0 model itself.

for layer in model.layers[0].layers:
    layer.trainable = True
    
#You have to do it this way given that so far we have just saved the "best weights" for the classifier

In [17]:
#This function configures the model with a loss function, an optimizer, and one or more metrics for evaluation.
#In our case, the rmse function defined above is used as the loss function
#We are using the Adam optimizer, lr=1.5e-4 sets the learning rate of the Adam optimizer to 0.00015.
#Metrics are used to evaluate the performance of your model. They are similar to the loss function but are not used for training the model, only for evaluation.
model.compile(loss=rmse,
                  optimizer=Adam(learning_rate=1.5e-4),
                  metrics=[rmse])

In [18]:
#A callback is a function that is passed into another function or class as an argument and is expected to be executed when certain event happens.
#Callback classes are an extension of this concept.
#They are classes that define a set of methods which are called at specific points during the execution of a program or in response to certain events.
#In deep learning, callback classes are used extensively to monitor and influence the training process of models.
from keras.callbacks import Callback
#This line imports various callback classes from Keras.
#Callbacks like ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, and CSVLogger provide functionalities to
#save the model at certain points, stop training early under certain conditions, reduce the learning rate when a metric has stopped improving,
#and log the training process to a CSV file, respectively.
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, CSVLogger

#The LossHistory class is an example of a custom callback class.
#This class is designed to track and record the loss, validation loss, and RMSE after each batch during the training of a neural network.
#This class extends Keras's Callback base class, by inheriting from it
class LossHistory(Callback):
    #This method is automatically called at the beginning of the training process (that is built in the Keras callback class).
    #Inside this method, three empty lists (self.losses, self.val_losses, and self.rmse) are initialized.
    #These lists will be used to store the values of the training loss, validation loss, and RMSE, respectively, as the training progresses.
    #It is not clear to me what the purpose of receiving the "logs" dictionary here
    #the logs dictionary will contain the current training metrics.
    def on_train_begin(self, logs={}):
        self.losses = []
        self.val_losses = []
        self.rmse = []

    #This method is executed at the end of each batch during the training process.
    def on_batch_end(self, batch, logs={}):
        #The method extracts the current batch's loss, val_loss, and rmse values from the
        #logs dictionary and appends them to the corresponding lists (self.losses, self.val_losses, and self.rmse).
        self.losses.append(logs.get('loss'))
        self.val_losses.append(logs.get('val_loss'))
        self.rmse.append(logs.get('rmse'))

In [19]:
#This line creates an instance of the EarlyStopping callback.
#The purpose of this callback is to stop the training process early if there is no improvement in a specified metric after a certain number of epochs (patience).

#monitor='val_loss' specifies that the callback should monitor the validation loss.

#patience=10 means that the training will be stopped if there is no improvement in the monitored metric for 10 consecutive epochs.
#"No improvement" is defined based on the mode parameter.

#verbose=1 enables verbose output. This means the callback will print a message when the training is stopped early.

#mode='auto' lets the callback decide the appropriate mode for monitoring (min or max) based on the monitored metric.
#For val_loss, it will automatically set the mode to min, meaning training will stop when the metric stops decreasing.
early_stopping = EarlyStopping(
    monitor='val_loss', patience=10, verbose=1, mode='auto')

#This line creates an instance of the LossHistory callback, which you defined earlier.
history = LossHistory()

In [20]:
#The ModelCheckpoint callback automatically saves the model or model weights at specified intervals during training.

#filepath specifies the location to save the model file.

#verbose=2 enables verbose output. It will print a message each time the model is saved, but the message is shorter than for verbose=1

#save_best_only=True means the model will only be saved when the monitored metric (which is val_loss) has improved.

checkpointer = ModelCheckpoint(
    filepath='C:/Users/usuario/Desktop/DL_project_Jupyter/weights_efficientnetB0.hdf5', verbose=2, save_best_only=True)



#This callback reduces the learning rate when a metric has stopped improving.
#Reducing the learning rate can help the model to converge to a minimum.

#factor=0.2 means that the new learning rate will be 20% of the current learning rate when a reduction is made.

#patience=4 means that the learning rate will be reduced if there is no improvement in the monitored metric for 4 consecutive epochs.
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=4,)

#The CSVLogger callback streams the results of each epoch into a CSV file
csv_logger = CSVLogger('C:/Users/usuario/Desktop/DL_project_Jupyter/weights_efficientnetB0.csv')

In [None]:
#This is where the training of a deep learning model takes place using Keras

#This .fit_generator method is used to train the model on data generated batch-by-batch by a Python generator (train_generator).
#The fit_generator method is often used when the dataset is too large to fit into memory, requiring real-time data loading and augmentation.

#The first argument is the training data generator (train_generator) that yields batches of training data and labels.

#steps_per_epoch specifies the number of batch steps before declaring one epoch finished and starting the next epoch.
#It is set to STEP_SIZE_TRAIN, which is the total number of training samples divided by the batch size.
#This ensures that the model sees all training samples in each epoch.

#The validation_data parameter is set to valid_generator, which is a similar generator but for validation data.
#It provides batches of validation data and labels for evaluating the model's performance on data it has not trained on

#validation_steps determines how many batches of validation data will be used in evaluating the model’s performance at the end of each epoch.
#It is typically set (and this is our case) to the number of validation samples divided by the validation batch size.

#epochs: this sets the number of epochs for which the training will run. The model will go through the training dataset 50 times.

#callbacks are a set of functions applied at given stages of the training procedure.
#You can use callbacks to get a view of some internal states and statistics of the model during training

hist = model.fit_generator(
    train_generator,
    steps_per_epoch=STEP_SIZE_TRAIN,
    validation_data=valid_generator,
    validation_steps=STEP_SIZE_VALID,
    epochs=50,
    callbacks=[history, checkpointer, reduce_lr, early_stopping, csv_logger])

  hist = model.fit_generator(


Epoch 1/50
