In [2]:
#The purpose of the preprocess_input function is to preprocess the input images before feeding them to an EfficientNet model
from keras.applications.efficientnet import preprocess_input
#This line imports the `EfficientNetB0` model from Keras' applications module
from keras.applications import EfficientNetB0
#This imports the `Sequential` model from Keras. A Sequential model is a stack of layers.
#It's a basic form of neural network model in Keras where you can simply add layers to the model in sequence.
#This is a "linear" arrangement in the sense that the data flows through the layers in a single path, without branching or merging.
from keras.models import Sequential
# - This line imports specific layers that you can add to your neural network:
#     - `GlobalAveragePooling2D`: A layer that averages each feature map to a single number.
#     - `Dropout`: This layer randomly sets a fraction of input units to 0 at each update during training
from keras.layers import GlobalAveragePooling2D, Dropout, Dense
#Keras might not have every possible metric or loss function you need and although the RMSE, for instance,
#is a standard metric for regression problems, it is not directly available as a built-in function in Keras.
#Keras does not provide "low-level" operations such as tensor multiplication and convolution.
#Instead, it relies on the tensor library: `keras.backend` to handle these operations.
#We are using that library to define our custom loss function.
#By using `keras.backend`, your custom function remains compatible with whichever backend Keras is using.
#This means you can write your code once and it will work whether you're using TensorFlow, Theano, or any other backend supported by Keras.
# This allows for more flexible and portable code.
import keras.backend as backend
#ImageDataGenerator is a class in Keras used for real-time data augmentation.
#'img_to_array' is a utility function that converts a loaded image (in the form of a PIL image or a similar object) into a NumPy array.
#This conversion is necessary because deep learning models in Keras work with data in the form of NumPy arrays.
from keras.preprocessing.image import ImageDataGenerator, img_to_array
#The os module provides a way of using operating system-dependent functionality.
#It allows you to interact with the operating system in various ways, such as navigating the file system, reading, and writing files,
#querying and setting environment variables, and executing system commands.
import os
#cv2 provides tools that are essential for many tasks in the field of computer vision and image processing.
import cv2
#Import the ADAM optimizer
from keras.optimizers import Adam
#Remember that the Kaggle challenge includes a CSV file with the "labels" we are to predict
import pandas as pd
import numpy as np

In [3]:
#The probability distributions for the classifications for each of the training images comes in a CSV file.
#The columns of the CSV file are: "GalxaxyID, Class1.1, Class1.2, ..."
classes = [
    'Class1.1', 'Class1.2', 'Class1.3', 'Class2.1', 'Class2.2', 'Class3.1',
    'Class3.2', 'Class4.1', 'Class4.2', 'Class5.1', 'Class5.2', 'Class5.3',
    'Class5.4', 'Class6.1', 'Class6.2', 'Class7.1', 'Class7.2', 'Class7.3',
    'Class8.1', 'Class8.2', 'Class8.3', 'Class8.4', 'Class8.5', 'Class8.6',
    'Class8.7', 'Class9.1', 'Class9.2', 'Class9.3', 'Class10.1', 'Class10.2',
    'Class10.3', 'Class11.1', 'Class11.2', 'Class11.3', 'Class11.4',
    'Class11.5', 'Class11.6'
]

In [4]:
#The cropping we explained above
def random_input(img):
    #By using [:2], we ignore the number of channels and keep just the height and width.
    shape = img.shape[:2]
    #These lines calculate one-fourth of the height and width of the image, respectively.
    left = int(shape[0]/4)
    top = int(shape[1]/4)
    #This line crops the image to a central region.
    #It selects a square from the image that starts at (left, top) and extends to three times the value of left and top.
    img = img[left:left*3,top:top*3,:]
    #After cropping, the image is resized back to its original dimensions
    #interpolation=cv2.INTER_CUBIC argument specifies the interpolation method to be cubic, which is a method that generally provides good results.
    image = cv2.resize(img, shape, interpolation = cv2.INTER_CUBIC)
    #This line converts the resized image into a NumPy array using the img_to_array function.
    #This conversion is necessary because Keras models expect input in the form of NumPy arrays.
    image = img_to_array(image)

    #Apply EfficientNetB0 preprocess_input
    return preprocess_input(image)

In [5]:
#Here we construct a neural network model using Keras utilizing the EfficientNetB0 architecture as the base model.
def build_model():
    #This line initializes an EfficientNetB0 model pre-trained on the ImageNet dataset.
    #weights='imagenet' indicates that the model should be loaded with weights trained on the ImageNet dataset.
    #include_top=False means that the top layer of the model (a fully connected layer for classification) is not included. This allows for custom layers to be added for specific tasks.
    #input_shape=(224, 224, 3) sets the shape of the input images to 224x224 pixels with 3 color channels (RGB).
    eff1 = EfficientNetB0(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

    #This "freezes" these layers, meaning their weights will not be updated during training.
    #This is a common practice when using a pre-trained model as a feature extractor,
    #as it allows the model to maintain the knowledge it has gained from the original training dataset (ImageNet in this case).
    #for layer in eff1.layers:
    #    layer.trainable = False

    #model = Sequential(): This line initializes a new Sequential model.
    model = Sequential()
    #model.add(eff1): Adds the EfficientNetB0 model as the base of the new model.
    model.add(eff1)
    #model.add(GlobalAveragePooling2D()): This layer applies global average pooling to the output of EfficientNetB0.
    model.add(GlobalAveragePooling2D())
    #model.add(Dropout(0.5)): This layer randomly sets input units to 0 with a frequency of 0.5 at each instance (image) during training.
    model.add(Dropout(0.5))
    #model.add(Dense(64, activation='relu')): Adds a densely-connected Neural Network layer with 64 units and ReLU (Rectified Linear Unit) activation.
    model.add(Dense(64, activation='relu'))
    #model.add(Dense(37, activation='sigmoid')): Finally, adds a Dense layer with 37 units and a sigmoid activation function.
    model.add(Dense(37, activation='sigmoid'))

    return model

In [6]:
# Load the trained model
model = build_model()  # Make sure you have a function that builds your model architecture
model.load_weights('C:/Users/usuario/Desktop/DL_project_Jupyter/weights_efficientnetB0.hdf5')  # Path to your optimal weigh

In [7]:
#This line sets the variable train_path to the path of a directory that contains the training images
#train_path = "/content/drive/Othercomputers/Mi portátil/images_training_rev1"
test_path = 'C:/Users/usuario/Desktop/DL_project_Jupyter/images_test_rev1'

# Prepare the test data
test_datagen = ImageDataGenerator(preprocessing_function=random_input)

#Use 'None' because we don't have labels for the test set

#The flow_from_directory method expects the directory to contain one subdirectory per class, 
#and each subdirectory should contain the images that belong to that class. 
#Since you're working with test images that don't have labels, 
#you need to ensure that all test images are in a single subdirectory within the test_path directory.
test_generator = test_datagen.flow_from_directory(
    test_path,
    target_size=(224, 224),
    batch_size=70,
    class_mode=None,  
    shuffle=False
)

Found 79975 images belonging to 1 classes.


In [8]:
# Predict the test set
#Generating the predictions for the input samples from a data generator.
#`model.predict(...)`: This tells Keras to make predictions on the input data provided.

#`test_generator`: This is the data generator that we previously defined. 
#It yields batches of test data that the model will predict on.

#`steps=np.ceil(test_generator.samples/test_generator.batch_size)`: 
#The `steps` argument specifies how many batches of samples to draw from the generator before declaring one epoch finished 
#and stopping prediction. Since the generator is producing batches of data, you need to tell the `predict` method 
#how many steps it should take to process the entire test set. 
#This is calculated by dividing the total number of samples (`test_generator.samples`) by 
#the batch size (`test_generator.batch_size`) to get the number of steps per epoch. 
#Since this can result in a float value, `np.ceil` is used to round up to the nearest whole number, 
#ensuring that you don't miss any samples.

#`verbose=1`: When set to 1, it will show a progress bar during prediction, 
#which can be helpful to visualize the prediction process, especially if it takes a long time to run.

test_predictions = model.predict(test_generator, steps=np.ceil(test_generator.samples/test_generator.batch_size), verbose=1)



In [9]:
#Get the list of file paths from the test generator
file_paths = test_generator.filepaths

#Extract GalaxyID from each file path
#- `file_paths`: This variable is a list of strings, where each string is a full path to an image file.
#- `os.path.basename(path)`: The `os.path.basename` function takes a file path and returns the base name of the file. 
#For example, if the path is `/kaggle/input/galaxy-zoo-the-galaxy-challenge/images_training_rev1/100008.jpg`, 
#`os.path.basename(path)` would return `100008.jpg`.
#- `.split('.')[0]`: This part takes the base file name (e.g., `100008.jpg`) and splits it at the period character, 
#The `split` function returns a list, and `[0]` accesses the first element of this list. 
#In the case of `100008.jpg`, the `split('.')[0]` would result in `100008`
galaxy_ids = [os.path.basename(path).split('.')[0] for path in file_paths]

#Create a DataFrame with GalaxyID as the first column
results_df = pd.DataFrame({'GalaxyID': galaxy_ids})

In [10]:
#A loop that iterates over the list of class names and assigns the corresponding predictions 
#to a new column in the `results_df` DataFrame for each class.

#`enumerate(classes)`: The `enumerate` function is used to loop over something and have an automatic counter. 
#Here it's used on the list `classes`, which contains the class names as strings (like 'Class1.1', 'Class1.2', etc.). 
#The function provides two values on each iteration: `i` (the index or counter) and `class_name` (the value from the list).

#`for i, class_name in enumerate(classes)`: 
#This sets up a loop that will go through the `classes` list, with `i` being the index (starting from 0) and 
#`class_name` being the string value of each class.
for i, class_name in enumerate(classes):
    #`results_df[class_name]` creates a new column in the `results_df` DataFrame with the name of 
    #the current class (e.g., 'Class1.1').
    #`test_predictions[:, i]` selects all the predictions for that class from the `test_predictions` array. 
    #The `:` means "select all rows" in the array, and `i` is the index for the column corresponding to the current class.
    results_df[class_name] = test_predictions[:, i]

# Now results_df has GalaxyID and all the class columns with the predictions

In [11]:
#Save the DataFrame to a CSV file
results_df.to_csv('C:/Users/usuario/Desktop/DL_project_Jupyter/galaxy_zoo_predictions.csv', index=False)