In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/the-nature-conservancy-fisheries-monitoring/train.zip
/kaggle/input/the-nature-conservancy-fisheries-monitoring/test_stg2.7z
/kaggle/input/the-nature-conservancy-fisheries-monitoring/test_stg1.zip
/kaggle/input/the-nature-conservancy-fisheries-monitoring/sample_submission_stg2.csv.zip
/kaggle/input/the-nature-conservancy-fisheries-monitoring/sample_submission_stg1.csv.zip


In [2]:
import random
from glob import glob
from sklearn.model_selection import train_test_split

In [3]:
!unzip ../input/the-nature-conservancy-fisheries-monitoring/train.zip
!unzip ../input/the-nature-conservancy-fisheries-monitoring/test_stg1.zip
!unzip ../input/the-nature-conservancy-fisheries-monitoring/sample_submission_stg1.csv
!unzip ../input/the-nature-conservancy-fisheries-monitoring/sample_submission_stg2.csv

Archive:  ../input/the-nature-conservancy-fisheries-monitoring/train.zip
   creating: train/
  inflating: train/.DS_Store         
   creating: __MACOSX/
   creating: __MACOSX/train/
  inflating: __MACOSX/train/._.DS_Store  
   creating: train/ALB/
  inflating: train/ALB/img_00003.jpg  
  inflating: train/ALB/img_00010.jpg  
  inflating: train/ALB/img_00012.jpg  
  inflating: train/ALB/img_00015.jpg  
  inflating: train/ALB/img_00019.jpg  
  inflating: train/ALB/img_00020.jpg  
  inflating: train/ALB/img_00029.jpg  
  inflating: train/ALB/img_00032.jpg  
  inflating: train/ALB/img_00037.jpg  
  inflating: train/ALB/img_00038.jpg  
  inflating: train/ALB/img_00039.jpg  
  inflating: train/ALB/img_00041.jpg  
  inflating: train/ALB/img_00043.jpg  
  inflating: train/ALB/img_00045.jpg  
  inflating: train/ALB/img_00055.jpg  
  inflating: train/ALB/img_00057.jpg  
  inflating: train/ALB/img_00074.jpg  
  inflating: train/ALB/img_00085.jpg  
  inflating: train/ALB/i

In [4]:
"""
The dataset should be unzipped into the folder named 'data' This folder should be in the same directory as this file.
Extract the train.zip file.
"""
print(os.listdir("../input/the-nature-conservancy-fisheries-monitoring/")) 
# The Most important folder is going to be the train folder. 
# We will create our validation set by splitting the train folder into train and valid folders.
if not os.path.exists('valid/'):
    os.mkdir('valid/')

# Now we will create validation folders for each type of fish 
fish_types = ['ALB', 'BET', 'DOL', 'LAG', 'NoF', 'OTHER', 'SHARK', 'YFT'] 
for fish in fish_types:
    # Creates a validation folder for each type of feature (fish)
    if not os.path.exists(f'valid/{fish}'):
        os.mkdir(f'valid/{fish}')

    # if valid folder already contains validation set, it breaks out of the loop
    if len(os.listdir(f'valid/{fish}')) > 0:
        print(f"Validation folder already contains files for {fish}")
        continue

    train_dir = f'train/{fish}/'
    valid_dir = f'valid/{fish}/'

    # List all files in the source directory
    file_paths = [os.path.join(train_dir, filename) for filename in os.listdir(train_dir)]

    # Use 20% of the files for validation, using the train_test_split function
    validation_ratio = 0.2
    train_files, valid_files = train_test_split(file_paths, test_size=validation_ratio, random_state=42)

    # Move the selected validation files to the validation directory
    for file_path in valid_files:
        filename = os.path.basename(file_path)
        destination = os.path.join(valid_dir, filename)
        os.rename(file_path, destination)

    print(f"{len(valid_files)} files from test/{fish} moved to validation set.")


# TODO
# Create a function to reset the validation, moving all validation images back into the test folder

['train.zip', 'test_stg2.7z', 'test_stg1.zip', 'sample_submission_stg2.csv.zip', 'sample_submission_stg1.csv.zip']
344 files from test/ALB moved to validation set.
40 files from test/BET moved to validation set.
24 files from test/DOL moved to validation set.
14 files from test/LAG moved to validation set.
93 files from test/NoF moved to validation set.
60 files from test/OTHER moved to validation set.
36 files from test/SHARK moved to validation set.
147 files from test/YFT moved to validation set.


In [5]:
from keras.preprocessing.image import ImageDataGenerator
import tensorflow as tf

In [6]:
""" 
This part of the code sets up an Image generator that will be used for the CNN
It doesn't actually compute any of the resizing or transformations yet, it just sets up the generator
The generator will be called when we train the model 
"""
# Create an ImageDataGenerator with data augmentation and normalization
datagen = ImageDataGenerator(
    rescale=1.0/255,    # Normalize pixel values to the range [0, 1]
    shear_range=0.2,    # Random shear transformations
    zoom_range=0.2,     # Random zooming
    horizontal_flip=True,  # Random horizontal flipping
)

# Load and preprocess the dataset (in this example, assume you have a directory of images)
train_generator = datagen.flow_from_directory(
    'train/',
    target_size=(224, 224),   # Resize images to a consistent size
    batch_size=32,            # Batch size for training
    class_mode='categorical'  # The type of labels (categorical for classification)
)

# Load and preprocess the validation set
validation_generator = datagen.flow_from_directory(
    'valid/',
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical'
)

print(train_generator.class_indices)

Found 3019 images belonging to 8 classes.
Found 758 images belonging to 8 classes.
{'ALB': 0, 'BET': 1, 'DOL': 2, 'LAG': 3, 'NoF': 4, 'OTHER': 5, 'SHARK': 6, 'YFT': 7}


In [7]:
"""
This part of the code sets up the CNN model
"""
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense

model = Sequential()
model.add(Conv2D(32, (3, 3), input_shape=(224, 224, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(8, activation='softmax'))

# compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

# Run the model
model.fit(
    train_generator,
    steps_per_epoch=10,
    epochs=1,
    validation_data=validation_generator,
    validation_steps=1
)

# Save the model
version = 1
model.save_weights(f'model{version}.h5')

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 222, 222, 32)      896       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 111, 111, 32)      0         
_________________________________________________________________
flatten (Flatten)            (None, 394272)            0         
_________________________________________________________________
dense (Dense)                (None, 128)               50466944  
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 8)                 1032      
Total params: 50,468,872
Trainable params: 50,468,872
Non-trainable params: 0
____________________________________________

In [8]:
"""
Now run our trained model on the test set and create a submission csv file
"""
import keras
import sys

# Load the model 
model.load_weights(f'model{version}.h5')

# Load the first test set, test_stg1
# Submission must be a combination of test_stg1 (1000) and test_stg2 (12153) = 13153
test_set1 = glob('test_stg1/*.jpg')
test_set2 = glob('test_stg2/*.jpg')

# Each image name must be called 'test_stg1/image_000001.jpg', for example
for i, image in enumerate(test_set1):
    test_set1[i] = 'test_stg1/' + os.path.basename(image)
for i, image in enumerate(test_set2):
    test_set2[i] = 'test_stg2/' + os.path.basename(image)
test_set = test_set1 + test_set2
print(test_set[0:10])

# Create a dataframe to hold the predictions
submission = pd.DataFrame(columns=['image', 'ALB', 'BET', 'DOL', 'LAG', 'NoF', 'OTHER', 'SHARK', 'YFT'])

# For each image in the test set, run the models prediction and save it to the submission df
# It takes about a min to run on test_stg1 for me (Karl). 
# It takes about 15 min to run the combined test_stg1 and test_stg2
# for each iteration, it takes 20 to 40ms to run. 
for i, image in enumerate(test_set):
    # Preprocess the test images
    test_image = test_set[i]
    test_image = tf.keras.preprocessing.image.load_img(test_image, target_size=(224, 224))
    test_image = tf.keras.preprocessing.image.img_to_array(test_image)
    test_image = np.expand_dims(test_image, axis=0)
    test_image = keras.applications.mobilenet.preprocess_input(test_image)

    # Run the prediction 
    prediction = model.predict(test_image)

    # round the prediction to the nearest tenth
    prediction = np.round(prediction, decimals=1)

    # print(f'prediction array: {prediction[0]}')

    # Get the predicted class
    # The prediction array is a list of probabilities for each class
    # For each prediction, map the prediction to the name of the class
    # train_generator.class_indices is a dictionary of the classes of fish and their indices
    keys = list(train_generator.class_indices.keys())

    # Create a dictionary of the predictions
    # Looks like this: 
    #  {'ALB': 1.3623509e-07, 'BET': 9.1926294e-20,
    #   'DOL': 2.9418256e-16, 'LAG': 1.9241103e-13, 'NoF': 0.0019608391, 
    #   'OTHER': 1.6155835e-11, 'SHARK': 2.625621e-13, 'YFT': 0.99803907}
    prediction_dict = dict(zip(keys, prediction[0]))
    # print(prediction_dict)

    # Add the prediction to the dataframe
    submission.loc[i, 'image'] = os.path.basename(test_set[i])
    for key in keys:
        submission.loc[i, key] = prediction_dict[key]

    # Uncomment the break if you want to test 
    break


# Print the head, does it look okay?
print(submission.columns)
print(submission.head())

# Save the dataframe to a csv file :)
submission.to_csv('submission.csv', index=False)

['test_stg1/img_02347.jpg', 'test_stg1/img_06506.jpg', 'test_stg1/img_01224.jpg', 'test_stg1/img_05568.jpg', 'test_stg1/img_04061.jpg', 'test_stg1/img_01847.jpg', 'test_stg1/img_06564.jpg', 'test_stg1/img_04422.jpg', 'test_stg1/img_05026.jpg', 'test_stg1/img_02865.jpg']
Index(['image', 'ALB', 'BET', 'DOL', 'LAG', 'NoF', 'OTHER', 'SHARK', 'YFT'], dtype='object')
           image  ALB  BET  DOL  LAG  NoF OTHER SHARK  YFT
0  img_02347.jpg  0.0  0.0  0.0  0.0  0.0   0.0   0.0  1.0
