In [2]:
import numpy as np
import pandas as pd
import os
import random
from glob import glob
from sklearn.model_selection import train_test_split

In [3]:
"""
The dataset should be unzipped into the folder named 'data' This folder should be in the same directory as this file.
Extract the train.zip file.
"""
print(os.listdir("data/")) # should list train.zip, sample_submission... etc 
# The Most important folder is going to be the train folder. 
# We will create our validation set by splitting the train folder into train and valid folders.
if not os.path.exists('data/valid'):
    os.mkdir('data/valid/')

# Now we will create validation folders for each type of fish 
fish_types = ['ALB', 'BET', 'DOL', 'LAG', 'NoF', 'OTHER', 'SHARK', 'YFT'] 
for fish in fish_types:
    # Creates a validation folder for each type of feature (fish)
    if not os.path.exists(f'data/valid/{fish}'):
        os.mkdir(f'data/valid/{fish}')

    # if valid folder already contains validation set, it breaks out of the loop
    if len(os.listdir(f'data/valid/{fish}')) > 0:
        print(f"Validation folder already contains files for {fish}")
        continue

    source_dir = f'data/train/{fish}/'
    valid_dir = f'data/valid/{fish}/'

    # List all files in the source directory
    file_paths = [os.path.join(source_dir, filename) for filename in os.listdir(source_dir)]

    # Use 20% of the files for validation, using the train_test_split function
    validation_ratio = 0.2
    train_files, valid_files = train_test_split(file_paths, test_size=validation_ratio, random_state=42)

    # Move the selected validation files to the validation directory
    for file_path in valid_files:
        filename = os.path.basename(file_path)
        destination = os.path.join(valid_dir, filename)
        os.rename(file_path, destination)

    print(f"{len(valid_files)} files from test/{fish} moved to validation set.")


['train.zip', '__MACOSX', 'valid', 'sample_submission_stg1.csv.zip', 'sample_submission_stg2.csv.zip', 'test_stg2.7z', 'test_stg1.zip', 'train', 'test_stg1']
Validation folder already contains files for ALB
Validation folder already contains files for BET
Validation folder already contains files for DOL
Validation folder already contains files for LAG
Validation folder already contains files for NoF
Validation folder already contains files for OTHER
Validation folder already contains files for SHARK
Validation folder already contains files for YFT


In [4]:
from keras.preprocessing.image import ImageDataGenerator
import tensorflow as tf

2023-10-11 21:01:20.699777: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-11 21:01:20.733030: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-11 21:01:20.733720: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
""" 
This part of the code sets up an Image generator that will be used for the CNN
It doesn't actually compute any of the resizing or transformations yet, it just sets up the generator
The generator will be called when we train the model 
"""
# Create an ImageDataGenerator with data augmentation and normalization
datagen = ImageDataGenerator(
    rescale=1.0/255,    # Normalize pixel values to the range [0, 1]
    shear_range=0.2,    # Random shear transformations
    zoom_range=0.2,     # Random zooming
    horizontal_flip=True,  # Random horizontal flipping
)

# Load and preprocess the dataset (in this example, assume you have a directory of images)
train_generator = datagen.flow_from_directory(
    'data/train/',
    target_size=(224, 224),   # Resize images to a consistent size
    batch_size=32,            # Batch size for training
    class_mode='categorical'  # The type of labels (categorical for classification)
)

# Load and preprocess the validation set
validation_generator = datagen.flow_from_directory(
    'data/valid/',
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical'
)

print(train_generator.class_indices)


Found 3019 images belonging to 8 classes.
Found 758 images belonging to 8 classes.
{'ALB': 0, 'BET': 1, 'DOL': 2, 'LAG': 3, 'NoF': 4, 'OTHER': 5, 'SHARK': 6, 'YFT': 7}


In [9]:
"""
This part of the code sets up the CNN model
"""
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense

model = Sequential()
model.add(Conv2D(32, (3, 3), input_shape=(224, 224, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(8, activation='softmax'))

# compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

# Run the model
model.fit(
    train_generator,
    steps_per_epoch=10,
    epochs=1,
    validation_data=validation_generator,
    validation_steps=1
)

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_3 (Conv2D)           (None, 222, 222, 32)      896       
                                                                 
 max_pooling2d_3 (MaxPoolin  (None, 111, 111, 32)      0         
 g2D)                                                            
                                                                 
 flatten_3 (Flatten)         (None, 394272)            0         
                                                                 
 dense_6 (Dense)             (None, 128)               50466944  
                                                                 
 dropout_3 (Dropout)         (None, 128)               0         
                                                                 
 dense_7 (Dense)             (None, 8)                 1032      
                                                      

<keras.src.callbacks.History at 0x7f2b644c88d0>