In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/UBC-OCEAN/updated_image_ids.json
/kaggle/input/UBC-OCEAN/sample_submission.csv
/kaggle/input/UBC-OCEAN/train.csv
/kaggle/input/UBC-OCEAN/test.csv
/kaggle/input/UBC-OCEAN/test_thumbnails/41_thumbnail.png
/kaggle/input/UBC-OCEAN/train_images/14127.png
/kaggle/input/UBC-OCEAN/train_images/34649.png
/kaggle/input/UBC-OCEAN/train_images/15221.png
/kaggle/input/UBC-OCEAN/train_images/52375.png
/kaggle/input/UBC-OCEAN/train_images/17487.png
/kaggle/input/UBC-OCEAN/train_images/32112.png
/kaggle/input/UBC-OCEAN/train_images/22290.png
/kaggle/input/UBC-OCEAN/train_images/48734.png
/kaggle/input/UBC-OCEAN/train_images/39146.png
/kaggle/input/UBC-OCEAN/train_images/64950.png
/kaggle/input/UBC-OCEAN/train_images/32042.png
/kaggle/input/UBC-OCEAN/train_images/63429.png
/kaggle/input/UBC-OCEAN/train_images/13526.png
/kaggle/input/UBC-OCEAN/train_images/53859.png
/kaggle/input/UBC-OCEAN/train_images/63836.png
/kaggle/input/UBC-OCEAN/train_images/56117.png
/kaggle/input/UBC-OCEAN/train_i

# Data Wrangling

In [2]:
import os
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from PIL import Image

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import balanced_accuracy_score

import tensorflow as tf
import keras
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras.layers import Dropout, Dense
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.metrics import Metric



In [3]:
df = pd.read_csv('/kaggle/input/UBC-OCEAN/train.csv')

## Handling TMA Images

In [4]:
df_no_tma = df[df['is_tma'] == False]
df_tma = df[df['is_tma'] == True]

In [5]:
df_no_tma['image_id_path'] = [f"{i}_thumbnail.png" for i in df_no_tma['image_id']]
df_tma['image_id_path'] = [f"{i}.png" for i in df_tma['image_id']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_no_tma['image_id_path'] = [f"{i}_thumbnail.png" for i in df_no_tma['image_id']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tma['image_id_path'] = [f"{i}.png" for i in df_tma['image_id']]


In [6]:
images = []
labels = []
img_size = 224

In [7]:
for img , label in zip(df_no_tma['image_id_path'],df_no_tma['label']):
    image = Image.open("/kaggle/input/UBC-OCEAN/train_thumbnails/"+img)
    image = image.resize((img_size,img_size))
    image = image.convert("RGB")
    image = np.array(image)
    images.append(image)
    labels.append(label)

In [8]:
for img , label in zip(df_tma['image_id_path'],df_tma['label']):
    image = Image.open("/kaggle/input/UBC-OCEAN/train_images/"+img)
    image = image.resize((img_size,img_size))
    image = image.convert("RGB")
    image = np.array(image)
    images.append(image)
    labels.append(label)

In [9]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

In [10]:
images = [np.array(image) for image in images]

X = np.array(images)
Y = np.array(encoded_labels)

## Data Augmentation

In [11]:
dataGenerator = ImageDataGenerator(
        rotation_range=20,
        width_shift_range=0.2,
        height_shift_range=0.2,
        horizontal_flip=True,
        zoom_range=0.2)

In [12]:
dataGenerator.fit(X)

augmentedImages = []
augmentedLabels = []
for i in range(5):
    for x_batch, y_batch in dataGenerator.flow(X, Y, batch_size=len(X)):
        augmentedImages.extend(x_batch)
        augmentedLabels.extend(y_batch)
        break

In [13]:
augmentedImages = np.array(augmentedImages)
augmentedLabels = np.array(augmentedLabels)

X = np.concatenate((X, augmentedImages))
y = np.concatenate((Y, augmentedLabels))

## Data Pre-processing

In [14]:
X, y = shuffle(X, y, random_state=101)
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(2582, 224, 224, 3)
(2582,)
(646, 224, 224, 3)
(646,)


In [16]:
y_train = tf.keras.utils.to_categorical(y_train)
y_test = tf.keras.utils.to_categorical(y_test)

# Model

In [17]:
effnet = EfficientNetB0(weights='imagenet', include_top=False, input_shape=(img_size, img_size, 3))

num_class = 5

model = Sequential([
    effnet,
    tf.keras.layers.GlobalAveragePooling2D(),
    Dense(units=4096, activation="relu"),
    Dropout(0.25),
    Dense(units=4096, activation="relu"),
    Dropout(0.25),
    Dense(units=4096, activation="relu"),
    Dropout(0.25),
    Dense(units=2048, activation="relu"),
    Dropout(0.25),
    Dense(units=2048, activation="relu"),
    Dropout(0.25),
    Dense(units=1024, activation="relu"),
    Dropout(0.25),
    Dense(units=256, activation="relu"),
    Dropout(0.25),
    Dense(units=num_class, activation="softmax")
])

model.summary()

Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 efficientnetb0 (Functional  (None, 7, 7, 1280)        4049571   
 )                                                               
                                                                 
 global_average_pooling2d (  (None, 1280)              0         
 GlobalAveragePooling2D)                                         
                                                                 
 dense (Dense)               (None, 4096)              5246976   
                                                                 
 dropout (Dropout)           (None, 4096)              0         
                                                                 
 dense_1 (Dense)             (None, 4096)              16781312  
                          

### Balanced Accuracy Metric

In [18]:
class BalancedAccuracy(Metric):
    def __init__(self, name='balanced_accuracy', **kwargs):
        super(BalancedAccuracy, self).__init__(name=name, **kwargs)
        self.true_positives = self.add_weight(name='tp', initializer='zeros')
        self.false_positives = self.add_weight(name='fp', initializer='zeros')
        self.true_negatives = self.add_weight(name='tn', initializer='zeros')
        self.false_negatives = self.add_weight(name='fn', initializer='zeros')

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = tf.cast(y_true, tf.bool)
        y_pred = tf.cast(tf.round(y_pred), tf.bool)

        true_positives = tf.logical_and(tf.equal(y_true, True), tf.equal(y_pred, True))
        false_positives = tf.logical_and(tf.equal(y_true, False), tf.equal(y_pred, True))
        true_negatives = tf.logical_and(tf.equal(y_true, False), tf.equal(y_pred, False))
        false_negatives = tf.logical_and(tf.equal(y_true, True), tf.equal(y_pred, False))

        self.true_positives.assign_add(tf.reduce_sum(tf.cast(true_positives, self.dtype)))
        self.false_positives.assign_add(tf.reduce_sum(tf.cast(false_positives, self.dtype)))
        self.true_negatives.assign_add(tf.reduce_sum(tf.cast(true_negatives, self.dtype)))
        self.false_negatives.assign_add(tf.reduce_sum(tf.cast(false_negatives, self.dtype)))

    def result(self):
        sensitivity = self.true_positives / (self.true_positives + self.false_negatives)
        specificity = self.true_negatives / (self.true_negatives + self.false_positives)

        return (sensitivity + specificity) / 2

    def reset_states(self):
        for s in self.variables:
            s.assign(tf.zeros_like(s))

In [19]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
model.compile(
    loss='categorical_crossentropy', 
    optimizer=optimizer, 
    metrics=['accuracy', BalancedAccuracy()]
)

## Model Training

In [20]:
import wandb
from wandb.keras import WandbCallback
from tensorflow.keras.callbacks import ModelCheckpoint

wandb.init(project="UBC-OCEAN")

config = wandb.config
config.learning_rate = 0.0001
config.epochs = 30
config.batch_size = 16
config.validation_split = 0.2

model_checkpoint = ModelCheckpoint(
    'best_weights.h5', 
    save_best_only=True, 
    monitor='val_accuracy', 
    mode='max', 
    verbose=1
)

history = model.fit(
    X_train, 
    y_train, 
    validation_split=config.validation_split, 
    epochs=config.epochs, 
    verbose=1, 
    batch_size=config.batch_size,
    callbacks=[model_checkpoint, WandbCallback()]
)

wandb.finish()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  


[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Epoch 1/30


2024-01-06 00:13:40.955147: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] layout failed: INVALID_ARGUMENT: Size of values 0 does not match size of permutation 4 @ fanin shape insequential/efficientnetb0/block2b_drop/dropout/SelectV2-2-TransposeNHWCToNCHW-LayoutOptimizer




  m.reset_state()



Epoch 1: val_accuracy improved from -inf to 0.40426, saving model to best_weights.h5


  saving_api.save_model(
[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/wandb/run-20240106_001225-pjycwuns/files/model-best)... Done. 2.8s


Epoch 2/30


  m.reset_state()


Epoch 2: val_accuracy improved from 0.40426 to 0.57447, saving model to best_weights.h5


  saving_api.save_model(
[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/wandb/run-20240106_001225-pjycwuns/files/model-best)... Done. 2.6s


Epoch 3/30


  m.reset_state()


Epoch 3: val_accuracy improved from 0.57447 to 0.73114, saving model to best_weights.h5


  saving_api.save_model(
[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/wandb/run-20240106_001225-pjycwuns/files/model-best)... Done. 2.7s


Epoch 4/30


  m.reset_state()


Epoch 4: val_accuracy improved from 0.73114 to 0.86073, saving model to best_weights.h5


  saving_api.save_model(
[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/wandb/run-20240106_001225-pjycwuns/files/model-best)... Done. 3.1s


Epoch 5/30
  1/130 [..............................] - ETA: 24s - loss: 0.3019 - accuracy: 0.8750 - balanced_accuracy: 0.9219

  m.reset_state()


Epoch 5: val_accuracy did not improve from 0.86073


  saving_api.save_model(
[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/wandb/run-20240106_001225-pjycwuns/files/model-best)... Done. 2.9s


Epoch 6/30
  1/130 [..............................] - ETA: 23s - loss: 0.2486 - accuracy: 0.8750 - balanced_accuracy: 0.9297

  m.reset_state()


Epoch 6: val_accuracy improved from 0.86073 to 0.91489, saving model to best_weights.h5


  saving_api.save_model(
[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/wandb/run-20240106_001225-pjycwuns/files/model-best)... Done. 3.0s


Epoch 7/30
  1/130 [..............................] - ETA: 24s - loss: 0.1151 - accuracy: 1.0000 - balanced_accuracy: 0.9688

  m.reset_state()


Epoch 7: val_accuracy improved from 0.91489 to 0.92843, saving model to best_weights.h5


  saving_api.save_model(


Epoch 8/30
Epoch 8: val_accuracy improved from 0.92843 to 0.93424, saving model to best_weights.h5
Epoch 9/30
Epoch 9: val_accuracy did not improve from 0.93424
Epoch 10/30
Epoch 10: val_accuracy did not improve from 0.93424
Epoch 11/30
Epoch 11: val_accuracy did not improve from 0.93424
Epoch 12/30
Epoch 12: val_accuracy did not improve from 0.93424
Epoch 13/30
Epoch 13: val_accuracy did not improve from 0.93424
Epoch 14/30
Epoch 14: val_accuracy improved from 0.93424 to 0.94584, saving model to best_weights.h5


[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/wandb/run-20240106_001225-pjycwuns/files/model-best)... Done. 2.9s


Epoch 15/30
  1/130 [..............................] - ETA: 22s - loss: 0.0014 - accuracy: 1.0000 - balanced_accuracy: 1.0000

  m.reset_state()


Epoch 15: val_accuracy improved from 0.94584 to 0.95358, saving model to best_weights.h5


  saving_api.save_model(


Epoch 16/30
Epoch 16: val_accuracy improved from 0.95358 to 0.95745, saving model to best_weights.h5


[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/wandb/run-20240106_001225-pjycwuns/files/model-best)... Done. 2.9s


Epoch 17/30


  m.reset_state()


Epoch 17: val_accuracy did not improve from 0.95745
Epoch 18/30
Epoch 18: val_accuracy did not improve from 0.95745
Epoch 19/30
Epoch 19: val_accuracy did not improve from 0.95745
Epoch 20/30
Epoch 20: val_accuracy did not improve from 0.95745
Epoch 21/30
Epoch 21: val_accuracy did not improve from 0.95745
Epoch 22/30
Epoch 22: val_accuracy did not improve from 0.95745
Epoch 23/30
Epoch 23: val_accuracy improved from 0.95745 to 0.97679, saving model to best_weights.h5


  saving_api.save_model(
[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/wandb/run-20240106_001225-pjycwuns/files/model-best)... Done. 2.9s


Epoch 24/30
  1/130 [..............................] - ETA: 22s - loss: 9.5118e-04 - accuracy: 1.0000 - balanced_accuracy: 1.0000

  m.reset_state()


Epoch 24: val_accuracy did not improve from 0.97679
Epoch 25/30
Epoch 25: val_accuracy did not improve from 0.97679
Epoch 26/30
Epoch 26: val_accuracy did not improve from 0.97679
Epoch 27/30
Epoch 27: val_accuracy did not improve from 0.97679
Epoch 28/30
Epoch 28: val_accuracy did not improve from 0.97679
Epoch 29/30
Epoch 29: val_accuracy did not improve from 0.97679
Epoch 30/30
Epoch 30: val_accuracy did not improve from 0.97679


VBox(children=(Label(value='6228.372 MB of 6228.372 MB uploaded (6.121 MB deduped)\r'), FloatProgress(value=1.…

0,1
accuracy,▁▂▄▆▇▇▇███████████████████████
balanced_accuracy,▁▂▄▆▇▇▇███████████████████████
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
loss,█▇▅▄▃▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▃▅▇▇▇▇▇▇▇▇▇▇█████▇███████████
val_balanced_accuracy,▁▁▅▆▇▇▇█▇▇█▇▇█████▇███████████
val_loss,█▇▅▃▃▂▂▂▃▃▂▂▂▂▂▁▁▂▂▁▂▂▁▂▂▁▁▁▁▂

0,1
accuracy,0.99031
balanced_accuracy,0.99364
best_epoch,22.0
best_val_loss,0.10174
epoch,29.0
loss,0.04188
val_accuracy,0.95745
val_balanced_accuracy,0.96881
val_loss,0.21296


In [21]:
predictions = np.argmax(model.predict(X_test),axis=1)
y_test = np.argmax(y_test, axis=1)
print(classification_report(predictions, y_test))

              precision    recall  f1-score   support

           0       0.96      0.90      0.93       122
           1       0.96      0.91      0.93       161
           2       0.92      0.98      0.95       259
           3       0.95      0.95      0.95        55
           4       0.96      0.94      0.95        49

    accuracy                           0.94       646
   macro avg       0.95      0.93      0.94       646
weighted avg       0.94      0.94      0.94       646



# Saving Model, Encoder

In [22]:
model.save("UCC-OCEAN.h5")

  saving_api.save_model(


In [23]:
with open('label_encoder.pkl', 'wb') as file:
    pickle.dump(label_encoder, file)