# Basic Pipeline

Things I want to learn/try:
- Data flow from files (for bigger data/projects)
- Loading and tuning a pre-trained model
- Layer visualisation methods
- Picking out the miss-classified from validation, and displaying them
- Non-sequential models (No excuse yet)

In [None]:
# The very first time you import keras and seaborn, there's a long delay as
# setup stuff happens
import json

import pandas as pd
import seaborn as sns
from keras import layers, models, optimizers
from keras.applications import VGG16
from keras.preprocessing.image import ImageDataGenerator

# This contains a few useful functions for reshaping, plotting etc
import src.helpers as h

In [None]:
with open("data/shipsnet.json", "r") as f:
    data = json.load(f)

# Data structure
print([key for key in data.keys()])

# Check labels
print(data["labels"][:10], data["labels"][-10:])

# Check labels split
print("True: ", sum([i == 1 for i in data["labels"]]))
print("False: ", sum([i == 0 for i in data["labels"]]))

# Plot an example to check
h.quick_plot_img(data["data"][5])

In [None]:
# Split the data - in this case I let validation split be handled by the
# training data generator below, rather than doing it manually
train_X, train_y, val_X, val_y, test_X, test_y = h.train_test_validation_split(
    data["data"], data["labels"]
)

# Reformat the split features into (N, 80, 80, 3) shape array
train_X = h.format_imgs(train_X)
val_X = h.format_imgs(val_X)
test_X = h.format_imgs(test_X)

## 2. Make data generators (implements data augmentation steps)

In [None]:
# Configure data generators
train_datagen = ImageDataGenerator(
    rescale=1.0 / 255,
    rotation_range=30,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    vertical_flip=True,
    fill_mode="nearest",
)

# For test, obviously no augmentation
test_datagen = ImageDataGenerator(rescale=1.0 / 255)

# Feed the generators the source data
# DEPRECATION since book: class_mode now auto-detected? Unsure.
# NOTE: Shuffling and creation of a validation set can be handled by the
# generator, by passing the flow method 'shuffle=True' and by passing model.fit
# 'subset = "training"' or 'subset="validation"' respectively
train_generator = train_datagen.flow(
    train_X, train_y, batch_size=20, shuffle=False
)  # noqa:E501

validation_generator = train_datagen.flow(
    val_X, val_y, batch_size=20, shuffle=False
)  # noqa:E501

test_generator = test_datagen.flow(
    test_X, test_y, batch_size=20, shuffle=False
)  # noqa:E501

In [None]:
h.quick_plot_imggen(train_X[1], train_datagen)

# 3. Define the model

In [None]:
model = models.Sequential()

# Set up the pre-trained vision NN
conv_base = VGG16(
    weights="imagenet",  # Which trained model of a set
    include_top=False,  # Don't include classification layers
    input_shape=(80, 80, 3),
)  # Set input shape to own data

# Let's make sure we don't try to train these 14 mil parameters
conv_base.trainable = False

# These are the deep layers that develop into feature extractors
model.add(conv_base)

# And these are essentially a less complex classifier sat on top
# Note use of Dropout
model.add(layers.Flatten())
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1024, activation="relu"))
model.add(layers.Dense(1, activation="sigmoid"))

# Not sure what specifically is computed at compilation, I guess this is where
# the backprop formulae etc are determined?
# DEPRECATION since book:  arg 'lr' replaced with 'learning_rate' for optimizer
# On a very small model 'steps_per_execution' can be raised,
# reducing python overhead
model.compile(
    loss="binary_crossentropy",
    optimizer=optimizers.RMSprop(learning_rate=1e-4),
    metrics=["acc"],
)

# View a summary - useful for improvising a build
model.summary()

# 4. Fit the model

In [None]:
# Training loop#
# DEPRECATION since book:  method 'fit_generator', now just use 'fit'
# steps_per_epoch doesn't need to be specified if you've specified batch size
# in a generator based on a loaded dataset, but might be needed if you're
# streaming data from a directory.
# Likewise for validation_steps
history = model.fit(
    train_generator, epochs=50, validation_data=validation_generator
)  # noqa:E501

model.save("models/ship_spotting_v0.1.h5")

# 5. Review Performance

In [None]:
# I'm going to use pandas and seaborn if I can for results plots
results = pd.DataFrame(history.history)

# Add 1 to the index values, so they go 1-30 rather than 0-29 (pure aesthetics)
results.index = results.index + 1

# Quick look at final performance
print(results.tail(5))

In [None]:
sns.lineplot(data=results[["acc", "val_acc"]])

In [None]:
sns.lineplot(data=results[["loss", "val_loss"]])

In [None]:
prob = model.evaluate(test_generator)