<a href="https://colab.research.google.com/github/IverMartinsen/MastersThesis/blob/main/Notebooks/greenland_halibut_sex_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import modules

In [None]:
import sys

sys.path.append('/content/drive/Othercomputers/Min bærbare datamaskin/UiT/src/Python/modules')

import pandas as pd
import tensorflow as tf
import numpy as np
from stratified_idxs import stratified_idxs

Load features, i.e. age, sex and length

In [None]:
# Load dataframe of features
df = pd.read_csv('/content/drive/Othercomputers/Min bærbare datamaskin/UiT/Data/Grønlandskveiteotolitter/dataframe.csv')

# Locate data points with complete set of features
notna = np.all(np.array(df.notna()), axis = 1)

# Drop data with incomplete set of features
df = df.dropna()

Load images

In [None]:
# Only use images with complete set of features
images = np.load('/content/drive/MyDrive/images128.npy')[notna]
image_size = images.shape[1:3]

Create subsets for training, validation and testing

In [None]:
# Create stratified indices for selecting datasets for training etc.
train_idx, valid_idx, test_idx = stratified_idxs(df['age'], (0.6, 0.2, 0.2), seed=123)

# Create utility function for creating datasets compatible with tensorflow
set_from_idx = lambda idx : (tf.convert_to_tensor(df['sex'].iloc[idx]), images[idx])

# Create stratified subsets for training, validation and testing
y_tr = (df['sex'].iloc[train_idx] == 'male')*1
y_va = (df['sex'].iloc[valid_idx] == 'male')*1
y_te = (df['sex'].iloc[test_idx] == 'male')*1

ds_train = tf.data.Dataset.from_tensor_slices((images[train_idx], y_tr)).shuffle(len(train_idx)).batch(32)
ds_valid = tf.data.Dataset.from_tensor_slices((images[valid_idx], y_va)).batch(32)
ds_test = tf.data.Dataset.from_tensor_slices((images[test_idx], y_te)).batch(32)

f_tr = df['filename'].iloc[train_idx]
f_va = df['filename'].iloc[valid_idx]
f_te = df['filename'].iloc[test_idx]

Define model

In [None]:
# Define pretrained base model without classification head. Use global average pooling on output.
base_model = tf.keras.applications.Xception(
    input_shape=image_size + (3, ), 
    include_top=False,
    pooling='avg')

for layer in base_model.layers:
    layer.trainable=False

# Define full model. Note that by setting training=False in the base model
# we always run the model in inference mode. 
inputs = tf.keras.layers.Input(image_size + (3, ))

# First we process the images
x = tf.keras.applications.xception.preprocess_input(inputs)
x = tf.keras.layers.RandomTranslation(0, 0.1)(x)
x = tf.keras.layers.RandomRotation(0.1, fill_mode='constant')(x)
x = base_model(x, training=False)
#x = tf.keras.layers.Dropout(0.2)(x)
outputs = tf.keras.layers.Dense(1, 'sigmoid')(x)

model = tf.keras.models.Model(inputs, outputs)

Compile and fit model

In [None]:
# Compile model using custom loss function
model.compile(tf.keras.optimizers.Adam(1e-3), tf.keras.losses.BinaryCrossentropy())

# Apply early stopping
callbacks = tf.keras.callbacks.EarlyStopping(patience=20, restore_best_weights=True)

# Fit model
model.fit(
    ds_train,
    epochs=100,
    validation_data=ds_valid,
    callbacks = callbacks
    )

Evaluate on test set

In [None]:
model.evaluate(ds_test)
tf.keras.metrics.binary_accuracy(np.array(y_te), model.predict(ds_test).flatten().round())

In [None]:
model.predict(ds_test).flatten().round()

In [None]:
y_tr = df['age'].iloc[train_idx]
y_va = df['age'].iloc[valid_idx]
y_te = df['age'].iloc[test_idx]

Predict age by length

In [None]:
# Define loss function to be minimized as function of parameters
loss_by_length = lambda params: tf.reduce_mean(
    (y_tr - params[0] + tf.math.log(1 - df['length'].iloc[train_idx]/params[1])/params[2])**2).numpy()

# Find optimal set of parameters
params = minimize(loss_by_length, (1, 10000, 1)).x

# Define function that takes length as input and returns age
age_by_length = lambda params: lambda length: tf.cast(params[0] - tf.math.log(1 - length/params[1])/params[2], tf.float32).numpy()

# Predict age by length of test set
y2 = age_by_length(params)(df['length'].iloc[test_idx])

Predict age by a weighted sum of y1 and y2

In [None]:
# Column of ones with length equal to number of training samples
z0 = tf.constant(1, shape=train_idx.shape, dtype=tf.float32)
# Age of training samples predicted by the model
z1 = model.predict(set_from_idx(train_idx))[:, 0]
# Age of training samples predicted by length
z2 = age_by_length(params)(df['length'].iloc[train_idx])
# Design matrix based on training samples
z = tf.stack([z0, z1, z2], axis = 1)
# Weights of the linear model
w = tf.matmul(
    tf.matmul(
        tf.linalg.inv(
            tf.linalg.matmul(
                tf.transpose(z), z)), tf.transpose(z)), tf.cast(tf.reshape(y_tr, (-1, 1)), tf.float32))

# Column of ones with length equal to the number of test samples
y0 = tf.constant(1, shape=y1.shape, dtype=tf.float32)
# Age predictions for the test set
y3 = tf.matmul(tf.stack((y0, y1, y2), axis=1), w).numpy().reshape(-1)

Print loss and accuracy for each sex on the test set

In [None]:
names = ('Deep Learning', 'Length', 'Both')

# Filename for summary statistics
filename = '/content/drive/Othercomputers/Min bærbare datamaskin/UiT/output.txt'

# Write summary statistics to file
with open(filename, 'w') as f:
    for j, predictions in enumerate((y1, y2, y3)):
        print(names[j] + '\n', file=f)

        print(f'Number of samples: {len(y_te)}', file=f)
        print(f'Loss: {tf.keras.losses.mean_squared_error(y_te, predictions).numpy():.4f}', file=f)

        for i in range(3):
            print(f'{i}-off accuracy: {np.sum(np.abs(predictions.round() - y_te) <= i)*100 / len(y_te):.2f} %', file=f)
        print('----------------------------------------------', file=f)

        for sex in ['male', 'female']:
            idx = np.where(df.iloc[test_idx]['sex'] == sex)[0]

            print(f'Number of {sex} samples: {len(idx)}', file=f)
            print(f'{sex} loss: {tf.keras.losses.mean_squared_error(y_te.iloc[idx], predictions[idx]).numpy():.4f}', file=f)

            for i in range(3):
                print(f'{i}-off accuracy: {np.sum(np.abs(predictions.round()[idx] - y_te.iloc[idx]) <= i)*100 / len(y_te.iloc[idx]):.2f} %', file=f)
            print('----------------------------------------------', file=f)
        print('\n', file=f)

# Print content of file
with open(filename, 'r') as f:
    print(f.read())

Save predictions to csv file

In [None]:
# Save design matrix for linear regression analysis
pd.DataFrame({'y': y_tr, 'x1': z1, 'x2': z2}).to_csv('design_matrix.csv', index=False)

# Save predictions of test data
pd.DataFrame({
    'filenames': f_te, 
    'true_age': y_te, 
    'pred_age_deep': y1.round().astype(int),
    'pred_age_length': y2.round().astype(int), 
    'pred_age_both': y3.round().astype(int),
    'sex': df.iloc[test_idx]['sex']}).to_csv('/content/drive/Othercomputers/Min bærbare datamaskin/UiT/predictions.csv', index=False)