In [None]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import zipfile
import os
import random
import datetime
import h5py
import cv2
import sklearn.metrics 
from functions import *

# Data exploration

In [None]:
zip_ref = zipfile.ZipFile('Data/TRAIN_2.zip', 'r') #Opens the zip file in read mode
zip_ref.extractall('Data') #Extracts the files into the Data folder
zip_ref.close()

In [None]:
with open('Data/TRAIN_Images_2.pkl', 'rb') as file:
    train_images = pickle.load(file)

with open('Data/TRAIN_Labels_2.pkl', 'rb') as file:
    train_labels = pickle.load(file)

with open('Data/TRAIN_Coordinates_2.pkl', 'rb') as file:
    train_cor = pickle.load(file)
    
with open('Data/TRAIN_Genera_2.pkl', 'rb') as file:
    train_gen = pickle.load(file)

In [None]:
i = random.choice(range(len(train_images)))

plt.figure()
plt.subplot(1,2,1)
plt.imshow(train_images[i],cmap="gray")
plt.axis("off")
plt.title("Annotated image")
plt.scatter(*zip(*train_cor[i]),s=1, c="orange")
plt.subplot(1,2,2)
plt.imshow(train_labels[i], cmap="inferno")
plt.axis("off")
plt.title("Target")

## Dimensions

In [None]:
p0_p = np.mean([(np.sum(train_labels[i]>0.1)/np.sum(train_labels[i]==0)) for i in range(len(train_labels))])*100
p0_ptotal = np.mean([np.sum(train_labels[i]>0.1)/(2688*2048) for i in range(len(train_labels))])*100
print("The % of pixels with a value > 0.1 to pixels with a value = 0 is:",np.round(p0_p,2))
print(np.round(p0_ptotal,2), "% of the pixels has a value bigger than 0.1.")

In [None]:
print(train_images.dtype)
print(train_labels.dtype)

In [None]:
print("The train set contains", str(len(train_images)), "images.")
print("The train set contains", str(len(train_labels)), "labels.")
print("The list contains the coordinates of the annotations of", str(len(train_cor)), "images.")
print("The list of genera contains", str(len(train_gen)), "elements.")

In [None]:
print("The images in the train set have a width of", str( train_images.shape[2]),"and a height of",str(train_images.shape[1]))
print("The labels in the train set have a width of", str( train_labels.shape[2]),"and a height of",str(train_labels.shape[1]))

In [None]:
print("The maximum pixel value of the images is:", str(np.amax(train_images)))
print("The minumum pixel value of the images is:", str(np.amin(train_images)))
print("The maximum pixel value of the labels is:", str(np.amax(train_labels)))
print("The minumum pixel value of the labels is:", str(np.amin(train_labels)))

## Replicates

In [None]:
for train_image in train_images:
  if sum(np.all(train_images == train_image, axis=(1,2)))>1:
    print("There is a replicate.")

# Validation set and train set

In [None]:
n = int(0.3 * len(train_images))

train_features = train_images[n:]
train_targets = train_labels[n:]
train_coordinates = train_cor[n:]
train_genera = train_gen[n:]

val_features = train_images[:n]
val_targets = train_labels[:n]
val_coordinates = train_cor[:n]
val_genera = train_gen[:n]

In [None]:
with open("Data/train_coordinates.pkl", 'wb') as file:
    pickle.dump(train_coordinates, file)
    
with open("Data/train_genera.pkl", 'wb') as file:
    pickle.dump(train_genera, file)
    
with open("Data/val_coordinates.pkl", 'wb') as file:
    pickle.dump(val_coordinates, file)
    
with open("Data/val_genera.pkl", 'wb') as file:
    pickle.dump(val_genera, file)

In [None]:
print("The train set contains", str(len(train_features)), "images.")
print("The train set contains", str(len(train_targets)), "labels.")
print("The train set contains the coordinates of the annotations of", str(len(train_coordinates)), "images.")
print("The list of genera of the train set contains", str(len(train_genera)), "elements.")

print("The validation set contains", str(len(val_features)), "images.")
print("The validation set contains", str(len(val_targets)), "labels.")
print("The validation set contains the coordinates of the annotations of", str(len(val_coordinates)), "images.")
print("The list of genera of the validation set contains", str(len(val_genera)), "elements.")

In [None]:
i = random.choice(range(len(train_features)))
plt.figure()
plt.subplot(1,2,1)
plt.imshow(train_features[i],cmap="gray")
plt.scatter(*zip(*train_coordinates[i]),s=1, c="orange")
plt.axis("off")
plt.title("Annotated image")
plt.subplot(1,2,2)
plt.imshow(train_targets[i], cmap="inferno")
plt.axis("off")
plt.title("Target")

In [None]:
i = random.choice(range(len(val_features)))
plt.figure()
plt.subplot(1,2,1)
plt.imshow(val_features[i],cmap="gray")
plt.scatter(*zip(*val_coordinates[i]),s=1, c="orange")
plt.axis("off")
plt.title("Annotated image")
plt.subplot(1,2,2)
plt.imshow(val_targets[i], cmap="inferno")
plt.axis("off")
plt.title("Target")

# Data generator

In [None]:
# Create a HDF5 file for train set
fileHDFTrain = 'Data/train.hdf5'
with h5py.File(fileHDFTrain, 'w') as hdf:
    hdf.create_dataset('features', data=train_features)
    hdf.create_dataset('targets',  data=train_targets)
    
# Create a HDF5 file for validation set
fileHDFVal = 'Data/val.hdf5'
with h5py.File(fileHDFVal, 'w') as hdf:
    hdf.create_dataset('features', data=val_features)
    hdf.create_dataset('targets',  data=val_targets)

In [None]:
batch_size = 1

augmentations = {
    'random_vertical_flip': None,
    'random_horizontal_flip': None,
    'rotation': 45,
    'translation': 500,
    'brightness': 0.2
}

fileHDFTrain = 'Data/train.hdf5'
fileHDFVal = 'Data/val.hdf5'

In [None]:
train_gen = create_hdf5_generator(
    fileHDFTrain,
    batch_size,
    augmentations = augmentations,
    keys = ['features', 'targets']
)

val_gen = create_hdf5_generator(
    fileHDFVal,
    batch_size,
    augmentations = {},
    keys = ['features', 'targets']
)

In [None]:
features, targets = next(train_gen)

for i in range(batch_size):
    plt.figure()
    plt.subplot(1,2,1)
    plt.imshow(features[i], cmap="gray")
    plt.axis("off")
    plt.title('Image')
    plt.subplot(1,2,2)
    plt.imshow(targets[i], cmap="inferno")
    plt.title('Target')
    plt.axis("off")

In [None]:
features, targets = next(val_gen)

for i in range(batch_size):
    plt.figure()
    plt.subplot(1,2,1)
    plt.imshow(features[i], cmap="gray")
    plt.title('Image')
    plt.axis("off")
    plt.subplot(1,2,2)
    plt.imshow(targets[i], cmap="inferno")
    plt.title('Target')
    plt.axis("off")