# Imports

removed checking if in CoLab

In [None]:
#imports
from platform import python_version

#basic python stuff
import os
import json
from pathlib import Path

#basics from the SciPy Stack
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#colab stuff
from google.colab import drive

#data managing
from sklearn.model_selection import train_test_split
from skimage import io #read in images

#model
%tensorflow_version 2.x
import tensorflow as tf
from tensorflow.keras import layers, callbacks
from keras.optimizers import Adam

#progress bar
from tqdm.notebook import tqdm

In [None]:
# settings

# implements progress_apply into pandas
tqdm.pandas(desc='Pandas_Progress')

In [None]:
print("Tensorflow version", tf.__version__)
print("Python version =",python_version())

In [None]:
# get access to google drive
drive.mount('/content/drive')

# Hyperparameters

In [None]:
random_state = 42
sample_size = 100
test_size = 0.3
val_size = 0.3

# image preprocessing
img_size = 128
anti_aliasing = True
resize_method = tf.image.ResizeMethod.BILINEAR
mask_threshold = 0.5

# Retrieve the Dataset

In [None]:
# unzip the dataset
!unzip -n -q /content/drive/MyDrive/ML_Project_Satellite_Images/data/current_dataset.zip -d /content/

In [None]:
# read in samples.csv with information about the images (only a sample)
samples_df = pd.read_csv('/content/dataset/samples.csv').sample(sample_size, random_state=random_state)
samples_df.set_index('id', inplace=True)
samples_df

In [None]:
# paths to the sat/mask folder
path_sat_folder = '/content/dataset/images/satellite/'
path_mask_folder = '/content/dataset/images/mask/'

In [None]:
# append absoulute paths of the images to the dataframe
samples_df['abs_satellite_path'] = samples_df['satellite_file'].apply(lambda x: path_sat_folder+x)
samples_df['abs_mask_path'] = samples_df['mask_file'].apply(lambda x: path_mask_folder+x)

# Prepare the Dataset


In [None]:
# function for image reading
def read_satellite_img(filepath):
  img = io.imread(filepath)
  return img

def read_mask_img(filepath):
  img = io.imread(filepath)
  if img.shape == (256, 256,4):
    img = img[:,:,2]
  return img

In [None]:
# append images to dataframe
samples_df['satellite_img'] = samples_df['abs_satellite_path'].progress_apply(read_satellite_img)
samples_df['mask_img'] = samples_df['abs_mask_path'].progress_apply(read_mask_img)

In [None]:
X = samples_df['satellite_img'].to_numpy()
Y = samples_df['mask_img'].to_numpy()
X = np.stack(X)
Y = np.stack(Y)

# Split Training, Validation and Test Data

In [None]:
# split in (train+val) and test
X_train_val, X_test = train_test_split(X, test_size=test_size, random_state=random_state)
Y_train_val, Y_test = train_test_split(Y, test_size=test_size, random_state=random_state)

# split (train+val)
X_train, X_val = train_test_split(X_train_val, test_size=val_size, random_state=random_state)
Y_train, Y_val = train_test_split(Y_train_val, test_size=val_size, random_state=random_state)

In [None]:
print('Training Shape: ', X_train.shape, Y_train.shape)
print('Validation Shape: ', X_val.shape, Y_val.shape)
print('Test Shape: ', X_test.shape, Y_test.shape)

In [None]:
print(np.unique(X_train))
print()
print(np.unique(Y_train))

In [None]:
num = 10

for i in range(num):
  fig, (ax_sat, ax_mask) = plt.subplots(1, 2)
  ax_sat.set_title(np.unique(Y_train[i]))
  ax_sat.imshow(X_train[i])
  ax_mask.imshow(Y_train[i])
  fig.tight_layout()
  fig.show()

# What colors are the waters?
Channels are only considered individually here, probably useles


In [None]:
water_mask = (Y_train == 255)
water_sat = X_train[water_mask]

In [None]:
water_sat.shape

In [None]:
ch1_unique, ch1_counts = np.unique(water_sat[:,0], return_counts=True)
ch2_unique, ch2_counts = np.unique(water_sat[:,1], return_counts=True)
ch3_unique, ch3_counts = np.unique(water_sat[:,2], return_counts=True)

ch1_unique.shape, ch1_counts.shape, ch2_unique.shape, ch2_counts.shape, ch3_unique.shape, ch3_counts.shape

In [None]:
ch1 = np.concatenate((ch1_unique.reshape(-1,1), ch1_counts.reshape(-1,1)), axis=1)
ch2 = np.concatenate((ch2_unique.reshape(-1,1), ch2_counts.reshape(-1,1)), axis=1)
ch3 = np.concatenate((ch3_unique.reshape(-1,1), ch3_counts.reshape(-1,1)), axis=1)

ch1.shape, ch2.shape, ch3.shape

In [None]:
# sort array by number of colors
ch1 = np.sort(ch1, axis=0)[::-1]
ch2 = np.sort(ch2, axis=0)[::-1]
ch3 = np.sort(ch3, axis=0)[::-1]

print('top 10 colors of ch1: \n', ch1[0:10])
print('top 10 colors of ch2: \n', ch2[0:10])
print('top 10 colors of ch3: \n', ch3[0:10])

In [None]:
#choose one image
image_num = 5
x = X_train[image_num].reshape(256*256, 3)
x.shape, np.unique(x)

In [None]:
result_mask = np.zeros(256*256)

for i in range(256*256):
  for (col_ch1, col_ch2, col_ch3) in zip(ch1[0:150, 0], ch2[0:150, 0], ch3[0:150, 0]):
    if col_ch1 == x[i, 0] or col_ch2 == x[i, 1] or col_ch3 == x[i, 2]:
        result_mask[i] = 1
print(np.unique(result_mask))

In [None]:
fig, (ax_pred, ax_mask, ax_sat) = plt.subplots(1, 3, figsize=(10, 5))
ax_pred.imshow(result_mask.reshape(256,256))
ax_mask.imshow(Y_train[image_num])
ax_sat.imshow(X_train[image_num])
plt.tight_layout()
plt.show()

In [None]:
drive.flush_and_unmount()
print('All changes made in this colab session should now be visible in Drive.')