In [1]:
#!pip install opencv-contrib-python
!pip install imutils
#!pip install scikit-learn
#!pip install tqdm
!pip install shortuuid

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [51]:
import os
import seaborn as sns
import pandas as pd
import cv2
from sklearn.model_selection import train_test_split
from imutils import paths
from tqdm import tqdm
import time
import numpy as np
import random
from os.path import exists
from datetime import datetime
from pathlib import Path
import sys
import tqdm
from sklearn.utils.class_weight import compute_class_weight

## Function Definitions

In [65]:
def slice_coords(path):
		path_no_ext = os.path.splitext(path)[0]
		coords = os.path.basename(path_no_ext).split('_')[-2:]
		return tuple([int(x) for x in coords])


def filter_data_present_in_other_set(Imgs, masks, slices_to_filter_out):
		slices_to_filter_out = set([slice_coords(path) for path in slices_to_filter_out])
		for k in Imgs:
				paths_to_keep = [path for path in Imgs[k] if slice_coords(path) not in slices_to_filter_out]
				Imgs[k] = paths_to_keep

		paths_to_keep = [path for path in masks if slice_coords(path) not in slices_to_filter_out]
		masks = paths_to_keep

		return Imgs, masks
    
def splitting_forest_dataset(image0Paths,
							image1Paths,
							image2Paths,
							image3Paths,
							image4Paths,
							image5Paths,
							image6Paths,
							image7Paths,
							maskPaths,
							TEST_SPLIT,
							RANDOM_STATE):

		split = train_test_split(image0Paths,
							image1Paths,
							image2Paths,
							image3Paths,
							image4Paths,
							image5Paths,
							image6Paths,
							image7Paths,
							maskPaths,
							test_size=TEST_SPLIT,
							random_state=RANDOM_STATE)

		(trainImages0, testImages0) = split[:2]
		(trainImages1, testImages1) = split[2:4]
		(trainImages2, testImages2) = split[4:6]
		(trainImages3, testImages3) = split[6:8]
		(trainImages4, testImages4) = split[8:10]
		(trainImages5, testImages5) = split[10:12]
		(trainImages6, testImages6) = split[12:14]
		(trainImages7, testImages7) = split[14:16]
		(trainMasks, testMasks) = split[16:]

		trainImages = {
				0: trainImages0,
				1: trainImages1,
				2: trainImages2,
				3: trainImages3,
				4: trainImages4,
				5: trainImages5,
				6: trainImages6,
				7: trainImages7,
		}

		testImages = {
				0: testImages0,
				1: testImages1,
				2: testImages2,
				3: testImages3,
				4: testImages4,
				5: testImages5,
				6: testImages6,
				7: testImages7,
		}

		return trainImages, testImages, trainMasks, testMasks

## NEW WAY
def load_img_and_mask_paths(mask_version='nks', single_set=True, TEST_SPLIT=None, RANDOM_STATE=None): # TEST_SPLIT is proportional size of test set retunred
		wv2_0_data = pd.read_csv(f'/data/ForestDataset8C/Dataset-wv2_0-{mask_version}.csv')
		wv2_1_data = pd.read_csv(f'/data/ForestDataset8C/Dataset-wv2_1-{mask_version}.csv')
		wv2_2_data = pd.read_csv(f'/data/ForestDataset8C/Dataset-wv2_2-{mask_version}.csv')
		wv2_3_data = pd.read_csv(f'/data/ForestDataset8C/Dataset-wv2_3-{mask_version}.csv')
		wv2_4_data = pd.read_csv(f'/data/ForestDataset8C/Dataset-wv2_4-{mask_version}.csv')
		wv2_5_data = pd.read_csv(f'/data/ForestDataset8C/Dataset-wv2_5-{mask_version}.csv')
		wv2_6_data = pd.read_csv(f'/data/ForestDataset8C/Dataset-wv2_6-{mask_version}.csv')
		wv2_7_data = pd.read_csv(f'/data/ForestDataset8C/Dataset-wv2_7-{mask_version}.csv')
		image0Paths = list(sorted(wv2_0_data.imgPath))
		image1Paths = list(sorted(wv2_1_data.imgPath))
		image2Paths = list(sorted(wv2_2_data.imgPath))
		image3Paths = list(sorted(wv2_3_data.imgPath))
		image4Paths = list(sorted(wv2_4_data.imgPath))
		image5Paths = list(sorted(wv2_5_data.imgPath))
		image6Paths = list(sorted(wv2_6_data.imgPath))
		image7Paths = list(sorted(wv2_7_data.imgPath))
		maskPaths = list(sorted(wv2_7_data.maskPath))

		l = [image0Paths,
             image1Paths,
             image2Paths,
             image3Paths,
             image4Paths,
             image5Paths,
             image6Paths,
             image7Paths,
             maskPaths]
		for dataset in l:
				assert len(l[0]) == len(dataset), "List of paths of diffrenet length for differen channels/mask"
		for i in range(len(image0Paths)):
				assert all([slice_coords(image0Paths[i]) == slice_coords(img_dataset[i]) for img_dataset in l]), "Slices out of order for different channels/mask of dataset"
                
		if single_set:
				Imgs = {
						0: image0Paths,
						1: image1Paths,
						2: image2Paths,
						3: image3Paths,
						4: image4Paths,
						5: image5Paths,
						6: image6Paths,
						7: image7Paths,
				}
				return Imgs, maskPaths
		return splitting_forest_dataset(image0Paths,
							image1Paths,
							image2Paths,
							image3Paths,
							image4Paths,
							image5Paths,
							image6Paths,
							image7Paths,
							maskPaths,
							TEST_SPLIT,
							RANDOM_STATE)

In [3]:
def get_class_counts(paths, labels):
    counts = {l: 0 for l in labels}
    for path in paths:
        mask = cv2.imread(path, cv2.IMREAD_UNCHANGED)
        mask[mask < 0] = 0 # Background class should be zero not -3.4e+38
        counts_in_mask = np.unique(mask, return_counts=True)
        for i in range(len(counts_in_mask[0])):
            counts[counts_in_mask[0][i]] += counts_in_mask[1][i]
    return counts

In [4]:
def get_proportions(counts):
    df = pd.DataFrame([counts])
    return df.div(df.sum(axis=1)*0.01, axis=0)

## Sanity check

In [5]:
Imgs, maskPaths = load_img_and_mask_paths(mask_version='train_polygons')

counts_global = get_class_counts(paths=maskPaths, labels=range(13))
proportions_global = get_proportions(counts_global)

In [6]:
# Sanity check that it works

In [7]:
counts_global

{0: 3496450,
 1: 2941809,
 2: 2945983,
 3: 878024,
 4: 4411736,
 5: 1106848,
 6: 1325334,
 7: 7418729,
 8: 2462666,
 9: 2201673,
 10: 978627,
 11: 1051232,
 12: 408185}

In [8]:
# Sum of all counts
s=0
for k in counts_global: s+= counts_global[k]
s

31627296

In [9]:
# Class counts in total train_polygons
df = pd.DataFrame([counts_global])
df


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,3496450,2941809,2945983,878024,4411736,1106848,1325334,7418729,2462666,2201673,978627,1051232,408185


In [10]:
df.sum(axis=1)

0    31627296
dtype: int64

In [11]:
# Percent of class in whole train_polygons datase
df.div(df.sum(axis=1)*0.01, axis=0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,11.055166,9.301488,9.314685,2.776159,13.949141,3.499661,4.190475,23.456729,7.786521,6.961306,3.094248,3.323812,1.29061


In [12]:
# sanity check of above
df.div(df.sum(axis=1)*0.01, axis=0).sum(axis=1)

0    100.0
dtype: float64

In [47]:
proportions_global

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,11.055166,9.301488,9.314685,2.776159,13.949141,3.499661,4.190475,23.456729,7.786521,6.961306,3.094248,3.323812,1.29061


## Finding best split for train test split

In [26]:
best_rand_state = None
sum_of_diffs = []
TEST_SPLIT=0.25
for rand_state in tqdm.tqdm(range(150)): # same with 400
    trainImages, testImages, trainMasks, testMasks = load_img_and_mask_paths(mask_version='train_polygons',
                                          single_set=False,
                                          TEST_SPLIT=TEST_SPLIT,
                                          RANDOM_STATE=rand_state)

    counts = get_class_counts(paths=testMasks, labels=range(13))
    proportions = get_proportions(counts)
    sum_of_diff = np.abs((proportions_global - proportions).values.squeeze()[1:]).sum()
    
    counts = get_class_counts(paths=trainMasks, labels=range(13))
    proportions = get_proportions(counts)
    sum_of_diff += np.abs((proportions_global - proportions).values.squeeze()[1:]).sum()
    
    if best_rand_state is None or sum_of_diff < np.min(sum_of_diffs):
        best_rand_state = rand_state
    sum_of_diffs.append(sum_of_diff)
print(f"Best Random state was: {best_rand_state} with sum of diffs: {sum_of_diffs[best_rand_state]} with test_split={TEST_SPLIT}")
    

100%|██████████| 150/150 [00:54<00:00,  2.73it/s]

Best Random state was: 74 with sum of diffs: 9.659318752173228 with test_split=0.25





In [32]:
def save_train_polygons_dataset(images, masks, name='train_polygons_TRAINING'):
    for channel in images:
        df = pd.DataFrame({'imgPath': sorted(images[channel]), 'maskPath':sorted(masks)})
        df.to_csv(f'/data/ForestDataset8C/Dataset-wv2_{channel}-{name}.csv', index=False)

In [28]:
trainImages, testImages, trainMasks, testMasks = load_img_and_mask_paths(mask_version='train_polygons',
                                          single_set=False,
                                          TEST_SPLIT=0.25,
                                          RANDOM_STATE=74)

counts = get_class_counts(paths=testMasks, labels=range(13))
proportions = get_proportions(counts)
sum_of_diff = np.abs((proportions_global - proportions).values.squeeze()[1:]).sum()

counts = get_class_counts(paths=trainMasks, labels=range(13))
proportions = get_proportions(counts)
sum_of_diff += np.abs((proportions_global - proportions).values.squeeze()[1:]).sum()

print(f"total sum of share (%) deviations: {sum_of_diff}")



total sum of share (%) deviations: 9.659318752173228


In [50]:
best_rand_state

74

In [48]:
len(trainMasks)

72

In [49]:
len(testMasks)

24

In [33]:
save_train_polygons_dataset(testImages, testMasks, name='train_polygons_test_0.25_TESTING')

## Fining best split into real traning and validation sets from training set

In [75]:
best_rand_state = None
sum_of_diffs = []
TEST_SPLIT=0.25
for rand_state in tqdm.tqdm(range(400)): # same with 400
    realTrainImages, valImages, realTrainMasks, valMasks = splitting_forest_dataset(trainImages[0],
                                                                            trainImages[1],
                                                                            trainImages[2],
                                                                            trainImages[3],
                                                                            trainImages[4],
                                                                            trainImages[5],
                                                                            trainImages[6],
                                                                            trainImages[7],
                                                                            trainMasks,
                                                                            TEST_SPLIT,
                                                                            rand_state)

    counts = get_class_counts(paths=valMasks, labels=range(13))
    proportions = get_proportions(counts)
    sum_of_diff = np.abs((proportions_global - proportions).values.squeeze()[1:]).sum()
    
    counts = get_class_counts(paths=realTrainMasks, labels=range(13))
    proportions = get_proportions(counts)
    sum_of_diff += np.abs((proportions_global - proportions).values.squeeze()[1:]).sum()
    
    if best_rand_state is None or sum_of_diff < np.min(sum_of_diffs):
        best_rand_state = rand_state
    sum_of_diffs.append(sum_of_diff)
print(f"Best Random state was: {best_rand_state} with sum of diffs: {sum_of_diffs[best_rand_state]} with test_split={TEST_SPLIT}")
    

100%|██████████| 400/400 [01:46<00:00,  3.76it/s]

Best Random state was: 374 with sum of diffs: 14.837712757149188 with test_split=0.25





In [77]:
realTrainImages, valImages, realTrainMasks, valMasks = splitting_forest_dataset(trainImages[0],
                                                                            trainImages[1],
                                                                            trainImages[2],
                                                                            trainImages[3],
                                                                            trainImages[4],
                                                                            trainImages[5],
                                                                            trainImages[6],
                                                                            trainImages[7],
                                                                            trainMasks,
                                                                            TEST_SPLIT,
                                                                            374)

counts = get_class_counts(paths=valMasks, labels=range(13))
proportions = get_proportions(counts)
sum_of_diff = np.abs((proportions_global - proportions).values.squeeze()[1:]).sum()
    
counts = get_class_counts(paths=realTrainMasks, labels=range(13))
proportions = get_proportions(counts)
sum_of_diff += np.abs((proportions_global - proportions).values.squeeze()[1:]).sum()
print(f"total sum of share (%) deviations: {sum_of_diff}")

total sum of share (%) deviations: 14.837712757149188


In [78]:
len(valMasks)

18

In [79]:
len(realTrainMasks)

54

In [80]:
len(realTrainImages[0])

54

In [81]:
len(valImages[0])

18

In [82]:
save_train_polygons_dataset(realTrainImages, realTrainMasks, name='train_polygons_test_0.75x0.75_TRAINING')
save_train_polygons_dataset(valImages, valMasks, name='train_polygons_test_0.75x0.25_VALIDATION')

## Finding class weights for train dataset

In [83]:
smoothing_factor = 0.03
raveled_total_trained_set = None
for tmask_path in realTrainMasks:
    img = cv2.imread(tmask_path, -1) #  cv2.IMREAD_UNCHANGED
    img[img < 0] = 0
    values, counts = np.unique(img, return_counts=True)
    raveled_img = img.ravel()
    for v in values:
        raveled_img = np.append(raveled_img, [v] * int(np.sum(counts)*smoothing_factor))
    if raveled_total_trained_set is None:
        raveled_total_trained_set = raveled_img
    else:
        raveled_total_trained_set = np.append(raveled_total_trained_set, raveled_img)

In [84]:
values, counts = np.unique(raveled_total_trained_set, return_counts=True)

In [85]:
values

array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.],
      dtype=float32)

In [86]:
print(compute_class_weight(class_weight='balanced', classes=values, y=raveled_total_trained_set))

[0.68027677 0.86080587 0.7664746  2.73974581 0.55010219 2.27545507
 1.52235798 0.35160898 1.08095651 1.10757723 2.21847914 2.06521764
 5.6361113 ]


In [87]:
class_w = compute_class_weight(class_weight='balanced', classes=values, y=raveled_total_trained_set)
for i,v in enumerate(values):
    print(f"class weight of {v} is {class_w[i]}")

class weight of 0.0 is 0.6802767700064816
class weight of 1.0 is 0.860805874461505
class weight of 2.0 is 0.766474596412992
class weight of 3.0 is 2.739745810690325
class weight of 4.0 is 0.5501021875452016
class weight of 5.0 is 2.2754550701786496
class weight of 6.0 is 1.522357976272553
class weight of 7.0 is 0.35160898476724495
class weight of 8.0 is 1.080956508247329
class weight of 9.0 is 1.1075772287750636
class weight of 10.0 is 2.218479136613627
class weight of 11.0 is 2.0652176412940286
class weight of 12.0 is 5.636111304832973


In [88]:
counts = get_class_counts(paths=realTrainMasks, labels=range(13))
get_proportions(counts)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,11.85552,8.943498,9.964698,2.871725,13.738203,3.048388,4.523609,22.839647,7.516118,6.869683,3.039827,4.010376,0.778709
