In [1]:
#!pip install opencv-contrib-python
!pip install imutils
#!pip install scikit-learn
#!pip install tqdm
!pip install shortuuid

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [1]:
import os
import seaborn as sns
import pandas as pd
import cv2
from sklearn.model_selection import train_test_split
from imutils import paths
from tqdm import tqdm
import time
import numpy as np
import random
from os.path import exists
from datetime import datetime
from pathlib import Path
import sys
import tqdm

## Function Definitions

In [2]:
def slice_coords(path):
		path_no_ext = os.path.splitext(path)[0]
		coords = os.path.basename(path_no_ext).split('_')[-2:]
		return tuple([int(x) for x in coords])


def filter_data_present_in_other_set(Imgs, masks, slices_to_filter_out):
		slices_to_filter_out = set([slice_coords(path) for path in slices_to_filter_out])
		for k in Imgs:
				paths_to_keep = [path for path in Imgs[k] if slice_coords(path) not in slices_to_filter_out]
				Imgs[k] = paths_to_keep

		paths_to_keep = [path for path in masks if slice_coords(path) not in slices_to_filter_out]
		masks = paths_to_keep

		return Imgs, masks

## NEW WAY
def load_img_and_mask_paths(mask_version='nks', single_set=True, TEST_SPLIT=None, RANDOM_STATE=None):
		wv2_0_data = pd.read_csv(f'/data/ForestDataset8C/Dataset-wv2_0-{mask_version}.csv')
		wv2_1_data = pd.read_csv(f'/data/ForestDataset8C/Dataset-wv2_1-{mask_version}.csv')
		wv2_2_data = pd.read_csv(f'/data/ForestDataset8C/Dataset-wv2_2-{mask_version}.csv')
		wv2_3_data = pd.read_csv(f'/data/ForestDataset8C/Dataset-wv2_3-{mask_version}.csv')
		wv2_4_data = pd.read_csv(f'/data/ForestDataset8C/Dataset-wv2_4-{mask_version}.csv')
		wv2_5_data = pd.read_csv(f'/data/ForestDataset8C/Dataset-wv2_5-{mask_version}.csv')
		wv2_6_data = pd.read_csv(f'/data/ForestDataset8C/Dataset-wv2_6-{mask_version}.csv')
		wv2_7_data = pd.read_csv(f'/data/ForestDataset8C/Dataset-wv2_7-{mask_version}.csv')
		image0Paths = list(sorted(wv2_0_data.imgPath))
		image1Paths = list(sorted(wv2_1_data.imgPath))
		image2Paths = list(sorted(wv2_2_data.imgPath))
		image3Paths = list(sorted(wv2_3_data.imgPath))
		image4Paths = list(sorted(wv2_4_data.imgPath))
		image5Paths = list(sorted(wv2_5_data.imgPath))
		image6Paths = list(sorted(wv2_6_data.imgPath))
		image7Paths = list(sorted(wv2_7_data.imgPath))
		maskPaths = list(sorted(wv2_7_data.maskPath))

		l = [image0Paths,
             image1Paths,
             image2Paths,
             image3Paths,
             image4Paths,
             image5Paths,
             image6Paths,
             image7Paths,
             maskPaths]
		for dataset in l:
				assert len(l[0]) == len(dataset), "List of paths of diffrenet length for differen channels/mask"
		for i in range(len(image0Paths)):
				assert all([slice_coords(image0Paths[i]) == slice_coords(img_dataset[i]) for img_dataset in l]), "Slices out of order for different channels/mask of dataset"
                
		if single_set:
				Imgs = {
						0: image0Paths,
						1: image1Paths,
						2: image2Paths,
						3: image3Paths,
						4: image4Paths,
						5: image5Paths,
						6: image6Paths,
						7: image7Paths,
				}
				return Imgs, maskPaths
		split = train_test_split(image0Paths,
                                 image1Paths,
                                 image2Paths,
                                 image3Paths,
                                 image4Paths,
                                 image5Paths,
                                 image6Paths,
                                 image7Paths,
                                 maskPaths,
                                 test_size=TEST_SPLIT,
                                 random_state=RANDOM_STATE)

		(trainImages0, testImages0) = split[:2]
		(trainImages1, testImages1) = split[2:4]
		(trainImages2, testImages2) = split[4:6]
		(trainImages3, testImages3) = split[6:8]
		(trainImages4, testImages4) = split[8:10]
		(trainImages5, testImages5) = split[10:12]
		(trainImages6, testImages6) = split[12:14]
		(trainImages7, testImages7) = split[14:16]
		(trainMasks, testMasks) = split[16:]

		trainImages = {
				0: trainImages0,
				1: trainImages1,
				2: trainImages2,
				3: trainImages3,
				4: trainImages4,
				5: trainImages5,
				6: trainImages6,
				7: trainImages7,
		}

		testImages = {
				0: testImages0,
				1: testImages1,
				2: testImages2,
				3: testImages3,
				4: testImages4,
				5: testImages5,
				6: testImages6,
				7: testImages7,
		}

		return trainImages, testImages, trainMasks, testMasks

In [3]:
def get_class_counts(paths, labels):
    counts = {l: 0 for l in labels}
    for path in paths:
        mask = cv2.imread(path, cv2.IMREAD_UNCHANGED)
        mask[mask < 0] = 0 # Background class should be zero not -3.4e+38
        counts_in_mask = np.unique(mask, return_counts=True)
        for i in range(len(counts_in_mask[0])):
            counts[counts_in_mask[0][i]] += counts_in_mask[1][i]
    return counts

In [4]:
def get_proportions(counts):
    df = pd.DataFrame([counts])
    return df.div(df.sum(axis=1)*0.01, axis=0)

In [5]:
Imgs, maskPaths = load_img_and_mask_paths(mask_version='train_polygons')

counts_global = get_class_counts(paths=maskPaths, labels=range(13))
proportions_global = get_proportions(counts_global)

In [6]:
# Sanity check that it works

In [7]:
counts_global

{0: 3496450,
 1: 2941809,
 2: 2945983,
 3: 878024,
 4: 4411736,
 5: 1106848,
 6: 1325334,
 7: 7418729,
 8: 2462666,
 9: 2201673,
 10: 978627,
 11: 1051232,
 12: 408185}

In [8]:
# Sum of all counts
s=0
for k in counts_global: s+= counts_global[k]
s

31627296

In [9]:
# Class counts in total train_polygons
df = pd.DataFrame([counts_global])
df


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,3496450,2941809,2945983,878024,4411736,1106848,1325334,7418729,2462666,2201673,978627,1051232,408185


In [10]:
df.sum(axis=1)

0    31627296
dtype: int64

In [11]:
# Percent of class in whole train_polygons datase
df.div(df.sum(axis=1)*0.01, axis=0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,11.055166,9.301488,9.314685,2.776159,13.949141,3.499661,4.190475,23.456729,7.786521,6.961306,3.094248,3.323812,1.29061


In [12]:
# sanity check of above
df.div(df.sum(axis=1)*0.01, axis=0).sum(axis=1)

0    100.0
dtype: float64

In [13]:
proportions_global

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,11.055166,9.301488,9.314685,2.776159,13.949141,3.499661,4.190475,23.456729,7.786521,6.961306,3.094248,3.323812,1.29061


In [14]:
# Testing train_polygons split
valImages, testImages, valMasks, testMasks = load_img_and_mask_paths(mask_version='train_polygons',
                                          single_set=False,
                                          TEST_SPLIT=0.3,
                                          RANDOM_STATE=42)

counts = get_class_counts(paths=testMasks, labels=range(13))
proportions = get_proportions(counts)
proportions

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,11.103111,11.008366,7.439357,1.537835,7.337934,3.443754,3.332828,29.102721,7.755441,10.182373,2.298756,4.743848,0.713674


In [25]:
np.abs((proportions_global - proportions).values.squeeze()[1:]).sum()

24.035893464979715

In [14]:
best_rand_state = None
sum_of_diffs = []
TEST_SPLIT=0.5
for rand_state in tqdm.tqdm(range(150)):
    valImages, testImages, valMasks, testMasks = load_img_and_mask_paths(mask_version='train_polygons',
                                          single_set=False,
                                          TEST_SPLIT=TEST_SPLIT,
                                          RANDOM_STATE=rand_state)

    counts = get_class_counts(paths=testMasks, labels=range(13))
    proportions = get_proportions(counts)
    sum_of_diff = np.abs((proportions_global - proportions).values.squeeze()[1:]).sum()
    
#     SAME RESULT EVEN WITH THIS ADDITION 
#     counts = get_class_counts(paths=valMasks, labels=range(13))
#     proportions = get_proportions(counts)
#     sum_of_diff += np.abs((proportions_global - proportions).values.squeeze()[1:]).sum()
    
    if best_rand_state is None or sum_of_diff < np.min(sum_of_diffs):
        best_rand_state = rand_state
    sum_of_diffs.append(sum_of_diff)
print(f"Best Random state was: {best_rand_state} with sum of diffs: {sum_of_diffs[best_rand_state]} with test_split={TEST_SPLIT}")
    

100%|██████████| 150/150 [00:28<00:00,  5.33it/s]

Best Random state was: 39 with sum of diffs: 6.726993037912568 with test_split=0.5





In [20]:
def save_train_polygons_dataset(images, masks, name='train_polygons_validation'):
    for channel in images:
        df = pd.DataFrame({'imgPath': sorted(images[channel]), 'maskPath':sorted(masks)})
        df.to_csv(f'/data/ForestDataset8C/Dataset-wv2_{channel}-{name}.csv', index=False)

In [21]:
valImages, testImages, valMasks, testMasks = load_img_and_mask_paths(mask_version='train_polygons',
                                          single_set=False,
                                          TEST_SPLIT=0.5,
                                          RANDOM_STATE=39)

counts = get_class_counts(paths=testMasks, labels=range(13))
proportions = get_proportions(counts)
print(f"total sum of share (%) deviations: {np.abs((proportions_global - proportions).values.squeeze()[1:]).sum()}")

save_train_polygons_dataset(valImages, valMasks, name='train_polygons_validation')
save_train_polygons_dataset(testImages, testMasks, name='train_polygons_test')

total sum of share (%) deviations: 6.726993037912568


In [16]:
proportions

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,9.283854,11.969819,8.571767,2.407079,14.664902,2.508137,2.449144,24.021706,9.478237,6.561098,4.490444,2.75524,0.838573


In [17]:
proportions_global - proportions

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,1.771312,-2.668331,0.742918,0.36908,-0.715761,0.991523,1.741331,-0.564977,-1.691716,0.400208,-1.396196,0.568572,0.452037


In [19]:
sum_of_diffs[39]

6.726993037912568

In [18]:
sum_of_diffs

[12.923128173840723,
 15.591336040867986,
 12.702356850234683,
 9.724018139268054,
 11.680340930821274,
 15.319880649929729,
 21.337037475476876,
 18.720506489078296,
 11.83197577181432,
 15.477440752443714,
 17.530654533349924,
 8.08941112133013,
 9.691432362728696,
 12.44966373350412,
 14.571451192033615,
 13.021713901814433,
 9.638421191618782,
 12.209896160582302,
 11.753739554592336,
 8.865576115011537,
 15.712788092918222,
 8.39927637190356,
 7.689370599370876,
 10.248135028679023,
 19.5409876329611,
 14.320086042132719,
 12.391163632831589,
 14.062890485484438,
 8.42543731844796,
 17.237565930391266,
 7.353831323423915,
 10.095387225009684,
 13.008124374590858,
 10.961563075136107,
 12.942314132703594,
 18.332797087680213,
 14.21818672073642,
 11.157621568407238,
 12.578691520135012,
 6.726993037912568,
 19.18532017406736,
 11.700020134506595,
 13.745449500330345,
 17.26817872764083,
 17.220700751654523,
 9.440237951420189,
 12.7023947921441,
 20.756981564279158,
 10.29166704608