In [1]:
#!pip install opencv-contrib-python
!pip install imutils
#!pip install scikit-learn
#!pip install tqdm
!pip install shortuuid

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [1]:
import os
import seaborn as sns
import pandas as pd
import cv2
from sklearn.model_selection import train_test_split
from imutils import paths
from tqdm import tqdm
import time
import numpy as np
import random
from os.path import exists
from datetime import datetime
from pathlib import Path
import sys
import tqdm
from sklearn.utils.class_weight import compute_class_weight

## Function Definitions

In [2]:
def slice_coords(path):
		path_no_ext = os.path.splitext(path)[0]
		coords = os.path.basename(path_no_ext).split('_')[-2:]
		return tuple([int(x) for x in coords])


def filter_data_present_in_other_set(Imgs, masks, slices_to_filter_out):
		slices_to_filter_out = set([slice_coords(path) for path in slices_to_filter_out])
		for k in Imgs:
				paths_to_keep = [path for path in Imgs[k] if slice_coords(path) not in slices_to_filter_out]
				Imgs[k] = paths_to_keep

		paths_to_keep = [path for path in masks if slice_coords(path) not in slices_to_filter_out]
		masks = paths_to_keep

		return Imgs, masks
    
def splitting_forest_dataset(image0Paths,
							image1Paths,
							image2Paths,
							image3Paths,
							image4Paths,
							image5Paths,
							image6Paths,
							image7Paths,
							maskPaths,
							TEST_SPLIT,
							RANDOM_STATE):

		split = train_test_split(image0Paths,
							image1Paths,
							image2Paths,
							image3Paths,
							image4Paths,
							image5Paths,
							image6Paths,
							image7Paths,
							maskPaths,
							test_size=TEST_SPLIT,
							random_state=RANDOM_STATE)

		(trainImages0, testImages0) = split[:2]
		(trainImages1, testImages1) = split[2:4]
		(trainImages2, testImages2) = split[4:6]
		(trainImages3, testImages3) = split[6:8]
		(trainImages4, testImages4) = split[8:10]
		(trainImages5, testImages5) = split[10:12]
		(trainImages6, testImages6) = split[12:14]
		(trainImages7, testImages7) = split[14:16]
		(trainMasks, testMasks) = split[16:]

		trainImages = {
				0: trainImages0,
				1: trainImages1,
				2: trainImages2,
				3: trainImages3,
				4: trainImages4,
				5: trainImages5,
				6: trainImages6,
				7: trainImages7,
		}

		testImages = {
				0: testImages0,
				1: testImages1,
				2: testImages2,
				3: testImages3,
				4: testImages4,
				5: testImages5,
				6: testImages6,
				7: testImages7,
		}

		return trainImages, testImages, trainMasks, testMasks

## NEW WAY
def load_img_and_mask_paths(mask_version='nks', single_set=True, TEST_SPLIT=None, RANDOM_STATE=None): # TEST_SPLIT is proportional size of test set retunred
		wv2_0_data = pd.read_csv(f'/data/ForestDataset8C/Dataset-wv2_0-{mask_version}.csv')
		wv2_1_data = pd.read_csv(f'/data/ForestDataset8C/Dataset-wv2_1-{mask_version}.csv')
		wv2_2_data = pd.read_csv(f'/data/ForestDataset8C/Dataset-wv2_2-{mask_version}.csv')
		wv2_3_data = pd.read_csv(f'/data/ForestDataset8C/Dataset-wv2_3-{mask_version}.csv')
		wv2_4_data = pd.read_csv(f'/data/ForestDataset8C/Dataset-wv2_4-{mask_version}.csv')
		wv2_5_data = pd.read_csv(f'/data/ForestDataset8C/Dataset-wv2_5-{mask_version}.csv')
		wv2_6_data = pd.read_csv(f'/data/ForestDataset8C/Dataset-wv2_6-{mask_version}.csv')
		wv2_7_data = pd.read_csv(f'/data/ForestDataset8C/Dataset-wv2_7-{mask_version}.csv')
		image0Paths = list(sorted(wv2_0_data.imgPath))
		image1Paths = list(sorted(wv2_1_data.imgPath))
		image2Paths = list(sorted(wv2_2_data.imgPath))
		image3Paths = list(sorted(wv2_3_data.imgPath))
		image4Paths = list(sorted(wv2_4_data.imgPath))
		image5Paths = list(sorted(wv2_5_data.imgPath))
		image6Paths = list(sorted(wv2_6_data.imgPath))
		image7Paths = list(sorted(wv2_7_data.imgPath))
		maskPaths = list(sorted(wv2_7_data.maskPath))

		l = [image0Paths,
             image1Paths,
             image2Paths,
             image3Paths,
             image4Paths,
             image5Paths,
             image6Paths,
             image7Paths,
             maskPaths]
		for dataset in l:
				assert len(l[0]) == len(dataset), "List of paths of diffrenet length for differen channels/mask"
		for i in range(len(image0Paths)):
				assert all([slice_coords(image0Paths[i]) == slice_coords(img_dataset[i]) for img_dataset in l]), "Slices out of order for different channels/mask of dataset"
                
		if single_set:
				Imgs = {
						0: image0Paths,
						1: image1Paths,
						2: image2Paths,
						3: image3Paths,
						4: image4Paths,
						5: image5Paths,
						6: image6Paths,
						7: image7Paths,
				}
				return Imgs, maskPaths
		return splitting_forest_dataset(image0Paths,
							image1Paths,
							image2Paths,
							image3Paths,
							image4Paths,
							image5Paths,
							image6Paths,
							image7Paths,
							maskPaths,
							TEST_SPLIT,
							RANDOM_STATE)

In [3]:
def get_class_counts(paths, labels):
    counts = {l: 0 for l in labels}
    for path in paths:
        mask = cv2.imread(path, cv2.IMREAD_UNCHANGED)
        mask[mask < 0] = 0 # Background class should be zero not -3.4e+38
        counts_in_mask = np.unique(mask, return_counts=True)
        for i in range(len(counts_in_mask[0])):
            counts[counts_in_mask[0][i]] += counts_in_mask[1][i]
    return counts

In [4]:
def get_proportions(counts):
    df = pd.DataFrame([counts])
    return df.div(df.sum(axis=1)*0.01, axis=0)

## Sanity check

In [5]:
Imgs_tp, maskPaths_tp = load_img_and_mask_paths(mask_version='train_polygons')
Imgs_nks, maskPaths_nks = load_img_and_mask_paths(mask_version='nks')

In [11]:
tp = set(slice_coords(mp) for mp in maskPaths_tp)
nks = set(slice_coords(mp) for mp in maskPaths_nks)

In [12]:
tp.issubset(nks)

False

In [13]:
[x for x in tp if x not in nks]

[(10, 8), (11, 8)]

[(10, 8), (11, 8)] are in train polygons but are not in nks