# Dataset EDA
## Creating list of image names splitted into equaly distributed datasets 

In [None]:
# Change before use into actual path

dataset_path = '../input/350pics/dataset'

In [None]:
import cv2
from PIL import Image
import matplotlib.pyplot as plt

import os
import random
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

random.seed(22)

In [None]:
im_path = dataset_path+'/masks/106.png'
img = cv2.cvtColor(cv2.imread(im_path), cv2.COLOR_BGR2RGB)
print(img.shape)

Images does not include only extrema values (0, 255) but the whole spectrum between 

In [None]:
blueimg = img[:, :, 0]
fig, axs = plt.subplots(1,2, figsize = (10, 10))

print('unique values:', np.unique(img, return_index=False, return_inverse=False, return_counts=False)) # , axis=2)

print('r', np.unique(img[:,:,0]))
print('g', np.unique(img[:,:,1]))
print('b', np.unique(img[:,:,2]))

axs[0].imshow(blueimg, cmap='gray')
axs[1].imshow([[ 255 if cell not in (0, 255) else 0 for cell in row] for row in blueimg ], cmap='gray')
print('celkovo mimo 0 a 255: ', len([x for x in blueimg.flatten() if x not in (0, 255)]))

In [None]:
def convert_into_1d(old_tensor):
    # pixels that will be marked as a backgroung
    max_pixel_value = np.max(old_tensor)
    color_threshold = 0.5 * max_pixel_value
    
    # 1 - additional channel for the class labeling
    new_tensor = np.full((1, old_tensor.shape[1], old_tensor.shape[2]), color_threshold)

    # concatenate two tensors
    old_tensor = np.concatenate((old_tensor, new_tensor), axis=0)
    
    # finding the layer with the maximum pixel value
    max_idxs = np.argmax(old_tensor, axis=0)
                    
    return max_idxs

## Image retransformation test: Original-Reconstructed

In [None]:
whole_im_name = dataset_path+'masks/1112.png'
img = np.transpose(cv2.cvtColor(cv2.imread(whole_im_name), cv2.COLOR_BGR2RGB), (2,0,1))
classes = convert_into_1d(img)
print(np.unique(classes))

img = np.transpose(img, (1,2,0))
fig, axs = plt.subplots(1,2, figsize = (10, 10))

choicelist = [
    [255, 0, 0], # granulation tissue
    [0, 255, 0], # slough tissue
    [0, 0, 255], # necrotic tissue
    [0, 0, 0]    # background
]

axs[0].imshow(img)
axs[1].imshow(np.choose(classes.reshape(512,512,1),choicelist))

In [None]:
# Displaying the results
def show_diversity_piechart(df_list, name_list, figsize=(20,7)):
    
    labels = 'GRANULATION TISSUE (red pixels)', 'SLOUGH TISSUE (green pixels)', 'NECROTIC TISSUE (blue pixels)'
    if isinstance(df_list, list):
        if len(df_list) != len(name_list):
            print("different input shapes")
            return
            
        fig, axs = plt.subplots(1, len(df_list), figsize = figsize)
        for index, df in enumerate(df_list):
            red_sum, green_sum, blue_sum = df['number_of_RED_pixels'].sum(), df['number_of_GREEN_pixels'].sum(), df['number_of_BLUE_pixels'].sum()
            axs[index].pie([red_sum, green_sum, blue_sum], labels = labels, autopct = '%1.1f%%', colors = ['#b56576', '#eaac8b', '#6d597a'])
            axs[index].set_title(name_list[index] + ' [{}]'.format(len(df)))
    else:
        df = df_list
        name = name_list
        red_sum, green_sum, blue_sum = df['number_of_RED_pixels'].sum(), df['number_of_GREEN_pixels'].sum(), df['number_of_BLUE_pixels'].sum()
        fig, ax = plt.subplots()
        ax.pie([red_sum, green_sum, blue_sum], labels = labels, autopct = '%1.1f%%', colors = ['#b56576', '#eaac8b', '#6d597a'])
        plt.title(name + ' [{}]'.format(len(df)))
    plt.show()
    

def imname_to_categorylist(im_name):
    img = np.transpose(cv2.cvtColor(cv2.imread(im_name), cv2.COLOR_BGR2RGB), (2,0,1))
    classes = convert_into_1d(img)
    return tuple(( np.count_nonzero(classes == i) for i in range(4) ))

column_names = ['filename', 'number_of_RED_pixels', 'number_of_GREEN_pixels', 'number_of_BLUE_pixels', 'number_of_BLACK_pixels']

In [None]:
path = dataset_path + 'masks'
data = [ (im_name, *imname_to_categorylist(path + '//' +im_name))for im_name in os.listdir(path) ]

# Dataset split 
## *splitting into 70:15:15 {Train, validate, test}*

## Original dataset

In [None]:
df = pd.DataFrame(data, columns=column_names)

msk = np.random.rand(len(df)) < 0.7
train    = df[msk]
test_val = df[~msk]
msk = np.random.rand(len(test_val)) < 0.5
test = test_val[msk]
val  = test_val[~msk]

In [None]:
show_diversity_piechart([train, val, test], ['Train dataset', 'Validate dataset', 'Test dataset'])

In [None]:
folder = 'whole'
os.makedirs('./'+folder, exist_ok=True)
train.to_csv('./'+folder+'/train.csv', columns = ['filename'])
test.to_csv( './'+folder+'/test.csv',  columns = ['filename'])
val.to_csv(  './'+folder+'/val.csv',   columns = ['filename']) 

## Drop images with only red pixels {Granulation tissue}

In [None]:
new_df = df.loc[(df['number_of_GREEN_pixels']>0) | (df['number_of_BLUE_pixels']>0)]
msk = np.random.rand(len(new_df)) < 0.7
new_train    = new_df[msk]
new_test_val = new_df[~msk]
msk = np.random.rand(len(new_test_val)) < 0.5
new_test = new_test_val[msk]
new_val  = new_test_val[~msk]

In [None]:
print(len(new_df))
show_diversity_piechart([new_train, new_val, new_test], ['new Train dataset', 'new Validate dataset ', 'new Test dataset'])

In [None]:
folder = 'smallerv2'
os.makedirs('./'+folder, exist_ok=True)
new_train.to_csv('./'+folder+'/train.csv', columns = ['filename'])
new_test.to_csv( './'+folder+'/test.csv',  columns = ['filename'])
new_val.to_csv(  './'+folder+'/val.csv',   columns = ['filename']) 

## Sort original first, than split

In [None]:
sort_df = df.copy(deep=True)
sort_df.sort_values(by=['number_of_BLUE_pixels', 'number_of_GREEN_pixels', 'number_of_RED_pixels'])
msk = np.random.rand(len(sort_df)) < 0.7
sort_train    = sort_df[msk]
sort_test_val = sort_df[~msk]
msk = np.random.rand(len(sort_test_val)) < 0.5
sort_test = sort_test_val[msk]
sort_val  = sort_test_val[~msk]

In [None]:
print(len(sort_df))
show_diversity_piechart([sort_train, sort_val, sort_test], 
                        ['sort Train dataset', 
                         'sort Validate dataset', 
                         'sort Test dataset'])

In [None]:
folder = 'sort_whole'
os.makedirs('./'+folder, exist_ok=True)
sort_train.to_csv('./'+folder+'/train.csv', columns = ['filename'])
sort_test.to_csv( './'+folder+'/test.csv',  columns = ['filename'])
sort_val.to_csv(  './'+folder+'/val.csv',   columns = ['filename']) 