# Checagem de dados

Scripts com exemplos de como fazer o carregamento e divisão dos dados de forma estratificada.
 

In [1]:
# imports

import os
import sys
sys.path.append(os.path.abspath('..'))
from rasterio.coords import BoundingBox

import src.data.preprocess_data as data
import src.data.view as view
import src.data.utils as utils

import torch

### Definições
Quais tiles e o número de subtiels são definidos a seguir. 

Este primeiro conjunto é o de treino e tem 4 tiles.

O número de subtiles define a divisão do tile (de 10560 x 10560 pixels) por linha e coluna

Vão ser gerados num_subtiles² subtiles pra cada tile

In [2]:
tiles_treino = {
              'Manaus': '016009',
              'Porto Alegre': '025037',
              'Belo Horizonte': '032027',
              'Salvador': '038019',      
              }
tiles = tiles_treino.values() 
num_subtiles = 6


- Divisão do conjunto de treino, validação e teste, em que cada arquivo em data/processed dos tiles correspondentes são associados a um dos conjuntos.
- A divisão entre treino, validação e teste é de 60/20/20%
- A função train_val_test_stratify aplica a divisão de dados, de forma aleatória.
- Neste exemplo a seguir foi gerada a divisão dos datasets de um tile inteiro
- Salva um arquivo que mapeia os subtiles escolhidos para cada dataset. (na pasta config)
- Também tem como output, impresso na célula, o percentual de pixels de cada uma das 5 classes.

In [3]:
train_files_no_strat, val_files_no_strat, test_files_no_strat = data.train_val_test_stratify(tiles, 
                                                                  num_subtiles,
                                                                    train_size = 0.6, 
                                                                    val_size = 0.2,
                                                                    subfolder='q_12ch')
print('Percentual por classe, Dataset inteiro:')
data.count_classes(train_files_no_strat+val_files_no_strat+test_files_no_strat, num_subtiles=6, agregate_by='4types')
print('Percentual por classe, Dataset de treino:')
data.count_classes(train_files_no_strat, num_subtiles=6, agregate_by='4types')
print('Percentual por classe, Dataset de validação:')
data.count_classes(val_files_no_strat, num_subtiles=6, agregate_by='4types')
print('Percentual por classe, Dataset de teste:')
data.count_classes(test_files_no_strat, num_subtiles=6, agregate_by='4types')


016009
025037
032027
038019
Training set size: 86
Validation set size: 29
Test set size: 29
creating /home/jonathan/UrbanizedAreasSegmentation/config/train_val_test_split/6-subtiles/mode-/num_tiles-4
saved /home/jonathan/UrbanizedAreasSegmentation/config/train_val_test_split/6-subtiles/mode-/num_tiles-4/train_val_test_split.yaml
Percentual por classe, Dataset inteiro:
Classe 0: 94.52%, Classe 1: 0.12%, Classe 2: 0.31%, Classe 3: 5.05%
----------------------
Percentual por classe, Dataset de treino:
Classe 0: 95.63%, Classe 1: 0.12%, Classe 2: 0.23%, Classe 3: 4.01%
----------------------
Percentual por classe, Dataset de validação:
Classe 0: 89.44%, Classe 1: 0.14%, Classe 2: 0.35%, Classe 3: 10.07%
----------------------
Percentual por classe, Dataset de teste:
Classe 0: 96.30%, Classe 1: 0.11%, Classe 2: 0.50%, Classe 3: 3.09%
----------------------


## Checagem de resultados da estratificação
- Se passar o argumento stratify_by, ele lê as máscaras e estratifica pelos labels da máscara.
- stratify_by aceita binary (binário), type (por tipo) e density (por densidade).
data.check_stratification imprime o número de pixels de cada classe por divisão de conjunto. 

In [4]:
train_files, val_files, test_files = data.train_val_test_stratify(tiles, 
                                                                  num_subtiles,
                                                                    train_size = 0.6, 
                                                                    val_size = 0.2, 
                                                                    stratify_by = '4types',
                                                                    subfolder = 'q_12ch')

train_files, val_files, test_files
print(len(train_files), len(val_files), len(test_files))

print('Percentual por classe, Dataset inteiro:')
data.count_classes(train_files_no_strat+val_files_no_strat+test_files_no_strat, num_subtiles=6, agregate_by='4types')
print('Percentual por classe, Dataset de treino:')
data.count_classes(train_files_no_strat, num_subtiles=6, agregate_by='4types')
print('Percentual por classe, Dataset de validação:')
data.count_classes(val_files_no_strat, num_subtiles=6, agregate_by='4types')
print('Percentual por classe, Dataset de teste:')
data.count_classes(test_files_no_strat, num_subtiles=6, agregate_by='4types')


016009
025037
032027
038019
Training set size: 86
Validation set size: 29
Test set size: 29
saved /home/jonathan/UrbanizedAreasSegmentation/config/train_val_test_split/6-subtiles/mode-4types/num_tiles-4/train_val_test_split.yaml
86 29 29
Percentual por classe, Dataset inteiro:
Classe 0: 94.52%, Classe 1: 0.12%, Classe 2: 0.31%, Classe 3: 5.05%
----------------------
Percentual por classe, Dataset de treino:
Classe 0: 95.63%, Classe 1: 0.12%, Classe 2: 0.23%, Classe 3: 4.01%
----------------------
Percentual por classe, Dataset de validação:
Classe 0: 89.44%, Classe 1: 0.14%, Classe 2: 0.35%, Classe 3: 10.07%
----------------------
Percentual por classe, Dataset de teste:
Classe 0: 96.30%, Classe 1: 0.11%, Classe 2: 0.50%, Classe 3: 3.09%
----------------------


In [None]:
train_dataset = data.SubtileDataset(train_files,
                    num_subtiles = num_subtiles,
                    classes_mode = '4types',
                    patch_size=256,
                    stride = 256,
                    dynamic_sampling=False,
                    data_augmentation = False, 
                    )

class_counter, class_counter_img = train_dataset.count_classes()
percentual_counter = [c/sum(class_counter) for c in class_counter]
img_percentual = [c/len(train_dataset) for c in class_counter_img]
print(class_counter)
print(class_counter_img)
print('Percentual de pixels de cada classe:')
print([100*t.item() for t in percentual_counter])
print('Percentual de classes presentes em cada patch:')
print([100*t.item() for t in img_percentual])
print('Pesos:')
[1/c for c in img_percentual]




tensor([267246529,    253239,    427684,   8241252])
tensor([4209,  114,  240,  947])
Percentual de pixels de cada classe:
[96.7693030834198, 0.09169721161015332, 0.15486330958083272, 2.984136901795864]
Percentual de pixels de cada classe presentes em cada patch:
[99.88134503364563, 2.7052681893110275, 5.695301294326782, 22.47270941734314]
Pesos:


[tensor(1.0012), tensor(36.9649), tensor(17.5583), tensor(4.4498)]

In [None]:
%%time
train_dataset = data.SubtileDataset(train_files, 
                    num_subtiles = num_subtiles, 
                    classes_mode = '4types', 
                    patch_size=256, 
                    stride = 256,
                    dynamic_sampling=True,
                    data_augmentation = False 
                    )

class_counter, class_counter_img = train_dataset.count_classes()
percentual_counter = [c/sum(class_counter) for c in class_counter]
img_percentual = [c/len(train_dataset) for c in class_counter_img]
print(class_counter)
print(class_counter_img)
print('Percentual de pixels de cada classe:')
print([100*t.item() for t in percentual_counter])
print('Percentual de classes presentes em cada patch:')
print([100*t.item() for t in img_percentual])
print('Pesos:')
[1/c for c in img_percentual]


Doing dynamic sampling (DS)
Before DS:
Num of pixels for each class: tensor([267246529,    253239,    427684,   8241252])
% of pixels for each class: [tensor(96.7693), tensor(0.0917), tensor(0.1549), tensor(2.9841)]
Num of images with each class: tensor([4209,  114,  240,  947])
% of images with each class: [tensor(99.8813), tensor(2.7053), tensor(5.6953), tensor(22.4727)]
[3, 2, 1]
[128, 64, 64]


100%|██████████| 4214/4214 [02:37<00:00, 26.81it/s] 


tensor([622532908,   6800798,   9463548,  88586810])
tensor([11078,  2422,  4562,  7193])
Percentual de pixels de cada classe:
[85.58517694473267, 0.9349665604531765, 1.3010386377573013, 12.178821861743927]
Percentual de pixels de cada classe presentes em cada patch:
[99.810791015625, 21.82178646326065, 41.102802753448486, 64.80764150619507]
Pesos:
CPU times: user 13min 11s, sys: 4.48 s, total: 13min 16s
Wall time: 6min 6s


[tensor(1.0019), tensor(4.5826), tensor(2.4329), tensor(1.5430)]

ZeroDivisionError: division by zero

In [16]:
tiles_finetune = {
              'Boa Vista': '015002',  
              'Campo Grande': '021027',
              'Macapá': '025005',
              'Curitiba': '027032',
              'Brasília': '028022',                      
              'Rio de Janeiro': '033029',
              'Teresina': '034011',
              'Petrolina': '036016',
              }

tiles = tiles_finetune.values() 
num_subtiles = 6
classes_mode = '4types'

train_files_no_strat, val_files_no_strat, test_files_no_strat = data.train_val_test_stratify(tiles, 
                                                                  num_subtiles,
                                                                    train_size = 0.6, 
                                                                    val_size = 0.2,
                                                                    subfolder='q_12ch')
print('Percentual por classe, Dataset inteiro:')
data.count_classes(train_files_no_strat+val_files_no_strat+test_files_no_strat, num_subtiles=6, agregate_by='4types')
print('Percentual por classe, Dataset de treino:')
data.count_classes(train_files_no_strat, num_subtiles=6, agregate_by='4types')
print('Percentual por classe, Dataset de validação:')
data.count_classes(val_files_no_strat, num_subtiles=6, agregate_by='4types')
print('Percentual por classe, Dataset de teste:')
data.count_classes(test_files_no_strat, num_subtiles=6, agregate_by='4types')

train_files_no_strat, val_files_no_strat, test_files_no_strat = data.train_val_test_stratify(tiles, 
                                                                  num_subtiles,
                                                                    train_size = 0.6, 
                                                                    val_size = 0.2,
                                                                    stratify_by = '4types',
                                                                    subfolder='q_12ch')
print('Percentual por classe, Dataset inteiro:')
data.count_classes(train_files_no_strat+val_files_no_strat+test_files_no_strat, num_subtiles=6, agregate_by='4types')
print('Percentual por classe, Dataset de treino:')
data.count_classes(train_files_no_strat, num_subtiles=6, agregate_by='4types')
print('Percentual por classe, Dataset de validação:')
data.count_classes(val_files_no_strat, num_subtiles=6, agregate_by='4types')
print('Percentual por classe, Dataset de teste:')
data.count_classes(test_files_no_strat, num_subtiles=6, agregate_by='4types')



015002
021027
025005
027032
028022
033029
034011
036016
Training set size: 173
Validation set size: 58
Test set size: 57
creating /home/jonathan/UrbanizedAreasSegmentation/config/train_val_test_split/6-subtiles/mode-/num_tiles-8
saved /home/jonathan/UrbanizedAreasSegmentation/config/train_val_test_split/6-subtiles/mode-/num_tiles-8/train_val_test_split.yaml
Percentual por classe, Dataset inteiro:
Classe 0: 96.90%, Classe 1: 0.08%, Classe 2: 0.11%, Classe 3: 2.91%
----------------------
Percentual por classe, Dataset de treino:
Classe 0: 97.26%, Classe 1: 0.04%, Classe 2: 0.09%, Classe 3: 2.60%
----------------------
Percentual por classe, Dataset de validação:
Classe 0: 96.55%, Classe 1: 0.11%, Classe 2: 0.14%, Classe 3: 3.21%
----------------------
Percentual por classe, Dataset de teste:
Classe 0: 96.17%, Classe 1: 0.16%, Classe 2: 0.13%, Classe 3: 3.54%
----------------------
015002
021027
025005
027032
028022
033029
034011
036016
Training set size: 173
Validation set size: 58
Test 

In [None]:
%%time
train_dataset = data.SubtileDataset(train_files, 
                    num_subtiles = num_subtiles, 
                    classes_mode = '4types', 
                    patch_size=256, 
                    stride = 256,
                    dynamic_sampling=False,
                    data_augmentation = False 
                    )

class_counter, class_counter_img = train_dataset.count_classes()
percentual_counter = [c/sum(class_counter) for c in class_counter]
img_percentual = [c/len(train_dataset) for c in class_counter_img]
print(class_counter)
print(class_counter_img)
print('Percentual de pixels de cada classe:')
print([100*t.item() for t in percentual_counter])
print('Percentual de classes presentes em cada patch:')
print([100*t.item() for t in img_percentual])
print('Pesos:')
[1/c for c in img_percentual]

In [None]:
%%time
train_dataset = data.SubtileDataset(train_files, 
                    num_subtiles = num_subtiles, 
                    classes_mode = '4types', 
                    patch_size=256, 
                    stride = 256,
                    dynamic_sampling=True,
                    data_augmentation = False 
                    )

class_counter, class_counter_img = train_dataset.count_classes()
percentual_counter = [c/sum(class_counter) for c in class_counter]
img_percentual = [c/len(train_dataset) for c in class_counter_img]
print(class_counter)
print(class_counter_img)
print('Percentual de pixels de cada classe:')
print([100*t.item() for t in percentual_counter])
print('Percentual de classes presentes em cada patch:')
print([100*t.item() for t in img_percentual])
print('Pesos:')
[1/c for c in img_percentual]

### Salvar e carregar

Ao chamar train_val_test_stratify pela primeira vez, um arquivo yaml é salvo automaticamente. Isso foi feito porque checar a estratificação pode levar muito tempo.
Ao chamar train_val_test_stratify após já ter sido criado, é carregado do arquivo salvo.
A célula a seguir é um exemplo, deve rodar rápido porque já tinha sido salvo o yaml.

In [None]:
train_files, val_files, test_files = data.train_val_test_stratify(tiles, 
                                                                  num_subtiles,
                                                                    train_size = 0.6, 
                                                                    val_size = 0.2, 
                                                                    stratify_by = '4types',
                                                                    subfolder='q_12ch')

015002
016009
021027
025005
025037
027032
028022
032027
033029
034011
036016
038019
Loading:  /home/jonathan/UrbanizedAreasSegmentation/config/train_val_test_split-6_subtiles-015002_016009_021027_025005_025037_027032_028022_032027_033029_034011_036016_038019-stratified_by_4types.yaml
File already saved, loading it.


## Dataset e dataloader

O dataset foi feito com base na classe Dataset do pytorch, ele divide os subtiles e máscaras em patches e atribui ao dataset.

Na célula a seguir, é feito o carregamento de imagens e mascaras, por tipo.

É feita com janelamento com stride igual ao tamanho da imagem.


In [None]:
yaml_filename = data.yaml_filename(num_subtiles, tiles, stratified_by = '4types')

print(yaml_filename)
train_dataset = data.SubtileDataset(yaml_filename,
                                    set = 'train_files',
                                    patch_size=256, 
                                    stride=256, 
                                    dynamic_sampling=True,
                                    data_augmentation = False, # testando 
                                    )
print(len(train_dataset))
class_counter, class_counter_img = train_dataset.count_classes()
percentual_counter = [c/sum(class_counter) for c in class_counter]
img_percentual = [c/len(train_dataset) for c in class_counter_img]
print(class_counter)
print(percentual_counter)
print(class_counter_img)
print(img_percentual)



/home/jonathan/UrbanizedAreasSegmentation/config/train_val_test_split-6_subtiles-015002_016009_021027_025005_025037_027032_028022_032027_033029_034011_036016_038019-stratified_by_4types.yaml
Loading:  /home/jonathan/UrbanizedAreasSegmentation/config/train_val_test_split-6_subtiles-015002_016009_021027_025005_025037_027032_028022_032027_033029_034011_036016_038019-stratified_by_4types.yaml
{'num_subtiles': 6, 'test_files': ['/home/jonathan/UrbanizedAreasSegmentation/data/processed/S2-16D_V2_016009/6x6_subtiles/q_12ch/x=0_y=1760.tif', '/home/jonathan/UrbanizedAreasSegmentation/data/processed/S2-16D_V2_027032/6x6_subtiles/q_12ch/x=0_y=7040.tif', '/home/jonathan/UrbanizedAreasSegmentation/data/processed/S2-16D_V2_015002/6x6_subtiles/q_12ch/x=1760_y=3520.tif', '/home/jonathan/UrbanizedAreasSegmentation/data/processed/S2-16D_V2_028022/6x6_subtiles/q_12ch/x=8800_y=1760.tif', '/home/jonathan/UrbanizedAreasSegmentation/data/processed/S2-16D_V2_025037/6x6_subtiles/q_12ch/x=1760_y=3520.tif', '/ho

Doing data augmentation, stage 1...
Before data augmentation stage 1:
Pixels for each class: tensor([78086449,   119428,   175285,    11951,  1888487])
Num images with each class: tensor([1225,   82,   67,    5,  368])
[4, 1, 2, 3]
100%|██████████| 1225/1225 [08:12<00:00,  2.49it/s]
Before data augmentation stage 2:
Pixels for each class: tensor([217938265,   1339280,   6290601,   2197203,  38638491])
Num images with each class: tensor([4065, 1042, 2195,  965, 2596])
[4, 2, 1, 3]
[tensor(1), tensor(1), tensor(3), tensor(4)]
100%|██████████| 4065/4065 [00:11<00:00, 352.55it/s]
After data augmentation:
Pixels for each class: tensor([745605717,   3140102,  24110105,   4915387, 111290065])
Num images with each class: tensor([13566,  2399,  3515,  2197,  9073])
Starting from 1225 images
Dinamic Window step added 2840 images
Data augmentation added 9501 images with transform
Total: 13566
tensor([745605717,   3140102,  24110105,   4915387, 111290065])
[tensor(0.8386), tensor(0.0035), tensor(0.0271), tensor(0.0055), tensor(0.1252)]
tensor([13566,  2399,  3515,  2197,  9073])
[tensor(1.), tensor(0.1768), tensor(0.2591), tensor(0.1619), tensor(0.6688)]

Das 9800 imagens, 1455 delas tiveram mais de 1% de área de classes minoritárias. Essas são readicionadas no dataset em 7 transformações adicionais.

Isso gera, para este conjunto de treino, 1455*7 + 9800 = 19985 imagens.
 

In [None]:
print(len(train_dataset))
for img, mask in train_dataset:
    print(img.shape)
    print(mask.shape)
    break

### Exemplo de imagem

In [None]:
%%time
import matplotlib.pyplot as plt

i = 0
for img, mask in train_dataset:
    if  (mask != 0).any():
        #subtile_composition.display_images(img)
        plt.figure(figsize=(20,20))
        plt.subplot(1,2,1)
        plt.imshow(mask.squeeze())
        plt.subplot(1,2,2)
        plt.imshow(img[2].squeeze(),cmap='gray')
        plt.show()
        i+=1

    if i == 3:
        pass
        break
    print(i)
    #img,mask = next(train_dataset)

In [None]:
%%time
for i in range(len(train_dataset)):
    image, label = train_dataset[i]
    print(f"Sample {i}: Image shape = {image.shape}, Label shape = {label.shape}")
#4 min, with nan =  false


### Dataloader

Para carregar várias imagens em batches, usamos dataloaders

In [None]:
from torch.utils.data import DataLoader
batch_size = 16
train_loader = DataLoader(train_dataset, 
                            batch_size=16, 
                            shuffle=True)

In [None]:
for batch in train_loader:
    img = batch[0]
    mask = batch[1]
    print(img.shape)
    print(mask.shape)
    print(mask.unique(return_counts=True))
    break



In [None]:
# Compute PCA components
working_dir = os.path.abspath('..')
save_path = os.path.join(working_dir, 'config', 'pca_weights.npy')
pca_weights = data.compute_pca_from_dataloader(train_loader, save_path=save_path)
print("PCA Weights Shape:", pca_weights.shape)  # Expected: [3, 12]


In [None]:
for img,lab in train_loader:
    print(pca_weights.shape)
    print(img.shape)
    pca_img = data.apply_pca_weights(img, torch.Tensor(pca_weights.T))
    #plt.imshow(img)
    print(pca_img.shape)
    break



In [None]:




view.plot_pca_batch(pca_img, images_per_row=4)


### Mostrando 1 batch

In [None]:
#subtile_composition.display_images(img)
import matplotlib.pyplot as plt
plt.figure(figsize=(20,20))
for i in range(batch_size):
    plt.subplot(4,4,i+1)
    plt.imshow(img[i,1,:,:].squeeze(),cmap='gray')
    plt.imshow(mask[i,...])
    plt.tight_layout()
plt.show()


In [None]:
for batch in train_loader:
    img = batch[0]
    mask = batch[1]
    unique_classes, counts = torch.unique(mask, return_counts=True)
    print("Class distribution in y_true:")
    for cls, count in zip(unique_classes, counts):
        print(f"Class {cls.item()}: {count.item()} samples")

In [None]:
for i in range(len(train_dataset)):
    image, label = train_dataset[i]
    print(f"Sample {i}: Image shape = {image.shape}, Label shape = {label.shape}")
    
    unique_classes, counts = torch.unique(label, return_counts=True)
    print("Class distribution in y_true:")
    for cls, count in zip(unique_classes, counts):
        print(f"Class {cls.item()}: {count.item()} samples")
    print()

### Visualizando a interpolação

In [None]:
working_dir = os.path.abspath('..')
train_dataset_nans = data.SubtileDataset(train_files+val_files+test_files, 
                    num_subtiles = num_subtiles, 
                    classes_mode = '4types', 
                    patch_size=256, 
                    stride=256, # sem overlap 
                    data_augmentation = False, 
                    ignore_most_nans= False,
                    return_imgidx = False,
                    treat_nans=False, # they supposelly are saved as negative, then a true here cut costs
                    debug=False)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
most_nan_img = None
most_nan_mask = None
num_nan = 0

indices = []
num_nans = []
for i, (img, mask) in enumerate(train_dataset_nans):
    negatives = torch.sum(img < 0).item()
    urban = torch.sum(mask > 0).item()
    #if negatives > 0 and urban > 0:
    indices.append(i)
    num_nans.append(negatives)

ordered_idx = np.argsort(num_nans)
len(ordered_idx)

In [None]:

print(num_nans)
print(indices)
print([indices[i] for i in ordered_idx])
print([num_nans[i] for i in ordered_idx])
print([num_nans[i]/img.numel() for i in ordered_idx])

data = [num_nans[i]/img.numel()*100 for i in ordered_idx]
plt.figure(figsize=(14, 8))
counts, bins, patches = plt.hist(data, bins=30, edgecolor='black', weights=[100/len(data)]*len(data))  # Histogram
# Calculate midpoints for labels
midpoints = 0.5 * (bins[1:] + bins[:-1])  # Midpoints for 20 bins
bin_labels = [f'{bins[i]:.2f}-{bins[i+1]:.2f}' for i in range(len(bins)-1)]

for count, bin_edge in zip(counts, bins[:-1]):
    plt.text(bin_edge + (bins[1] - bins[0]) / 2, count, f'{count:.2f}%', ha='center', va='bottom', rotation=45)  # Centered above each bar
# Apply labels to midpoints
plt.xticks(midpoints, bin_labels, rotation=45)  # Use midpoints instead of bin edges

plt.xlabel('Percentual de NaN na imagem')
plt.ylabel('Quantidade de imagens')
plt.title(f'Distribuição de quantidade de NaN em dataset de {len(train_dataset_nans)} imagens')
#plt.yscale('log')  # Use log scale to better visualize large ranges

save_to = os.path.join(working_dir, 'figs', 'nans_distribution.png')
plt.savefig(save_to, bbox_inches='tight', pad_inches=0)
plt.show()

In [None]:
print(len(ordered_idx))
num_nans.sort
most_nan_img = img.clone() 
num_nan = negatives
most_nan_mask = mask.clone()
#print(type(most_nan_img))
#print(img.shape)
num_nan
#choices = [int(ordered_idx[j]) for j in [9500, 9650, 9800-1]]
choices = [indices[i] for i in [j for j in ordered_idx[1000, 1200, 1343]]]

print(choices)
print([num_nans[i] for i in choices])
imgs = []
masks = []
for c in choices:
    img,mask = train_dataset_nans[c]
    imgs.append(img)
    masks.append(mask)
    print(torch.sum(img < 0).item())



In [None]:
import numpy as np
nan_value = most_nan_img.max()

plt.figure(figsize=(12,16))

for idx in range(3):
    plt.subplot(1,3,idx+1)
    img_ = imgs[idx]
    mask_ = masks[idx]
    r = img_[1,:,:].squeeze()
    g = img_[2,:,:].squeeze()
    b = img_[3,:,:].squeeze()
    rgb_image = torch.stack([r, g, b], axis=-1)
    min = rgb_image[rgb_image > 0].min().item()
    rgb_image = rgb_image-min
    rgb_image*=1/(rgb_image.max())
    for i in range(r.shape[0]):
        for j in range(r.shape[1]):
            if r[i,j]<0:
                rgb_image[i,j,0] = 1
                rgb_image[i,j,1] = 0
                rgb_image[i,j,2] = 0
    #rgb_image = rgb_image-rgb_image.min()
    #rgb_image*=1/(rgb_image.max())
    
    plt.imshow(rgb_image)
    #plt.imshow(mask_.squeeze(), alpha=0.1)
    plt.tight_layout()
    plt.axis('off')
    

save_to = os.path.join(working_dir, 'figs', 'invalid_pixels.png')
plt.savefig(save_to, bbox_inches='tight', pad_inches=0)
plt.show()

In [None]:
import numpy as np
nan_value = most_nan_img.max()

plt.figure(figsize=(12,16))

for idx in range(3):
    plt.subplot(1,3,idx+1)
    img_ = torch.abs(imgs[idx])
    mask_ = masks[idx]
    r = img_[1,:,:].squeeze()
    g = img_[2,:,:].squeeze()
    b = img_[3,:,:].squeeze()
    rgb_image = torch.stack([r, g, b], axis=-1)
    min = rgb_image[rgb_image > 0].min().item()
    rgb_image = rgb_image-min
    rgb_image*=1/(rgb_image.max())
    for i in range(r.shape[0]):
        for j in range(r.shape[1]):
            if r[i,j]<0:
                rgb_image[i,j,0] = 1
                rgb_image[i,j,1] = 0
                rgb_image[i,j,2] = 0
    #rgb_image = rgb_image-rgb_image.min()
    #rgb_image*=1/(rgb_image.max())
    
    plt.imshow(rgb_image)
    #plt.imshow(mask_.squeeze(), alpha=0.1)
    plt.tight_layout()
    plt.axis('off')
    
save_to = os.path.join(working_dir, 'figs', 'nearest_pixels.png')
plt.savefig(save_to, bbox_inches='tight', pad_inches=0)

plt.show()

In [None]:
train_files, val_files, test_files
len(train_files), len(val_files), len(test_files)

In [None]:
test_files

In [None]:
import matplotlib.pyplot as plt
import numpy as np
working_dir = os.path.abspath('..')

def fill_with(f, rgb = (0,0,1)):
    x, y = utils.extract_integers(f)
    x = x//ratio
    y = y//ratio
    set_division[x+borda:x+subtile_size-borda, y+borda:y+subtile_size-borda, 0] = rgb[0]
    set_division[x+borda:x+subtile_size-borda, y+borda:y+subtile_size-borda, 1] = rgb[1]
    set_division[x+borda:x+subtile_size-borda, y+borda:y+subtile_size-borda, 2] = rgb[2]
ratio = 10
set_division = np.zeros(shape=(10560//ratio,10560//ratio, 3))
subtile_size = 10560//6//ratio
borda = 3

for f in train_files:
    fill_with(f, rgb = (1,0,0))
for f in val_files:
    fill_with(f, rgb = (0,1,0))
for f in test_files:
    fill_with(f, rgb = (0,0,1))

plt.figure(figsize=(15,15))
plt.imshow(set_division)

plt.tight_layout()
plt.axis('off')
    
save_to = os.path.join(working_dir, 'figs', 'set_division.png')
plt.savefig(save_to, bbox_inches='tight', pad_inches=0)

In [None]:
#################

# Amostragem dinâmica e data augmentation

Aqui vamos testar qual o efeito da amostragem dinâmica e data augmentation na distribuição de classes entre as amostras

Vamos utilizar os dados de treino.

In [None]:
tiles = ['032027']#, '032026'] 
num_subtiles = 6
working_dir = os.path.abspath('..')
train_files, val_files, test_files = data.train_val_test_stratify(tiles, 
                                                                  num_subtiles,
                                                                    train_size = 0.6, 
                                                                    val_size = 0.2, 
                                                                    stratify_by = '4types',
                                                                    subfolder='q_12ch',
                                                                    debug = True)


##### Carregando um dataset sem amostragem dinamica nem data augmentation, e observando a distribuição.

O método stats printa na tela as distribuições por classe de quantidade de patches que a classe aparece, quantidade de pixels e os equivalentes em percentual.

Observe o alto desbalanceamento. Classe 3 aparece em 1% dos patches e representa 0.07% do total de pixels.

In [None]:
simple_dataset = data.SubtileDataset(train_files, #[file], #train_files,#train_files+val_files+test_files, 
                    num_subtiles = num_subtiles, 
                    classes_mode = '4types', 
                    patch_size=256, 
                    stride = 256,
                    dynamic_sampling=False,
                    data_augmentation = False, 
                    ignore_most_nans = True, # testando 
                    )
simple_dataset.stats()

##### Carregando dataset e adicionando amostragem dinâmica

Os passos são respectivamente 256, 128, 64 e 32 para as classes 4, 1, 2, 3.

In [None]:

sample_dataset = data.SubtileDataset(train_files, #[file], #train_files,#train_files+val_files+test_files, 
                    num_subtiles = num_subtiles, 
                    classes_mode = '4types', 
                    patch_size=256, 
                    stride = 256,
                    dynamic_sampling=True,
                    data_augmentation = False, 
                    ignore_most_nans = True, # testando 
                    )

In [None]:
sample_dataset.plot_sampled_outlines(area_limits=(0,1760,0,1760), save_to=os.path.join(working_dir, 'figs', 'dynamic.png'))

Foram adicionados cerca de 1600 patches (versóes com shift de patches previamente amostrados)

Agora a classe que menos aparece é a 1, aparecendo em 15% dos patches.

O efeito no percentual de pixels é menor, pois também ao amostrar também aumenta-se os pixels das classes super representadas.

In [None]:
sample_dataset.stats()

### Data augmentation
Carregando dataset e aplicando o data augmentation D4

São adicionados mais de 2000 patches, de rotações e reflexões de patches com classes minoritárias.

A classe 3 aparece em cerca de 2% dos patches, o efeito é menor que da amostragem dinâmica. 

Deve-se ao fato de data augmentation estar limitada a x8, enquanto a amostragem dinâmica permite até 64 repetições. 

In [None]:
da_dataset = data.SubtileDataset(train_files, #[file], #train_files,#train_files+val_files+test_files, 
                    num_subtiles = num_subtiles, 
                    classes_mode = '4types', 
                    patch_size=256, 
                    stride = 256,
                    dynamic_sampling=False,
                    data_augmentation = True, 
                    ignore_most_nans = True, # testando 
                    )
da_dataset.stats()

### Amostragem dinâmica + data augmentation

In [None]:
full_da_dataset = data.SubtileDataset(train_files, #[file], #train_files,#train_files+val_files+test_files, 
                    num_subtiles = num_subtiles, 
                    classes_mode = '4types', 
                    patch_size=256, 
                    stride = 256,
                    dynamic_sampling=True,
                    data_augmentation = True, 
                    ignore_most_nans = True, # testando 
                    )
full_da_dataset.stats()

In [None]:
full_da_dataset.plot_transformed(save_to=os.path.join(working_dir, 'figs', 'transformed.png'))

### Contagem de batches do experimento

In [None]:
num_subtiles = 6
tiles = ['032027']
classes_mode = '4types'


yaml_filename = data.yaml_filename(num_subtiles, tiles, classes_mode)

for patch_size, ds, bs in zip([64, 224, 256, 512, 64, 224, 256, 512], [False, False, False, False, True, True, True, True], [16, 16, 16, 4, 16, 16, 16, 4]):
    train_dataset = data.SubtileDataset(yaml_filename, 
                                    set = 'train_files',
                                    patch_size=patch_size, 
                                    stride=patch_size, 
                                    dynamic_sampling =  ds,
                                    data_augmentation = False, # testando 
                                    )
    
    val_dataset = data.SubtileDataset(yaml_filename, 
                                    set = 'val_files',
                                    patch_size=patch_size, 
                                    stride=patch_size, 
                                    dynamic_sampling = False,
                                    data_augmentation = False, # testando 
                                    )
    print('Dataset info:')
    print(patch_size, ds, len(train_dataset), len(train_dataset)/bs)
    print(patch_size, ds, len(val_dataset),len(val_dataset)/bs)


Loading:  <function yaml_filename at 0x717e64ff3400>
{'num_subtiles': 6, 'test_files': ['/home/jonathan/UrbanizedAreasSegmentation/data/processed/S2-16D_V2_032027/6x6_subtiles/S2-16D_V2_032027_x=8800_y=8800.tif', '/home/jonathan/UrbanizedAreasSegmentation/data/processed/S2-16D_V2_032027/6x6_subtiles/S2-16D_V2_032027_x=0_y=1760.tif', '/home/jonathan/UrbanizedAreasSegmentation/data/processed/S2-16D_V2_032027/6x6_subtiles/S2-16D_V2_032027_x=1760_y=0.tif', '/home/jonathan/UrbanizedAreasSegmentation/data/processed/S2-16D_V2_032027/6x6_subtiles/S2-16D_V2_032027_x=5280_y=0.tif', '/home/jonathan/UrbanizedAreasSegmentation/data/processed/S2-16D_V2_032027/6x6_subtiles/S2-16D_V2_032027_x=1760_y=7040.tif', '/home/jonathan/UrbanizedAreasSegmentation/data/processed/S2-16D_V2_032027/6x6_subtiles/S2-16D_V2_032027_x=8800_y=7040.tif', '/home/jonathan/UrbanizedAreasSegmentation/data/processed/S2-16D_V2_032027/6x6_subtiles/S2-16D_V2_032027_x=8800_y=5280.tif'], 'tiles': ['032027'], 'train_files': ['/home/j