# Checagem de dados

Scripts com exemplos de como fazer o carregamento e divisão dos dados de forma estratificada.
 

In [None]:
# imports

import os
import sys
sys.path.append(os.path.abspath('..'))
from rasterio.coords import BoundingBox

import src.data.preprocess_data as data
import src.data.view as view

import torch

### Definições
Quais tiles e o número de subtiels são definidos a seguir. 

In [None]:
tiles = ['032027']#, '032026'] 
num_subtiles = 6


Divisão do conjunto de treino, validação e teste, em que cada arquivo em data/processed dos tiles correspondentes são associados a um dos conjuntos.

Primeiramente, sem estratificação


In [None]:
train_files_no_strat, val_files_no_strat, test_files_no_strat = data.train_val_test_stratify(tiles, 
                                                                  num_subtiles,
                                                                    train_size = 0.6, 
                                                                    val_size = 0.2)

data.count_classes(train_files_no_strat+val_files_no_strat+test_files_no_strat, num_subtiles=6, agregate_by='type')

data.count_classes(train_files_no_strat, num_subtiles=6, agregate_by='type')
data.count_classes(val_files_no_strat, num_subtiles=6, agregate_by='type')
data.count_classes(test_files_no_strat, num_subtiles=6, agregate_by='type')



Com estratificação, baseada nas máscaras, de acordo com o valor de classe escolhida pelo usuário.

Neste exemplo, é feita pelo tipo.


## Checagem de resultados da estratificação

data.check_stratification imprime o número de pixels de cada classe por divisão de conjunto. 

In [None]:
train_files, val_files, test_files = data.train_val_test_stratify(tiles, 
                                                                  num_subtiles,
                                                                    train_size = 0.6, 
                                                                    val_size = 0.2, 
                                                                    stratify_by = 'type')

train_files, val_files, test_files
print(len(train_files), len(val_files), len(test_files))

data.count_classes(train_files+val_files+test_files, num_subtiles=6, agregate_by='type')

data.count_classes(train_files, num_subtiles=6, agregate_by='type')
data.count_classes(val_files, num_subtiles=6, agregate_by='type')
data.count_classes(test_files, num_subtiles=6, agregate_by='type')


## Salvar e carregar

Um arquivo yaml é salvo automaticamente.


## Dataset e dataloader

Carregamento de imagens e mascaras, por tipo.

É feita com janelamento com stride de metade do tamanho da imagem.

Adicionalmente, é feito data augmentation. As classes minoritárias que tem 1% ou mais de área da imagem geram 8 cópias contendo as transformacoes "Dihedral Group of Order 8" (ou D₄).

In [None]:
%%time
train_dataset = data.SubtileDataset(train_files, 
                    num_subtiles = num_subtiles, 
                    classes_mode = 'type', 
                    patch_size=256, 
                    stride = 256,
                    dynamic_sampling=False,
                    data_augmentation = False, 
                    ignore_most_nans = True, # testando 
                    )

class_counter, class_counter_img = train_dataset.count_classes()
percentual_counter = [c/sum(class_counter) for c in class_counter]
img_percentual = [c/len(train_dataset) for c in class_counter_img]
print(class_counter)
print(percentual_counter)
print(class_counter_img)
print(img_percentual)
[1/c for c in img_percentual]



In [None]:
%%time
train_dataset = data.SubtileDataset(train_files, 
                    num_subtiles = num_subtiles, 
                    classes_mode = 'type', 
                    patch_size=256, 
                    stride = 256,
                    dynamic_sampling=False,
                    data_augmentation = False, 
                    ignore_most_nans = False, # testando 
                    )

class_counter, class_counter_img = train_dataset.count_classes()
percentual_counter = [c/sum(class_counter) for c in class_counter]
img_percentual = [c/len(train_dataset) for c in class_counter_img]
print(class_counter)
print(percentual_counter)
print(class_counter_img)
print(img_percentual)
[1/c for c in img_percentual]



In [None]:
yaml_filename = data.yaml_filename(num_subtiles, tiles, stratified_by = 'type')

print(yaml_filename)
train_dataset = data.SubtileDataset(yaml_filename,
                                    set = 'train_files',
                                    patch_size=256, 
                                    stride=256, 
                                    dynamic_sampling=True,
                                    data_augmentation = True, # testando 
                                    )
print(len(train_dataset))
class_counter, class_counter_img = train_dataset.count_classes()
percentual_counter = [c/sum(class_counter) for c in class_counter]
img_percentual = [c/len(train_dataset) for c in class_counter_img]
print(class_counter)
print(percentual_counter)
print(class_counter_img)
print(img_percentual)



Doing data augmentation, stage 1...
Before data augmentation stage 1:
Pixels for each class: tensor([78086449,   119428,   175285,    11951,  1888487])
Num images with each class: tensor([1225,   82,   67,    5,  368])
[4, 1, 2, 3]
100%|██████████| 1225/1225 [08:12<00:00,  2.49it/s]
Before data augmentation stage 2:
Pixels for each class: tensor([217938265,   1339280,   6290601,   2197203,  38638491])
Num images with each class: tensor([4065, 1042, 2195,  965, 2596])
[4, 2, 1, 3]
[tensor(1), tensor(1), tensor(3), tensor(4)]
100%|██████████| 4065/4065 [00:11<00:00, 352.55it/s]
After data augmentation:
Pixels for each class: tensor([745605717,   3140102,  24110105,   4915387, 111290065])
Num images with each class: tensor([13566,  2399,  3515,  2197,  9073])
Starting from 1225 images
Dinamic Window step added 2840 images
Data augmentation added 9501 images with transform
Total: 13566
tensor([745605717,   3140102,  24110105,   4915387, 111290065])
[tensor(0.8386), tensor(0.0035), tensor(0.0271), tensor(0.0055), tensor(0.1252)]
tensor([13566,  2399,  3515,  2197,  9073])
[tensor(1.), tensor(0.1768), tensor(0.2591), tensor(0.1619), tensor(0.6688)]

In [None]:
if 0:
    train_dataset = data.SubtileDataset(train_files, 
                    num_subtiles = num_subtiles, 
                    classes_mode = 'type', 
                    patch_size=256, 
                    stride=256, 
                    data_augmentation = 1, # testando 
                    return_imgidx = False,
                    treat_nans=True, # they supposelly are saved as negative, then a true here cut costs
                    debug=False, 
                    augmentation_thresholds=(0.05, 0.5))

    class_counter, class_counter_img = train_dataset.count_classes()
    percentual_counter = [c/sum(class_counter) for c in class_counter]
    img_percentual = [c/len(train_dataset) for c in class_counter_img]
    print(class_counter)
    print(percentual_counter)
    print(class_counter_img)
    print(img_percentual)
        

Das 9800 imagens, 1455 delas tiveram mais de 1% de área de classes minoritárias. Essas são readicionadas no dataset em 7 transformações adicionais.

Isso gera, para este conjunto de treino, 1455*7 + 9800 = 19985 imagens.
 

In [None]:
print(len(train_dataset))
for img, mask in train_dataset:
    print(img.shape)
    print(mask.shape)
    break

### Exemplo de imagem

In [None]:
%%time
import matplotlib.pyplot as plt

i = 0
for img, mask in train_dataset:
    if  (mask != 0).any():
        #subtile_composition.display_images(img)
        plt.figure(figsize=(20,20))
        plt.subplot(1,2,1)
        plt.imshow(mask.squeeze())
        plt.subplot(1,2,2)
        plt.imshow(img[2].squeeze(),cmap='gray')
        plt.show()
        i+=1

    if i == 3:
        pass
        break
    print(i)
    #img,mask = next(train_dataset)

In [None]:
%%time
for i in range(len(train_dataset)):
    image, label = train_dataset[i]
    print(f"Sample {i}: Image shape = {image.shape}, Label shape = {label.shape}")
#4 min, with nan =  false


### Dataloader

Para carregar várias imagens em batches, usamos dataloaders

In [None]:
from torch.utils.data import DataLoader
batch_size = 16
train_loader = DataLoader(train_dataset, 
                            batch_size=16, 
                            shuffle=True)

In [None]:
for batch in train_loader:
    img = batch[0]
    mask = batch[1]
    print(img.shape)
    print(mask.shape)
    print(mask.unique(return_counts=True))
    break



In [None]:
# Compute PCA components
working_dir = os.path.abspath('..')
save_path = os.path.join(working_dir, 'config', 'pca_weights.npy')
pca_weights = data.compute_pca_from_dataloader(train_loader, save_path=save_path)
print("PCA Weights Shape:", pca_weights.shape)  # Expected: [3, 12]


In [None]:
for img,lab in train_loader:
    print(pca_weights.shape)
    print(img.shape)
    pca_img = data.apply_pca_weights(img, torch.Tensor(pca_weights.T))
    #plt.imshow(img)
    print(pca_img.shape)
    break



In [None]:




view.plot_pca_batch(pca_img, images_per_row=4)


### Mostrando 1 batch

In [None]:
#subtile_composition.display_images(img)
import matplotlib.pyplot as plt
plt.figure(figsize=(20,20))
for i in range(batch_size):
    plt.subplot(4,4,i+1)
    plt.imshow(img[i,1,:,:].squeeze(),cmap='gray')
    plt.imshow(mask[i,...])
    plt.tight_layout()
plt.show()


In [None]:
for batch in train_loader:
    img = batch[0]
    mask = batch[1]
    unique_classes, counts = torch.unique(mask, return_counts=True)
    print("Class distribution in y_true:")
    for cls, count in zip(unique_classes, counts):
        print(f"Class {cls.item()}: {count.item()} samples")

In [None]:
for i in range(len(train_dataset)):
    image, label = train_dataset[i]
    print(f"Sample {i}: Image shape = {image.shape}, Label shape = {label.shape}")
    
    unique_classes, counts = torch.unique(label, return_counts=True)
    print("Class distribution in y_true:")
    for cls, count in zip(unique_classes, counts):
        print(f"Class {cls.item()}: {count.item()} samples")
    print()

### Visualizando a interpolação

In [None]:
working_dir = os.path.abspath('..')
train_dataset_nans = data.SubtileDataset(train_files+val_files+test_files, 
                    num_subtiles = num_subtiles, 
                    classes_mode = 'type', 
                    patch_size=256, 
                    stride=256, # sem overlap 
                    data_augmentation = False, 
                    ignore_most_nans= False,
                    return_imgidx = False,
                    treat_nans=False, # they supposelly are saved as negative, then a true here cut costs
                    debug=False)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
most_nan_img = None
most_nan_mask = None
num_nan = 0

indices = []
num_nans = []
for i, (img, mask) in enumerate(train_dataset_nans):
    negatives = torch.sum(img < 0).item()
    urban = torch.sum(mask > 0).item()
    #if negatives > 0 and urban > 0:
    indices.append(i)
    num_nans.append(negatives)

ordered_idx = np.argsort(num_nans)
len(ordered_idx)

In [None]:

print(num_nans)
print(indices)
print([indices[i] for i in ordered_idx])
print([num_nans[i] for i in ordered_idx])
print([num_nans[i]/img.numel() for i in ordered_idx])

data = [num_nans[i]/img.numel()*100 for i in ordered_idx]
plt.figure(figsize=(14, 8))
counts, bins, patches = plt.hist(data, bins=30, edgecolor='black', weights=[100/len(data)]*len(data))  # Histogram
# Calculate midpoints for labels
midpoints = 0.5 * (bins[1:] + bins[:-1])  # Midpoints for 20 bins
bin_labels = [f'{bins[i]:.2f}-{bins[i+1]:.2f}' for i in range(len(bins)-1)]

for count, bin_edge in zip(counts, bins[:-1]):
    plt.text(bin_edge + (bins[1] - bins[0]) / 2, count, f'{count:.2f}%', ha='center', va='bottom', rotation=45)  # Centered above each bar
# Apply labels to midpoints
plt.xticks(midpoints, bin_labels, rotation=45)  # Use midpoints instead of bin edges

plt.xlabel('Percentual de NaN na imagem')
plt.ylabel('Quantidade de imagens')
plt.title('Distribuição de quantidade de NaN em dataset de 9800 imagens')
#plt.yscale('log')  # Use log scale to better visualize large ranges

save_to = os.path.join(working_dir, 'figs', 'nans_distribution.png')
plt.savefig(save_to, bbox_inches='tight', pad_inches=0)
plt.show()

In [None]:

num_nans.sort
most_nan_img = img.clone() 
num_nan = negatives
most_nan_mask = mask.clone()
#print(type(most_nan_img))
#print(img.shape)
num_nan
choices = [int(ordered_idx[j]) for j in [9500, 9650, 9800-1]]
#choices = [indices[i] for i in [j for j in ordered_idx[1000, 1200, 1343]]]

print(choices)
print([num_nans[i] for i in choices])
imgs = []
masks = []
for c in choices:
    img,mask = train_dataset_nans[c]
    imgs.append(img)
    masks.append(mask)
    print(torch.sum(img < 0).item())



In [None]:
import numpy as np
nan_value = most_nan_img.max()

plt.figure(figsize=(12,16))

for idx in range(3):
    plt.subplot(1,3,idx+1)
    img_ = imgs[idx]
    mask_ = masks[idx]
    r = img_[1,:,:].squeeze()
    g = img_[2,:,:].squeeze()
    b = img_[3,:,:].squeeze()
    rgb_image = torch.stack([r, g, b], axis=-1)
    min = rgb_image[rgb_image > 0].min().item()
    rgb_image = rgb_image-min
    rgb_image*=1/(rgb_image.max())
    for i in range(r.shape[0]):
        for j in range(r.shape[1]):
            if r[i,j]<0:
                rgb_image[i,j,0] = 1
                rgb_image[i,j,1] = 0
                rgb_image[i,j,2] = 0
    #rgb_image = rgb_image-rgb_image.min()
    #rgb_image*=1/(rgb_image.max())
    
    plt.imshow(rgb_image)
    #plt.imshow(mask_.squeeze(), alpha=0.1)
    plt.tight_layout()
    plt.axis('off')
    

save_to = os.path.join(working_dir, 'figs', 'invalid_pixels.png')
plt.savefig(save_to, bbox_inches='tight', pad_inches=0)
plt.show()

In [None]:
import numpy as np
nan_value = most_nan_img.max()

plt.figure(figsize=(12,16))

for idx in range(3):
    plt.subplot(1,3,idx+1)
    img_ = torch.abs(imgs[idx])
    mask_ = masks[idx]
    r = img_[1,:,:].squeeze()
    g = img_[2,:,:].squeeze()
    b = img_[3,:,:].squeeze()
    rgb_image = torch.stack([r, g, b], axis=-1)
    min = rgb_image[rgb_image > 0].min().item()
    rgb_image = rgb_image-min
    rgb_image*=1/(rgb_image.max())
    for i in range(r.shape[0]):
        for j in range(r.shape[1]):
            if r[i,j]<0:
                rgb_image[i,j,0] = 1
                rgb_image[i,j,1] = 0
                rgb_image[i,j,2] = 0
    #rgb_image = rgb_image-rgb_image.min()
    #rgb_image*=1/(rgb_image.max())
    
    plt.imshow(rgb_image)
    #plt.imshow(mask_.squeeze(), alpha=0.1)
    plt.tight_layout()
    plt.axis('off')
    
save_to = os.path.join(working_dir, 'figs', 'nearest_pixels.png')
plt.savefig(save_to, bbox_inches='tight', pad_inches=0)

plt.show()

In [None]:
train_files, val_files, test_files
len(train_files), len(val_files), len(test_files)

In [None]:
test_files

In [None]:
import matplotlib.pyplot as plt
import numpy as np
working_dir = os.path.abspath('..')

def fill_with(f, rgb = (0,0,1)):
    x, y = data.extract_integers(f)
    x = x//ratio
    y = y//ratio
    set_division[x+borda:x+subtile_size-borda, y+borda:y+subtile_size-borda, 0] = rgb[0]
    set_division[x+borda:x+subtile_size-borda, y+borda:y+subtile_size-borda, 1] = rgb[1]
    set_division[x+borda:x+subtile_size-borda, y+borda:y+subtile_size-borda, 2] = rgb[2]
ratio = 10
set_division = np.zeros(shape=(10560//ratio,10560//ratio, 3))
subtile_size = 10560//6//ratio
borda = 3

for f in train_files:
    fill_with(f, rgb = (1,0,0))
for f in val_files:
    fill_with(f, rgb = (0,1,0))
for f in test_files:
    fill_with(f, rgb = (0,0,1))

plt.figure(figsize=(15,15))
plt.imshow(set_division)

plt.tight_layout()
plt.axis('off')
    
save_to = os.path.join(working_dir, 'figs', 'set_division.png')
plt.savefig(save_to, bbox_inches='tight', pad_inches=0)

In [None]:

train_files, val_files, test_files = data.train_val_test_stratify(tiles, 
                                                                  num_subtiles,
                                                                    train_size = 0.7, 
                                                                    val_size = 0.15, 
                                                                    stratify_by = 'type',
                                                                    debug = True)