Data preparation for feeding the models will be carried out in this file.

In [1]:
import pandas as pd
import seaborn as sns
import os 
import numpy as np
import matplotlib.pyplot as plt 
import random
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data import random_split

In [2]:
PATH_OSATS = "../data/raw/OSATS.xlsx"
PATH_TRAIN = "../data/processed/"
# PATH_VIDEO_ARRAY = "../data/video_array/"
BATCH_SIZE = 4

In [3]:
from GRSDataset import GRSDataset

def prepare_data_loaders(particion, path_train, path_osats):

    dataset_train = GRSDataset(path_train, path_osats, transforms=None)

    train_size = int(particion * len(dataset_train))
    val_size = int(0.1 * len(dataset_train))  # 10% for validation
    test_size = len(dataset_train) - train_size - val_size

    train_data, temp_data = random_split(dataset_train, [train_size, len(dataset_train) - train_size], generator=torch.Generator().manual_seed(42))
    val_data, test_data = random_split(temp_data, [val_size, test_size], generator=torch.Generator().manual_seed(42))

    train_dl = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
    val_dl = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=True)
    test_dl = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=True)

    train_dl_all = DataLoader(train_data, batch_size=len(train_data), shuffle=True)
    val_dl_all = DataLoader(val_data, batch_size=len(val_data), shuffle=True)
    test_dl_all = DataLoader(test_data, batch_size=len(test_data), shuffle=True)

    return train_dl, val_dl, test_dl, train_dl_all, val_dl_all, test_dl_all

In [4]:
train_dl, val_dl, test_dl, train_dl_all, val_dl_all, test_dl_all = prepare_data_loaders(0.70, PATH_TRAIN, PATH_OSATS)

In [5]:
from IPython.display import display

def visualize_data(path):
    # criar uma instância do dataset
    df = pd.read_csv(path, header=0)
    display(df)

def visualize_dataset(train_dl, test_dl, dataset_train, dataset_test):
    print(f"Quantidade de casos de Treino:{len(train_dl.dataset)}")
    print(f"Quantidade de casos de Validação:{len(val_dl.dataset)}")
    print(f"Quantidade de casos de Teste:{len(test_dl.dataset)}")

    x, y = next(iter(train_dl)) # fazer uma iteração nos loaders para ir buscar um batch de casos
    print(f"Shape tensor batch casos treino, input: {x.shape}, output: {y.shape}")
    x, y = next(iter(val_dl)) # fazer uma iteração nos loaders para ir buscar um batch de casos
    print(f"Shape tensor batch casos validação, input: {x.shape}, output: {y.shape}")
    x, y = next(iter(test_dl))
    print(f"Shape tensor batch casos test, input: {x.shape}, output: {y.shape}")

    print(f'Valor maximo:{torch.max(x)} Valor mínimo:{torch.min(x)}')
    x=x.detach().numpy()
    print(f'Valor maximo:{np.max(x)} Valor mínimo:{np.min(x)}')
    print(y)

In [6]:
#visualize_data(PATH_TRAIN)
visualize_dataset(train_dl, test_dl, train_dl_all, test_dl_all)

Quantidade de casos de Treino:21
Quantidade de casos de Validação:3
Quantidade de casos de Teste:6
Shape tensor batch casos treino, input: torch.Size([4, 3, 1000, 224, 224]), output: torch.Size([4])
Shape tensor batch casos validação, input: torch.Size([3, 3, 1000, 224, 224]), output: torch.Size([3])
Shape tensor batch casos test, input: torch.Size([4, 3, 1000, 224, 224]), output: torch.Size([4])
Valor maximo:1.0 Valor mínimo:0.0
Valor maximo:1.0 Valor mínimo:0.0
tensor([1, 3, 0, 3])


In [7]:
def visualize_holdout_balance(dl):
    _, labels = next(iter(dl))
    
    sns.set_style('whitegrid')
    print("Cases:", len(labels))
    
    x, y = np.unique(labels, return_counts=True)
    
    print(f"Labels (indices): {[str(n) for n in x]}")
    print(f"Class count: {y}")
    print(f"Sum of counts: {np.sum(y)}")

print("----------------------------------- Train Cases -----------------------------------")
visualize_holdout_balance(train_dl_all)
print("--------------------------------- Validation Cases ---------------------------------")
visualize_holdout_balance(val_dl_all)
print("------------------------------------ Test Cases ------------------------------------")
visualize_holdout_balance(test_dl_all)

----------------------------------- Train Cases -----------------------------------
Cases: 21
Labels (indices): ['0', '1', '2', '3']
Class count: [13  3  4  1]
Sum of counts: 21
----------------------------------- Validation Cases -----------------------------------
Cases: 3
Labels (indices): ['0', '1', '2']
Class count: [1 1 1]
Sum of counts: 3
----------------------------------- Test Cases -----------------------------------
Cases: 6
Labels (indices): ['0', '1', '2', '3']
Class count: [1 1 1 3]
Sum of counts: 6
