# Read Parquet Files


In [1]:
import pyarrow.parquet as pq

dataset = pq.ParquetDataset('data')
table = dataset.read()
df = table.to_pandas()
df.head()

Unnamed: 0,image,mask
0,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...
1,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...
2,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...
3,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...
4,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...


# Train Test Split

In [2]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2)

# Define Dataset

In [18]:
def convert2SimpleMask(mask : np.array):
    mask[np.isin(mask, [1, 3, 4, 5, 6, 7, 8, 9, 10, 16, 17])] = 1
    mask[np.isin(mask, [2, 11, 12, 13, 14, 15])] = 2
    return mask
    

In [48]:
from PIL import Image
import torch
from torch.utils.data import Dataset
from io import BytesIO

class SegmentationDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        image = Image.open(BytesIO(self.dataframe.iloc[idx, 0]['bytes'])) # image
        mask = convert2SimpleMask(np.array(Image.open(BytesIO(self.dataframe.iloc[idx, 1]['bytes'])))) # mask

        if self.transform:
            image = self.transform(image)
            mask = self.transform(mask)

        return image, mask

# Add transforms

In [61]:
from torchvision import transforms

mean = torch.tensor([0.593, 0.567, 0.534])
std = torch.tensor([0.247, 0.247, 0.247])

trainsform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize((600, 400)),
    transforms.Normalize(mean, std)
])

train_dataset = SegmentationDataset(train_df, transform=trainsform)
test_dataset = SegmentationDataset(test_df, transform=trainsform)

# DataLoader

In [62]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

In [59]:
# mean = torch.zeros(3)
# std = torch.zeros(3)
# 
# for i, (image, _) in enumerate(train_loader):
#     batch_samples = image.size(0) # batch size (the last batch can have smaller size!)
#     image = image.view(batch_samples, image.size(1), -1)
#     mean += image.mean(2).sum(0)
#     std += image.std(2).sum(0)
# 
# mean /= len(train_loader.dataset)
# std /= len(train_loader.dataset)
# mean, std

(tensor([0.5927, 0.5667, 0.5338]), tensor([0.2466, 0.2467, 0.2466]))