# Happy Whale Images Converted to netcdf for Faster Batch Loading

In [None]:
import h5py
from PIL import Image
import os
import numpy as np
from tqdm import tqdm
import pncpy
import pandas as pd

# paths
TRAIN_IMAGES = '../../../exercise/whale-and-dolphin/train_images'
TEST_IMAGES = '../../../exercise/whale-and-dolphin/test_images'



def list_files(gtdir):
    file_list = []
    for root, dirs, files in os.walk(gtdir):
        for file in files:
            file_list.append(os.path.join(root,file))
    return file_list

def tonetcdf(gtdir, out_file_path, df):
    print ('=> Converting images to netcdf')
    images_names = df.image.values
    print ('=> Total Images To Process : {}'.format(len(images_names)))
    pbar = tqdm(total=len(images_names))
    count = 0
    
    with pncpy.File(out_file_path, mode = "w", format = "64BIT_DATA") as fnc:
        dim_y = fnc.def_dim("Y", 224)
        dim_x = fnc.def_dim("X", 224)
        dim_rgb = fnc.def_dim("RGB", 3)
        dim_n = fnc.def_dim("img_idx", len(images_names))
        var = fnc.def_var("images", pncpy.NC_UBYTE, (dim_n, dim_y, dim_x, dim_rgb))
        fnc.enddef()
        for k, img_name in enumerate(images_names):
            f_ = os.path.join(gtdir, img_name)
            image = Image.open(f_)
            if image.mode == 'L':
                image = image.convert('RGB')
            image = image.resize((224,224))
            image = np.array(image)
            var[k] = image 
            count = count + 1
            if count % 10 == 0:
                pbar.update(count)

    pbar.close()
    print('=>  Finished Converting images to netcdf')
       
# print('=> ========= Converting Train Images ========= <=')
TRAIN_CSV = '../../../exercise/whale-and-dolphin/train.csv'
TEST_CSV = '.../../../exercise/happy-whale-and-dolphin/sample_submission.csv'
train_df = pd.read_csv(TRAIN_CSV)
tonetcdf(TRAIN_IMAGES,'train_images.nc', train_df)
print('=> ========= Converting Test Images ========= <=')
# file_list = list_files(TEST_IMAGES)[:100]
# tonc(file_list,out_file_path='test_images.nc')


# Example DataLoader and DataSet in PyTorch with PnetCDF-python

In [None]:
import torch
from torchvision import transforms
from torch.utils.data import DataLoader, Dataset
import pandas as pd


TRAIN_CSV = '../../../exercise/whale-and-dolphin/train.csv'
TEST_CSV = '.../../../exercise/whale-and-dolphin/sample_submission.csv'
# img_lists = [file.split('/')[-1] for file in file_list]

# Change accordingly 
# if input
#DATASET_ROOT = '../input/happy-whale-to-hdf5-224x224'

# if output
DATASET_ROOT = './'

# Read CSV to DataFrame
train_df = pd.read_csv(TRAIN_CSV)


# Train Transforms
train_transforms  = transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) # imagenet
        ])


# Label Encoder
def get_label_encoder_decoder(unique_values):
    label_encoder = {}
    label_decoder = {}
    for idx, label in enumerate(unique_values):
        label_encoder[label] = idx
        label_decoder[idx] = label
    return label_encoder, label_decoder

label_encoder_ind_id, label_decoder_ind_id = get_label_encoder_decoder(train_df['individual_id'].unique())

# torch dataloader
class DolphinWhaleDatasetNetCDF(Dataset):
    
    def __init__(self, root_dir, data_frame, is_train=True, transforms=None):
    
        self.image_names = data_frame['image'].values
        self.is_train = is_train
        if is_train:
            self.labels = data_frame['individual_id'].values
        else:
            self.labels = [-1] *  len(self.image_names)
           
        self.transforms = transforms
        print ('=> Reading NetCDF File...')
        nc_path = os.path.join(root_dir,'{}_images.nc'.format('train' if is_train else 'test'))
        self.nc = pncpy.File(nc_path,'r')
        print('=> Dataset created, image nc file is : {}'.format(nc_path))
        
    def __len__(self):
        return len(self.image_names)
    
    def fetch_item_train(self,idx):
        
        # image name
        image_name = self.image_names[idx]
       
        # read image 
        image = np.array(self.nc.variables['images'][idx])
        
        # fetch and encode label
        label = label_encoder_ind_id[self.labels[idx]]
       
        if self.transforms:
            image = self.transforms(image)
        
        return {'image':image,
                'label':label,}
    
    def fetch_item_test(self,idx):
        image_name = self.image_names[idx]
       
        # read image 
        image = np.array(self.nc.variables['images'][idx])
       
        if self.transforms:
            image = self.transforms(image)
        
        return {'image':image,
                'image_name':image_name}
    
    def __getitem__(self, index):
        if self.is_train:
            return self.fetch_item_train(index)
        else:
            return self.fetch_item_test(index)
    

    
# Training and Validation Dataset
dataset = DolphinWhaleDatasetNetCDF(DATASET_ROOT,train_df,is_train=True, transforms=train_transforms)

train_loader = DataLoader(
    dataset, batch_size=4, num_workers=4)
print(len(train_loader))
for batch_idx, sample_ in enumerate(train_loader):
    inputs = sample_['image']  
    print(inputs.shape)
    print(inputs[0])

