# Splitting the large image to many smaller ones

In [1]:
import os
import numpy as np
import imageio

In [2]:
#As FFCV needs each instance to fit on a single page, the sizes are reduced
tif_path = "data/train_area.tif"
tif_image = imageio.imread(tif_path)
data = np.array(tif_image)
print(data.shape)

  tif_image = imageio.imread(tif_path)


(57832, 94303, 3)


In [3]:
height = 57832 // 1200
width = 94303 // 2000
print(height, width, height*width)
#2256 Files will be created with shape (1200, 2000, 3)
#Later each batch contains 48 images

48 47 2256


In [5]:
folderpath = "data/train_small_npy"
height = 1200
width = 2000

height_parts = data.shape[0] // height
width_parts = data.shape[1] // width

for i in range(height_parts):
    for j in range(width_parts):
        part = data[i * height: (i + 1) * height, j * width: (j + 1) * width]
        
        #Convert the shape of each ndarray to (3, 1200, 2000) for processing in PyTorch
        part = np.transpose(part, (2, 0, 1))
        
        filepath = os.path.join(folderpath, f"part_{i}_{j}.npy")
        np.save(filepath, part)
    print(f"Height part {i+1}/{height_parts} done.")

Height part 1/48 done.
Height part 2/48 done.
Height part 3/48 done.
Height part 4/48 done.
Height part 5/48 done.
Height part 6/48 done.
Height part 7/48 done.
Height part 8/48 done.
Height part 9/48 done.
Height part 10/48 done.
Height part 11/48 done.
Height part 12/48 done.
Height part 13/48 done.
Height part 14/48 done.
Height part 15/48 done.
Height part 16/48 done.
Height part 17/48 done.
Height part 18/48 done.
Height part 19/48 done.
Height part 20/48 done.
Height part 21/48 done.
Height part 22/48 done.
Height part 23/48 done.
Height part 24/48 done.
Height part 25/48 done.
Height part 26/48 done.
Height part 27/48 done.
Height part 28/48 done.
Height part 29/48 done.
Height part 30/48 done.
Height part 31/48 done.
Height part 32/48 done.
Height part 33/48 done.
Height part 34/48 done.
Height part 35/48 done.
Height part 36/48 done.
Height part 37/48 done.
Height part 38/48 done.
Height part 39/48 done.
Height part 40/48 done.
Height part 41/48 done.
Height part 42/48 done.
H

In [6]:
# Label dictionary already included
print(len(os.listdir(folderpath)))

2257


# Add dictionary for labels

In [7]:
file_list = os.listdir(folderpath)
np.random.seed(seed=None)

labels_dict = {}

for filename in file_list:
    if filename.startswith("part_"):
        label = np.random.uniform(0, 100)
        labels_dict[filename] = label

np.save(os.path.join(folderpath, "labels.npy"), labels_dict)

In [8]:
print(len(labels_dict))

2256


# Creating the .beton file for FFCV

In [10]:
!export NUMBA_THREADING_LAYER='omp'
import ffcv



In [11]:
from ffcv.writer import DatasetWriter
from ffcv.fields import NDArrayField, FloatField
import numpy as np
from torch.utils.data import Dataset
import os

In [12]:
# Identical definition as in PyTorch
class CustomDataset(Dataset):
    def __init__(self, data_dir):
        self.data_dir = data_dir
        self.file_list = [f for f in os.listdir(data_dir) if f.startswith("part_")]
        self.labels = np.load(os.path.join(data_dir, "labels.npy"), allow_pickle = True).item()
        
    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        file_path = os.path.join(self.data_dir, self.file_list[idx])
        image = np.load(file_path)
        label = self.labels[self.file_list[idx]]
        return image, label

In [13]:
data_dir = "data/train_small_npy"

dataset = CustomDataset(data_dir)

In [14]:
writer = DatasetWriter("data/train_data.beton", {
    "image": NDArrayField(shape=(3, 1200, 2000), dtype = np.dtype("uint8")),
    "label": FloatField()
}, num_workers=8)

In [15]:
writer.from_indexed_dataset(dataset)

100%|██████████| 2256/2256 [00:14<00:00, 159.46it/s]


# Copy the data to ssd storage

In [None]:
'''Command line commands
cp -r saved_data/data/train_small_npy ssd
cp saved_data/data/train_data.beton ssd
'''

# Generate 200 GB dataset to exceed RAM

In [33]:
import os
print(len(os.listdir("data/train_small_npy")))

2257


In [34]:
data_dir = "data/train_small_npy"
file_list = [f for f in os.listdir(data_dir) if f.startswith("part_")]
print(len(file_list))
data = np.load('data/train_small_npy/part_45_12.npy')
print(data.shape)

2256
(3, 1200, 2000)


In [35]:
# Format: part_heigth_width_left/right_copy
# with heigth 0-47, width 0-46, left/right 0-1, copy 0-12
data_dir = "data/train_small_npy"
new_data_dir = "data/train_200GB"
counter = 0
for file in file_list:
    counter += 1
    
    filename = file[:-4]
    image = np.load(os.path.join(data_dir, file))
    left_half = image[:, :, :1000]
    right_half = image[:, :, 1000:]
    for copy in range(13):
        left_name = f"{filename}_0_{copy}.npy" 
        right_name = f"{filename}_1_{copy}.npy"
        np.save(os.path.join(new_data_dir, left_name), left_half)
        np.save(os.path.join(new_data_dir, right_name), left_half)
    if (counter%188 == 0):
        print(f"Progress: {int(counter/188)}/12 Done.")

Progress: 1/12 Done.
Progress: 2/12 Done.
Progress: 3/12 Done.
Progress: 4/12 Done.
Progress: 5/12 Done.
Progress: 6/12 Done.
Progress: 7/12 Done.
Progress: 8/12 Done.
Progress: 9/12 Done.
Progress: 10/12 Done.
Progress: 11/12 Done.
Progress: 12/12 Done.


In [36]:
# Addition of labels.npy 
new_data_dir = "data/train_200GB"
file_list = [f for f in os.listdir(new_data_dir) if f.startswith("part_")]
np.random.seed(seed=1)

labels_dict = {}

for filename in file_list:
    if filename.startswith("part_"):
        label = np.random.uniform(0, 100)
        labels_dict[filename] = label

# Expected entries: 2256 * 2 * 13 = 58656 
print("Entries in dict:", len(labels_dict))
np.save(os.path.join(new_data_dir, "labels.npy"), labels_dict)

Entries in dict: 58656


# Conversion of 200GB dataset to .beton

In [37]:
!export NUMBA_THREADING_LAYER='omp'
import ffcv
from ffcv.writer import DatasetWriter
from ffcv.fields import NDArrayField, FloatField
import numpy as np
from torch.utils.data import Dataset
import os
import time



In [38]:
class CustomDataset(Dataset):
    def __init__(self, data_dir):
        self.data_dir = data_dir
        self.file_list = [f for f in os.listdir(data_dir) if f.startswith("part_")]
        self.labels = np.load(os.path.join(data_dir, "labels.npy"), allow_pickle = True).item()
        
    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        file_path = os.path.join(self.data_dir, self.file_list[idx])
        image = np.load(file_path)
        label = self.labels[self.file_list[idx]]
        return image, label

In [39]:
new_data_dir = "data/train_200GB"

dataset = CustomDataset(new_data_dir)

In [40]:
writer = DatasetWriter("data/train_200GB.beton", {
    "image": NDArrayField(shape=(3, 1200, 1000), dtype = np.dtype("uint8")),
    "label": FloatField()
}, num_workers=8)

In [41]:
start_time = time.time()
writer.from_indexed_dataset(dataset)
end_time = time.time()
print(f"Time taken: {end_time-start_time}")

100%|██████████| 58656/58656 [22:23<00:00, 43.66it/s]


Time taken: 1343.9033253192902
