# Investigating the size of the datasets

This is necessary in order to figure out if the whole dataset fit on the GPU memory. 

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import sys


# add path
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
# parent_dir = os.path.dirname(os.path.realpath(__file__))
sys.path.append(parent_dir)
# parent_dir

In [2]:
from src.data.sliced_cube_dataset import make_sliced_dataset
from src.data.whole_cube_dataset import make_whole_dataset

## Load entire dataset

In [3]:
# Load the data into memory
train_dataset, test_dataset, val_dataset = make_sliced_dataset(train_test_val_split = (0.6, 0.2, 0.2),
    batch_size = 32,
    num_workers = 4,
    redshifts = 1.0,
    total_seeds = np.arange(0, 1000, 1),
    random_seed = 42,
    prefetch_factor = 2,)

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1200/1200 [00:50<00:00, 23.77it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [00:17<00:00, 23.09it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [00:17<00:00, 22.42it/s]


In [4]:
wtrain_dataset, wtest_dataset, wval_dataset = make_whole_dataset(
    train_test_val_split = (0.6, 0.2, 0.2),
    batch_size = 32,
    num_workers = 4,
    redshifts = 1.0,
    total_seeds = np.arange(0, 1000, 1),
    random_seed = 42,
    prefetch_factor = 2,
)

Making training dataset: 60% ...


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1200/1200 [09:41<00:00,  2.06it/s]


Making testing dataset: 20% ...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [03:53<00:00,  1.71it/s]


Making validation dataset: 20% ...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [04:06<00:00,  1.62it/s]


## For sliced data

### Number of GB per sample

In [5]:
ds = train_dataset.dataset
ds_val = val_dataset.dataset
ds_test = test_dataset.dataset
ds[0]

{'image': tensor([[[ 0.1970,  0.2670,  0.2721,  ..., -0.3677, -0.2241, -0.0173],
          [ 0.2979,  0.3097,  0.2938,  ..., -0.1604, -0.0313,  0.1539],
          [ 0.2934,  0.2783,  0.2712,  ..., -0.0011,  0.0469,  0.1926],
          ...,
          [ 0.3091,  0.3403,  0.2597,  ..., -0.4060, -0.3275, -0.0361],
          [ 0.2242,  0.2846,  0.2315,  ..., -0.5227, -0.3930, -0.0998],
          [ 0.1562,  0.2483,  0.2395,  ..., -0.5058, -0.3734, -0.1292]]]),
 'label': tensor([0.])}

In [6]:
img = ds[0]["image"]
lbl = ds[0]["label"]

In [7]:
img.element_size()

4

In [8]:
GB_per_sample = (img.nelement() * img.element_size() + lbl.nelement() * lbl.element_size())/ 1e9 # in GB
GB_per_sample

0.000262148

In [9]:
# For sanity check
img_test = ds_test[0]["image"]
lbl_test = ds_test[0]["label"]
GB_per_sample_test = (img_test.nelement() * img_test.element_size() + lbl_test.nelement() * lbl_test.element_size())/ 1e9 # in GB
GB_per_sample_test

0.000262148

### Number of samples

In [10]:
# Number of samples
n_train_samples = len(ds)
n_test_samples = len(ds_test)
n_val_samples = len(ds_val)
total_samples = n_train_samples + n_test_samples + n_val_samples
total_samples

1536000

### Total size of dataset (for each redshift)

In [11]:
# Total size of the dataset
total_size = GB_per_sample * total_samples
print(f"Total size of the dataset: {total_size} GB")

Total size of the dataset: 402.659328 GB


## For whole data

### Number of GB per sample

In [12]:
wds = wtrain_dataset.dataset
wds_val = wval_dataset.dataset
wds_test = wtest_dataset.dataset
wds[0]

{'image': tensor([[[ 1.9703e-01,  2.6700e-01,  2.7213e-01,  ..., -3.6772e-01,
           -2.2405e-01, -1.7319e-02],
          [ 2.9788e-01,  3.0972e-01,  2.9384e-01,  ..., -1.6039e-01,
           -3.1312e-02,  1.5389e-01],
          [ 2.9336e-01,  2.7827e-01,  2.7116e-01,  ..., -1.1068e-03,
            4.6882e-02,  1.9257e-01],
          ...,
          [ 3.0912e-01,  3.4028e-01,  2.5971e-01,  ..., -4.0599e-01,
           -3.2754e-01, -3.6118e-02],
          [ 2.2416e-01,  2.8464e-01,  2.3154e-01,  ..., -5.2271e-01,
           -3.9304e-01, -9.9776e-02],
          [ 1.5616e-01,  2.4832e-01,  2.3954e-01,  ..., -5.0583e-01,
           -3.7336e-01, -1.2919e-01]],
 
         [[ 1.7088e-01,  2.1625e-01,  2.8168e-01,  ..., -3.1048e-01,
           -1.7747e-01, -5.4072e-03],
          [ 2.0266e-01,  2.1738e-01,  2.6250e-01,  ..., -9.2962e-02,
           -4.4450e-03,  1.0160e-01],
          [ 1.7509e-01,  1.4940e-01,  1.9126e-01,  ...,  1.4148e-01,
            1.3591e-01,  1.5712e-01],
          

In [13]:
wimg = wds[0]["image"]
wlbl = wds[0]["label"]

In [14]:
wimg.element_size()

4

In [15]:
wimg.nelement()

16777216

In [16]:
wGB_per_sample = (wimg.nelement() * wimg.element_size() + wlbl.nelement() * wlbl.element_size())/ 1e9 # in GB
wGB_per_sample

0.067108868

### Number of samples

In [17]:
# Number of samples
wn_train_samples = len(wds)
wn_test_samples = len(wds_test)
wn_val_samples = len(wds_val)
wtotal_samples = wn_train_samples + wn_test_samples + wn_val_samples
wtotal_samples

2000

### Total size of dataset (for each redshift)

In [18]:
# Total size of the dataset
wtotal_size = wGB_per_sample * wtotal_samples
print(f"Total size of the dataset: {wtotal_size} GB")

Total size of the dataset: 134.217736 GB
