# GalaxyZoo

After the data have been downloaded for the website, we will show you how to setup a data pipeline in tensorflow using tf.data


In [12]:

import glob
import pandas as pd
from torchvision import transforms
from torch.utils.data import DataLoader

from utils import trim_file_list, img_label, CustomDataset

In [3]:
# Path to data and labels
img_dir = "data/images/images_training_rev1"
label_path = "data/labels.csv"
val_split = 0.2

In [4]:
# Get labels for all images by ID
# set_index means we can use loc for the exact ID rather than order in file
# only interested in subset of columns
labels_df = pd.read_csv(label_path).set_index('GalaxyID')[['Class1.1','Class1.2','Class1.3']]

In [5]:
# get list of files
files = glob.glob(f'{img_dir}/*')
print(files[:5])

['data/images/images_training_rev1/555529.jpg', 'data/images/images_training_rev1/426194.jpg', 'data/images/images_training_rev1/510506.jpg', 'data/images/images_training_rev1/945904.jpg', 'data/images/images_training_rev1/898518.jpg']


In [6]:
# Trim files
files = trim_file_list(files, labels_df=labels_df)
print(files[:5])


['data/images/images_training_rev1/555529.jpg', 'data/images/images_training_rev1/426194.jpg', 'data/images/images_training_rev1/510506.jpg', 'data/images/images_training_rev1/945904.jpg', 'data/images/images_training_rev1/898518.jpg']


In [7]:
# Create tensorflow datset from list of IDs
labels = [list(img_label(f, labels_df=labels_df).values) for f in files]

In [8]:
labels

[[np.float64(0.156511), np.float64(0.773751), np.float64(0.069738)],
 [np.float64(0.423676), np.float64(0.576324), np.float64(0.0)],
 [np.float64(0.577396), np.float64(0.404096), np.float64(0.018507)],
 [np.float64(0.715874), np.float64(0.220661), np.float64(0.063465)],
 [np.float64(0.744352), np.float64(0.222931), np.float64(0.032717)],
 [np.float64(0.635343), np.float64(0.337528), np.float64(0.027129)],
 [np.float64(0.383609), np.float64(0.587939), np.float64(0.028452)],
 [np.float64(0.202796), np.float64(0.797204), np.float64(0.0)],
 [np.float64(0.521823), np.float64(0.459679), np.float64(0.018498)],
 [np.float64(0.021846), np.float64(0.978154), np.float64(0.0)],
 [np.float64(0.543123), np.float64(0.456877), np.float64(0.0)],
 [np.float64(0.812297), np.float64(0.102102), np.float64(0.085601)],
 [np.float64(0.900922), np.float64(0.048842), np.float64(0.050236)],
 [np.float64(0.914403), np.float64(0.068995), np.float64(0.016602)],
 [np.float64(0.149755), np.float64(0.850245), np.float

In [9]:
# Example usage
transform = transforms.Compose([
    transforms.Resize((64, 64)),  # Downsample
    transforms.ToTensor()
])
# Create PyTorch dataset from list of files, apply image loading and decoding
image_ds = CustomDataset(files, labels, transform=transform)

In [10]:
# Crete dataloader
dataloader = DataLoader(image_ds, batch_size=32, shuffle=True)

In [11]:
# Demo of accessing data, can use train_ds directly in model.fit
for x,y in dataloader:
    break
print(x.shape)

torch.Size([32, 3, 64, 64])


This is how to create a dataloader/dataset from the images.
Remember to split it in to `train` and `val` can be used when training and validating the model