In [1]:
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch
from torch import nn
import os
from sklearn.preprocessing import StandardScaler
from multiprocessing import cpu_count

In [2]:
# Load data into memory
# These datasets come come pre-packaged with colab, so it's best to run this lesson there.
housing = pd.read_csv('sample_data/california_housing_train.csv')
housing_test = pd.read_csv('sample_data/california_housing_test.csv')

In [3]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [4]:
housing.agg(['mean','std'])

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
mean,-119.562108,35.625225,28.589353,2643.664412,539.410824,1429.573941,501.221941,3.883578,207300.912353
std,2.005166,2.13734,12.586937,2179.947071,421.499452,1147.852959,384.520841,1.908157,115983.764387


In [5]:
x_train = housing.drop('median_house_value', axis=1)
y_train = housing.median_house_value.values

x_valid = housing_test.drop('median_house_value', axis=1)
y_valid = housing_test.median_house_value.values

In [6]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)

In [7]:
# Check that the means of each column are close to 0
assert np.allclose(x_train_scaled.mean(axis=0), np.zeros(x_train_scaled.shape[1]))
# Check that the stds of each column are close to 1
assert np.allclose(x_train_scaled.std(axis=0), np.ones(x_train_scaled.shape[1]))

In [8]:
class HousingDataset(Dataset):
    def __init__(self, X, y):
        super().__init__()
        self.X = X
        self.y = y

    def __len__(self):
        # What are some other ways we could do this?
        return min(len(self.X), len(self.y))

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [9]:
train_ds = HousingDataset(x_train_scaled, y_train)
valid_ds = HousingDataset(x_valid_scaled, y_valid)

In [10]:
# What are the number of items in each dataset?
len(train_ds), len(valid_ds)

(17000, 3000)

In [11]:
# What is the x and y at a given index?
idx = 8

In [12]:
x, y = train_ds[idx]
x, y

(array([ 2.47972161, -0.94289358,  0.42987474,  0.98415173,  1.50796844,
         1.48492555,  1.44282004, -0.89375701]),
 np.float64(58400.0))

In [15]:
# Choose an index in your dataset
idx = 12000
# Fetch an item at that index from train_ds
x, y = train_ds[idx]

# Print values
print("Training sample at idx=12000:")
print("x (standardized features):", x)
print("y (median_house_value):", y)
assert np.allclose(x, x_train_scaled[12000]), "x does not match x_train_scaled[12000]"

# Verify y matches y_train[12000]
assert np.allclose(y, y_train[12000]), "y does not match y_train[12000]"

# Confirm
print("Assertions passed: x and y match x_train_scaled[12000] and y_train[12000]")

Training sample at idx=12000:
x (standardized features): [-0.91161787  1.36845874 -0.28517331 -0.21683018 -0.25721033 -0.32807775
 -0.20603375 -0.1864567 ]
y (median_house_value): 126200.0
Assertions passed: x and y match x_train_scaled[12000] and y_train[12000]


In [16]:
BATCH_SIZE = 64
N_WORKERS = cpu_count()
print(f"""
In this example, each batch will contain {BATCH_SIZE} items.
We will use {N_WORKERS} workers to load data more efficiently.
""")


In this example, each batch will contain 64 items.
We will use 2 workers to load data more efficiently.



In [17]:
train_dl = DataLoader(
    train_ds,
    batch_size=BATCH_SIZE,
    shuffle=True, # We generally want to shuffle the train dataloader
    num_workers=N_WORKERS
)

In [None]:
valid_dl = ...

In [None]:
# Pull one batch of data
for batch in train_dl:
    break

In [None]:
# What's the type? The length?
type(batch), len(batch)

In [None]:
# This looks like our X
batch[0].shape

In [None]:
# This looks like our y
batch[1].shape

In [None]:
for x_batch, y_batch in train_dl:
    break

In [None]:
x_batch.shape, y_batch.shape

In [None]:
x_batch

In [None]:
y_batch

In [None]:
# We're just using fastai for the datasets for now.
# We'll learn how to use it for modeling later on.
!pip install -Uqq fastai

In [None]:
# Download and extract the data
from fastai.data.all import URLs, untar_data
from fastcore.basics import Path
from PIL import Image
import numpy as np

path = untar_data(URLs.CIFAR)

In [None]:
# what files or directories are in the path variable?

In [None]:
# what is contained in path/'train'?

In [None]:
# find the paths for 10 images of airplanes from the train dataset.

In [None]:
def list_png_files(path):
    return list(path.glob('**/*.png'))

In [None]:
sample_files = list_png_files(path/'train')[:10]
sample_files

In [None]:
def label_from_path_parent(path:Path) -> str:
    return path.parent.name

In [None]:
# Sanity check for label_from_parent_path
assert label_from_path_parent(Path('/root/.fastai/data/cifar10/train/horse/42500_horse.png')) == 'horse'

In [None]:
def load_image_and_label(path):
    img = Image.open(path)
    label = label_from_path_parent(path)
    return img, label

In [None]:
img, label = load_image_and_label(sample_files[0])
print(label)
img

In [None]:
# Let's resize this image and inspect what it looks like
img.resize((224,224))

In [None]:
class CifarDataset(Dataset):
    def __init__(self, path):
        self.path = path
        self.files = ...

    def __len__(self):
        return ...

    def __getitem__(self, idx):
        return ...

In [None]:
train_cifar = CifarDataset(path/'train')

In [None]:
img, label = train_cifar[8000]
print(label)
img.resize((224, 224))

In [None]:
def img_to_scaled_tensor(img, channels_first=True):
    t = torch.tensor(np.array(img) / 255).float()
    if channels_first:
        return t.permute(2, 0, 1)
    return t

In [None]:
img_t = img_to_scaled_tensor(img)
img_t.shape

In [None]:
classes = {d.name:i  for i, d in enumerate((path/'train').ls())}
classes

In [None]:
def class_to_idx(class_name):
    return classes.get(class_name)

In [None]:
def collate_fn(batch):
    # The batch comes in the format ((x1, y1), (x2, y2), ..., (xn, yn)).
    # Let's split this up into our xs and our ys.
    xs, ys = list(zip(*batch))
    # Let's create a tensor that concatenates all our images on a new axis.
    # Is there another way to do this?
    xs = torch.cat([img_to_scaled_tensor(i).unsqueeze(0) for i in xs], dim=0)
    # Let's create another tensor that combines all our class labels.
    ys = torch.tensor([class_to_idx(i) for i in ys])

    return xs, ys

In [None]:
# Test the collate function
items = (train_cifar[0], train_cifar[1])
items

In [None]:
x_b, y_b = collate_fn(items)

In [None]:
x_b.shape, y_b.shape

In [None]:
train_cifar_dl = DataLoader(
    train_cifar,
    batch_size=BATCH_SIZE,
    num_workers=N_WORKERS,
    shuffle=True,
    collate_fn=collate_fn
)

In [None]:
for x_b, y_b in train_cifar_dl:
    break

In [None]:
x_b.shape

In [None]:
y_b.shape