# Create a Custom Data Pipeline

Objectives:
- Create a Custom Data Pipeline using Pytorch
- Use PyTorch builtin layers to create a Deep Learning Model
- Python training techniques: Checkpointing, Training, and Evaluation

In [83]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms
import pandas as pd
import os
import torch.nn as nn

- Data Preparation stage (** TODAY's Focus)
- Model Building stage
- Training stage
- Evaluation stage (Additional)
- Deployment stage (Additional)

In [84]:
torch.manual_seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [85]:
data = pd.read_csv('../Datasets/digit_train.csv')
data.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [86]:
""" 28 * 28 = 784

[1 2 3
4 5 6
7 8 9]

flatten => 1 2 3 4 5 6 7 8 9
"""

idx = 0

pixels = data.iloc[idx].values[1:].astype('float32')
label = int(data.iloc[idx].values[0])

print(pixels.shape)
print(label)

(784,)
1


In [87]:
print(type(pixels))
print(type(label))

<class 'numpy.ndarray'>
<class 'int'>


In [88]:
pixels = torch.tensor(pixels)
label = torch.tensor(label)

print(pixels[210:230])
print(label)

tensor([  0.,   0.,   0.,  80., 247., 253., 208.,  13.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.])
tensor(1)


In [89]:
pixels = pixels.reshape(-1, 28, 28) / 255.0
print(pixels.shape)

torch.Size([1, 28, 28])


## Data Processing
- pixels: standardization

RGB/Gray scale/Binary <br>

A channels is a representation length of a pixel<br>

If a Image format like RGB image, takes 3 values to make a pixel, then we call it has 3 channels. <br>

Pixel at (0, 0) => [253, 248, 147]<br>

Image representation: C x W x H

In [90]:
pixels_transformation = transforms.Compose([
    transforms.Normalize(
        mean=torch.tensor([0.1307]),
        std=torch.tensor([0.3081])
    ),
])

In [91]:
pixels = pixels_transformation(pixels)

print(pixels[0, 25, 24])
print(label)

tensor(-0.4242)
tensor(1)


# Create a custom Dataset

In [92]:
class DigitDataset(Dataset):
    def __init__(self, file_path, transform=None):
        self.data = pd.read_csv(file_path)
        self.transform = transform
        
    def __len__(self):
        return len(self.data)
    
    """
    Output shape of __getitem__: B x C x W x H (B: batch size, C: channels, W: width, H: height)
    Initially, pixels 784 => 28 x 28 => 1 (C) x 28 (W) x 28 (H)
    """
    def __getitem__(self, idx):
        pixels = data.iloc[idx].values[1:].astype('float32')
        label = int(data.iloc[idx].values[0])
        
        pixels = torch.tensor(pixels)
        label = torch.tensor(label)
        
        pixels = pixels.reshape(28, 28).unsqueeze(0) / 255.0
        if self.transform:
            pixels = pixels_transformation(pixels)
            
        return pixels, label

In [93]:
dataset = DigitDataset('../Datasets/digit_train.csv')

# Create Data Loader

In [94]:
train_size = int(0.7 * len(dataset))
val_size = int(0.15 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(
    dataset=dataset, 
    lengths=[train_size, val_size, test_size]
)

In [95]:
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=32,
    shuffle=True,
)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=32,
    shuffle=False, 
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=32,
    shuffle=False,
)

In [96]:
for pixel_batch, label_batch in train_loader:
    print(pixel_batch.shape)
    print(label_batch.shape)
    
    print(label_batch)
    break

torch.Size([32, 1, 28, 28])
torch.Size([32])
tensor([6, 0, 0, 1, 7, 7, 1, 4, 0, 2, 1, 1, 5, 9, 2, 4, 3, 8, 6, 7, 2, 4, 1, 7,
        1, 7, 9, 5, 3, 2, 5, 0])


# Creating Model Using Pytorch

In [97]:
class DigitClassifier(nn.Module):
    def __init__(self):
        super(DigitClassifier, self).__init__()
        self.fc1 = nn.Linear(28 * 28, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 10)

    def forward(self, x):
        x = x.view(-1, 28 * 28)
        x = self.fc1(x)
        x = torch.relu(x)
        x = self.fc2(x)
        x = torch.relu(x)
        x = self.fc3(x)
        return x

In [98]:
def evaluate_model(model, data_loader, criterion, device):
    model.eval()
    val_loss = 0
    accuracy = 0
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(data_loader):
            data, target = data.to(device), target.to(device)
            output = model(data)
            loss = criterion(output, target)
            val_loss += loss.item()
            
            _, predicted = torch.max(output.data, 1)
            accuracy += (predicted == target).sum().item()
        
        val_loss /= len(data_loader.dataset)
        accuracy /= len(data_loader.dataset)
    return val_loss, accuracy

In [99]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, device, checkpoint_path):
    model.train()
    
    checkpoint_dir = os.path.dirname(checkpoint_path)
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
        
    best_val_accuracy = 0
    
    for epoch in range(num_epochs):
        ## Calculate training loss
        train_loss = 0
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()    
            train_loss += loss.item()
        train_loss /= len(train_loader.dataset) 
    
        # calculate validation loss
        val_loss, val_accuracy = evaluate_model(
            model=model,
            data_loader=val_loader,
            criterion=criterion,
            device=device
        )
        
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            torch.save(model.state_dict(), checkpoint_path)
            print("Best parameter so far.")
        
        print(f"epoch [{epoch+1}/{num_epochs}], loss: {train_loss:.4f}", end=" ")
        print(f"val_loss: {val_loss:.4f}", end=" ")
        print(f"val_acc: {val_accuracy:.4f}")

In [100]:
model = DigitClassifier().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [101]:
checkpoint_path = os.path.join(
    os.getcwd(), "../Model/", "digit_Classify_model.pth"
)

if os.path.exists(checkpoint_path):
    model.load_state_dict(torch.load(checkpoint_path))
    print("Model loaded successfully from checkpoints.")

Model loaded successfully from checkpoints.


  model.load_state_dict(torch.load(checkpoint_path))


In [102]:
train_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    criterion=criterion,
    optimizer=optimizer,
    num_epochs=10,
    device=device,
    checkpoint_path=checkpoint_path,
)

Best parameter so far.
epoch [1/10], loss: 0.0009 val_loss: 0.0030 val_acc: 0.9762
Best parameter so far.
epoch [2/10], loss: 0.0007 val_loss: 0.0029 val_acc: 0.9768
epoch [3/10], loss: 0.0006 val_loss: 0.0032 val_acc: 0.9744
epoch [4/10], loss: 0.0005 val_loss: 0.0040 val_acc: 0.9673
epoch [5/10], loss: 0.0005 val_loss: 0.0032 val_acc: 0.9743
Best parameter so far.
epoch [6/10], loss: 0.0004 val_loss: 0.0033 val_acc: 0.9771
epoch [7/10], loss: 0.0004 val_loss: 0.0036 val_acc: 0.9738
Best parameter so far.
epoch [8/10], loss: 0.0003 val_loss: 0.0032 val_acc: 0.9790
Best parameter so far.
epoch [9/10], loss: 0.0003 val_loss: 0.0031 val_acc: 0.9794
epoch [10/10], loss: 0.0004 val_loss: 0.0039 val_acc: 0.9760


# Evaluate Performance

In [103]:
test_loss, test_accuracy = evaluate_model(
    model=model,
    data_loader=test_loader,
    criterion=criterion,
    device=device
)

In [104]:
print(f"test_loss: {test_loss:0.4f}, test_acc: {test_accuracy:0.4f}")

test_loss: 0.0058, test_acc: 0.9686


# Bonus

In [105]:
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

In [106]:
X_train = np.array([data[0].numpy().flatten() for data in train_dataset])
y_train = np.array([data[1] for data in train_dataset])
X_test = np.array([data[0].numpy().flatten() for data in test_dataset])
y_test = np.array([data[1] for data in test_dataset])

In [107]:
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)

y_pred = dt_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'DecisionTreeClassifier test accuracy: {accuracy:.2f}')

DecisionTreeClassifier test accuracy: 0.86
