In [4]:
"""Class 16. Data Processing Pipeline using PyTorch

Objectives:
- Create a custom data processing pipeline using torch
- Use torch builtin layers to create a deep feed forward neural network
- Training techniques: Checkpointing, training, evaluation using torch
"""

import torch
from torchvision import transforms
import pandas as pd
import os
import torch.nn as nn

In [25]:
"""
- Data Preparation stage (** TODAY's Focus)
- Model Building stage
- Training stage
- Evaluation stage (Additional)
- Deployment stage (Additional)
- Monitoring stage (Additional)
"""

ROOT_DIR = "E:\\PyCharmProjects\\pythonProject\\"
DATA_DIR = os.path.join(ROOT_DIR, "data")

In [26]:
""" The data and the model must be onto the same device (either cpu or cuda)
"""
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [27]:
dataset_path = os.path.join(DATA_DIR, "digit_train.csv")

In [3]:
data = pd.read_csv(dataset_path)
data.head()

NameError: name 'pd' is not defined

In [43]:
""" Split examples into feature and label """
idx = 3

pixels = data.iloc[idx].values[1:].astype('float32')
label = int(data.iloc[idx].values[0])

print(pixels.shape)
print(label)

(784,)
4


In [44]:
print(type(pixels))
print(type(label))

<class 'numpy.ndarray'>
<class 'int'>


In [45]:
"""Convert each features and label to tensor"""
pixels = torch.tensor(pixels)
label = torch.tensor(label)
print(pixels[210:230])
print(label)

tensor([  0.,   0.,   0.,  27., 254.,  63.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.])
tensor(4)


In [52]:
"""
  [[[1 2],
   [3, 4]]] => (1, 2,2)
"""
pixels.reshape(28, 28).unsqueeze(0).shape

torch.Size([1, 28, 28])

In [32]:
""" 
Note 1: Torch process the data batchwise.
So, we need to add a batch dimension (if not available)
Example: pixels shape = (784,) => (B, 784)
         labels shape = (1,) => (B, 1)
"""

import numpy as np
my_np_array = np.arange(12)
my_np_array = my_np_array.reshape(-1, 12)
print(my_np_array.shape)

(1, 12)


In [33]:
pixels = pixels.reshape(-1, 28, 28)
print(pixels.shape)
print(pixels[0, 17, 12])

torch.Size([1, 28, 28])
tensor(245.)


In [36]:
pixels_transformation = transforms.Compose([
    transforms.Normalize(
        mean=torch.tensor([0.1307]), 
        std=torch.tensor([0.3081])),
])
pixels = pixels_transformation(pixels)

### Create a Dataset Class

In [41]:
from torch.utils.data import Dataset

class DigitDataset(Dataset):
    def __init__(self, file_path, transform=None):
        self.data = pd.read_csv(file_path)
        print(data.head())
        self.transform = transform
        
    def __len__(self):
        """Returns the total number of examples in the dataset"""
        return len(self.data)
    
    def __getitem__(self, idx):
        """ Focus to process a single example in the dataset """
        pixels = self.data.iloc[idx].values[1:].astype('float32')
        label = int(self.data.iloc[idx].values[0])

        pixels = torch.tensor(pixels)
        label = torch.tensor(label)

        pixels = pixels.reshape(28, 28).unsqueeze(0) / 255.0
        if self.transform:
            pixels = pixels_transformation(pixels)

        return pixels, label

In [42]:
""" Concept of Dataset and Dataloader class
"""
dataset = DigitDataset(
    file_path=dataset_path, 
    transform=pixels_transformation
)

   label  pixel0  pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  \
0      1       0       0       0       0       0       0       0       0   
1      0       0       0       0       0       0       0       0       0   
2      1       0       0       0       0       0       0       0       0   
3      4       0       0       0       0       0       0       0       0   
4      0       0       0       0       0       0       0       0       0   

   pixel8  ...  pixel774  pixel775  pixel776  pixel777  pixel778  pixel779  \
0       0  ...         0         0         0         0         0         0   
1       0  ...         0         0         0         0         0         0   
2       0  ...         0         0         0         0         0         0   
3       0  ...         0         0         0         0         0         0   
4       0  ...         0         0         0         0         0         0   

   pixel780  pixel781  pixel782  pixel783  
0         0         0         

In [1]:
from torch.utils.data import random_split

torch.manual_seed(42)

train_size = int(0.7 * len(dataset))
val_size = int(0.15 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(
    dataset=dataset, 
    lengths=[train_size, val_size, test_size],

)

Microsoft Visual C++ Redistributable is not installed, this may lead to the DLL load failure.
It can be downloaded at https://aka.ms/vs/16/release/vc_redist.x64.exe


NameError: name 'torch' is not defined

In [54]:
from torch.utils.data import DataLoader

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=32,
    shuffle=True,
)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=32,
    shuffle=False, 
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=32,
    shuffle=False,
)

In [58]:
"""
1 is the chanel dimension i.e. grayscale no of channel is 1, RGB it is 3
pixels (B, C, H, W)
torch.Size([32, 1, 28, 28])
"""
for pixel_batch, label_batch in train_loader:
    print(pixel_batch.shape)
    print(label_batch.shape)
    print(label_batch)
    break

torch.Size([32, 1, 28, 28])
torch.Size([32])
tensor([7, 4, 2, 0, 0, 4, 6, 3, 2, 3, 2, 9, 7, 5, 0, 2, 0, 6, 4, 6, 7, 3, 3, 4,
        1, 9, 2, 3, 9, 0, 6, 9])


## Model

In [60]:
class DigitClassifier(nn.Module):
    def __init__(self):
        super(DigitClassifier, self).__init__()
        self.fc1 = nn.Linear(28 * 28, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 10)

    def forward(self, x):
        x = x.view(-1, 28 * 28)
        x = self.fc1(x)
        x = torch.relu(x)
        x = self.fc2(x)
        x = torch.relu(x)
        x = self.fc3(x)
        return x

In [61]:
def evaluate_model(model, data_loader, criterion, device):
    model.eval()
    val_loss = 0
    accuracy = 0
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(data_loader):
            data, target = data.to(device), target.to(device)
            output = model(data)
            loss = criterion(output, target)
            val_loss += loss.item()
            
            _, predicted = torch.max(output.data, 1)
            accuracy += (predicted == target).sum().item()
        
        val_loss /= len(data_loader.dataset)
        accuracy /= len(data_loader.dataset)
    return val_loss, accuracy

In [71]:
def train_model(
        model, 
        train_loader, val_loader, 
        criterion, optimizer, 
        num_epochs, 
        device,
        checkpoint_path
):
    model.train()
    
    best_val_accuracy = 0
    patience = 3
    
    # Checkpoints
    checkpoint_dir = os.path.dirname(checkpoint_path)
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    
    for epoch in range(num_epochs):
        train_loss = 0
        for batch_idx, (pixel_batch, label_batch) in enumerate(train_loader):
            pixel_batch, label_batch = pixel_batch.to(device), label_batch.to(device)
            
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, label_batch)
            loss.backward()
            optimizer.step()    
            train_loss += loss.item()
        train_loss /= len(train_loader.dataset) 
        val_loss, val_accuracy = evaluate_model(
            model=model,
            data_loader=val_loader,
            criterion=criterion,
            device=device
        )
        
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            torch.save(model.state_dict(), checkpoint_path)
            print("Best parameter so far.")
        else:
            """ early stopping """
            patience = patience - 1
        
        print(f"epoch [{epoch+1}/{num_epochs}], loss: {train_loss:.4f}", end=" ")
        print(f"val_loss: {val_loss:.4f}", end=" ")
        print(f"val_acc: {val_accuracy:.4f}")
        
        if patience == 0:
            """ Callback: Early stopping """
            print(f"Model performance is not improving. Exiting the training.")
            break

In [63]:
model = DigitClassifier().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [64]:
checkpoint_path = os.path.join(
    os.getcwd(), "checkpoints", "best_model.pth"
)

In [65]:
if os.path.exists(checkpoint_path):
    model.load_state_dict(torch.load(checkpoint_path))
    print("Model loaded successfully from checkpoints.")

In [66]:
train_model(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    criterion=criterion,
    optimizer=optimizer,
    num_epochs=100,
    device=device,
    checkpoint_path=checkpoint_path,
)

Best parameter so far.
epoch [1/100], loss: 0.0102 val_loss: 0.0052 val_acc: 0.9500
Best parameter so far.
epoch [2/100], loss: 0.0045 val_loss: 0.0040 val_acc: 0.9603
Best parameter so far.
epoch [3/100], loss: 0.0032 val_loss: 0.0031 val_acc: 0.9695
epoch [4/100], loss: 0.0024 val_loss: 0.0034 val_acc: 0.9659
Model performance is not improving. Exiting the training.


# Evaluate Performance

In [68]:
model.load_state_dict(torch.load(checkpoint_path))

val_loss, val_acc = evaluate_model(
    model=model,
    data_loader=val_loader,
    criterion=criterion,
    device=device
)

print(f"test_loss: {val_loss:0.4f}, test_acc: {val_acc:0.4f}")

test_loss: 0.0031, test_acc: 0.9695


In [69]:
test_loss, test_accuracy = evaluate_model(
    model=model,
    data_loader=test_loader,
    criterion=criterion,
    device=device
)

print(f"test_loss: {test_loss:0.4f}, test_acc: {test_accuracy:0.4f}")

test_loss: 0.0042, test_acc: 0.9603


## Bonus

In [70]:
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

X_train = np.array([data[0].numpy().flatten() for data in train_dataset])
y_train = np.array([data[1] for data in train_dataset])
X_test = np.array([data[0].numpy().flatten() for data in test_dataset])
y_test = np.array([data[1] for data in test_dataset])

dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)

y_pred = dt_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'DecisionTreeClassifier test accuracy: {accuracy:.2f}')

DecisionTreeClassifier test accuracy: 0.85
