# Testing the convNet model on real input data

We will test the model on the [Malaria Infected Cells](https://www.kaggle.com/iarunava/cell-images-for-detecting-malaria) dataset. The goal is to predict which cells are infected by malaria.

In [53]:
from src.convNet.model import convNet
from torchsummary import summary
from torchvision import datasets, transforms
import torch
import numpy as np
import logging
logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO)

# Ensure reproducibility
torch.manual_seed(767365)

<torch._C.Generator at 0x7fea00d29670>

We'll scale down these images quite a bit so they can run comfortably on CPU. 

In [54]:
H = W = 32
# Define transformations
transform = {
    'train': transforms.Compose([
        transforms.Resize([H,W]),
        transforms.RandomHorizontalFlip(),
        transforms.RandomAffine(
            degrees=(-15,15),
            translate=(0,.2),
            scale=(.8, 1.2),
            shear=0.1,
        ),
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.5,0.5,0.5), std=(0.5,0.5,0.5))
    ]),
    'test': transforms.Compose([
        transforms.Resize([H,W]),
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.5,0.5,0.5), std=(0.5,0.5,0.5))
    ])
}

Next, we set up the imageFolder datasets. Note that these take different transforms. We don't want to apply shears & random flips to the test data.

In [55]:
train_data_folder = "/Users/jasperginn/PycharmProjects/Pneumonia/data/cell_images/train"
val_data_folder = "/Users/jasperginn/PycharmProjects/Pneumonia/data/cell_images/val"
# Set up data loaders
train_dataset = datasets.ImageFolder(
    root = train_data_folder,
    transform = transform["train"],
)
# Get classes
#print(train_dataset.class_to_idx)
#train_dataset.class_to_idx = {"Uninfected": 0, "Parasitized": 1}

# Validation data
val_dataset = datasets.ImageFolder(
    root = val_data_folder,
    transform = transform["test"]
)
#val_dataset.class_to_idx = {"Uninfected": 0, "Parasitized": 1}

We set up the dataloaders

In [56]:
train_data_loader = torch.utils.data.DataLoader(
    dataset=train_dataset,
    batch_size=64,
    shuffle=True,
    num_workers=4
)

val_data_loader = torch.utils.data.DataLoader(
    dataset=val_dataset,
    batch_size=64,
    shuffle=True,
    num_workers=4
)

Finally, we set up the model. Given that the last layer of the model does not perform a transformation, we have to use BCEWithLogitsLoss(). This combines the sigmoid function and loss function into one layer.

In [57]:
net = convNet(dropout=0.2)
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=0.001)

summary(net, (3, H, W))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1            [-1, 8, 32, 32]             608
         MaxPool2d-2            [-1, 8, 16, 16]               0
convolutional_block-3            [-1, 8, 16, 16]               0
            Conv2d-4           [-1, 16, 12, 12]           3,216
         MaxPool2d-5             [-1, 16, 6, 6]               0
convolutional_block-6             [-1, 16, 6, 6]               0
            Conv2d-7             [-1, 32, 4, 4]           4,640
         MaxPool2d-8             [-1, 32, 2, 2]               0
convolutional_block-9             [-1, 32, 2, 2]               0
          Flatten-10                  [-1, 128]               0
          Dropout-11                  [-1, 128]               0
           Linear-12                   [-1, 32]           4,128
           Linear-13                    [-1, 8]             264
           Linear-14                

In [58]:
for epoch in range(5):
    running_loss = 0.0
    acc = 0
    batches = 0
    for i, data in enumerate(train_data_loader, 0):
        batch_x, batch_y = data
        # Zero gradients
        optimizer.zero_grad()
        # Forward pass, backward pass
        outputs = net(batch_x)
        loss = criterion(outputs.view(-1), batch_y.type(torch.FloatTensor))
        loss.backward()
        # Optimize parameters
        optimizer.step()
        # print statistics
        running_loss += loss.item()
        if i % 100 == 99:    # print every 100 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 100))
            running_loss = 0.0
        batches += 1
        outputs_class = outputs > 0
        acc_current = torch.sum(outputs_class.view(-1) == batch_y).numpy() / batch_y.shape[0]
        acc += acc_current
    acc /= batches
    print("Accuracy on train set is: %s" % acc)
    # On cross-validation set
    with torch.no_grad():
        acc = 0
        batches = 0
        for i, data in enumerate(val_data_loader, 0):
            batch_x, batch_y = data
            outputs = net(batch_x)
            loss = criterion(outputs.view(-1), batch_y.type(torch.FloatTensor)).item()
            # Predict
            outputs_class = outputs > 0
            acc_current = torch.sum(outputs_class.view(-1) == batch_y).numpy() / batch_y.shape[0]
            batches += 1
            acc += acc_current
        acc /= batches
        print("Accuracy on validation set is: %s" % acc)

[1,   100] loss: 0.690
[1,   200] loss: 0.654
[1,   300] loss: 0.621
[1,   400] loss: 0.584
Accuracy on train set is: 0.6309652390651801
Accuracy on validation set is: 0.8170797413793103
[2,   100] loss: 0.510
[2,   200] loss: 0.463
[2,   300] loss: 0.423
[2,   400] loss: 0.400
Accuracy on train set is: 0.8694977487135507
Accuracy on validation set is: 0.915948275862069
[3,   100] loss: 0.366
[3,   200] loss: 0.348
[3,   300] loss: 0.325
[3,   400] loss: 0.311
Accuracy on train set is: 0.9188230328044598
Accuracy on validation set is: 0.9362877155172413
[4,   100] loss: 0.299
[4,   200] loss: 0.294
[4,   300] loss: 0.276
[4,   400] loss: 0.260
Accuracy on train set is: 0.9322918900085763
Accuracy on validation set is: 0.9280711206896551
[5,   100] loss: 0.257
[5,   200] loss: 0.255
[5,   300] loss: 0.247
[5,   400] loss: 0.237
Accuracy on train set is: 0.9369720197255574
Accuracy on validation set is: 0.9562230603448276


We use this final model to predict the accuracy on the test data

In [60]:
test_data_folder = "/Users/jasperginn/PycharmProjects/Pneumonia/data/cell_images/test"
# Define the dataset
test_dataset = datasets.ImageFolder(
    root = test_data_folder,
    transform = transform["test"]
)
# Set up the data loader
test_data_loader = torch.utils.data.DataLoader(
    dataset=test_dataset,
    batch_size=64,
    shuffle=True,
    num_workers=4
)

with torch.no_grad():
    acc = 0
    batches = 0
    for i, data in enumerate(test_data_loader, 0):
        batch_x, batch_y = data
        outputs = net(batch_x)
        loss = criterion(outputs.view(-1), batch_y.type(torch.FloatTensor)).item()
        # Predict
        outputs_class = outputs > 0
        acc_current = torch.sum(outputs_class.view(-1) == batch_y).numpy() / batch_y.shape[0]
        batches += 1
        acc += acc_current
    acc /= batches
    print("Accuracy on validation set is: %s" % acc)

Accuracy on validation set is: 0.984375


So this model works very well on this dataset. By performing hyperparameter optimization we could probably squeeze out a little more performance. 

In [62]:
torch.save(net.state_dict(), "../models/model.pt")