Dataset link https://www.kaggle.com/datasets/paultimothymooney/chest-xray-pneumonia

### **Week 1**
- Data preparation - Tariq
- feature engineering by using computer vision techniques such as edge detection etc. - Shang Hui
- basic CNN with pytorch - Jayanth

Get score for basic model with raw data

Try different models and engineer the data to get a higher score than the base model/base data

--------------------------
### **Week 2-3**
- Tariq
  1. DenseNet
  2. Depthwise convolution 

- Jayanth
  1. InceptionNet
  2. ResNet

- Shang Hui
  1. VGG (-16/19)

-----------------------------
### **Week 4**
- Ensemble
- Report



# Data Preparation


### Get File Paths

Imports

In [1]:
import os
import pandas as pd
from pathlib import Path
# import cv2
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torchvision
from torchvision import transforms as tf
import time


### Paths and device

In [2]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('mps')
print(torch.cuda.is_available())


data_path = "./data/chest_xray"
train_path = f'{data_path}/train'
test_path = f'{data_path}/test'
val_path = f'{data_path}/val'

output_size = (512, 512)
batch_size = 48
num_epochs = 25

MODEL_SAVE_PATH = "./model-store"

False


### Load Training, Validation, and testing data

In [3]:
transform = tf.Compose({
    tf.ToTensor(),
    tf.Grayscale(),
    tf.Resize(output_size)
})

data = torchvision.datasets.ImageFolder(train_path, transform=transform)

train_size = int(len(data) * 0.85)
val_size = len(data) - train_size

train_data, validation_data = torch.utils.data.random_split(data, [train_size, val_size])
train_loader = torch.utils.data.DataLoader(train_data, batch_size, shuffle=True)

validation_loader = torch.utils.data.DataLoader(validation_data, batch_size, shuffle=True)

test_data = torchvision.datasets.ImageFolder(test_path, transform=transform)
test_loader = torch.utils.data.DataLoader(test_data, batch_size, shuffle=True)


### Model definition

In [4]:
class SimpleCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(1, 8, 5),
            nn.BatchNorm2d(8),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
            nn.Conv2d(8, 16, 5),
            nn.BatchNorm2d(16),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
            nn.Conv2d(16, 32, 5),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
        )
        self.fc = nn.Sequential(
            nn.Linear(32 * 121 * 121, 200),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(200, 50),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(50, 10),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(10, 2),
            nn.Softmax(dim=1)
        )

    def forward(self, x):
        # print(x.shape)
        x = self.conv(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        
        return x

## Early stopper
Implement an early stopper to stop training if `validation_loss > min_validation_loss + min_delta` for `patience` consecutive epochs.
Taken from https://stackoverflow.com/a/73704579

In [5]:
class EarlyStopper:
    def __init__(self, patience=1, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = np.inf

    def early_stop(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            print(f'reset early stop counter to {0}')
            self.min_validation_loss = validation_loss
            self.counter = 0
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            print(f'update early stop counter to {self.counter}')
            if self.counter >= self.patience:
                print('Trigger early stop')
                return True
        return False

### Utility code to compute statistics
Computes `new_loss` by adding `prev_loss` and `loss` (which is averaged over a batch) * batch_size.

In [6]:
def update_loss_and_accuracy(inputs, outputs, labels, loss, prev_loss, prev_accuracy):
    new_loss = prev_loss + loss.item() * inputs.size(0)
    ret, predictions = torch.max(outputs.data, 1)
    correct_counts = predictions.eq(labels.data.view_as(predictions))
    # Convert correct_counts to float and then compute the mean
    acc = torch.mean(correct_counts.type(torch.FloatTensor))
    # Compute total accuracy in the whole batch and add to train_acc
    new_acc = prev_accuracy + acc.item() * inputs.size(0)
    return new_loss, new_acc, acc

### Model Training

In [13]:
## Training config
train_config = {
    'OPTIMIZER': torch.optim.Adam,
    'LR': 0.001,
    'LOSS_FN': torch.nn.CrossEntropyLoss(),
    'EPOCHS': 25
}

In [14]:
model = SimpleCNN()
model = model.to(device)
early_stopper = EarlyStopper(2, 0.1)
lr = train_config['LR']
optimizer = train_config['OPTIMIZER'](model.parameters(), lr)
loss_fn = train_config['LOSS_FN']
num_epochs = train_config['EPOCHS']
# LR Scheduler - taken from
# https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate
scheduler1 = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
scheduler2 = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[8, 20], gamma=0.1)

history = []
for epoch in range(num_epochs):
    epoch_start = time.time()
    print(f'Epoch {epoch + 1}/{num_epochs}')
    model.train()
    train_loss, train_acc, valid_loss, valid_acc = 0, 0, 0, 0

    for i, (inputs, labels) in enumerate(train_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)

        # Forwards pass
        outputs = model(inputs)
        loss = loss_fn(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss, train_acc, acc = update_loss_and_accuracy(inputs, outputs, labels, loss, train_loss, train_acc)
        print("Batch number: {:03d}, Training: Loss: {:.4f}, Accuracy: {:.4f}".format(i, loss.item(), acc.item()))
    scheduler1.step()
    scheduler2.step()

    with torch.no_grad():
        # Set to evaluation mode
        model.eval()
        # Validation loop
        for j, (inputs, labels) in enumerate(validation_loader):
            inputs = inputs.to(device)
            labels = labels.to(device)
            # Forward pass - compute outputs on input data using the model
            outputs = model(inputs)
            # Compute loss
            loss = loss_fn(outputs, labels)
            # Compute the total loss for the batch and add it to valid_loss
            valid_loss, valid_acc, acc = update_loss_and_accuracy(inputs, outputs, labels, loss, valid_loss, valid_acc)
            print("Validation Batch number: {:03d}, Validation: Loss: {:.4f}, Accuracy: {:.4f}".format(j, loss.item(), acc.item()))
    # Find average training loss and training accuracy
    avg_train_loss = train_loss / train_size
    avg_train_acc = train_acc / float(train_size)

    # Find average training loss and training accuracy
    avg_valid_loss = valid_loss / val_size
    avg_valid_acc = valid_acc / float(val_size)


    history.append([avg_train_loss, avg_valid_loss, avg_train_acc, avg_valid_acc])
    epoch_end = time.time()

    print(
        "Epoch : {:03d}, Training: Loss: {:.4f}, Accuracy: {:.4f}%, nttValidation : Loss : {:.4f}, Accuracy: {:.4f}%, Time: {:.4f}s".format(
            epoch, avg_train_loss, avg_train_acc * 100, avg_valid_loss, avg_valid_acc * 100, epoch_end - epoch_start))
    if early_stopper.early_stop(avg_valid_loss):
        break

Epoch 1/25
Batch number: 000, Training: Loss: 0.6959, Accuracy: 0.3750


KeyboardInterrupt: 

### Save model

In [10]:
torch.save(model.state_dict(), f'{MODEL_SAVE_PATH}/model.pth')

### Test Error

In [16]:
model.eval()
loss_fn = train_config['LOSS_FN']
test_loss = 0
test_acc = 0
avg_test_acc = 0

test_size = len(test_loader.dataset)

for j, (inputs, labels) in enumerate(test_loader):
    inputs = inputs.to(device)
    labels = labels.to(device)
    # Forward pass - compute outputs on input data using the model
    outputs = model(inputs)
    # Compute loss
    loss = loss_fn(outputs, labels)
    # Compute the total loss for the batch and add it to valid_loss
    test_loss, test_acc, acc = update_loss_and_accuracy(inputs, outputs, labels, loss, test_loss, test_acc)
    print("Test Batch number: {:03d}, Validation: Loss: {:.4f}, Accuracy: {:.4f}"
            .format(j, loss.item(), acc.item()))
    avg_test_loss = test_loss / test_size
    avg_test_acc = test_acc / float(test_size)
print(f'overall test accuracy {avg_test_acc}%')



Test Batch number: 000, Validation: Loss: 0.6883, Accuracy: 0.6250
Test Batch number: 001, Validation: Loss: 0.6258, Accuracy: 0.6875
Test Batch number: 002, Validation: Loss: 0.5841, Accuracy: 0.7292
Test Batch number: 003, Validation: Loss: 0.8133, Accuracy: 0.5000


KeyboardInterrupt: 