## Import Libraries

In [None]:
import os
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
from torch.utils.data import DataLoader , Dataset
from torch.utils.data import TensorDataset
import torchvision
from torchvision import datasets , transforms
from torchvision.datasets import ImageFolder
from torchvision.transforms import ToTensor
from PIL import Image
import torch.nn.functional as F
import pytorch_lightning as pl

## Import Dataset

In [None]:

# Directory containing the image data
data_dir = '/home/NGS/HTA_Projects/group_5/lung_colon_image_set'

# Lists to store file paths and labels
filepaths = []
labels = []

# Get subdirectories within the main data directory
folds = os.listdir(data_dir)

# Iterate over each subdirectory (fold)
for fold in folds:
    # Construct the full path to the current subdirectory
    foldpath = os.path.join(data_dir, fold)

    # Get a list of files in the current subdirectory
    flist = os.listdir(foldpath)

    # Iterate over each file in the current subdirectory
    for f in flist:
        # Construct the full path to the current file
        f_path = os.path.join(foldpath, f)

        # Get a list of files in the subdirectory of the current file
        filelist = os.listdir(f_path)

        # Iterate over each file in the subdirectory of the current file
        for file in filelist:
            # Construct the full path to the current file
            fpath = os.path.join(f_path, file)

            # Appending the full path to the list of file paths
            filepaths.append(fpath)

            # Assigning a label based on the subdirectory (tissue type)
            if f == 'colon_aca':
                labels.append('Colon Adenocarcinoma')
            elif f == 'colon_n':
                labels.append('Colon Benign Tissue')
            elif f == 'lung_aca':
                labels.append('Lung Adenocarcinoma')
            elif f == 'lung_n':
                labels.append('Lung Benign Tissue')
            elif f == 'lung_scc':
                labels.append('Lung Squamous Cell Carcinoma')

# Concatenate data paths with labels into one DataFrame
Fseries = pd.Series(filepaths, name='filepaths')
Lseries = pd.Series(labels, name='labels')
df = pd.concat([Fseries, Lseries], axis=1)



In [None]:
# Using LabelEncoder to convert categorical labels to numerical values
le = LabelEncoder()

# Applying a lambda function to map each tissue type label to a numerical category
df['labels'] = df['labels'].apply(lambda x:
                                  0 if x == 'Colon Adenocarcinoma'
                                  else (1 if x == 'Colon Benign Tissue'
                                        else (2 if x == 'Lung Adenocarcinoma'
                                              else (3 if x == 'Lung Squamous Cell Carcinoma'
                                                    else (4 if x == 'Lung Benign Tissue' else x)))))



## Split Dataset

In [None]:
# Splitting the data into training and testing sets

# x: File paths of the images
x = df['filepaths']
# y: Corresponding numerical labels for tissue types
y = df['labels']

# Splitting the data into 70% training and 30% testing sets
# shuffle=True ensures random shuffling of the data
# random_state is set for reproducibility
x_train_paths, x_test_paths, y_train, y_test = train_test_split(x, y, train_size=0.7,
                                                                shuffle=True, random_state=34)


## Convert Into Tensor using transform

In [None]:
# Defining a series of image transformations using transforms.Compose
transform = transforms.Compose([transforms.Resize((224, 224)),  # Resize the image to (224, 224) pixels
transforms.ToTensor()            # Convert the image to a PyTorch tensor
])

# Function to load an image from a file path, apply the defined transformations,
# and return the resulting tensor
def to_tensor(image_path):
    # Open the image using PIL
    image = Image.open(image_path)

    # Apply the defined transformations
    image = transform(image)

    # Return the transformed image as a PyTorch tensor
    return image


In [None]:
# To load and transform training images using 'to_tensor' function and 'transform', resulting in a list of PyTorch tensors.
x_train = [to_tensor(image_path) for image_path in x_train_paths]

In [None]:
#  To load and transform test images using 'to_tensor' function and 'transform', resulting in a list of PyTorch tensors.
x_test = [to_tensor(image_path) for image_path in x_test_paths]

## Create Dataloaders

In [None]:
# Converting training and testing labels to PyTorch tensors using torch.tensor

y_train_tensor = torch.tensor(y_train.values)


y_test_tensor = torch.tensor(y_test.values)

In [None]:
# Creating PyTorch datasets and data loaders for training and testing with batch size 120, shuffle for training, and no shuffle for testing.

train_dataset = TensorDataset(torch.stack(x_train), y_train_tensor)
test_dataset = TensorDataset(torch.stack(x_test), y_test_tensor)


train_loader = DataLoader(train_dataset, batch_size=120, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=120, shuffle=False)

In [None]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x7f7618ef84d0>

In [None]:
train_loader.batch_size

120

## Model Building

In [None]:
# Convolutional Neural Network (CNN) class using PyTorch Lightning
class ConvolutionalNetwork(pl.LightningModule):


def __init__(self):
    # Call the constructor of the parent class (nn.Module)
    super(ConvolutionalNetwork, self).__init__()

    # Define three convolutional layers with specified parameters
    self.conv1 = nn.Conv2d(3, 6, 3, 1)   # Input channels: 3 (RGB), Output channels: 6, Kernel size: 3x3, Stride: 1
    self.conv2 = nn.Conv2d(6, 16, 3, 1)  # Input channels: 6, Output channels: 16, Kernel size: 3x3, Stride: 1
    self.conv3 = nn.Conv2d(16, 32, 3, 1) # Input channels: 16, Output channels: 32, Kernel size: 3x3, Stride: 1

# Define fully connected (dense) layers in the Convolutional Neural Network (CNN) model
# Four fully connected layers (fc1, fc2, fc3, fc4) are defined
    self.fc1 = nn.Linear(32 * 26 * 26, 120)  # Input size: 32 * 26 * 26, Output size: 120
    self.fc2 = nn.Linear(120, 84)             # Input size: 120, Output size: 84
    self.fc3 = nn.Linear(84, 20)              # Input size: 84, Output size: 20
    self.fc4 = nn.Linear(20, 5)               # Input size: 20, Output size: 5

    # Define the forward pass, optimization, and evaluation methods in the Convolutional Neural Network (CNN) model

# Forward Pass:
# - Applies ReLU activation and max pooling after each convolutional layer
# - Flattens the output and passes it through fully connected layers with ReLU activation
# - Applies log_softmax activation for the output layer
def forward(self, X):
    X = F.relu(self.conv1(X))
    X = F.max_pool2d(X, 2, 2)
    X = F.relu(self.conv2(X))
    X = F.max_pool2d(X, 2, 2)
    X = F.relu(self.conv3(X))
    X = F.max_pool2d(X, 2, 2)
    X = X.view(-1, 32 * 26 * 26)
    X = F.relu(self.fc1(X))
    X = F.relu(self.fc2(X))
    X = F.relu(self.fc3(X))
    X = self.fc4(X)
    return F.log_softmax(X, dim=1)

# Configure Optimizers:
# Configures the Adam optimizer with a learning rate of 0.004
def configure_optimizers(self):
    optimizer = torch.optim.Adam(self.parameters(), lr=0.004)
    return optimizer

# Training Step:
# - Calculates cross-entropy loss, accuracy, and logs metrics during training
def training_step(self, train_batch, batch_idx):
    X, y = train_batch
    y_hat = self(X)
    loss = F.cross_entropy(y_hat, y)
    pred = y_hat.argmax(dim=1, keepdim=True)
    acc = pred.eq(y.view_as(pred)).sum().item() / y.shape[0]
    self.log("train_loss", loss)
    self.log("train_acc", acc)
    return loss

# Validation Step:
# - Calculates cross-entropy loss, accuracy, and logs metrics during validation
def validation_step(self, val_batch, batch_idx):
    X, y = val_batch
    y_hat = self(X)
    loss = F.cross_entropy(y_hat, y)
    pred = y_hat.argmax(dim=1, keepdim=True)
    acc = pred.eq(y.view_as(pred)).sum().item() / y.shape[0]
    self.log("val_loss", loss)
    self.log("val_acc", acc)

# Testing Step:
# Calculates cross-entropy loss, accuracy, and logs metrics during testing
def test_step(self, test_batch, batch_idx):
    X, y = test_batch
    y_hat = self(X)
    loss = F.cross_entropy(y_hat, y)
    pred = y_hat.argmax(dim=1, keepdim=True)
    acc = pred.eq(y.view_as(pred)).sum().item() / y.shape[0]
    self.log("test_loss", loss)
    self.log("test_acc", acc)


In [None]:
model = ConvolutionalNetwork()

In [None]:
# Creating  a TensorBoard logger for monitoring training progress and metrics

# logger: An instance of TensorBoardLogger from PyTorch Lightning
# Logs will be saved in the "logs/" directory with the experiment name "ConvolutionalNetwork"
logger = pl.loggers.TensorBoardLogger("logs/", name="ConvolutionalNetwork")

logger = pl.loggers.TensorBoardLogger("logs/", name="ConvolutionalNetwork")

In [None]:
# Create a PyTorch Lightning Trainer instance for model training
#max_epochs: Specifies the maximum number of training epochs (here, set to 30)
trainer = pl.Trainer(max_epochs=30 , logger=logger, accelerator="gpu" , devices=1)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
# Train the PyTorch Lightning model using the Trainer instance

# trainer.fit(): Method used to initiate the training process
# model: The PyTorch Lightning model (ConvolutionalNetwork in this case) to be trained
# train_dataloaders: Training data loader (train_loader) used during training
# val_dataloaders: Validation data loader (test_loader) used for validation during training

trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=test_loader)

trainer.fit(model , train_dataloaders = train_loader , val_dataloaders = test_loader )

Missing logger folder: logs/ConvolutionalNetwork
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name  | Type   | Params
---------------------------------
0 | conv1 | Conv2d | 168   
1 | conv2 | Conv2d | 880   
2 | conv3 | Conv2d | 4.6 K 
3 | fc1   | Linear | 2.6 M 
4 | fc2   | Linear | 10.2 K
5 | fc3   | Linear | 1.7 K 
6 | fc4   | Linear | 105   
---------------------------------
2.6 M     Trainable params
0         Non-trainable params
2.6 M     Total params
10.454    Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/student/miniconda3/envs/pytorch/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=39` in the `DataLoader` to improve performance.
/home/student/miniconda3/envs/pytorch/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=39` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=30` reached.


## Evaluate model

In [None]:
from sklearn.metrics import classification_report
import numpy as np

# Evaluate the model on the test set and gather predictions
model.eval()
predicted_labels = []
true_labels = []

with torch.no_grad():
    for images, labels in test_loader:
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        predicted_labels.extend(predicted.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Convert lists to numpy arrays
predicted_labels = np.array(predicted_labels)
true_labels = np.array(true_labels)

# Generate classification report
report = classification_report(true_labels, predicted_labels)
print(report)


              precision    recall  f1-score   support

           0       0.87      0.95      0.91      1519
           1       0.96      0.86      0.90      1466
           2       0.94      0.89      0.91      1481
           3       0.90      0.96      0.93      1493
           4       0.99      0.99      0.99      1541

    accuracy                           0.93      7500
   macro avg       0.93      0.93      0.93      7500
weighted avg       0.93      0.93      0.93      7500



In [None]:
# Evaluation loop for the test set
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f"Accuracy on test set: {accuracy * 100:.2f}%")


Accuracy on test set: 92.95%
