**Importing Libraries and Setting Environment**

In [1]:
# The torch modules are used for the Nueral Network
import torch
import torch.nn as nn
#From this libraries, models will give us Resnet34 which will be used for Feature Extraction
# and in DL tasks, transforms library is used to make the images usable for the models
from torchvision import models, transforms
#DataLoader is used to create the DataLoader object used for DL tasks
#ConcatDataset to create a concatenated dataset of all the three folds in the Dataset
from torch.utils.data import DataLoader, ConcatDataset, Dataset
#ImageFolder module is used to create labelled data from the testing_data
from torchvision.datasets import ImageFolder
import numpy as np
#SVC, LogisticRegression, DecisionTree are the modules provided my sklearn for ML application
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
#VotingClassifier is an ensemble method.
#Ensemble means it aggregates the output of all the three ML models and then selects
#the output depending on all the three's output(in our case)
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

import pandas as pd

import matplotlib.pyplot as plt

**Loading And Preprocessing Data**

In [2]:
#Here, transforms.Compose is used to create the images in a uniform manner
transform = transforms.Compose([
    # Resize to match ResNet's input size
    transforms.Resize((224, 224)), 
    #This creates into Tensor
    transforms.ToTensor(),
    # ImageNet normalization for Z-Score Normalization
    #Images have three channels RGB and so three values of mean and std for individual channel
    #Z-Score=val-mean/std
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

#Here we are reading the images from each and every fold to create our testing data
#ImageFolder is for labeled data creation, it works like in our case we are having
#images under two folders all and hem, so it creates label in alphabetical order, all-0 & hem-1
#Now, the images under each folder will be assigned that specific label
fold_0_data = ImageFolder("/kaggle/input/leukemia-classification/C-NMC_Leukemia/training_data/fold_0", transform=transform)
fold_1_data = ImageFolder("/kaggle/input/leukemia-classification/C-NMC_Leukemia/training_data/fold_1", transform=transform)
fold_2_data = ImageFolder("/kaggle/input/leukemia-classification/C-NMC_Leukemia/training_data/fold_2", transform=transform)

train_dataset = ConcatDataset([fold_0_data, fold_1_data, fold_2_data])

In [3]:
import os
from PIL import Image

# Custom Dataset class for validation data
#While working with custom dataset, it is compulsary to have __init__,__len__,__getitem__ methods
#Here this class inherits Dataset class
class ValidationDataset(Dataset):
    #Init method will have inclusive call, self is self's insatance
    #img_dir is the path to testing data, #labels_csv path to labels data for testing data
    #transform is if we pass any transform function(made above), else none as default
    def __init__(self, img_dir, labels_csv, transform=None):
        self.img_dir = img_dir
        self.labels = pd.read_csv(labels_csv)

        # Now here, labels is having name and label of that specific image
        #In our case, all is 0 and hem is 1 but in test_labels we have 1 as all and 0 as hem
        #So we are inverting them
        self.labels['labels'] = self.labels['labels'].apply(lambda x: 1 if x == 0 else 0)

        self.transform = transform

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Get the image path and label
        img_name = os.path.join(self.img_dir, self.labels.iloc[idx, 1])  # Assuming column 1 contains image names
        image = Image.open(img_name).convert("RGB")  # Load the image in RGB mode
        label = int(self.labels.iloc[idx, 2])  # Assuming column 2 contains the modified label

        if self.transform:
            image = self.transform(image)

        return image, label

# Load the validation dataset
validation_dir = "/kaggle/input/leukemia-classification/C-NMC_Leukemia/validation_data/C-NMC_test_prelim_phase_data"
validation_dataset = ValidationDataset(validation_dir, 
                                       "/kaggle/input/leukemia-classification/C-NMC_Leukemia/validation_data/C-NMC_test_prelim_phase_data_labels.csv", 
                                       transform=transform)

# Load the validation data using DataLoader
validation_loader = DataLoader(validation_dataset, batch_size=32, shuffle=True)

In [None]:
# Function to unnormalize and display an image
def imshow(image, label):
    # Convert image to numpy for displaying
    image = image.numpy().transpose((1, 2, 0))  # Change dimensions to Height x Width x Channels
    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    image = std * image + mean  # Unnormalize the image
    image = np.clip(image, 0, 1)
    
    # Display the image
    plt.imshow(image)
    plt.title(f'Label: {label}')
    plt.show()

# Iterate over the first few samples in the ConcatDataset and display them
for i in range(4):  # Display first 4 samples
    image, label = train_dataset[i]
    imshow(image, label)

**Using Resnet34 Model for Features Extraction**

In [None]:
# Create a DataLoader for the training dataset to handle batches
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

resnet = models.resnet34(pretrained=True)

# Remove the final fully connected layer (fc) so we only use the convolutional layers for feature extraction
feature_extractor = nn.Sequential(*list(resnet.children())[:-1])

# Set the model to evaluation mode (we don't want to train it)
#Setting it into eval mode will make sure that the gradients aren't stored or calculated
#as we don'w want to backprop them
feature_extractor.eval()

# Function to extract features from the DataLoader using the feature extractor model
def extract_features_batch(model, loader):
    features = []
    labels = []
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():  # Disable gradient calculation
        for inputs, label in loader:
            # Move inputs to GPU if available
            inputs = inputs.cuda() if torch.cuda.is_available() else inputs

            # Pass the batch through the feature extractor
            outputs = model(inputs)
            
            # Flatten and convert to NumPy arrays
            features.append(outputs.cpu().numpy())  # Move outputs to CPU before converting to NumPy
            labels.append(label.cpu().numpy())  # Move labels to CPU before converting

    # Concatenate all batches into a single array
    return np.concatenate(features), np.concatenate(labels)

# Move model to GPU if available
feature_extractor = feature_extractor.cuda() if torch.cuda.is_available() else feature_extractor

# Extract features for the entire dataset using batch processing
train_features, train_labels = extract_features_batch(feature_extractor, train_loader)

# Print the shape of the extracted features
print(f"Extracted feature shape: {train_features.shape}")
#10661, 512, 1, 1 = 10661 testing images
# 512 features for each image
# 1,1 are the height and width as the image is convolved and pooled in resnet34 the final height and width will be 1,1

In [None]:
#Same for the images for the validation
validation_features, validation_labels = extract_features_batch(feature_extractor, validation_loader)

#Here we are reshaping the image to remove 1,1 as they aren't required info.
train_features_flat = train_features.reshape(train_features.shape[0], 512)
validation_features_flat = validation_features.reshape(validation_features.shape[0], 512)

# Print the shape of the validation features
print(f"Validation features shape: {validation_features_flat.shape}")
print(f"Train features shape: {train_features_flat.shape}")

In [None]:
#Here, we have used two different inputs to analyze the model performance, one is directly passing 512 features as input
#The second one is to reduce them to 50, using Principal Component analysis technique
#PCA works by determining Principal components for the data and selecting top k PCs with highest variance coverage
#If you need to learn more about PCA, refer: PCA Learning Docs by IBM
from sklearn.decomposition import PCA

# Reduce dimensions to 50 using PCA
pca = PCA(n_components=50)
train_features_pca = pca.fit_transform(train_features_flat)
validation_features_pca = pca.transform(validation_features_flat)

print(f"Reduced training feature shape: {train_features_pca.shape}")
print(f"Reduced validation feature shape: {validation_features_pca.shape}")

In [None]:
#Here are the instance of each ML technique  we are gonna implement
#With probablities=True it will output the probabilities of the classes instead of just class label
#Incase of inbalanced data, balance will make sure that the model is not fed with imbalanced data
svm_clf = SVC(probability=True, class_weight='balanced')
log_clf = LogisticRegression(max_iter=1000, class_weight='balanced')  # Logistic Regression
dt_clf = DecisionTreeClassifier()  # Decision Tree

voting_clf = VotingClassifier(estimators=[
    ('svm', svm_clf), ('log', log_clf), ('dt', dt_clf)], voting='soft')

voting_clf.fit(train_features_flat, train_labels)
#dt_clf.fit(train_features_pca, train_labels)

y_pred = voting_clf.predict(validation_features_flat)
# y_pred = dt_clf.predict(validation_features_pca)

accuracy = accuracy_score(validation_labels, y_pred)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")

print(classification_report(validation_labels, y_pred))

conf_matrix = confusion_matrix(validation_labels, y_pred)

# Display the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=['Hem', 'All'])
disp.plot(cmap='Blues')
plt.show()

In [None]:
import torch
import torch.nn as nn
from torchvision import models

#Now the resnet 34 was developed for classification of 1k classes, but as we want it for 2, we are updating the final FC layer
# Modify the fully connected layer to match the number of output classes (Hem and All)
num_ftrs = resnet.fc.in_features
resnet.fc = nn.Linear(num_ftrs, 2)  # 2 output classes for Hem and All

# Move the model to GPU if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
resnet34 = resnet.to(device)


In [None]:
import torch.optim as optim

# Set up the loss function and optimizer
criterion = nn.CrossEntropyLoss()  # Cross Entropy Loss for classification
#Optimizers are used to get the best Weights and Bias pair, over time.
#Adam is a well known one, the first argument tells which all parameters are to be updated, in our case all
#learning rate=0.001, weight_decay is for regularization purpose
optimizer = optim.Adam(resnet34.parameters(), lr=0.001, weight_decay=1e-4)

# Training function for one epoch
def train_epoch(model, train_loader, criterion, optimizer, device):
    model.train()  # Set model to training mode
    running_loss = 0.0
    correct = 0
    total = 0
    
    for inputs, labels in train_loader: #The data will be processed batch wise, in our case 32
        #Putting both of them to the available device
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        # Compute training statistics
        #This will provide us with the 0 or 1 class value based on the maximum values
        _, preds = torch.max(outputs, 1)
        #This computed the total running loss over all the batches
        running_loss += loss.item() * inputs.size(0)
        #Number of correct predictions over all
        correct += (preds == labels).sum().item()
        #It is possible that the last batch can be partially empty so getting the actual length
        total += labels.size(0)
    
    # Calculate training loss and accuracy
    epoch_loss = running_loss / total
    epoch_acc = correct / total
    epoch_error = 1 - epoch_acc  # Error = 1 - Accuracy
    
    print(f"Train Loss: {epoch_loss:.4f} | Train Accuracy: {epoch_acc * 100:.2f}% | Train Error: {epoch_error * 100:.2f}% | Correct: {correct}")

    return epoch_loss, epoch_acc, epoch_error

In [None]:
# Testing function for one epoch
def test_epoch(model, validation_loader, criterion, device):
    model.eval()  # Set model to evaluation mode
    running_loss = 0.0
    correct = 0
    total = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():  # Disable gradient computation for evaluation
        for inputs, labels in validation_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            # Compute validation statistics
            _, preds = torch.max(outputs, 1)
            running_loss += loss.item() * inputs.size(0)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
            
            # Store all predictions and labels for further analysis
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Calculate validation loss and accuracy
    epoch_loss = running_loss / total
    epoch_acc = correct / total
    epoch_error = 1 - epoch_acc  # Error = 1 - Accuracy
    
    print(f"Validation Loss: {epoch_loss:.4f} | Validation Accuracy: {epoch_acc * 100:.2f}% | Validation Error: {epoch_error * 100:.2f}% | Correct: {correct}")
    
    return epoch_loss, epoch_acc, epoch_error

In [None]:
num_epochs = 8  # Number of epochs to train

# Lists to track losses, accuracies, and errors over epochs
train_losses = []
train_accuracies = []
train_errors = []
val_losses = []
val_accuracies = []
val_errors = []

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    
    # Train for one epoch
    train_loss, train_acc, train_error = train_epoch(resnet34, train_loader, criterion, optimizer, device)
    train_losses.append(train_loss)
    train_accuracies.append(train_acc)
    train_errors.append(train_error)
    
    # Validate after each epoch
    val_loss, val_acc, val_error = test_epoch(resnet34, validation_loader, criterion, device)
    val_losses.append(val_loss)
    val_accuracies.append(val_acc)
    val_errors.append(val_error)