In [1]:
import os
from sklearn.model_selection import train_test_split
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Dataset
from PIL import Image
import torch

# Define paths
data_dir = "data"
infected_dir = os.path.join(data_dir, "infected")
uninfected_dir = os.path.join(data_dir, "uninfected")


In [2]:
class CellDataset(Dataset):
    def __init__(self, file_paths, labels, transform=None):
        self.file_paths = file_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        img_path = self.file_paths[idx]
        image = Image.open(img_path).convert("RGB")  # Convert .tif to RGB
        label = self.labels[idx]

        if self.transform:
            image = self.transform(image)

        return image, label


In [3]:
# Get all file paths and labels
infected_files = [os.path.join(infected_dir, f) for f in os.listdir(infected_dir) if f.endswith('.tif')]
uninfected_files = [os.path.join(uninfected_dir, f) for f in os.listdir(uninfected_dir) if f.endswith('.tif')]

infected_labels = [1] * len(infected_files)  # Label for infected
uninfected_labels = [0] * len(uninfected_files)  # Label for uninfected

all_files = infected_files + uninfected_files
all_labels = infected_labels + uninfected_labels

# Split into train and validation
train_files, val_files, train_labels, val_labels = train_test_split(all_files, all_labels, test_size=0.2, stratify=all_labels, random_state=42)


In [4]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to match VGG input size
    transforms.ToTensor(),          # Convert image to tensor
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])  # Normalize
])


In [5]:
# Create dataset objects
train_dataset = CellDataset(train_files, train_labels, transform=transform)
val_dataset = CellDataset(val_files, val_labels, transform=transform)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=42,shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=42, shuffle=False)


In [6]:
from torchvision import models
import torch.nn as nn

# Load pre-trained VGG16 using the new 'weights' parameter
vgg = models.vgg16(weights=models.VGG16_Weights.IMAGENET1K_V1)

# Modify the final classifier layer to adapt for binary classification (Infected vs Uninfected)
vgg.classifier[6] = nn.Linear(vgg.classifier[6].in_features, 2)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vgg = vgg.to(device)


In [7]:
from tqdm import tqdm

def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=10):
    model.train()  # Set the model to training mode
    for epoch in range(epochs):
        running_loss = 0.0
        
        # Create a tqdm progress bar for the training loop
        with tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}", unit="batch") as progress_bar:
            for inputs, labels in progress_bar:
                # Move data to device
                inputs, labels = inputs.to(device), labels.to(device)

                # Zero the parameter gradients
                optimizer.zero_grad()

                # Forward pass
                outputs = model(inputs)
                loss = criterion(outputs, labels)

                # Backward pass and optimize
                loss.backward()
                optimizer.step()

                # Track loss
                running_loss += loss.item()

                # Update progress bar description with the average loss
                progress_bar.set_postfix(loss=running_loss / (progress_bar.n + 1))

        # Print training loss for the epoch
        print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {running_loss / len(train_loader):.4f}")

        # Evaluate on the validation set after every epoch
        validate_model(model, val_loader)



In [8]:
def validate_model(model, val_loader):
    model.eval()  # Set the model to evaluation mode
    correct = 0
    total = 0
    running_loss = 0.0
    with torch.no_grad():  # Disable gradient calculations for validation
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            # Forward pass
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)

            # Calculate accuracy
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f"Validation Accuracy: {accuracy:.2f}%")
    return accuracy


In [9]:
# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(vgg.parameters(), lr=0.0001)

# Train the model
train_model(vgg, train_loader, val_loader, criterion, optimizer, epochs=4)


Epoch 1/4: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 602/602 [2:11:51<00:00, 13.14s/batch, loss=0.178]


Epoch 1/4, Training Loss: 0.1776
Validation Accuracy: 95.50%


Epoch 2/4: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 602/602 [2:10:13<00:00, 12.98s/batch, loss=0.0925]


Epoch 2/4, Training Loss: 0.0925
Validation Accuracy: 95.01%


Epoch 3/4: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 602/602 [2:09:55<00:00, 12.95s/batch, loss=0.0599]


Epoch 3/4, Training Loss: 0.0599
Validation Accuracy: 97.10%


Epoch 4/4: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 602/602 [2:10:15<00:00, 12.98s/batch, loss=0.048]


Epoch 4/4, Training Loss: 0.0480
Validation Accuracy: 96.80%


In [10]:
# Save model
torch.save(vgg.state_dict(), "cancer_model_full_dataset.pth")

# Evaluate performance
validate_model(vgg, val_loader)


Validation Accuracy: 96.80%


96.80278569167459