In [1]:
import os
import glob
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms

# Preparing Dataset

## 1. Load the Balanced CSV & creating a Path Map 


In [3]:
df = pd.read_csv('oral_cancer_balanced.csv')

print("Mapping image paths...")
path_map = {}

for root, dirs, files in os.walk('Data'):
    if 'val' in dirs:
        dirs.remove('val')
        
    for file in files:
        if file.lower().endswith(('.jpg', '.jpeg')):
            # Optional: Extra safety check ensuring 'val' isn't in the path string
            if 'val' not in root:
                path_map[file] = os.path.join(root, file)

print(f"Mapped {len(path_map)} image paths.")

Mapping image paths...
Mapped 240705 image paths.


## 2. Defining the Custom Dataset Class

In [4]:
class OralCancerDataset(Dataset):
    def __init__(self, dataframe, path_map, transform=None):
        '''
        dataframe: Pandas dataframe containing 'id' and 'label'
        path_map: Dictionary mapping filenames to full file paths
        transform: PyTorch transforms (augmentation/normalization)
        '''
        self.data = dataframe
        self.path_map = path_map
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Get ID and Label from the dataframe
        img_id = self.data.iloc[idx]['id']
        label = int(self.data.iloc[idx]['label'])
        
        # Find the full path using our map
        # (We use .get() to handle cases where a file might be missing safely)
        img_path = self.path_map.get(img_id)
        
        if img_path is None:
            raise FileNotFoundError(f"Image {img_id} not found in Data folders.")

        # Open Image
        image = Image.open(img_path).convert('RGB') # Ensure it's RGB

        # Apply Transforms
        if self.transform:
            image = self.transform(image)

        return image, label

## 3. Defining Transforms (Augmentation)

In [5]:
# Image settings
IMG_SIZE = 96

# 1. Training Transforms (With Augmentation)
# We add noise/flips to make the model generalize better
train_transforms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.RandomRotation(20),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.05),
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
])

# 2. Validation Transforms (Clean)
# No flips or rotations; just resize and normalize
val_transforms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
])

## 4. Spliting Data and Initializing Datasets

In [6]:
# Split the Dataframe (80% Train, 20% Validation)
train_df, val_df = train_test_split(
    df, 
    test_size=0.2, 
    random_state=42, 
    stratify=df['label'] # Ensures equal balance of 0s and 1s in both sets
)

#  Create the Datasets
# We pass 'train_transforms' to the train set and 'val_transforms' to the val set
train_dataset = OralCancerDataset(train_df, path_map, transform=train_transforms)
val_dataset = OralCancerDataset(val_df, path_map, transform=val_transforms)

print("Datasets successfully created.")
print("-" * 30)
print(f"Train data size:      {len(train_dataset)} images")
print(f"Validation data size: {len(val_dataset)} images")

Datasets successfully created.
------------------------------
Train data size:      192452 images
Validation data size: 48114 images


## 5. Creating DataLoaders

In [7]:
BATCH_SIZE = 64

# Create DataLoaders
# shuffle=True for training to mix batches
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)

# shuffle=False for validation (order doesn't matter, but usually kept static)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

print(f"DataLoaders ready with Batch Size {BATCH_SIZE}.")
print(f"Training Batches: {len(train_loader)}")
print(f"Validation Batches: {len(val_loader)}")

DataLoaders ready with Batch Size 64.
Training Batches: 3008
Validation Batches: 752


In [8]:
images, labels = next(iter(train_loader))
print(f"Train batch images: {images.shape}, labels: {labels.shape}")

images, labels = next(iter(val_loader))
print(f"Validation batch images: {images.shape}, labels: {labels.shape}")

Train batch images: torch.Size([64, 3, 96, 96]), labels: torch.Size([64])
Validation batch images: torch.Size([64, 3, 96, 96]), labels: torch.Size([64])


# Model Architecture

In [11]:
import torch.nn as nn
import torch.nn.functional as F

In [13]:
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        
        # --- Block 1 ---
        # Input: (Batch_Size, 3, 96, 96)
        # Conv: Maintains size (padding=1) -> (32, 96, 96)
        # Pool: Halves size -> (32, 48, 48)
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        
        # --- Block 2 ---
        # Input: (32, 48, 48)
        # Conv: Maintains size -> (64, 48, 48)
        # Pool: Halves size -> (64, 24, 24)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        
        # --- Block 3 ---
        # Input: (64, 24, 24)
        # Conv: Maintains size -> (128, 24, 24)
        # Pool: Halves size -> (128, 12, 12)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        
        # Max Pooling Layer (used in all blocks)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        
        # --- Classifier ---
        # Flattened Input size calculation:
        # Depth (128) * Height (12) * Width (12) = 18,432 features
        self.fc1 = nn.Linear(128 * 12 * 12, 256)
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(256, 1)  # Output: 1 Logit

    def forward(self, x):
        # Block 1
        x = self.conv1(x)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.pool(x) # Image is now 48x48
        
        # Block 2
        x = self.conv2(x)
        x = self.bn2(x)
        x = F.relu(x)
        x = self.pool(x) # Image is now 24x24
        
        # Block 3
        x = self.conv3(x)
        x = self.bn3(x)
        x = F.relu(x)
        x = self.pool(x) # Image is now 12x12
        
        # Flattening
        # Flatten dimensions 1, 2, 3 into a single vector per image
        x = x.view(-1, 128 * 12 * 12) 
        
        # Fully Connected
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        
        return x 



## Dummy Testing

In [14]:

# Creating the model
model = SimpleCNN()

# Create a dummy batch of 1 image with shape (1, 3, 96, 96)
dummy_input = torch.randn(1, 3, 96, 96)

# Pass it through the model
output = model(dummy_input)

print(f"Model Architecture Created.")
print(f"Input Shape:  {dummy_input.shape}")
print(f"Output Shape: {output.shape} (Should be [1, 1])")
print("Test Passed: Dimensions align correctly.")

Model Architecture Created.
Input Shape:  torch.Size([1, 3, 96, 96])
Output Shape: torch.Size([1, 1]) (Should be [1, 1])
Test Passed: Dimensions align correctly.


## Creating code diagnostic and assigning cuda to model

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
simple_model = SimpleCNN().to(device)
print('Simple model architecture:')
print(f"GPU: {device}")
print(simple_model)


Simple model architecture:
GPU: cuda
SimpleCNN(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn3): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=18432, out_features=256, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=256, out_features=1, bias=True)
)


## Creating Optimizer and Loss Function

In [22]:
import torch.optim as optim
# For a binary classifier, this function applies sigmoid while evaluating the Loss.
loss_function_simple_model = nn.BCEWithLogitsLoss()



# We use the Adam optimizer because it is a good optimization method.
optimizer_simple_model = optim.Adam(simple_model.parameters(), lr=0.001)


## Scheduler

In [24]:
# This is for adjusting the learning rate after each epoch, in order to obtain better results, 
# as it can happen that the parameters are arriving to an optimal value and the learning rate needs to be reduced.

scheduler_simple_model = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer_simple_model, mode='max', factor=0.5, patience=2, min_lr=1e-5
)

# Training Model

In [25]:
def training_model(num_epochs, optimizer, model, loss_function, scheduler, model_save_path):
    
    # Track the best accuracy to save the model
    best_val_acc = 0.0
    
    train_losses = []
    val_losses = []
    
    print(f"Starting training for {num_epochs} epochs...")
    print("-" * 80)

    for epoch in range(num_epochs):
        # --- TRAINING PHASE ---
        model.train()
        running_loss = 0.0
        train_correct = 0
        train_total = 0

        for images, labels in train_loader:
            # Move data to GPU
            images = images.to(device)
            # Reshape labels to (Batch_Size, 1) to match model output
            labels = labels.float().unsqueeze(1).to(device)

            # 1. Zero Gradients
            optimizer.zero_grad()
            
            # 2. Forward Pass
            outputs = model(images)
            
            # 3. Calculate Loss
            loss = loss_function(outputs, labels)
            
            # 4. Backward Pass
            loss.backward()
            
            # 5. Optimization Step
            optimizer.step()

            # --- Metrics Calculation ---
            running_loss += loss.item() * images.size(0)
            
            # Calculate Training Accuracy
            # Apply Sigmoid to convert logits to probabilities (0 to 1)
            probs = torch.sigmoid(outputs)
            preds = probs >= 0.5
            train_correct += (preds == labels).sum().item()
            train_total += labels.size(0)
        
        # Calculate Epoch Averages
        epoch_train_loss = running_loss / len(train_loader.dataset)
        epoch_train_acc = train_correct / train_total
        train_losses.append(epoch_train_loss)

        # --- VALIDATION PHASE ---
        model.eval()
        val_running_loss = 0.0
        val_correct = 0
        val_total = 0

        with torch.no_grad(): # No gradients needed for validation
            for images, labels in val_loader:
                images = images.to(device)
                labels = labels.float().unsqueeze(1).to(device)

                outputs = model(images)
                loss = loss_function(outputs, labels)
                
                val_running_loss += loss.item() * images.size(0)
                
                # Validation Accuracy
                probs = torch.sigmoid(outputs)
                preds = probs >= 0.5
                val_correct += (preds == labels).sum().item()
                val_total += labels.size(0)

        epoch_val_loss = val_running_loss / len(val_loader.dataset)
        epoch_val_acc = val_correct / val_total
        val_losses.append(epoch_val_loss)

        # --- LOGGING ---
        print(f"Epoch [{epoch+1}/{num_epochs}]")
        print(f"   Train Loss: {epoch_train_loss:.4f} | Train Acc: {epoch_train_acc:.4f}")
        print(f"   Val Loss:   {epoch_val_loss:.4f} | Val Acc:   {epoch_val_acc:.4f}")

        # --- SCHEDULER & CHECKPOINTING ---
        # Update Scheduler based on Validation Accuracy
        scheduler.step(epoch_val_acc)
        
        # Save Model if Validation Accuracy Improves
        if epoch_val_acc > best_val_acc:
            print(f"   --> Accuracy Improved ({best_val_acc:.4f} -> {epoch_val_acc:.4f}). Saving model...")
            best_val_acc = epoch_val_acc
            torch.save(model.state_dict(), model_save_path)
        
        print("-" * 80)
        
    return train_losses, val_losses

In [26]:
save_path = 'simple_cnn_best.pth'

# Run Training
history = training_model(
    num_epochs=10, 
    optimizer=optimizer_simple_model, 
    model=simple_model, 
    loss_function=loss_function_simple_model, 
    scheduler=scheduler_simple_model, 
    model_save_path=save_path
)

print("\nTraining Complete.")

Starting training for 10 epochs...
--------------------------------------------------------------------------------
Epoch [1/10]
   Train Loss: 0.5123 | Train Acc: 0.7748
   Val Loss:   0.4011 | Val Acc:   0.8220
   --> Accuracy Improved (0.0000 -> 0.8220). Saving model...
--------------------------------------------------------------------------------
Epoch [2/10]
   Train Loss: 0.4397 | Train Acc: 0.8056
   Val Loss:   0.5499 | Val Acc:   0.7729
--------------------------------------------------------------------------------
Epoch [3/10]
   Train Loss: 0.4055 | Train Acc: 0.8219
   Val Loss:   0.3564 | Val Acc:   0.8306
   --> Accuracy Improved (0.8220 -> 0.8306). Saving model...
--------------------------------------------------------------------------------
Epoch [4/10]
   Train Loss: 0.3695 | Train Acc: 0.8434
   Val Loss:   0.3018 | Val Acc:   0.8684
   --> Accuracy Improved (0.8306 -> 0.8684). Saving model...
----------------------------------------------------------------------