## Import Libraries

In [11]:
import os
import pandas as pd
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

## Set Paths & Configs

In [12]:
excel_path = '/kaggle/input/huron-dataset/image_class_mapping.xlsx'
masks_dir = '/kaggle/input/huron-dataset/Sliced_masks' 
mapped_df = pd.read_excel(excel_path)

#verify dataset count
print(f"Total mapped images: {len(mapped_df)}")
print(mapped_df.head())

#GPU configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Total mapped images: 995
    Image                                  Class
0  18.png  Preview Tiles Small Fragmented tissue
1  19.png  Preview Tiles Small Fragmented tissue
2  20.png  Preview Tiles Small Fragmented tissue
3  21.png  Preview Tiles Small Fragmented tissue
4  22.png  Preview Tiles Small Fragmented tissue
Using device: cuda


## Prepocessing Data

### Splitting Dataset

In [13]:
all_mask_files = os.listdir(masks_dir)

#separate mapped and unmapped masks
unmapped_files = [f for f in all_mask_files if f not in mapped_df['Image'].values]

#split mapped dataset
train_mapped, test_mapped = train_test_split(
    mapped_df, test_size=0.2, random_state=42, stratify=mapped_df['Class'])

#split sizes
print(f"Training Mapped: {len(train_mapped)}, Test Mapped: {len(test_mapped)}")
print(f"Unmapped Files: {len(unmapped_files)}")


Training Mapped: 796, Test Mapped: 199
Unmapped Files: 16352


### Dataset Class

In [14]:
#We create a datset class to help load and preprocess data

class MaskDataset(Dataset):
    def __init__(self, dataframe, masks_dir, transform=None):
        self.dataframe = dataframe
        self.masks_dir = masks_dir
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        mask_path = os.path.join(self.masks_dir, row['Image'])
        mask = np.array(Image.open(mask_path).convert("L")) #grayscale
        label = row['Class']

        if self.transform:
            mask = self.transform(mask)

        return mask, label

#transform the masks
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize((224, 224)),
    transforms.Normalize(mean=[0.485], std=[0.229]),
    transforms.Lambda(lambda x: x.repeat(3, 1, 1))])

#split the datasets
train_dataset = MaskDataset(train_mapped, masks_dir, transform)
test_dataset = MaskDataset(test_mapped, masks_dir, transform)

## Model Setup & Training

### Setup

In [15]:
#loading the model
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = models.resnet50(pretrained=True)
model.fc = nn.Linear(model.fc.in_features, len(mapped_df['Class'].unique())) #gets the length of each unique class which is 7
model = model.to(device)
model = torch.nn.DataParallel(model) #We are going to enable DataParallel to make use of kaggle's dual GPUs(T4).

#loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.0001,weight_decay=0.0)

#model summary
print(model)


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 173MB/s]


DataParallel(
  (module): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(
      

### Training

In [19]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

unique_classes = set()
for _, labels in train_loader:
    unique_classes.update(labels)
    
class_to_int = {cls: idx for idx, cls in enumerate(unique_classes)}
int_to_class = {idx: cls for cls, idx in class_to_int.items()}

def train_model(model, train_loader, test_loader, criterion, optimizer, epochs=None):
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in tqdm(train_loader):
            labels = [class_to_int[label] for label in labels]
            inputs, labels = inputs.to(device), torch.tensor(labels).to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch {epoch + 1}, Training Loss: {running_loss / len(train_loader)}")

        #call test
        test_model(model, test_loader, criterion)

def test_model(model, test_loader, criterion):
    model.eval()
    test_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in test_loader:
            labels = [class_to_int[label] for label in labels]
            inputs, labels = inputs.to(device), torch.tensor(labels).to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            test_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f"Test Loss: {test_loss / len(test_loader)}, Accuracy: {100 * correct / total}%")

#Train the model
train_model(model, train_loader, test_loader, criterion, optimizer, epochs=22)

100%|██████████| 50/50 [00:10<00:00,  4.79it/s]


Epoch 1, Training Loss: 0.04879610908217728
Test Loss: 0.6229245009330603, Accuracy: 84.42211055276383%


100%|██████████| 50/50 [00:10<00:00,  4.81it/s]


Epoch 2, Training Loss: 0.05620527969906106
Test Loss: 0.6206578775667227, Accuracy: 87.43718592964824%


100%|██████████| 50/50 [00:10<00:00,  4.88it/s]


Epoch 3, Training Loss: 0.04416210222989321
Test Loss: 0.45146862195374876, Accuracy: 90.95477386934674%


100%|██████████| 50/50 [00:10<00:00,  4.85it/s]


Epoch 4, Training Loss: 0.04680671736365184
Test Loss: 0.6511748627974436, Accuracy: 88.44221105527639%


100%|██████████| 50/50 [00:10<00:00,  4.83it/s]


Epoch 5, Training Loss: 0.03509345697239041
Test Loss: 0.4371964131983427, Accuracy: 90.45226130653266%


100%|██████████| 50/50 [00:10<00:00,  4.84it/s]


Epoch 6, Training Loss: 0.040776352728717026
Test Loss: 0.5536113668662997, Accuracy: 89.44723618090453%


100%|██████████| 50/50 [00:10<00:00,  4.75it/s]


Epoch 7, Training Loss: 0.06047896529082209
Test Loss: 0.5976255387067795, Accuracy: 87.43718592964824%


100%|██████████| 50/50 [00:10<00:00,  4.98it/s]


Epoch 8, Training Loss: 0.11822748314589263
Test Loss: 0.6455268395634798, Accuracy: 83.41708542713567%


100%|██████████| 50/50 [00:10<00:00,  4.95it/s]


Epoch 9, Training Loss: 0.08353885956574231
Test Loss: 0.6718590609156169, Accuracy: 84.42211055276383%


100%|██████████| 50/50 [00:10<00:00,  4.80it/s]


Epoch 10, Training Loss: 0.10843266671523452
Test Loss: 1.1789498627185822, Accuracy: 75.37688442211055%


100%|██████████| 50/50 [00:10<00:00,  4.87it/s]


Epoch 11, Training Loss: 0.07320373735390603
Test Loss: 0.6611417483251828, Accuracy: 85.42713567839196%


100%|██████████| 50/50 [00:10<00:00,  4.83it/s]


Epoch 12, Training Loss: 0.06553415951319039
Test Loss: 0.548593496473936, Accuracy: 87.43718592964824%


100%|██████████| 50/50 [00:10<00:00,  4.80it/s]


Epoch 13, Training Loss: 0.03485669936402701
Test Loss: 0.6347189276264265, Accuracy: 85.92964824120602%


100%|██████████| 50/50 [00:10<00:00,  4.94it/s]


Epoch 14, Training Loss: 0.03699510023812763
Test Loss: 0.8034522304168115, Accuracy: 86.4321608040201%


100%|██████████| 50/50 [00:10<00:00,  4.74it/s]


Epoch 15, Training Loss: 0.05129243897274136
Test Loss: 0.6490065545703356, Accuracy: 87.43718592964824%


100%|██████████| 50/50 [00:10<00:00,  4.84it/s]


Epoch 16, Training Loss: 0.04753495764569379
Test Loss: 0.5928940089562764, Accuracy: 86.93467336683418%


100%|██████████| 50/50 [00:10<00:00,  4.86it/s]


Epoch 17, Training Loss: 0.03555901162559166
Test Loss: 0.5549720643231502, Accuracy: 88.44221105527639%


100%|██████████| 50/50 [00:10<00:00,  4.76it/s]


Epoch 18, Training Loss: 0.032342234743991866
Test Loss: 0.5672657249065546, Accuracy: 87.93969849246231%


100%|██████████| 50/50 [00:10<00:00,  4.92it/s]


Epoch 19, Training Loss: 0.028577275122515858
Test Loss: 0.539278135706599, Accuracy: 88.44221105527639%


100%|██████████| 50/50 [00:10<00:00,  4.86it/s]


Epoch 20, Training Loss: 0.02641967629431747
Test Loss: 0.5120312485557336, Accuracy: 88.94472361809045%


100%|██████████| 50/50 [00:10<00:00,  4.79it/s]


Epoch 21, Training Loss: 0.029387922687456013
Test Loss: 0.5077387710603384, Accuracy: 88.44221105527639%


100%|██████████| 50/50 [00:10<00:00,  4.89it/s]


Epoch 22, Training Loss: 0.028872970604570582
Test Loss: 0.5406551597138437, Accuracy: 87.93969849246231%


## Predict Classes

In [20]:
def predict_unmapped_masks(model, unmapped_files, masks_dir, transform, int_to_class, output_path="mapped_results.xlsx"):
    model.eval()
    predictions = []
    confidence_scores = []

    for file in tqdm(unmapped_files):
        mask_path = os.path.join(masks_dir, file)
        mask = np.array(Image.open(mask_path).convert("L"))
        if transform:
            mask = transform(mask).unsqueeze(0).to(device)

        with torch.no_grad():
            outputs = model(mask)
            probs = torch.softmax(outputs, dim=1)
            confidence, predicted = torch.max(probs, 1)
            predictions.append(predicted.item())
            confidence_scores.append(confidence.item())

    predicted_classes = [int_to_class[p] for p in predictions]

    #confidence levels
    confidence_categories = ['High' if c >= 0.7 else 'Medium' if c >= 0.45 else 'Low' for c in confidence_scores]

    #output structure
    output_df = pd.DataFrame({
        'Image': unmapped_files,
        'Predicted Class': predicted_classes,
        'Confidence Score': confidence_scores,
        'Confidence Category': confidence_categories
    })

    output_df.to_excel(output_path, index=False)
    print(f"Output saved to {output_path}")

#find predictions
predict_unmapped_masks(model, unmapped_files, masks_dir, transform,int_to_class)


100%|██████████| 16352/16352 [07:27<00:00, 36.53it/s]


Output saved to mapped_results.xlsx
