## Import Libraries

In [1]:
import os
import pandas as pd
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

## Set Paths & Configs

In [2]:
excel_path = '/kaggle/input/huron-dataset/image_class_mapping.xlsx'
masks_dir = '/kaggle/input/huron-dataset/Sliced_masks' 
mapped_df = pd.read_excel(excel_path)

#verify dataset count
print(f"Total mapped images: {len(mapped_df)}")
print(mapped_df.head())

#GPU configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Total mapped images: 995
    Image                                  Class
0  18.png  Preview Tiles Small Fragmented tissue
1  19.png  Preview Tiles Small Fragmented tissue
2  20.png  Preview Tiles Small Fragmented tissue
3  21.png  Preview Tiles Small Fragmented tissue
4  22.png  Preview Tiles Small Fragmented tissue
Using device: cuda


## Prepocessing Data

### Splitting Dataset

In [3]:
all_mask_files = os.listdir(masks_dir)

#separate mapped and unmapped masks
unmapped_files = [f for f in all_mask_files if f not in mapped_df['Image'].values]

#split mapped dataset
train_mapped, test_mapped = train_test_split(
    mapped_df, test_size=0.2, random_state=42, stratify=mapped_df['Class'])

#split sizes
print(f"Training Mapped: {len(train_mapped)}, Test Mapped: {len(test_mapped)}")
print(f"Unmapped Files: {len(unmapped_files)}")


Training Mapped: 796, Test Mapped: 199
Unmapped Files: 16352


### Dataset Class

In [4]:
#We create a datset class to help load and preprocess data

class MaskDataset(Dataset):
    def __init__(self, dataframe, masks_dir, transform=None):
        self.dataframe = dataframe
        self.masks_dir = masks_dir
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        mask_path = os.path.join(self.masks_dir, row['Image'])
        mask = np.array(Image.open(mask_path).convert("L")) #grayscale
        label = row['Class']

        if self.transform:
            mask = self.transform(mask)

        return mask, label

#transform the masks
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize((224, 224)),
    transforms.Normalize(mean=[0.485], std=[0.229]),
    transforms.Lambda(lambda x: x.repeat(3, 1, 1))])

#split the datasets
train_dataset = MaskDataset(train_mapped, masks_dir, transform)
test_dataset = MaskDataset(test_mapped, masks_dir, transform)

## Model Setup & Training

### Setup

In [5]:
#loading the model
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = models.resnet50(pretrained=True)
model.fc = nn.Linear(model.fc.in_features, len(mapped_df['Class'].unique())) #gets the length of each unique class which is 7
model = model.to(device)

#loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

#model summary
print(model)


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 182MB/s] 


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

### Training

In [6]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

unique_classes = set()
for _, labels in train_loader:
    unique_classes.update(labels)
    
class_to_int = {cls: idx for idx, cls in enumerate(unique_classes)}
int_to_class = {idx: cls for cls, idx in class_to_int.items()}

def train_model(model, train_loader, test_loader, criterion, optimizer, epochs=None):
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in tqdm(train_loader):
            labels = [class_to_int[label] for label in labels]
            inputs, labels = inputs.to(device), torch.tensor(labels).to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch {epoch + 1}, Training Loss: {running_loss / len(train_loader)}")

        #call test
        test_model(model, test_loader, criterion)

def test_model(model, test_loader, criterion):
    model.eval()
    test_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in test_loader:
            labels = [class_to_int[label] for label in labels]
            inputs, labels = inputs.to(device), torch.tensor(labels).to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            test_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(f"Test Loss: {test_loss / len(test_loader)}, Accuracy: {100 * correct / total}%")

#Train the model
train_model(model, train_loader, test_loader, criterion, optimizer, epochs=15)

100%|██████████| 25/25 [00:12<00:00,  2.00it/s]


Epoch 1, Training Loss: 1.3103053069114685
Test Loss: 5.472892761230469, Accuracy: 26.633165829145728%


100%|██████████| 25/25 [00:11<00:00,  2.23it/s]


Epoch 2, Training Loss: 1.0275969076156617
Test Loss: 1.0958034992218018, Accuracy: 59.79899497487437%


100%|██████████| 25/25 [00:11<00:00,  2.21it/s]


Epoch 3, Training Loss: 0.9771419048309327
Test Loss: 1.8014924015317644, Accuracy: 55.778894472361806%


100%|██████████| 25/25 [00:11<00:00,  2.23it/s]


Epoch 4, Training Loss: 0.7855842232704162
Test Loss: 1.06117616380964, Accuracy: 59.2964824120603%


100%|██████████| 25/25 [00:11<00:00,  2.22it/s]


Epoch 5, Training Loss: 0.6746327233314514
Test Loss: 1.2357210687228612, Accuracy: 59.79899497487437%


100%|██████████| 25/25 [00:11<00:00,  2.23it/s]


Epoch 6, Training Loss: 0.5481437563896179
Test Loss: 1.3088818873677934, Accuracy: 73.36683417085428%


100%|██████████| 25/25 [00:11<00:00,  2.24it/s]


Epoch 7, Training Loss: 0.60168949842453
Test Loss: 1.2627747058868408, Accuracy: 63.81909547738694%


100%|██████████| 25/25 [00:11<00:00,  2.21it/s]


Epoch 8, Training Loss: 0.4738135862350464
Test Loss: 6.31914894921439, Accuracy: 20.603015075376884%


100%|██████████| 25/25 [00:11<00:00,  2.19it/s]


Epoch 9, Training Loss: 0.40006020188331604
Test Loss: 1.5636472957474845, Accuracy: 66.33165829145729%


100%|██████████| 25/25 [00:11<00:00,  2.19it/s]


Epoch 10, Training Loss: 0.3777608418464661
Test Loss: 1.201246210506984, Accuracy: 67.8391959798995%


100%|██████████| 25/25 [00:11<00:00,  2.21it/s]


Epoch 11, Training Loss: 0.3764644849300385
Test Loss: 1.9453699673925127, Accuracy: 59.79899497487437%


100%|██████████| 25/25 [00:11<00:00,  2.21it/s]


Epoch 12, Training Loss: 0.2629974794387817
Test Loss: 1.085607877799443, Accuracy: 70.35175879396985%


100%|██████████| 25/25 [00:11<00:00,  2.20it/s]


Epoch 13, Training Loss: 0.2615015587210655
Test Loss: 1.5726923942565918, Accuracy: 69.84924623115577%


100%|██████████| 25/25 [00:11<00:00,  2.20it/s]


Epoch 14, Training Loss: 0.20597845941781998
Test Loss: 1.6352858883993966, Accuracy: 65.32663316582915%


100%|██████████| 25/25 [00:11<00:00,  2.19it/s]


Epoch 15, Training Loss: 0.20367587611079216
Test Loss: 1.061292520591191, Accuracy: 76.38190954773869%


## Predict Classes

In [7]:
def predict_unmapped_masks(model, unmapped_files, masks_dir, transform, int_to_class, output_path="mapped_results.xlsx"):
    model.eval()
    predictions = []
    confidence_scores = []

    for file in tqdm(unmapped_files):
        mask_path = os.path.join(masks_dir, file)
        mask = np.array(Image.open(mask_path).convert("L"))
        if transform:
            mask = transform(mask).unsqueeze(0).to(device)

        with torch.no_grad():
            outputs = model(mask)
            probs = torch.softmax(outputs, dim=1)
            confidence, predicted = torch.max(probs, 1)
            predictions.append(predicted.item())
            confidence_scores.append(confidence.item())

    predicted_classes = [int_to_class[p] for p in predictions]

    #confidence levels
    confidence_categories = ['High' if c >= 0.6 else 'Medium' if c >= 0.3 else 'Low' for c in confidence_scores]

    #output structure
    output_df = pd.DataFrame({
        'Image': unmapped_files,
        'Predicted Class': predicted_classes,
        'Confidence Score': confidence_scores,
        'Confidence Category': confidence_categories
    })

    output_df.to_excel(output_path, index=False)
    print(f"Output saved to {output_path}")

#find predictions
predict_unmapped_masks(model, unmapped_files, masks_dir, transform,int_to_class)


100%|██████████| 16352/16352 [05:24<00:00, 50.46it/s]


Output saved to mapped_results.xlsx
