In [1]:
import os
import pandas as pd
import numpy as np
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torchvision import transforms, models
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [2]:
def extract_patient_id(path):
    # filename is last component
    fname = os.path.basename(path)
    # e.g. SOB_B_A-14-22549AB-100-001.png
    parts = fname.split("-")
    # parts = ["SOB_B_A", "14", "22549AB", "100", "001.png"]
    return parts[1] + parts[2]   # e.g. "1422549AB"

In [3]:
def check_patient_leakage(df):
    df["patient"] = df["filename"].apply(extract_patient_id)
    
    # Group by train/test
    train_patients = set(df[df.grp == "train"].patient)
    test_patients  = set(df[df.grp == "test"].patient)
    
    leaked = train_patients.intersection(test_patients)
    
    if leaked:
        print("❌ PATIENT LEAKAGE DETECTED!")
        print("Patients appearing in BOTH train and test:", leaked)
    else:
        print("✅ No patient-level leakage detected.")
    
    print()
    print("Train patients:", len(train_patients))
    print("Test patients:", len(test_patients))

In [4]:
from sklearn.model_selection import GroupKFold

def rebuild_patient_splits(df, n_splits=5):
    df["patient"] = df["filename"].apply(extract_patient_id)

    gkf = GroupKFold(n_splits=n_splits)

    df["fold"] = -1
    for i, (_, test_idx) in enumerate(gkf.split(df, groups=df["patient"])):
        df.loc[test_idx, "fold"] = i + 1
    
    return df

In [5]:
def patient_level_accuracy(preds, labels, patient_ids):
    df = pd.DataFrame({
        "pred": preds,
        "true": labels,
        "patient": patient_ids
    })
    # Majority vote for each patient
    patient_votes = df.groupby("patient").pred.apply(lambda x: x.mode()[0])
    patient_truth = df.groupby("patient").true.first()
    
    correct = (patient_votes == patient_truth).sum()
    total = len(patient_truth)
    
    return correct / total

In [6]:
csv_path = "/Users/melissamartinez/Downloads/Princ DS/Breast Cancer Histopathological Project/Folds.csv"
root_dir = "/Users/melissamartinez/Downloads/Princ DS/Breast Cancer Histopathological Project"

df = pd.read_csv(csv_path)
check_patient_leakage(df)

df["patient"] = df["filename"].apply(extract_patient_id)

from sklearn.model_selection import GroupKFold
gkf = GroupKFold(n_splits=5)

df["fold"] = -1
for fold_idx, (_, test_idx) in enumerate(gkf.split(df, groups=df.patient)):
    df.loc[test_idx, "fold"] = fold_idx + 1

# Creating the full path of each image
df["fullpath"] = df["filename"].apply(lambda x: os.path.join(root_dir, x))

# Find if benign(0) or malignant(1)
df["label"] = df.filename.apply(lambda x: 0 if "benign" in x else 1)  

# Sanity Check
print(df.head())
print(df.label.value_counts())

❌ PATIENT LEAKAGE DETECTED!
Patients appearing in BOTH train and test: {'1419854C', '1412312', '1422549AB', '142980', '146241', '1422549CD', '1415696', '1413993', '1412204', '148168', '1419440', '144364', '1415570', '1417901', '1416601', '1429960CD', '1418650', '1418842D', '1411520', '1416188', '1419979', '1421998EF', '1416456', '149461', '1413413', '1416716', '1415704', '1423222AB', '1421998CD', '149133', '142985', '149146', '1414015', '1421998AB', '1416184', '1417614', '145694', '1414946', '1415275', '1412773', '1414134E', '1416184CD', '1422704', '1429960AB', '15190EF', '1416196', '145287', '1419979C', '1413418DE', '1415570C', '1410926', '1416448', '1415792', '1413412', '1422549G', '144372', '1417915', '142523', '1423060AB', '1415687B', '1421978AB', '1411951', '143909', '1423060CD', '1420629', '1420636', '143411F', '1429315EF'}

Train patients: 81
Test patients: 68
   fold  mag    grp                                           filename  \
0     2  100  train  BreaKHis_v1/histology_sli

In [7]:
class BreakHISDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df.reset_index(drop=True)
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # Load image from path
        img = Image.open(row.fullpath).convert("RGB")

        if self.transform:
            img = self.transform(img)

        label = int(row.label)

        return img, label

In [8]:
#Transfer learning needs 224x224 images

#Accounts for images being in different rotations and makes more variety, focuses on the density instead of the location
train_transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.RandomRotation(25),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

test_transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

In [9]:
# The csv already has them split
train_df = df[df.grp == "train"]
test_df  = df[df.grp == "test"]

print(len(train_df), len(test_df))

25880 13665


In [10]:
train_dataset = BreakHISDataset(train_df, transform=train_transform)
test_dataset  = BreakHISDataset(test_df, transform=test_transform)

#Batched in 32 so my computer doesn't crash
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=0)
test_loader  = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=0)

In [11]:
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

device

'mps'

In [12]:
#18 is lighter, 50 is heavier but more accurate, although not needed bc good accuracy with 18
model = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)

model.fc = nn.Linear(model.fc.in_features, 2)  # benign vs malignant

model = model.to(device)

#Loss function
criterion = nn.CrossEntropyLoss()

#Uses the moments from the gradients to fit the learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [13]:
EPOCHS = 5

for epoch in range(EPOCHS):
    model.train()
    running_loss = 0
    correct = 0
    total = 0

    for imgs, labels in train_loader:
        imgs, labels = imgs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(imgs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

    print(f"Epoch {epoch+1}/{EPOCHS} | Loss: {running_loss/len(train_loader):.4f} | Train Acc: {correct/total:.4f}")

Epoch 1/5 | Loss: 0.1734 | Train Acc: 0.9291
Epoch 2/5 | Loss: 0.0820 | Train Acc: 0.9692
Epoch 3/5 | Loss: 0.0632 | Train Acc: 0.9761
Epoch 4/5 | Loss: 0.0469 | Train Acc: 0.9835
Epoch 5/5 | Loss: 0.0417 | Train Acc: 0.9849


In [15]:
model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for imgs, labels in test_loader:
        imgs = imgs.to(device)
        outputs = model(imgs)
        pred = outputs.argmax(dim=1).cpu().numpy()

        y_pred.extend(pred)
        y_true.extend(labels.numpy())

# Extract patient IDs in the correct order
test_patients = df[df.grp == "test"]["patient"].values

import pandas as pd

def patient_level_accuracy(preds, labels, patient_ids):
    df_eval = pd.DataFrame({
        "pred": preds,
        "true": labels,
        "patient": patient_ids
    })
    # Majority vote per patient
    patient_pred = df_eval.groupby("patient").pred.apply(lambda x: x.mode()[0])
    patient_true = df_eval.groupby("patient").true.first()
    return (patient_pred == patient_true).mean()

#pl_acc = patient_level_accuracy(preds, true_labels, test_patients)
#print("Patient-Level Accuracy:", pl_acc)

test_acc = accuracy_score(y_true, y_pred)
print("Test Accuracy:", test_acc)

Test Accuracy: 0.9933406512989389
