The goal of this notebook is to show the process to fine-tune MESC models on a partially labeled WSI. The newly fine-tuned model can then be used to predict the labels of the unlabeled regions.
Data/Export/Temp/json2exp-output/Crop-256/WSI_NAME contains the extracted patches from the WSI_NAME WSI. 
Data/Partially_labeled/Labeled_data/WSI_NAME.xlsx contains the labeled patches from the WSI_NAME WSI in the format of an excel file formated as follows:
filename | M | E | S | C

The original MESC models can be found at: mescnn/classification/logs/cnn/holdout/MODEL_NAME
the fine-tuned models will be saved at: Data/Partially_labeled/models/WSI_NAME/MODEL_NAME

First, the user gives the WSI to the interface. Then the WSI is segmented with our segmentation model. The user then labels some glomerulus patches that will be used to fine-tune the MESC models. 
(This first part is abstracted in this notebook since the code already exists in the interface)

The interface then fine-tunes the MESC models on the labeled patches and saves the new models.

Finally, the interface uses the new models to predict the labels of the unlabeled patches.

In [1]:
import os 

# Dynamically get the root directory of the project
#ROOT_DIR = os.path.realpath(__file__) # Not working for notebooks but works for scripts
ROOT_DIR = "/home/ubuntu/M1"

In [2]:
# Read the labeled data from the excel file
import pandas as pd

labeled_data_dir = "Data/Partially_labeled/Labeled_data/test.XLSX"
df = pd.read_excel(os.path.join(ROOT_DIR, labeled_data_dir))

NUMBER_OF_LABELED_IMAGES = len(df)

# Replace the full paths in the filename column by keeping only the image name
df['filename'] = df['filename'].apply(lambda x: x.split("\\")[-1])

mesc_def = {
    "M": {
        0: "noM",
        1: "yesM",
    },
    "E": {
        0: "noE",
        1: "yesE"
    },
    "S": {
        "GGS": "GGS",
        0: "NoGS",
        1: "SGS"
    },
    "C": {
        0: "noC",
        1: "yesC"
    }
}
df["M"] = df["M"].replace(mesc_def["M"])
df["E"] = df["E"].replace(mesc_def["E"])
df["S"] = df["S"].replace(mesc_def["S"])
df["C"] = df["C"].replace(mesc_def["C"])

df

Unnamed: 0,filename,M,E,S,C
0,"glomerulus C1104066 [10884, 59188, 956, 948].jpeg",noM,noE,SGS,noC
1,"glomerulus C1104066 [142336, 49680, 744, 640]....",noM,noE,GGS,noC
2,"glomerulus C1104066 [142772, 48280, 1100, 864]...",yesM,noE,NoGS,noC
3,"glomerulus C1104066 [153544, 5020, 752, 628].jpeg",noM,noE,GGS,noC
4,"glomerulus C1104066 [28172, 21868, 736, 748].jpeg",noM,noE,SGS,noC
5,"glomerulus C1104066 [28344, 23428, 748, 708].jpeg",noM,noE,NoGS,noC
6,"glomerulus C1104066 [33508, 9732, 1396, 604].jpeg",noM,noE,NoGS,noC
7,"glomerulus C1104066 [8044, 62252, 752, 796].jpeg",noM,noE,GGS,noC
8,"glomerulus C1104066 [85212, 44372, 936, 880].jpeg",noM,noE,SGS,noC


In [26]:
import shutil

# Set the path to the Crop-256 folder
crop256_folder = "/home/ubuntu/M1/Data/Fine_tuning/cascade_R_50_FPN_1x/Crop-256"

# Set the path to the Data/Classification folder
dataset_folder = "/home/ubuntu/M1/Data/Partially_labeled/Classification"

# If the dataset folder does not exist, create it
if not os.path.exists(dataset_folder):
    os.makedirs(dataset_folder, exist_ok=True)
else: 
    # If the dataset folder exists, delete all its contents
    for file in os.listdir(dataset_folder):
        file_path = os.path.join(dataset_folder, file)
        try:
            if os.path.isfile(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path): shutil.rmtree(file_path)
        except Exception as e:
            print(e)
        
wsi = "C1104066"

for image in os.listdir(os.path.join(crop256_folder, wsi)):
    src_path = os.path.join(crop256_folder, wsi, image)
    dst_path = os.path.join(dataset_folder, image)
    shutil.copy(src_path, dst_path)

In [27]:
# Sort the images in the dataset folder into the labeled or unlabeled folders based on the dataframe labels
os.makedirs(os.path.join(dataset_folder, "labeled"), exist_ok=True)
os.makedirs(os.path.join(dataset_folder, "unlabeled"), exist_ok=True)

for image in os.listdir(dataset_folder):
    if os.path.isfile(os.path.join(dataset_folder, image)):
        if image in df["filename"].values:
            src_path = os.path.join(dataset_folder, image)
            dst_path = os.path.join(dataset_folder, "labeled", image)
            shutil.move(src_path, dst_path)
        else:
            src_path = os.path.join(dataset_folder, image)
            dst_path = os.path.join(dataset_folder, "unlabeled", image)
            shutil.move(src_path, dst_path)

In [28]:
# Set the path to the train folder
train_folder = "/home/ubuntu/M1/Data/Partially_labeled/Classification/labeled"
possible_labels = ["noM", "yesM", "noE", "yesE", "GGS", "NoGS", "SGS", "noC", "yesC", "nan_label"]

# Create new subdirectories for the labels in the train and val folders 
for label in possible_labels:
    os.makedirs(os.path.join(train_folder, label), exist_ok=True)
    
# Iterate over the rows in the df_combined dataframe
for index, row in df.iterrows():
    # Get the labels of the current row
    labels = row[["M", "E", "S", "C"]]
    
    # Get the name of the current patch
    patch_name = row["filename"]
    
    # Set the source path of the image
    source_path = os.path.join(train_folder, patch_name)
    
    # Set the destination paths of the image
    for label in labels:
        if label in possible_labels:
            dest_path = os.path.join(train_folder, label)
            if patch_name in os.listdir(dest_path):
                pass
            else:
                shutil.copy(source_path, dest_path)

In [29]:
# Delete all the images in the train folder that are not in subdirectories
for image in os.listdir(train_folder):
    if os.path.isfile(os.path.join(train_folder, image)):
        os.remove(os.path.join(train_folder, image))

In [30]:
# Create folders for each type of lesion
lesion_folders = ["M", "E", "S", "C"]
dataset_folder = os.path.join(dataset_folder, "labeled")
for lesion in lesion_folders:
    lesion_path = os.path.join(dataset_folder, lesion)
    os.makedirs(lesion_path, exist_ok=True)
    if lesion == "M":
        os.makedirs(os.path.join(lesion_path, "nan_label"), exist_ok=True)
        os.makedirs(os.path.join(lesion_path, "noM"), exist_ok=True)
        os.makedirs(os.path.join(lesion_path, "yesM"), exist_ok=True)
    if lesion == "E":
        os.makedirs(os.path.join(lesion_path, "noE"), exist_ok=True)
        os.makedirs(os.path.join(lesion_path, "yesE"), exist_ok=True)
    if lesion == "S":
        os.makedirs(os.path.join(lesion_path, "GGS"), exist_ok=True)
        os.makedirs(os.path.join(lesion_path, "NoGS"), exist_ok=True)
        os.makedirs(os.path.join(lesion_path, "SGS"), exist_ok=True)
    if lesion == "C":
        os.makedirs(os.path.join(lesion_path, "noC"), exist_ok=True)
        os.makedirs(os.path.join(lesion_path, "yesC"), exist_ok=True)
            
# Move the images to the appropriate folders
lesion_labels_dict = {
    "M": ["nan_label", "noM", "yesM"],
    "E": ["noE", "yesE"],
    "S": ["GGS", "NoGS", "SGS"],
    "C": ["noC", "yesC"]
}
                    
# Move the images to the appropriate folders                  
for lesion in lesion_labels_dict.keys():
    for label in lesion_labels_dict[lesion]:
        source_folder = os.path.join(dataset_folder, label)
        destination_folder = os.path.join(dataset_folder, lesion, label)
        for image in os.listdir(source_folder):
            source_path = os.path.join(source_folder, image)
            destination_path = os.path.join(destination_folder, image)
            shutil.move(source_path, destination_path)
        # # if the destination folder is empty, delete it
        # if len(os.listdir(destination_folder)) == 0:
        #     os.rmdir(destination_folder)
        os.rmdir(source_folder)


In [31]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
from PIL import Image

In [32]:
learning_rate = 10**(-5)
momentum = 0.8
batch_size = 1
epochs = 5
augmentation = ['HFlip', 'VFlip', 'BtnsCtst']
models_dir = os.path.join(ROOT_DIR, "mescnn/classification/logs/cnn/holdout")
model_name = "efficientnetv2-m_M_V3.pth"

# Load the M model
net = torch.load(os.path.join(models_dir, model_name))

# Freeze the net layers except the final layer
for param in net.parameters():
    param.requires_grad = False

# Unfreeze the final layer
for param in net.classifier.parameters():
    param.requires_grad = True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
net = net.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=momentum)

In [33]:
lesion = model_name.split("_")[1]   

data_transforms = {
    'train': transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
}

data_dir = "/home/ubuntu/M1/Data/Partially_labeled/Classification/labeled/" + lesion 

image_dataset = datasets.ImageFolder(data_dir, data_transforms['train'])
dataloader = torch.utils.data.DataLoader(image_dataset, batch_size=batch_size, shuffle=True)
dataset_size = len(image_dataset)


# Print the data, and each class, as well as the number associated with each class
print(image_dataset)
print(image_dataset.classes)
print(image_dataset.class_to_idx)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Dataset ImageFolder
    Number of datapoints: 9
    Root location: /home/ubuntu/M1/Data/Partially_labeled/Classification/labeled/M
    StandardTransform
Transform: Compose(
               ToTensor()
               Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
           )
['nan_label', 'noM', 'yesM']
{'nan_label': 0, 'noM': 1, 'yesM': 2}


In [34]:
from tempfile import TemporaryDirectory

def train_model(model, criterion, optimizer, dataloaders, dataset_sizes, device, num_epochs=25):
    since = time.time()

    # Make sure the device is set correctly
    model.to(device)

    # Create a temporary directory to save training checkpoints
    with TemporaryDirectory() as tempdir:
        best_model_params_path = os.path.join(tempdir, 'best_model_params.pt')

        # Initially save the current state of the model
        torch.save(model.state_dict(), best_model_params_path)
        best_acc = 0.0
        
        for epoch in range(num_epochs):
            print(f'\nEpoch {epoch+1}')
            print('-' * 10)

            model.train()  # Set model to training mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders:
                inputs.requires_grad = True
                inputs = inputs.to(device)
                labels = labels.to(device)
                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(True):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # # Print outputs and labels to debug the model's predictions
                    # if epoch == 0 and phase == 'train':
                    #     print(f"First batch labels and predictions in training: {labels} {preds}")

                    # backward + optimize
                    loss.backward()
                    optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
                
            epoch_loss = running_loss / dataset_sizes
            epoch_acc = running_corrects.double() / dataset_sizes
            
            print(f'Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

            # deep copy the model
            if epoch_acc > best_acc:
                best_acc = epoch_acc
                torch.save(model.state_dict(), best_model_params_path)

        time_elapsed = time.time() - since
        print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
        print(f'Best Acc: {best_acc:4f}')

        # load best model weights
        model.load_state_dict(torch.load(best_model_params_path))
        
    return best_acc, model


In [35]:
# Train the model
threshold = 0.34042131979695434 
acc, model = train_model(net, criterion, optimizer, dataloader, dataset_size, device, num_epochs=epochs)

print(f"Training accuracy: {acc}")

# Save the model
fine_tuned_models_dir = os.path.join(ROOT_DIR, "Data/Partially_labeled/models", wsi, model_name)
os.makedirs(os.path.dirname(fine_tuned_models_dir), exist_ok=True)
torch.save(model, fine_tuned_models_dir)



Epoch 1
----------
Loss: 0.5776 Acc: 0.8889

Epoch 2
----------
Loss: 0.6446 Acc: 0.8889

Epoch 3
----------
Loss: 0.5813 Acc: 0.8889

Epoch 4
----------
Loss: 0.5535 Acc: 0.7778

Epoch 5
----------
Loss: 0.4356 Acc: 0.8889
Training complete in 0m 8s
Best Acc: 0.888889
Training accuracy: 0.8888888888888888


In [7]:
# Using the fine-tuned model make predictions on the unlabeled images 
unlabeled_folder = os.path.join(ROOT_DIR, "Data/Partially_labeled/Classification/unlabeled")

# Load the model
model_path = os.path.join(ROOT_DIR, "Data/Partially_labeled/models", wsi, model_name)
model = torch.load(model_path)

# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set the transformation
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Make predictions on the unlabeled images
predictions = []
for image in os.listdir(unlabeled_folder):
    image_path = os.path.join(unlabeled_folder, image)
    img = Image.open(image_path)
    img = transform(img)
    img = img.unsqueeze(0)
    img = img.to(device)
    model.eval()
    with torch.no_grad():
        output = model(img)
        _, preds = torch.max(output, 1)
        predictions.append(preds.item())
        
df_unlabeled = pd.DataFrame({
    "filename": os.listdir(unlabeled_folder),
    "predicted_label": predictions
})

df_unlabeled["predicted_label"] = df_unlabeled["predicted_label"].replace({0:"nan_label", 1: "noM", 2: "yesM"})

df_unlabeled

NameError: name 'wsi' is not defined

In [6]:
# print the original labels from the excel file
df = pd.read_excel(os.path.join(ROOT_DIR, "Data/Partially_labeled/Labeled_data/C1104066_JGI.XLSX"))

# Replace the full paths in the filename column by keeping only the image name
df['filename'] = df['filename'].apply(lambda x: x.split("\\")[-1])

# Replace the labels in the dataframe with the corresponding values in the mesc_def dictionary
df["M"] = df["M"].replace(mesc_def["M"])

# Display the M and filename columns of the dataframe
df_labeled = df[["filename", "M"]]
df_labeled

Unnamed: 0,filename,M
0,"glomerulus C1104066 [10884, 59188, 956, 948].jpeg",noM
1,"glomerulus C1104066 [142336, 49680, 744, 640]....",noM
2,"glomerulus C1104066 [142772, 48280, 1100, 864]...",yesM
3,"glomerulus C1104066 [153544, 5020, 752, 628].jpeg",noM
4,"glomerulus C1104066 [28172, 21868, 736, 748].jpeg",noM
5,"glomerulus C1104066 [28344, 23428, 748, 708].jpeg",noM
6,"glomerulus C1104066 [33508, 9732, 1396, 604].jpeg",noM
7,"glomerulus C1104066 [8044, 62252, 752, 796].jpeg",noM
8,"glomerulus C1104066 [85212, 44372, 936, 880].jpeg",noM
9,"glomerulus C1104066 [8552, 64920, 784, 788].jpeg",noM
